## Notebook to experiment with porting Hooke model and latent spaces over to python
Eventual hope is to write code that can infer latent position and pseudostage for hotfish and other perturbed embryos

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import os
import glob2 as glob
import patsy

# set paths
fig_root = "/Users/nick/Cole Trapnell's Lab Dropbox/Nick Lammers/Nick/morphseq/figures/seq_data/PLN/"


# specify which regression to use
ccm = "t_spline_inter2" #"t_spline_inter"

fig_folder = os.path.join(fig_root, ccm, "")
os.makedirs(fig_folder, exist_ok=True)

# set path to data
hooke_data_path = "/Users/nick/Cole Trapnell's Lab Dropbox/Nick Lammers/Nick/morphseq/seq_data/emb_projections/hooke_model_files/"
ccs_data_path = "/Users/nick/Cole Trapnell's Lab Dropbox/Nick Lammers/Nick/morphseq/seq_data/emb_projections/ccs_data_cell_type_broad/"
# hooke_data_path = "/Users/nick/Cole Trapnell's Lab Dropbox/Nick Lammers/Nick/morphseq/seq_data/emb_projections/hooke_model_test/"
# ccs_data_path = "/Users/nick/Cole Trapnell's Lab Dropbox/Nick Lammers/Nick/morphseq/seq_data/emb_projections/ccs_data_test/"
model_path = os.path.join(hooke_data_path, ccm, "")

### Load in metadata, model params, and counts matrice 

In [None]:
# load full counts dataset
hooke_counts_long = pd.read_csv(model_path + "abundance_estimates.csv", index_col=0)
cols = list(hooke_counts_long.columns)
cell_ind = cols.index("cell_group")
cov_cols = cols[:cell_ind]
hooke_counts_df = hooke_counts_long.pivot(index=cov_cols,
                                           columns=["cell_group"], values = ["log_abund"])
hooke_counts_df.columns = ['_'.join(map(str, col)).strip('_') for col in hooke_counts_df.columns.values]
hooke_counts_df.reset_index(inplace=True)
new_cols = [col.replace("log_abund_", "") for col in hooke_counts_df.columns.values]
hooke_counts_df.columns = new_cols
sort_cols = new_cols[:cell_ind] + sorted(new_cols[cell_ind:], key=str.lower)
hooke_counts_df = hooke_counts_df.loc[:, sort_cols]
# meta_df = pd.read_csv(ccs_data_path + "mdl_embryo_metadata.csv", index_col=0)
# meta_df["dis_protocol_str"] = meta_df["dis_protocol"].astype(str)

# model formula
with open(model_path + "model_string.txt", "r") as file:
    formula_str = file.read()
formula_str = "dummy_response " + formula_str 
formula_str = formula_str.replace("ns(", "cr(")
formula_str = formula_str.replace("c(", "(")
formula_str = formula_str.replace("\n", "")

# load hooke predictions (for comparison purposes)
latent_df = pd.read_csv(model_path + "latents.csv", index_col=0)
time_splines = pd.read_csv(model_path + "time_splines.csv")

# load hooke model files
# b_array = pd.read_csv(model_path + "B.csv", index_col=0)
cov_array = pd.read_csv(model_path + "COV.csv", index_col=0)
theta_array = pd.read_csv(model_path + "Theta.csv", index_col=0)

# latent_df.head()
theta_array = theta_array.rename(columns={"(Intercept)":"Intercept"})
cols_from = theta_array.columns
cols_from_clean = [col.replace(" = c", "=") for col in cols_from]
theta_array.columns = cols_from_clean

time_splines.head()

In [None]:
from scipy.interpolate import interp1d
# Assume the lookup table has columns: "timepoint", "V1", "V2", "V3", "V4"
# (The actual names might differ; adjust as necessary.)

# Define a function to interpolate the spline basis for a new time value.
def get_spline_basis(new_time_vec, lookup_df):
    # Create an empty dictionary to hold the interpolated values.
    out_df = pd.DataFrame(new_time_vec, columns=["timepoint"])
    
    # Loop through each spline column (skip the "timepoint" column).
    for col in lookup_df.columns[1:]:
        # Create an interpolation function for this column.
        f_interp = interp1d(lookup_df["timepoint"], lookup_df[col],
                            kind='linear', fill_value="extrapolate")
        # Evaluate the interpolation at the new time value.
        out_df[col] = f_interp(new_time_vec)
    return out_df

### Experiment with building covariate matrix with patsy

In [None]:
def get_covariate_df(formula_str, meta_df, time_splines):
    meta_df["dummy_response"] = 0
    _, X = patsy.dmatrices(formula_str, meta_df, return_type='dataframe')
    col_list = list(X.columns)
    cols_to_clean = [col.replace("[T.", "") for col in col_list]
    cols_to_clean = [col.replace("]", "") for col in cols_to_clean]
    cols_to_clean = [col.replace("[", "") for col in cols_to_clean]
    cols_to_clean = [col.replace("cr", "ns") for col in cols_to_clean]
    cols_to_keep = [col for col in cols_to_clean if col in cols_from_clean]
    X.columns = cols_to_clean
    X = X.loc[:, cols_to_keep]
    
    # replace spline cols with lookups (can't get patsy to match ns from R)
    spline_cols = [col for col in cols_to_keep if "ns(" in col]
    spline_vals = get_spline_basis(meta_df.loc[:, "timepoint"].to_numpy(), time_splines)
    if "inter" in ccm:
        X.loc[:, spline_cols[:4]] = spline_vals.iloc[:, 1:].to_numpy()
        X.loc[:, spline_cols[4:]] = np.multiply(spline_vals.iloc[:, 1:].to_numpy(), X.loc[:, "dis_protocol"].to_numpy()[:, None])
    else:
        X.loc[:, spline_cols] = spline_vals.iloc[:, 1:].to_numpy()

    return X, spline_vals.iloc[:, 0].to_numpy()

In [None]:
meta_df = hooke_counts_df[cov_cols].copy()
meta_df.loc[:, "dummy_response"] = 0

X, splines = get_covariate_df(formula_str, meta_df, time_splines)

X.head()

### Verify that our predictions are consistent with output of Hooke's "estimate_abundances" function

In [None]:
# mu_python = np.matmul(X, theta_array.T)
# mu_python[mu_python < -5] = -5 # looks like Hooke applies a lower bound at log(counts)=-5
# python_pd = mu_python.to_numpy().ravel()
# hooke_pd = hooke_counts_df.iloc[:, cell_ind:].to_numpy().ravel()
#
# plot_indices = np.random.choice(range(len(hooke_pd)), 1000)
# fig = px.scatter(x=hooke_pd[plot_indices], y=python_pd[plot_indices])
#
# fig.update_layout(
#     xaxis_title="Hooke log abundance predictions",
#     yaxis_title="Python log abundance predictions"
# )
#
# fig.update_layout(width=800, height=600)
# fig.show()
#
# fig.write_image(fig_folder + "python_pd_validation.png", scale=2)
# fig.write_html(fig_folder + "python_pd_validation.html")

### Generate mean WT trajectories
Let's generate a high-res time trajectory. Look at differences between bead and enzymatic protocols. Average across experiment offsets (does that make sense?)

In [None]:
import itertools

nt = 250
dis_protocol_vals = np.unique(meta_df["dis_protocol"]).tolist()
expt_vals = np.unique(meta_df["expt"]).tolist()
time_vals = np.linspace(10, 74, nt).tolist()

query_df = pd.DataFrame(itertools.product(time_vals, dis_protocol_vals, expt_vals), columns=["timepoint", "dis_protocol", "expt"])

# get covariate matrix
X_ref, t_vec = get_covariate_df(formula_str, query_df, time_splines)
expt_cols = [col for col in X_ref.columns if "expt" in col]
# null_expt_filter = np.all(X_ref.loc[:, expt_cols].to_numpy()==0, axis=1)
# null_expt_filter = X_ref.loc[:, "dis_protocol"]==2
# X_ref = X_ref.loc[null_expt_filter, :]
expt_array = np.c_[np.zeros((X_ref.shape[0], 1)), X_ref.loc[:, expt_cols].to_numpy()]
expt_vec = np.argmax(expt_array, axis=1)
expt_names = ["baseline (bead)"] + [col.replace("expt", "") for col in expt_cols]
# t_vec = t_vec[null_expt_filter]
# get log abundance predictions

hooke_trend_df = np.matmul(X_ref, theta_array.T)
# hooke_trend_df.to_csv("/Users/nick/Cole Trapnell's Lab Dropbox/Nick Lammers/Nick/morphseq/analyses/crossmodal/hotfish/hooke_time_trends.csv", index=False)

In [None]:
# filter for baseline

enz_filter = (X_ref["dis_protocol"]==1).to_numpy()
expt_filter = expt_vec==0

hooke_trend_baseline = hooke_trend_df.loc[enz_filter & expt_filter]
hooke_trend_baseline["stage_hpf"] = time_vals
hooke_trend_baseline.to_csv("/Users/nick/Cole Trapnell's Lab Dropbox/Nick Lammers/Nick/morphseq/analyses/crossmodal/hotfish/hooke_time_trends.csv", index=False)