#### Use CCA to look for axes of correspondence between morph and seq modalities

In [None]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import os
from glob2 import glob
# from sklearn.cross_decomposition import CCA

### Set paths

In [None]:
root = "/Users/nick/Cole Trapnell's Lab Dropbox/Nick Lammers/Nick/morphseq/"
train_name = "20241107_ds"
model_name = "SeqVAE_z100_ne150_sweep_01_block01_iter030" 
train_dir = os.path.join(root, "training_data", train_name, "")
output_dir = os.path.join(train_dir, model_name) 

# get path to morph model
training_path = sorted(glob(os.path.join(output_dir, "*")))[-1]
training_name = os.path.dirname(training_path)
morph_read_path = os.path.join(training_path, "figures", "")

# set path to hooke projections
hooke_model_name = "bead_expt_linear"
latent_path = "/Users/nick/Cole Trapnell's Lab Dropbox/Nick Lammers/Nick/morphseq/seq_data/emb_projections/latent_projections/"
hooke_model_path = os.path.join(latent_path, hooke_model_name, "")

# path to save data
out_path = os.path.join(root, "results", "20240303", "")
os.makedirs(out_path, exist_ok=True)

# path to figures and data
fig_path = "/Users/nick/Cole Trapnell's Lab Dropbox/Nick Lammers/Nick/slides/morphseq/20250312/morphseq_cca/"
os.makedirs(fig_path, exist_ok=True)

### Load data

In [None]:
# morph latent encodings
morph_df = pd.read_csv(out_path + "hf_morph_df.csv")

# hooke latent encodings
seq_df = pd.read_csv(out_path + "hf_seq_df.csv", index_col=0)

# metadata df that allows us to link the two
morphseq_df = pd.read_csv(os.path.join(root, "metadata", "morphseq_metadata.csv"))

# load spline datasets for each space--we will use these to pretrain our MLP
morph_spline_df = pd.read_csv(out_path + "spline_morph_df.csv")
morph_spline_df = morph_spline_df.set_index("stage_hpf")
seq_spline_df = pd.read_csv(out_path + "spline_seq_df.csv")
seq_spline_df = seq_spline_df.set_index("stage_hpf")

### Subset for hotfish2 

In [None]:
import re
hf_experiments = np.asarray(["20240813_24hpf", "20240813_30hpf", "20240813_36hpf"])
hf_morphseq_df = morphseq_df.loc[np.isin(morphseq_df["experiment_date"], hf_experiments), :].reset_index(drop=True)

# subset morph 
# mu_cols = [col for col in morph_df.columns.tolist() if "z_mu_b" in col]
pattern = r"PCA_.*_bio"
pca_cols_morph = [col for col in morph_df.columns if re.search(pattern, col)]
pca_cols_seq = [col for col in seq_df.columns if "PCA" in col]

hf_morph_df = pd.DataFrame(hf_morphseq_df.loc[:, ["snip_id", "sample"]]).merge(morph_df, how="inner", on="snip_id")
hf_morph_df = hf_morph_df.set_index("snip_id")
hf_morph_df = hf_morph_df.loc[:, pca_cols_morph + ["sample"]]


# subset seq dataset
hf_seq_df = pd.DataFrame(hf_morph_df.loc[:, "sample"]).merge(seq_df, how="inner", right_index=True, left_on="sample")
hf_seq_df = hf_seq_df.set_index("sample")
print(hf_seq_df.shape)

# get rid of sample col
hf_morph_df = hf_morph_df.drop(labels=["sample"], axis=1)
print(hf_morph_df.shape)

# filter out a couple observations that had QC problems
hf_morphseq_df = hf_morphseq_df.loc[np.isin(hf_morphseq_df["snip_id"], hf_morph_df.index), :].reset_index()
hf_morphseq_df = hf_morphseq_df.merge(morph_df.loc[:, ["snip_id", "mdl_stage_hpf"]])
print(hf_morphseq_df.shape)

### Extract spline and obs columns to fit

In [None]:
from sklearn.decomposition import PCA
# n_components = len(pca_cols_morph) # captures over 99% of variance in both modalities

# get morph array
morph_pca = hf_morph_df[pca_cols_morph].to_numpy() #morph_pca.transform(hf_morph_df)

# get morph spline
morph_spline_pca = morph_spline_df[pca_cols_morph].to_numpy()

# get seq array
seq_pca = hf_seq_df[pca_cols_seq].to_numpy() #morph_pca.transform(hf_morph_df)

# get seq spline
seq_spline_pca = seq_spline_df[pca_cols_seq].to_numpy()

### Visualize the two latent spaces

In [None]:
fig = px.scatter_3d(x=morph_pca[:, 0], y=morph_pca[:, 1], z=morph_pca[:, 2], color=hf_morphseq_df["temperature"])

fig.add_traces(go.Scatter3d(x=morph_spline_pca[:, 0], y=morph_spline_pca[:, 1], z=morph_spline_pca[:, 2], mode="lines"))

fig.update_traces(marker=dict(size=5))
fig.update_layout(title="morphology space")
fig.show()

In [None]:
fig = px.scatter_3d(x=seq_pca[:, 0], y=seq_pca[:, 1], z=seq_pca[:, 2], 
                     color=hf_morphseq_df["temperature"], hover_data=[hf_morphseq_df["stage_hpf"]])

fig.add_traces(go.Scatter3d(x=seq_spline_pca[:, 0], y=seq_spline_pca[:, 1], z=seq_spline_pca[:, 2], mode="lines"))

fig.update_traces(marker=dict(size=5))
fig.update_layout(title="transcriptional space")
fig.show()

### Use K-fold cross validation to identify the optimal MLP archicture

In [None]:
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import KFold, cross_val_score
from itertools import product
from tqdm import tqdm 

n_kf_splits = 5
n_dim_out = morph_pca.shape[1]
y = morph_pca[:, :n_dim_out]

# designate parameters to sweep
n_dim_in_vec = [10, 15, 20, 25, 50, 75, 100]
layer_size_list = [10, 25, 50, 100]

# generate layer variants
one_layer_configs = [(l,) for l in layer_size_list]
two_layer_configs = list(product(layer_size_list, layer_size_list))
three_layer_configs = list(product(layer_size_list[:2], layer_size_list[:3], layer_size_list[:3]))
mdl_configs = one_layer_configs + two_layer_configs + three_layer_configs

# get full list of variants
model_specs = list(product(mdl_configs, n_dim_in_vec))
model_specs_arr = [[m[0], m[1]] for m in model_specs]

# Set up k-fold cross-validation (here, 5 folds)
kf = KFold(n_splits=n_kf_splits, shuffle=True, random_state=42)

# get DF for training
mdl_df = pd.DataFrame(model_specs_arr, columns=["mdl_config", "n_dim_in"])

for m in tqdm(range(mdl_df.shape[0])):

    n_dim_in = mdl_df.loc[m, "n_dim_in"]
    mdl_config = mdl_df.loc[m, "mdl_config"]

    X = seq_pca[:, :n_dim_in]
    
    # initialize model
    mlp = MLPRegressor(random_state=42, max_iter=20000, hidden_layer_sizes=mdl_config, tol=1e-8)
    
    # Evaluate the model using cross_val_score, with R² as the scoring metric
    scores = cross_val_score(mlp, X, y, cv=kf, scoring='r2')
    
    mdl_df.loc[m, "score"] = np.mean(scores)

In [None]:
mdl_df.to_csv(os.path.join(out_path, "mlp_cv_scores.csv"), index=False)
# mdl_df = pd.read_csv(os.path.join(out_path, "mlp_cv_scores.csv"))

In [None]:
fig = px.scatter(mdl_df, x="n_dim_in", y="score", hover_data=["mdl_config"])

fig.update_layout(width=600, height=600,
                  xaxis=dict(title="number of seq PC components"),
                  yaxis=dict(title="CV score (R2)"),
                  title="MLP model performance",
                 font=dict(
                    family="Arial, sans-serif",
                    size=18,  # Adjust this value to change the global font size
                    color="black"
                ))
fig.show()

fig.write_image(os.path.join(fig_path, "mlp_cv_scores.png"))

Not as interpretable as I'd hoped. There is no single architecture or input dim that reigns supreme. I could spend more time parsing this, but I think the main takeaway is that it does not matter too much, within reason. Moderately complex 2-layer models tend to do best ((10, 50), for instance). Overly simple or complex arhitectures generalize less well. Though even this trend is not absolute.

The trend with number of input dimensions is more complicated. The optimimum does not change, but the average certainly varies: dropping at 10 and 15 components, recovering at 20 (??) and then dropping again.

### Next step: use bootstrap resampling to assess predictive performance
This procedure is a little loopy but I think it will work. Idea is to fit model using N bootrap samples. For each fit, I will obtain predictions for whatever obervations are not included in the bootstrap sample. After it is all said and done, I should have a dataset with multiple unbiased predictions for each observation that I can use to get a decent gauge for true predictive capacity of the model

In [None]:
from tqdm import tqdm
from sklearn.neural_network import MLPRegressor

n_boots = 250
boot_size = 250
n_dim_in = 100 # nice to have richer transcriptional info
mdl_config = (25, 100) # this was consitently a strong performer
n_dim_out = morph_pca.shape[1]

np.random.seed(371)

# index vector to select from
n_obs = hf_morph_df.shape[0]
boot_options = np.arange(n_obs)

# snip IDs
snip_ids = hf_morph_df.index

# predictors
X = seq_pca[:, :n_dim_in]
Y = morph_pca[:, :n_dim_out]

# initialize vectors
boot_id_vec = []
morph_pd_vec = []
snip_id_vec = []

for n in tqdm(range(n_boots)):
    
    # take bootstrap sample
    boot_indices = np.random.choice(boot_options, boot_size, replace=True)
    X_boot = X[boot_indices]
    Y_boot = Y[boot_indices]

    # initialize model
    mlp = MLPRegressor(random_state=42, max_iter=20000, hidden_layer_sizes=mdl_config, tol=1e-8)

    # fit
    mlp.fit(X_boot, Y_boot)

    # identify held-out samples and get predictions
    test_indices = boot_options[~np.isin(boot_options, boot_indices)]

    if len(test_indices) > 0:
        X_test = X[test_indices]
        Y_pd = mlp.predict(X_test)

        # add info
        boot_id_vec += [n]*len(test_indices)
        snip_id_vec += [snip_ids[i] for i in test_indices]
        morph_pd_vec.append(Y_pd)

In [None]:
# convert vectors to DF and get summary stats
morph_pd_df_full = pd.DataFrame(snip_id_vec, columns=["snip_id"])
# morph_pd_df_full["boot_id"] = boot_id_vec
morph_pd_df_full[pca_cols_morph[:n_dim_out]] = np.vstack(morph_pd_vec)

# get summary stats
morph_pd_df = morph_pd_df_full.groupby("snip_id").agg(["mean", "std", "count"])

# Flatten the MultiIndex columns to a single level:
morph_pd_df.columns = [f"{col[0]}_{col[1]}" for col in morph_pd_df.columns]

# Optionally, you can rename the index back to a column if needed:
morph_pd_df = morph_pd_df.reset_index()
morph_pd_df = pd.DataFrame(hf_morphseq_df.loc[:, ["snip_id", "temperature", "timepoint", "mdl_stage_hpf"]]).merge(morph_pd_df, how="inner", on="snip_id")
morph_pd_df.head()

In [None]:
mean_cols = [col for col in morph_pd_df.columns if "_mean" in col]

fig = px.scatter_3d(morph_pd_df, x=mean_cols[0], y=mean_cols[1], z=mean_cols[2], color="temperature", hover_data=["timepoint", "snip_id"])
fig.update_traces(marker=dict(size=5))
fig.show()

In [None]:
fig = px.scatter_3d(x=morph_pca[:, 0], y=morph_pca[:, 1], z=morph_pca[:, 2], color=hf_morphseq_df["temperature"], 
                   hover_data=[morph_pd_df["snip_id"]])

# fig.add_traces(go.Scatter3d(x=morph_spline_pca[:, 0], y=morph_spline_pca[:, 1], z=morph_spline_pca[:, 2], mode="lines"))

fig.update_traces(marker=dict(size=5))
fig.update_layout(title="morphology space")
fig.show()

### Assess how well the model predicts morphological stage

In [None]:
import joblib

# Load staging model
morph_stage_model = joblib.load(os.path.join(out_path, 'morph_stage_model.joblib'))

# get predicted morphological stages using the seq->morph embeddings
morph_pd_df["seq_stage_hpf"] = morph_stage_model.predict(morph_pd_df[mean_cols].values)

# stage_pd_vec_check = morph_stage_model.predict(morph_pca)

In [None]:
fig = px.scatter(morph_pd_df, x="mdl_stage_hpf", y="seq_stage_hpf", color="temperature")

fig.update_traces(marker=dict(size=8))

fig.update_layout(xaxis=dict(title="morphological stage (actual)"),
                  yaxis=dict(title="morphological stage (predicted)"),
                  # title="PCA decomposition of morphVAE latent space",
                 font=dict(
                    family="Arial, sans-serif",
                    size=16,  # Adjust this value to change the global font size
                    color="black"
                ))
fig.add_shape(
    type="line",
    x0=10,
    y0=10,
    x1=42,
    y1=42,
    line=dict(
        dash="dash",
        color="black",
        width=2,
    )
)


fig.show()

fig.write_image(os.path.join(fig_path, "seq_to_morph_stage_scatter.png"))

### What about intra-group residuals?

In [None]:
stage_mean_df = morph_pd_df.loc[:, ["temperature", "timepoint", "mdl_stage_hpf", "seq_stage_hpf"]].groupby(
                               ["temperature", "timepoint"]).agg(["mean"])

# Flatten the MultiIndex columns to a single level:
stage_mean_df.columns = [f"{col[0]}_{col[1]}" for col in stage_mean_df.columns]
stage_mean_df = stage_mean_df.reset_index()

# join back onto original data frame
morph_pd_df = morph_pd_df.merge(stage_mean_df, on=["temperature", "timepoint"], how="left")
morph_pd_df["true_resid"] = morph_pd_df["mdl_stage_hpf"] - morph_pd_df["mdl_stage_hpf_mean"]
morph_pd_df["pd_resid"] = morph_pd_df["seq_stage_hpf"] - morph_pd_df["seq_stage_hpf_mean"]

In [None]:
fig = px.scatter(morph_pd_df, x="true_resid", y="pd_resid", color="temperature", symbol="timepoint")

fig.update_traces(marker=dict(size=8))

fig.update_layout(#xaxis=dict(title="morphological stage (actual)"),
                  #yaxis=dict(title="morphological stage (predicted)"),
                  # title="PCA decomposition of morphVAE latent space",
                width=1000, height=800,
                 font=dict(
                    family="Arial, sans-serif",
                    size=16,  # Adjust this value to change the global font size
                    color="black"
                ))

In [None]:
# corr_matrix_by_group = morph_pd_df.groupby(["temperature", "timepoint"])[['true_resid', 'pd_resid']].corr()
corr_by_group = morph_pd_df.groupby(["temperature", "timepoint"]).apply(lambda x: x['true_resid'].corr(x['pd_resid']))

In [None]:
np.mean(corr_by_group)

### Step back and assess morph predictions more generally: are they better than just looking at the pop average?

In [None]:
from sklearn.metrics import mean_squared_error

# get cohort averages
morph_df_true = hf_morph_df.copy().reset_index()
morph_df_true = morph_df_true.merge(morphseq_df.loc[:, ["snip_id", "timepoint", "temperature"]], how="left", on="snip_id")
morph_df_mean = morph_df_true.drop(labels=["snip_id"], axis=1).groupby(["temperature", "timepoint"]).agg(["mean"])

# Flatten the MultiIndex columns to a single level:
morph_df_mean.columns = [f"{col[0]}_{col[1]}" for col in morph_df_mean.columns]
morph_df_mean = morph_df_mean.reset_index()

# merge back to original obs
morph_df_null = morph_df_true.loc[:, ["snip_id", "timepoint", "temperature"]].merge(
                morph_df_mean, how="left", on=["timepoint", "temperature"])

# extract just the PCA values to compare
Y_pd = morph_pd_df[mean_cols].values
Y_mean = morph_df_null[mean_cols].values
Y_true = morph_df_true[pca_cols_morph[:n_dim_out]].values

# calculate mse
pd_error = (Y_true-Y_pd)**2
null_error = (Y_true-Y_mean)**2

# convert to DFz
pd_df = pd.DataFrame(pd_error, columns=pca_cols_morph[:n_dim_out])
pd_df["total_se"] = np.sqrt(np.sum(pd_df[pca_cols_morph[:n_dim_out]], axis=1))
pd_df["timepoint"] = morph_df_true["timepoint"].to_numpy()
pd_df["temperature"] = morph_df_true["temperature"].to_numpy()
pd_df_mean = pd_df.groupby(["temperature", "timepoint"]).agg(["mean"])
pd_df_mean.columns = [f"{col[0]}_{col[1]}" for col in pd_df_mean.columns]
pd_df_mean = pd_df_mean.reset_index()

null_df = pd.DataFrame(null_error, columns=pca_cols_morph[:n_dim_out])
null_df["total_se"] = np.sqrt(np.sum(null_df[pca_cols_morph[:n_dim_out]], axis=1))
null_df["timepoint"] = morph_df_true["timepoint"].to_numpy()
null_df["temperature"] = morph_df_true["temperature"].to_numpy()
null_df_mean = null_df.groupby(["temperature", "timepoint"]).agg(["mean"])
null_df_mean.columns = [f"{col[0]}_{col[1]}" for col in null_df_mean.columns]
null_df_mean = null_df_mean.reset_index()

In [None]:
ind = 0
fig = px.scatter(pd_df_mean, x="total_se_mean", y=null_df_mean["total_se_mean"], color="temperature", symbol="timepoint")
                # log_x=True, log_y=True)
fig.update_traces(marker=dict(size=8))
fig.update_layout(width=1000, height=800)
fig.update_xaxes(range=[0, 4])
fig.update_yaxes(range=[0, 4])
fig.show()

In [None]:
pca_cols_morph

In [None]:
mean_cols