## Train MLP to predict morph embeddings from seq vectors

In [5]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import os
from glob2 import glob
# from sklearn.cross_decomposition import CCA

### Set paths

In [6]:
# root = "/Users/nick/Cole Trapnell's Lab Dropbox/Nick Lammers/Nick/morphseq/"
root = "/media/nick/hdd021/Cole Trapnell's Lab Dropbox/Nick Lammers/Nick/morphseq/"

# path to save data
out_path = os.path.join(root, "analyses", "crossmodal", "hotfish", "")
os.makedirs(out_path, exist_ok=True)

# path to figures and data
fig_path = os.path.join(root, "figures", "crossmodal", "hotfish", "")
os.makedirs(fig_path, exist_ok=True)

# set path to hooke projections
# hooke_model_name = "bead_expt_linear"
# latent_path = "/Users/nick/Cole Trapnell's Lab Dropbox/Nick Lammers/Nick/morphseq/seq_data/emb_projections/latent_projections/"
# hooke_model_path = os.path.join(latent_path, hooke_model_name, "")

### Load data

In [7]:
# morph latent encodings
morph_df = pd.read_csv(out_path + "hf_morph_df.csv")

# hooke latent encodings
seq_df = pd.read_csv(out_path + "hf_seq_df.csv", index_col=0)

# metadata df that allows us to link the two
morphseq_df = pd.read_csv(os.path.join(root, "metadata", "morphseq_metadata.csv"))

# load spline datasets for each space--we will use these to pretrain our MLP
morph_spline_df = pd.read_csv(out_path + "spline_morph_df.csv")
morph_spline_df = morph_spline_df.set_index("stage_hpf")
seq_spline_df = pd.read_csv(out_path + "spline_seq_df.csv")
seq_spline_df = seq_spline_df.set_index("stage_hpf")

### Subset for hotfish2 

In [8]:
import re
hf_experiments = np.asarray(["20240813_24hpf", "20240813_30hpf", "20240813_36hpf"])
hf_morphseq_df = morphseq_df.loc[np.isin(morphseq_df["experiment_date"], hf_experiments), :].reset_index(drop=True)

# subset morph 
# mu_cols = [col for col in morph_df.columns.tolist() if "z_mu_b" in col]
pattern = r"PCA_.*_bio"
pca_cols_morph = [col for col in morph_df.columns if re.search(pattern, col)]
pca_cols_seq = [col for col in seq_df.columns if "PCA" in col]

hf_morph_df = pd.DataFrame(hf_morphseq_df.loc[:, ["snip_id", "sample"]]).merge(morph_df, how="inner", on="snip_id")
hf_morph_df = hf_morph_df.set_index("snip_id")
hf_morph_df = hf_morph_df.loc[:, pca_cols_morph + ["sample"]]


# subset seq dataset
hf_seq_df = pd.DataFrame(hf_morph_df.loc[:, "sample"]).merge(seq_df, how="inner", right_index=True, left_on="sample")
hf_seq_df = hf_seq_df.set_index("sample")
print(hf_seq_df.shape)

# get rid of sample col
hf_morph_df = hf_morph_df.drop(labels=["sample"], axis=1)
print(hf_morph_df.shape)

# filter out a couple observations that had QC problems
hf_morphseq_df = hf_morphseq_df.loc[np.isin(hf_morphseq_df["snip_id"], hf_morph_df.index), :].reset_index()
hf_morphseq_df = hf_morphseq_df.merge(morph_df.loc[:, ["snip_id", "mdl_stage_hpf"]])
print(hf_morphseq_df.shape)

(141, 100)
(141, 10)
(141, 43)


### Extract spline and obs columns to fit

In [9]:
from sklearn.decomposition import PCA
# n_components = len(pca_cols_morph) # captures over 99% of variance in both modalities

# get morph array
morph_pca = hf_morph_df[pca_cols_morph].to_numpy() #morph_pca.transform(hf_morph_df)

# get morph spline
morph_spline_pca = morph_spline_df[pca_cols_morph].to_numpy()

# get seq array
seq_pca = hf_seq_df[pca_cols_seq].to_numpy() #morph_pca.transform(hf_morph_df)

# get seq spline
# seq_spline_pca = seq_spline_df[pca_cols_seq].to_numpy()

In [None]:
from src.functions.spline_fitting_v2 import spline_fit_wrapper

n_boots = 5
n_spline_points = 100

seq_fit_df = hf_seq_df.copy()
seq_fit_df["timepoint"] = hf_morphseq_df["mdl_stage_hpf"].values.copy()
seq_fit_df[pca_cols_seq] = seq_fit_df[pca_cols_seq].values / 3 # rescale to help with fitting
# fit HF-specific spline--WT fit diverges too much in the early time points
seq_spline_df2 = spline_fit_wrapper(seq_fit_df, fit_cols=pca_cols_seq, n_boots=n_boots, n_spline_points=n_spline_points, stage_col="timepoint",
                               obs_weights=None, boot_size=seq_pca.shape[0], bandwidth=1.5)

  0%|          | 0/5 [00:00<?, ?it/s]

In [91]:
from scipy.spatial import KDTree


def estimate_times(
    times: np.ndarray,     # shape (M,)
    ref_feats: np.ndarray, # shape (M, D)
    query_feats: np.ndarray, # shape (N, D)
    k: int = 5,
    weight_scheme: str = "inverse_distance"  # or "inverse_distance"
) -> np.ndarray:
    """
    For each row in query_feats, find its k nearest neighbors among ref_feats,
    then estimate its time as a weighted average of times[idxs].

    Args:
        times:       1D array of length M with the known timestamps.
        ref_feats:   2D array (M, D) of features corresponding to those times.
        query_feats: 2D array (N, D) of features to estimate times for.
        k:           number of neighbors to use.
        weight_scheme: "uniform" or "inverse_distance".

    Returns:
        est_times: 1D array of length N with the estimated times.
    """
    # Build KD‑tree on the reference features
    tree = KDTree(ref_feats)

    # Query k nearest neighbors for each point in query_feats
    dists, idxs = tree.query(query_feats, k=k)
    if k == 1:
        dists = dists[:, None]
        idxs  = idxs[:, None]

    # Build weighting matrix
    if weight_scheme == "uniform":
        w = np.ones_like(dists)
    elif weight_scheme == "inverse_distance":
        eps = 1e-8
        w = 1.0 / (dists + eps)
    else:
        raise ValueError(f"Unknown weight_scheme {weight_scheme!r}")

    # Normalize weights across each query
    w = w / w.sum(axis=1, keepdims=True)

    # Gather neighbor times and compute weighted average
    neigh_times = times[idxs]                # shape (N, k)
    est_times   = np.sum(neigh_times * w, axis=1)  # shape (N,)

    return est_times

In [92]:
seq_spline_pca = seq_spline_df2[pca_cols_seq].to_numpy() * 3

spline_times = estimate_times(seq_fit_df["timepoint"].to_numpy(),
                              ref_feats=seq_fit_df[pca_cols_seq].to_numpy(),
                              query_feats=seq_spline_pca)




In [93]:
spline_times

array([13.67736317, 13.67808197, 13.67876996, 13.67950504, 13.67997102,
       13.68001491, 13.68015924, 13.68027279, 13.68057853, 13.68097701,
       13.68148301, 13.68194783, 13.68253229, 14.09824185, 14.10134537,
       14.34405227, 14.98381947, 16.08957191, 17.22982231, 17.9913288 ,
       17.99322584, 17.99522374, 18.13055831, 18.13351783, 18.8632925 ,
       18.53076096, 18.53091891, 18.53106831, 18.53120435, 18.53132924,
       18.53143954, 18.53153196, 18.07502186, 18.0748649 , 18.07468775,
       18.0744919 , 18.3328274 , 19.29878854, 19.30830362, 20.15057099,
       22.35909233, 22.36495028, 23.17714009, 23.1770169 , 23.17687373,
       23.42697498, 23.42828783, 23.99598934, 24.35112418, 25.36010365,
       25.45542475, 26.85951746, 26.97720903, 26.9784612 , 27.89786897,
       27.31034732, 26.67958359, 27.89455278, 27.8973982 , 27.38765058,
       27.99981464, 29.27473397, 32.51772646, 32.527847  , 34.50807413,
       33.62209014, 35.60120228, 35.60139401, 35.40221279, 35.96

### Visualize the two latent spaces

In [46]:
fig = px.scatter_3d(x=morph_pca[:, 0], y=morph_pca[:, 1], z=morph_pca[:, 2], color=hf_morphseq_df["temperature"])

fig.add_traces(go.Scatter3d(x=morph_spline_pca[:, 0], y=morph_spline_pca[:, 1], z=morph_spline_pca[:, 2], mode="lines"))

fig.update_traces(marker=dict(size=5))
fig.update_layout(title="morphology space")
fig.show()

In [69]:
fig = px.scatter_3d(x=seq_pca[:, 0], y=seq_pca[:, 1], z=seq_pca[:, 2], 
                     color=hf_morphseq_df["temperature"], hover_data=[hf_morphseq_df["stage_hpf"]])

fig.add_traces(go.Scatter3d(x=seq_spline_pca[:, 0], y=seq_spline_pca[:, 1], z=seq_spline_pca[:, 2], mode="lines"))

fig.update_traces(marker=dict(size=5))
fig.update_layout(title="transcriptional space")
fig.show()

### Next step: use bootstrap resampling to assess predictive performance
This procedure is a little loopy but I think it will work. Idea is to fit model using N bootrap samples. For each fit, I will obtain predictions for whatever obervations are not included in the bootstrap sample. After it is all said and done, I should have a dataset with multiple unbiased predictions for each observation that I can use to get a decent gauge for true predictive capacity of the model

In [72]:
from tqdm import tqdm
from sklearn.neural_network import MLPRegressor

n_boots = 100
boot_size = 250
n_dim_in = 100 # nice to have richer transcriptional info
mdl_config = (25, 100) # this was consitently a strong performer
n_dim_out = morph_pca.shape[1]

np.random.seed(371)

# index vector to select from
n_obs = hf_morph_df.shape[0]
boot_options = np.arange(n_obs)

# snip IDs
snip_ids = hf_morph_df.index

# Also generate predictions for seq spline fit
X_spline = seq_spline_pca

# predictors
X = seq_pca[:, :n_dim_in]
Y = morph_pca[:, :n_dim_out]

# initialize vectors
boot_id_vec = []
morph_pd_vec = []
morph_spline_pd_vec = []
snip_id_vec = []
spline_ind_vec = []

for n in tqdm(range(n_boots)):
    
    # take bootstrap sample
    boot_indices = np.random.choice(boot_options, boot_size, replace=True)
    X_boot = X[boot_indices]
    Y_boot = Y[boot_indices]

    # initialize model
    mlp = MLPRegressor(random_state=42, max_iter=20000, hidden_layer_sizes=mdl_config, tol=1e-8)

    # fit
    mlp.fit(X_boot, Y_boot)

    # identify held-out samples and get predictions
    test_indices = boot_options[~np.isin(boot_options, boot_indices)]

    if len(test_indices) > 0:
        X_test = X[test_indices]
        Y_pd = mlp.predict(X_test)
        Y_pd_spline = mlp.predict(X_spline)
        # add info
        boot_id_vec += [n]*len(test_indices)
        snip_id_vec += [snip_ids[i] for i in test_indices]
        spline_ind_vec += np.arange(Y_pd_spline.shape[0]).tolist()
        morph_pd_vec.append(Y_pd)
        morph_spline_pd_vec.append(Y_pd_spline)

100%|██████████| 100/100 [01:35<00:00,  1.05it/s]


In [71]:
# convert vectors to DF and get summary stats
morph_pd_df_full = pd.DataFrame(snip_id_vec, columns=["snip_id"])
# morph_pd_df_full["boot_id"] = boot_id_vec
morph_pd_df_full[pca_cols_morph[:n_dim_out]] = np.vstack(morph_pd_vec)

# get summary stats
morph_pd_df = morph_pd_df_full.groupby("snip_id").agg(["mean", "std", "count"])

# Flatten the MultiIndex columns to a single level:
morph_pd_df.columns = [f"{col[0]}_{col[1]}" for col in morph_pd_df.columns]

# Optionally, you can rename the index back to a column if needed:
morph_pd_df = morph_pd_df.reset_index()
morph_pd_df = pd.DataFrame(hf_morphseq_df.loc[:, ["snip_id", "temperature", "timepoint", "mdl_stage_hpf"]]).merge(morph_pd_df, how="inner", on="snip_id")
morph_pd_df.head()

Unnamed: 0,snip_id,temperature,timepoint,mdl_stage_hpf,PCA_00_bio_mean,PCA_00_bio_std,PCA_00_bio_count,PCA_01_bio_mean,PCA_01_bio_std,PCA_01_bio_count,...,PCA_06_bio_count,PCA_07_bio_mean,PCA_07_bio_std,PCA_07_bio_count,PCA_08_bio_mean,PCA_08_bio_std,PCA_08_bio_count,PCA_09_bio_mean,PCA_09_bio_std,PCA_09_bio_count
0,20240813_24hpf_A02_e00_t0000,19.0,24.0,14.149216,2.698856,0.297216,21,-0.602625,0.299559,21,...,21,0.035788,0.262206,21,-0.195713,0.198986,21,0.444504,0.321302,21
1,20240813_24hpf_A03_e00_t0000,25.0,24.0,17.759886,3.260033,0.288147,19,-0.858078,0.241823,19,...,19,0.746951,0.219817,19,0.607586,0.220142,19,0.083383,0.194309,19
2,20240813_24hpf_A04_e00_t0000,28.5,24.0,23.274317,1.815614,0.296725,20,2.147515,0.405279,20,...,20,-0.707741,0.260227,20,-0.753263,0.302706,20,-0.163953,0.234202,20
3,20240813_24hpf_A05_e00_t0000,32.0,24.0,27.391227,0.317162,0.357852,16,2.715318,0.397722,16,...,16,-0.318832,0.169303,16,-0.985337,0.265364,16,-0.117934,0.342201,16
4,20240813_24hpf_A06_e00_t0000,33.5,24.0,22.078289,0.863132,0.340431,20,3.176517,0.337411,20,...,20,-0.553219,0.216479,20,-0.886337,0.222979,20,0.268169,0.250501,20


### Now make DF for spline

In [94]:
# convert vectors to DF and get summary stats
morph_spline_df_full = pd.DataFrame(spline_ind_vec, columns=["spline_id"])

# morph_pd_df_full["boot_id"] = boot_id_vec
morph_spline_df_full[pca_cols_morph[:n_dim_out]] = np.vstack(morph_spline_pd_vec)

# get summary stats
morph_spline_pd_df = morph_spline_df_full.groupby("spline_id").agg(["mean", "std", "count"])

# Flatten the MultiIndex columns to a single level:
morph_spline_pd_df.columns = [f"{col[0]}_{col[1]}" for col in morph_spline_pd_df.columns]

# Optionally, you can rename the index back to a column if needed:
morph_spline_pd_df = morph_spline_pd_df.reset_index()

morph_spline_pd_df["stage_hpf"] = spline_times
# morph_spline_df = pd.DataFrame(hf_morphseq_df.loc[:, ["snip_id", "temperature", "timepoint", "mdl_stage_hpf"]]).merge(morph_pd_df, how="inner", on="snip_id")
morph_spline_pd_df.head()

Unnamed: 0,spline_id,PCA_00_bio_mean,PCA_00_bio_std,PCA_00_bio_count,PCA_01_bio_mean,PCA_01_bio_std,PCA_01_bio_count,PCA_02_bio_mean,PCA_02_bio_std,PCA_02_bio_count,...,PCA_07_bio_mean,PCA_07_bio_std,PCA_07_bio_count,PCA_08_bio_mean,PCA_08_bio_std,PCA_08_bio_count,PCA_09_bio_mean,PCA_09_bio_std,PCA_09_bio_count,stage_hpf
0,0,3.53896,0.148063,100,-0.706217,0.164389,100,2.721482,0.153615,100,...,0.93649,0.232867,100,0.08517,0.158999,100,-0.466601,0.154145,100,13.677363
1,1,3.524507,0.14263,100,-0.710661,0.154066,100,2.700989,0.146433,100,...,0.909485,0.218897,100,0.07747,0.150966,100,-0.439511,0.14628,100,13.678082
2,2,3.505994,0.137256,100,-0.714246,0.145603,100,2.681423,0.14127,100,...,0.885241,0.206473,100,0.070365,0.142618,100,-0.413504,0.138443,100,13.67877
3,3,3.503647,0.13259,100,-0.73532,0.137885,100,2.680892,0.134176,100,...,0.854123,0.192692,100,0.052518,0.131954,100,-0.375139,0.129378,100,13.679505
4,4,3.501479,0.128914,100,-0.754663,0.132587,100,2.681501,0.12754,100,...,0.83048,0.183873,100,0.036245,0.123122,100,-0.340037,0.12242,100,13.679971


In [76]:
mean_cols = [col for col in morph_pd_df.columns if "_mean" in col]

fig = px.scatter_3d(morph_pd_df, x=mean_cols[0], y=mean_cols[1], z=mean_cols[2], color="temperature", hover_data=["timepoint", "snip_id"])
fig.update_traces(marker=dict(size=5))
fig.show()

In [95]:
mean_cols = [col for col in morph_spline_pd_df.columns if "_mean" in col]

fig = px.scatter_3d(morph_spline_pd_df, x=mean_cols[0], y=mean_cols[1], z=mean_cols[2], color="stage_hpf")#, hover_data=["timepoint", "snip_id"])
fig.update_traces(marker=dict(size=5))
fig.show()

In [96]:
fig = px.scatter_3d(x=morph_pca[:, 0], y=morph_pca[:, 1], z=morph_pca[:, 2], color=hf_morphseq_df["temperature"],
                   hover_data=[morph_pd_df["snip_id"]])

morph_spline_pd = morph_spline_pd_df[mean_cols].values
fig.add_traces(go.Scatter3d(x=morph_spline_pd[:, 0], y=morph_spline_pd[:, 1], z=morph_spline_pd[:, 2], mode="lines"))

fig.update_traces(marker=dict(size=5))
fig.update_layout(title="morphology space")
fig.show()

### Assess how well the model predicts morphological stage

In [97]:
import joblib

# Load staging model
morph_stage_model = joblib.load(os.path.join(out_path, 'morph_stage_model.joblib'))

# get predicted morphological stages using the seq->morph embeddings
morph_pd_df["seq_stage_hpf"] = morph_stage_model.predict(morph_pd_df[mean_cols].values)
morph_spline_pd_df["seq_stage_hpf"] = morph_stage_model.predict(morph_spline_pd_df[mean_cols].values)

# stage_pd_vec_check = morph_stage_model.predict(morph_pca)


Trying to unpickle estimator PolynomialFeatures from version 1.6.1 when using version 1.5.2. This might lead to breaking code or invalid results. Use at your own risk. For more info please refer to:
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Trying to unpickle estimator LinearRegression from version 1.6.1 when using version 1.5.2. This might lead to breaking code or invalid results. Use at your own risk. For more info please refer to:
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Trying to unpickle estimator Pipeline from version 1.6.1 when using version 1.5.2. This might lead to breaking code or invalid results. Use at your own risk. For more info please refer to:
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations



In [None]:
morph_pd_df.to_csv(os.path.join(out_path, 'seq_to_morph_pd.csv'), index=False)
morph_spline_pd_df.to_csv(os.path.join(out_path, 'seq_to_morph_spline_pd.csv'), index=False)