In [None]:
import plotly.express as px
import pandas as pd
import numpy as np
import umap.umap_ as umap
import os

In [None]:
lmx_image_dir = "/Users/nick/Cole Trapnell's Lab Dropbox/Nick Lammers/Nick/morphseq/analysis/lmx1b"
lmx_seq_dir = "/Users/nick/Cole Trapnell's Lab Dropbox/Nick Lammers/Nick/morphseq/sci-PLEX/processed_sci_data/lmx1b_combined_analysis"

# load morph metadata DF
morph_df = pd.read_csv(os.path.join(lmx_image_dir, "embryo_df.csv"), index_col=0)
morph_df.drop(labels=["index"], inplace=True, axis=1)

# load hooke latent log counts 
hooke_latent_df = pd.read_csv(os.path.join(lmx_seq_dir, "lmx_gene_timepoint_latent_encodings.csv"), index_col=0)
hooke_latent_pos_df = pd.read_csv(os.path.join(lmx_seq_dir, "lmx_hooke_latent_pos.csv"), index_col=0)
raw_cell_counts_df = pd.read_csv(os.path.join(lmx_seq_dir, "lmx_all_cell_counts.csv"), index_col=0)

# add embryo names and cell types to latent DF

hooke_latent_df.columns = raw_cell_counts_df.index
hooke_latent_df["embryo"] = raw_cell_counts_df.columns
hooke_latent_df.reset_index(inplace=True)
# mapper_dict = dict({})
# for c in range(len(hooke_latent_df.columns)):
#     mapper_dict[c] = {hooke_latent_df.columns[c] : raw_cell_counts_df.index[c]}

# parse metadata from embryo field
for r in range(hooke_latent_df.shape[0]):
    emb = hooke_latent_df.loc[r, "embryo"]
    meta_vec = emb.split(".")
    
    hooke_latent_df.loc[r, "perturbation"] = meta_vec[0]
    hooke_latent_df.loc[r, "stage_hpf"] = meta_vec[1]
    hooke_latent_df.loc[r, "temperature"] = meta_vec[2]
    hooke_latent_df.loc[r, "hash_plate"] = meta_vec[4]
    hash_well_raw  = meta_vec[5]
    
    if len(hash_well_raw) == 2:
        hash_well_new = hash_well_raw[0] + "0" + hash_well_raw[1]
        hooke_latent_df.loc[r, "hash_well"] = hash_well_new
    else:
        hooke_latent_df.loc[r, "hash_well"] = hash_well_raw
    
# adjust col ordering
cols_init = hooke_latent_df.columns.tolist()
cols_new = cols_init[-6:] + cols_init[1:-6]

hooke_latent_df = hooke_latent_df.loc[:, cols_new]

In [None]:
# Plot UMAP embeddings for the lmx1b images
fig = px.scatter_3d(morph_df, x="UMAP_00_bio_3", y="UMAP_01_bio_3", z="UMAP_02_bio_3", color="master_perturbation")
fig.show()

In [None]:
fig = px.scatter_3d(morph_df, x="UMAP_00_bio_3", y="UMAP_01_bio_3", z="UMAP_02_bio_3", color="predicted_stage_hpf")
fig.show()

In [None]:
# fit UMAP to latent cell count data
n_components = 3
reducer = umap.UMAP(n_components=n_components)

latent_array = hooke_latent_df.iloc[:, 6:]
# scaled_z_mu = StandardScaler().fit_transform(z_mu_array)
embedding = reducer.fit_transform(latent_array)

# add to the data frame
hooke_latent_df["UMAP_hooke_00_3"] = embedding[:, 0]
hooke_latent_df["UMAP_hooke_01_3"] = embedding[:, 1]
hooke_latent_df["UMAP_hooke_02_3"] = embedding[:, 2]

In [None]:
fig = px.scatter_3d(hooke_latent_df, x="UMAP_hooke_00_3", y="UMAP_hooke_01_3", z="UMAP_hooke_02_3", color="perturbation")
fig.show()

In [None]:
fig = px.scatter_3d(hooke_latent_df, x="UMAP_hooke_00_3", y="UMAP_hooke_01_3", z="UMAP_hooke_02_3", color="stage_hpf")
fig.show()

In [None]:
fig = px.scatter_3d(hooke_latent_df, x="UMAP_hooke_00_3", y="UMAP_hooke_01_3", z="UMAP_hooke_02_3", color="hash_plate")
fig.show()

## Load in metadata to link image and sequencing datasets

In [None]:
metadata_path = "/Users/nick/Cole Trapnell's Lab Dropbox/Nick Lammers/Nick/morphseq/metadata"

# load master experimental log 
experiment_df = pd.read_csv(os.path.join(metadata_path, "experiment_metadata.csv"), index_col=0)
morphseq_meta_df = experiment_df.loc[experiment_df["has_sci_data"]==1, :]

# load sheets that contain morph-to-seq maps
date_vec = morphseq_meta_df["start_date"].astype(str).tolist()
hash_plate_vec = morphseq_meta_df["hash_plate_number"].tolist()
hash_plate_vec = ["P" + f"{int(h):02}" for h in hash_plate_vec]
# generate list to use for indexing
col_id_list = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
row_letter_list = ["A", "B", "C", "D", "E", "F", "G", "H"]
well_name_list = []
for r in range(len(row_letter_list)):
    for c in range(len(col_id_list)):
        well_name = row_letter_list[r] + f"{col_id_list[c]:02}"
        well_name_list.append(well_name)

In [None]:
morphseq_map_list = []
for d, date in enumerate(date_vec):
    xl_temp = pd.ExcelFile(os.path.join(metadata_path, "well_metadata", date + "_well_metadata.xlsx"))
    genotype_map = xl_temp.parse("genotype").iloc[:8, 1:13].to_numpy().ravel()
    hash_to_image_map = xl_temp.parse("hash_to_image_map").iloc[:8, 1:13].to_numpy().ravel()
    age_map = xl_temp.parse("start_age_hpf").iloc[:8, 1:13].to_numpy().ravel()
    image_notes_map = xl_temp.parse("image_notes").iloc[:8, 1:13].to_numpy().ravel()
    qc_map = xl_temp.parse("morph_seq_qc").iloc[:8, 1:13].to_numpy().ravel()
    
    # add fields to dataframe
    morph_map_df = pd.DataFrame(genotype_map[:, np.newaxis], columns=["master_perturbation"])
    morph_map_df["hash_plate"] = hash_plate_vec[d]
    morph_map_df["experiment_date"] = date
    morph_map_df["hash_well"] = hash_to_image_map
    morph_map_df["image_well"] = well_name_list
    morph_map_df["stage_hpf"] = age_map
    morph_map_df["morphseq_qc_flag"] = qc_map
    morph_map_df["notes"] = image_notes_map
    
    # remove entries with no hash well and clean up variable types/well names
    morph_map_df = morph_map_df.loc[~morph_map_df["hash_well"].isnull(), :]
    morph_map_df.reset_index(inplace=True, drop=True)
    
    for row in range(morph_map_df.shape[0]):
        hash_well_raw = morph_map_df.loc[row, "hash_well"]
        if len(hash_well_raw) == 2:
            hash_well_new = hash_well_raw[0] + "0" + hash_well_raw[1]
            morph_map_df.loc[row, "hash_well"] = hash_well_new
            
    
    morph_map_df["master_perturbation"] = morph_map_df["master_perturbation"].astype(str)
    
    morphseq_map_list.append(morph_map_df)
    
morphseq_df = pd.concat(morphseq_map_list, axis=0, ignore_index=True)

## Build linked datasets

In [None]:
master_df = morphseq_df.merge(hooke_latent_df, on=["hash_plate", "hash_well"], how="left", copy=False)

key = dict({"uninj": "wik", "ctrl-inj": "wik-ctrl-inj", "tbxta": "tbxta", "lmx1b":"lmx1b"})
for row in range(master_df.shape[0]):
    seq_lb = master_df.loc[row, "perturbation"]
    new_lb = key[seq_lb]
    master_df.loc[row, "perturbation"] = new_lb
    
# check for inconsistencies
# error_indices = np.where(master_df["master_perturbation"] != master_df["perturbation"])

In [None]:
# merge on the morph model data
for row in range(morph_df.shape[0]):
    meta_vec = morph_df.loc[row, "snip_id"].split("_")
    morph_df.loc[row, "image_well"] = meta_vec[1]
morph_df["experiment_date"] = morph_df["experiment_date"].astype(str)
    
master_df = master_df.merge(morph_df, on=["image_well", "experiment_date"], how="left", copy=False)
# master_df.head(10)

In [None]:
# drop problematic entries, rename variables and drop extraneous variables
master_df_clean = master_df.loc[~master_df["master_perturbation_y"].isnull(), :] # removes one 8/30 entry missing from images ("H03")
master_df_clean = master_df_clean.loc[master_df_clean["morphseq_qc_flag"]==0, :]

master_df_clean.rename({"master_perturbation_x" : "master_perturbation", "stage_hpf_x": "stage_hpf"}, inplace=True, axis=1)
master_df_clean.drop(["master_perturbation_y", "stage_hpf_y"], inplace=True, axis=1)
master_df_clean.reset_index(inplace=True, drop=True)
master_df_clean.to_csv(os.path.join(lmx_image_dir, "lmx_morphseq_df.csv"))

In [None]:
master_df_clean.head(5)

## Experiment with predicting morphology from sequence space

In [None]:
from sklearn.neural_network import MLPRegressor

n_train = 100
np.random.seed(154)
option_vec = range(master_df_clean.shape[0])
train_indices = np.random.choice(option_vec, n_train, replace=False)
test_indices = np.asarray([i for i in option_vec if i not in train_indices])

In [None]:
Y_train = master_df_clean.loc[train_indices, ["UMAP_00_bio_3", "UMAP_01_bio_3", "UMAP_02_bio_3"]]
Y_test = master_df_clean.loc[test_indices, ["UMAP_00_bio_3", "UMAP_01_bio_3", "UMAP_02_bio_3"]]

X_train = master_df_clean.loc[train_indices, ["UMAP_hooke_00_3", "UMAP_hooke_01_3", "UMAP_hooke_02_3"]]
X_test = master_df_clean.loc[test_indices, ["UMAP_hooke_00_3", "UMAP_hooke_01_3", "UMAP_hooke_02_3"]]

In [None]:
regr = MLPRegressor(random_state=1, max_iter=5000)
regr.fit(X_train, Y_train)
regr.score(X_test, Y_test)

In [None]:
Y_test_pd = regr.predict(X_test)

In [None]:
Y_test.reset_index(inplace=True, drop=True)
Y_test_arr = Y_test.to_numpy()
pert_id_test = master_df_clean.loc[test_indices,"master_perturbation"].to_numpy()
color_dict = dict({"lmx1b" : "lightskyblue", "wik":"gray", "wik-ctrl-inj":"black", "tbxta":"seagreen"})
color_vec = [color_dict[p] for p in pert_id_test]

import plotly.graph_objects as go

fig = go.Figure()

for r in range(Y_test.shape[0]):
    data = np.concatenate((Y_test_arr[r, :].reshape((1,3)), Y_test_pd[r, :].reshape((1,3))), axis=0)

    fig.add_trace(go.Scatter3d(x=data[:, 0], y=data[:, 1], z=data[:, 2], mode="lines+markers", 
                               line=dict(color="black"), marker=dict(size=1)))
    
fig.add_trace(go.Scatter3d(x=Y_test_arr[:, 0], y=Y_test_arr[:, 1], z=Y_test_arr[:, 2], mode="markers", 
                         marker=dict(color=color_vec, size=5)))
fig.add_trace(go.Scatter3d(x=Y_test_pd[:, 0], y=Y_test_pd[:, 1], z=Y_test_pd[:, 2], mode="markers", 
                         marker=dict(color=color_vec, size=5, symbol="diamond")))
    
fig.show()

In [None]:
# try removing tbxta embryos from the datasets to see if intra-cluster prediction can be improved
pert_id_train = master_df_clean.loc[train_indices,"master_perturbation"].to_numpy()
not_tbxta_train = pert_id_train != "tbxta"
pert_id_test = master_df_clean.loc[test_indices,"master_perturbation"].to_numpy()
not_tbxta_test = pert_id_test != "tbxta"

Y_train2 = Y_train.loc[not_tbxta_train]
Y_test2 = Y_test.loc[not_tbxta_test]

X_train2 = X_train.loc[not_tbxta_train]
X_test2 = X_test.loc[not_tbxta_test]

In [None]:
regr2 = MLPRegressor(random_state=1, max_iter=5000, hidden_layer_sizes=(500,))
regr2.fit(X_train2, Y_train2)
reg_score = regr2.score(X_test2, Y_test2)

Y_test_pd2 = regr.predict(X_test2)

In [None]:
reg_score_test2 = regr2.score(X_test2, Y_test2)
reg_score_train2 = regr2.score(X_train2, Y_train2)
print(reg_score_test2)
print(reg_score_train2)

In [None]:
Y_test2.reset_index(inplace=True, drop=True)
Y_test_arr2 = Y_test2.to_numpy()
pert_id_test_plot = pert_id_test[not_tbxta_test]
color_dict = dict({"lmx1b" : "lightskyblue", "wik":"gray", "wik-ctrl-inj":"seagreen"})
color_vec = [color_dict[p] for p in pert_id_test_plot]


fig = go.Figure()

for r in range(Y_test2.shape[0]):
    data = np.concatenate((Y_test_arr2[r, :].reshape((1,3)), Y_test_pd2[r, :].reshape((1,3))), axis=0)

    fig.add_trace(go.Scatter3d(x=data[:, 0], y=data[:, 1], z=data[:, 2], mode="lines+markers", 
                               line=dict(color="black"), marker=dict(size=1)))
    
fig.add_trace(go.Scatter3d(x=Y_test_arr2[:, 0], y=Y_test_arr2[:, 1], z=Y_test_arr2[:, 2], mode="markers", 
                         marker=dict(color=color_vec, size=5)))
fig.add_trace(go.Scatter3d(x=Y_test_pd2[:, 0], y=Y_test_pd2[:, 1], z=Y_test_pd2[:, 2], mode="markers", 
                         marker=dict(color=color_vec, size=5, symbol="diamond")))
    
fig.show()

## That went...ok. Let's try some basic clustering stuff next

In [None]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=15).fit(master_df_clean.loc[:, ["UMAP_hooke_00_3", "UMAP_hooke_01_3", "UMAP_hooke_02_3"]])

fig = px.scatter_3d(master_df_clean, x="UMAP_hooke_00_3", y="UMAP_hooke_01_3", z="UMAP_hooke_02_3", color=kmeans.labels_.astype(str))
fig.show()

In [None]:
fig = px.scatter_3d(master_df_clean, x="UMAP_00_bio_3", y="UMAP_01_bio_3", z="UMAP_02_bio_3", 
                    color=kmeans.labels_.astype(str))
fig.show()

### Train multiple models and take the one that performes the best

In [None]:
# try removing tbxta embryos from the datasets to see if intra-cluster prediction can be improved
pert_id_train = master_df_clean.loc[train_indices,"master_perturbation"].to_numpy()
not_tbxta_train = pert_id_train != "tbxta"
pert_id_test = master_df_clean.loc[test_indices,"master_perturbation"].to_numpy()
not_tbxta_test = pert_id_test != "tbxta"

Y_train3 = Y_train.loc[not_tbxta_train]
Y_test3 = Y_test.loc[not_tbxta_test]

X_train3 = X_train.loc[not_tbxta_train]
X_test3 = X_test.loc[not_tbxta_test]

In [None]:
random_state_vec = [1, 2, 3]
hl_size_vec = [250, 500, 1000]
score_vec = []
sz_vec = []
rs_vec = []
mdl_vec = []
for r in random_state_vec:
    for sz in hl_size_vec:
        regr2 = MLPRegressor(random_state=r, max_iter=5000, hidden_layer_sizes=(sz,))
        regr2.fit(X_train3, Y_train3)
        reg_score_train = regr2.score(X_train3, Y_train3)
        
        score_vec.append(reg_score_train)
        sz_vec.append(sz)
        rs_vec.append(r)
        mdl_vec.append(regr2)



In [None]:
best_ind = np.argmax(score_vec)
best_score = np.max(score_vec)

print(best_ind)
print(best_score)

In [None]:
mdl = mdl_vec[best_ind]
reg_score_test3 = mdl.score(X_test3, Y_test3)
Y_test_pd3 = mdl.predict(X_test3)

print(reg_score_test3)

sz_vec[best_ind]

### What happens if we use the full latent space?

In [None]:
# extract hooke latents
hook_latent_array = master_df_clean.loc[:, raw_cell_counts_df.index.tolist()].to_numpy()

# remove tbxta
pert_id_train = master_df_clean.loc[train_indices,"master_perturbation"].to_numpy()
not_tbxta_train = pert_id_train != "tbxta"
pert_id_test = master_df_clean.loc[test_indices,"master_perturbation"].to_numpy()
not_tbxta_test = pert_id_test != "tbxta"

# make test and train sets
X_train4 = hook_latent_array[train_indices[not_tbxta_train], :]
X_test4 = hook_latent_array[test_indices[not_tbxta_test], :]

Y_train4 = master_df_clean.loc[train_indices[not_tbxta_train], ["UMAP_00_bio_3", "UMAP_01_bio_3", "UMAP_02_bio_3"]].to_numpy()
Y_test4 = master_df_clean.loc[test_indices[not_tbxta_test], ["UMAP_00_bio_3", "UMAP_01_bio_3", "UMAP_02_bio_3"]].to_numpy()

In [None]:
regr4 = MLPRegressor(random_state=1, max_iter=5000, hidden_layer_sizes=(500,))
regr4.fit(X_train4, Y_train4)

In [None]:
reg_score_train = regr4.score(X_train4, Y_train4)
reg_score_test = regr4.score(X_test4, Y_test4)

print(reg_score_train)
print(reg_score_test)

## What about morph-to-seq?

In [None]:
Y_train5 = X_train3
Y_test5 = X_test3

X_train5 = Y_train3
X_test5 = Y_test3

regr5 = MLPRegressor(random_state=1, hidden_layer_sizes=(5000,))
regr5.fit(X_train5, Y_train5.iloc[:, 0])

In [None]:
reg_score_train = regr5.score(X_train5, Y_train5.iloc[:, 0])
reg_score_test = regr5.score(X_test5, Y_test5.iloc[:, 0])

print(reg_score_train)
print(reg_score_test)

In [None]:
X_train5.shape

In [None]:
master_df_clean.loc[master_df_clean["image_well"]=="D09", :]

In [None]:
# note images with potential 
qc_list = ["20230830.E02", "20230831.A01", "20230831.A06", "20230831.B03", "20230831.C06", 
           "20230831.D02", "20230832.D06", "20230831.E06", "20230831.F05", ,
           "20231208.G07"]
code["dorsal", "dorsal", "dorsal", "segmentation", "dorsal", 
     "saturation", "dorsal", "saturation", "segmentation (bad)", "dorsal"]

In [None]:
# note a few particularly severe lmx embryos
lmx_list = ["20230830.A01", "20230830.A02", "20230830.D02", "20230830.E01", "20230830.G01", 
            "20230831.A05", "20230831.B01", "20230831.B05" , "20230831.C05", "20230831.D01", "20230831.H06",
            "20231207.G02", "20231208.A02", "20231208.B09", "20231208.C08", "20231208.D01"]
phenotype = np.asarray([0, 2.5, 2.5, 4.5, 1, 2.5, 4.5, 3, 5, 0.5, 3, 1, 3, 3.5, 3, 1.5])

In [None]:
ref_ids = [] 
from_ids = []
for i in range(len(lmx_list)):
    meta_vec = lmx_list[i].split(".")
    bool1 = master_df_clean["experiment_date"]==meta_vec[0]
    bool2 = master_df_clean["image_well"]==meta_vec[1]
    if any(bool1 & bool2):
        ref_ids.append(np.where(bool1 & bool2)[0][0])
        from_ids.append(i)

In [None]:
fig = px.scatter_3d(master_df_clean, x="UMAP_00_bio_3", y="UMAP_01_bio_3", z="UMAP_02_bio_3", 
                    color="master_perturbation", opacity=0.2)

mdf_sub = master_df_clean.loc[ref_ids, :]
mdf_sub["phenotype_score"] = phenotype[np.asarray(from_ids)]

fig.add_trace(go.Scatter3d(x=mdf_sub.loc[:, "UMAP_00_bio_3"], 
                           y=mdf_sub.loc[:, "UMAP_01_bio_3"], 
                           z=mdf_sub.loc[:, "UMAP_02_bio_3"], 
                          mode="markers", marker=dict(color=phenotype[np.asarray(from_ids)])))
fig.show()

In [None]:
fig = px.scatter_3d(mdf_sub, x="UMAP_00_bio_3", y="UMAP_01_bio_3", z="UMAP_02_bio_3", 
                    color="snip_id", opacity=1)
fig.show()

In [None]:
fig = px.scatter_3d(master_df_clean, x="UMAP_hooke_00_3", y="UMAP_hooke_01_3", z="UMAP_hooke_02_3", 
                    color="master_perturbation", opacity=0.2)

mdf_sub = master_df_clean.loc[ref_ids, :]
mdf_sub["phenotype_score"] = phenotype[np.asarray(from_ids)]

fig.add_trace(go.Scatter3d(x=mdf_sub.loc[:, "UMAP_hooke_00_3"], 
                           y=mdf_sub.loc[:, "UMAP_hooke_01_3"], 
                           z=mdf_sub.loc[:, "UMAP_hooke_02_3"], 
                          mode="markers", marker=dict(color=phenotype[np.asarray(from_ids)])))
fig.show()