#### This notebook combines morphological and transcriptional embeddings for all embryos with 1-to-1 pairings from morphseq experiments

In [None]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import os
from glob2 import glob
from sklearn.cross_decomposition import CCA

### Set path variables to datasets

In [None]:
# load embryo_df for our current best model
# root = "/media/nick/hdd02/Cole Trapnell's Lab Dropbox/Nick Lammers/Nick/morphseq/"
root = "/Users/nick/Cole Trapnell's Lab Dropbox/Nick Lammers/Nick/morphseq/"
train_name = "20241107_ds"
model_name = "SeqVAE_z100_ne150_sweep_01_block01_iter030" 
train_dir = os.path.join(root, "training_data", train_name, "")
output_dir = os.path.join(train_dir, model_name) 

# get path to morph model
training_path = sorted(glob(os.path.join(output_dir, "*")))[-1]
training_name = os.path.dirname(training_path)
morph_read_path = os.path.join(training_path, "figures", "")

# set path to hooke projections
hooke_model_name = "bead_expt_linear"
latent_path = "/Users/nick/Cole Trapnell's Lab Dropbox/Nick Lammers/Nick/morphseq/seq_data/emb_projections/latent_projections/"
hooke_model_path = os.path.join(latent_path, hooke_model_name, "")

# path to figures and data
fig_path = "/Users/nick/Cole Trapnell's Lab Dropbox/Nick Lammers/Nick/slides/morphseq/20250312/morphseq_cca/"
os.makedirs(fig_path, exist_ok=True)

### Load morphVAE datasets

In [None]:
# load in morphseq datasets
morph_df = pd.read_csv(morph_read_path + "embryo_stats_df.csv", index_col=0)
# morph_umap_df = pd.read_csv(morph_read_path + "umap_df.csv", index_col=0)
# morph_umap_df = morph_umap_df.merge(morph_df.loc[:, ["snip_id", "embryo_id", "experiment_time"]], how="left", on=["snip_id"])
# morph_umap_df.head()

In [None]:
# load in master experiment log
im_meta_df = pd.read_csv(os.path.join(root, "metadata", "experiment_metadata.csv"), encoding="latin1")
im_meta_df.tail(3)

### Load Hooke latent projections

In [None]:
# load full counts dataset
# hooke_counts_df = pd.read_csv(hooke_model_path + "combined_counts.csv", index_col=0)
# hooke_metadata_df = pd.read_csv(hooke_model_path + "combined_metadata.csv", index_col=0)
# hooke_latents_df = pd.read_csv(hooke_model_path + "latent_projections.csv", index_col=0)
# hooke_latents_df.head()

# load in metadata file
ccs_meta_df = pd.read_csv(os.path.join(root, "metadata", "seq_embryo_df.csv"), index_col=0)

# generate experiment key
exp_df = ccs_meta_df.loc[:, ["expt", "target", "temp"]].drop_duplicates().reset_index()
exp_df.to_csv(os.path.join(root, "metadata", "experiment_key.csv"), index=False)

### Load plate maps to link morph and seq embryo embeddings

In [None]:
# generate list to use for indexing
col_id_list = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
row_letter_list = ["A", "B", "C", "D", "E", "F", "G", "H"]
well_name_list = []
for r in range(len(row_letter_list)):
    for c in range(len(col_id_list)):
        well_name = row_letter_list[r] + f"{col_id_list[c]:02}"
        well_name_list.append(well_name)

# dummy_df = pd.DataFrame(np.empty(len(well_name_list)), columns=[colname], index=new_index)
# well_name_list = np.asarray(well_name_list)

def get_sheet_df(xlsx, sheet_name, colname, col_id_list=col_id_list, row_letter_list=row_letter_list): 
    
    if sheet_name == "image_to_hash_plate_map":
        if "image_to_hash_plate_map" in xlsx.sheet_names:
            df = xlsx.parse(sheet_name, index_col=0)
        else:
            df = xlsx.parse("image_to_hash_plate_num", index_col=0)
    elif sheet_name == "temperature":
        if "temperature" in xlsx.sheet_names:
            df = xlsx.parse(sheet_name, index_col=0)
        else:
            df = pd.DataFrame(np.empty((len(row_letter_list), len(col_id_list))), columns=col_id_list, index=row_letter_list)
            df.iloc[:, :] = 28.5
    else:
        df = xlsx.parse(sheet_name, index_col=0)
        if df.size == 0:
            df = pd.DataFrame(np.empty((len(row_letter_list), len(col_id_list))), columns=col_id_list, index=row_letter_list)
            
    data_vec = df.iloc[:, :].to_numpy().ravel()
        
    col_array = np.asarray([str(num).zfill(2) for num in df.columns.tolist()])[None, :]
    row_array = np.asarray(list(df.index))[:, None]
    new_index = (row_array + col_array).ravel()
    
    sheet_df = pd.DataFrame(data_vec, columns=[colname], index=new_index)

    return sheet_df

In [None]:
from functools import reduce

morph_map_path = os.path.join(root, "metadata", "morphseq_maps", "")
plate_map_list = sorted(glob(morph_map_path + "*morph_well_metadata.xlsx"))
len(plate_map_list)

# read in and combine metadata
df_list = []
sheet_list = ["genotype", "image_to_hash_map", "temperature", "image_to_hash_plate_map", "start_age_hpf", "image_notes", "morph_seq_qc"]
col_list =   ["genotype", "hash_well", "temperature", "hash_plate_num", "stage_hpf", "image_notes", "qc_flag"]
for m, map_path in enumerate(plate_map_list):

    exp_name = os.path.basename(map_path).replace("_morph_well_metadata.xlsx", "")
    if "$" not in exp_name:
        # extract column vectors
        xl_temp = pd.ExcelFile(map_path)
        df_col_list = []
        for s, sheet in enumerate(sheet_list):
            dfc = get_sheet_df(xl_temp, sheet, col_list[s])
            df_col_list.append(dfc)
    
        # Merge all DataFrames on the 'index' column using reduce and pd.merge
        df_temp = reduce(lambda left, right: pd.merge(left, right, left_index=True, right_index=True), df_col_list)
        df_temp["experiment_date"] = exp_name
        df_temp = df_temp.reset_index(names=["image_well"])
        
        # add sci experiment name
        sci_expt = im_meta_df.loc[im_meta_df["start_date"] == exp_name, "sci_experiment"]
        df_temp["sci_expt"] = sci_expt.values[0]
        
        # add to list
        df_list.append(df_temp)

morphseq_df = pd.concat(df_list, axis=0, ignore_index=True)
morphseq_df.loc[np.isnan(morphseq_df["qc_flag"]), "qc_flag"] = 0
morphseq_df.loc[morphseq_df["image_notes"].astype(str)=="nan", "image_notes"] = ""
print(morphseq_df.shape)
morphseq_df = morphseq_df.dropna().reset_index(drop=True)
print(morphseq_df.shape)
morphseq_df.head()

### Standardize a few columns

In [None]:
# integer types
morphseq_df["hash_plate_num"] = morphseq_df["hash_plate_num"].astype(int)
morphseq_df["qc_flag"] = morphseq_df["qc_flag"].astype(int)

# standardize control notation
genotype_vec = morphseq_df["genotype"].values.tolist()
new_gt_vec = [(g.replace("wik-", "")).replace("ab-", "") for g in genotype_vec]
morphseq_df["genotype"] = new_gt_vec

# standardize hash well notation
hash_vec = morphseq_df["hash_well"].values.tolist().copy()
new_hash_vec = [f"{h[0]}0{h[1]}" if len(h)==2 else h for h in hash_vec]
morphseq_df["hash_well"] = new_hash_vec

# we also need to create a version stripped of 0 padding for matching to seq data
hash_vec = morphseq_df["hash_well"].values.tolist().copy()
stripped_hash_vec = [f"{h[0]}{h[-1]}" if h[1]=='0' else h for h in hash_vec]
morphseq_df["hash_well_seq"] = stripped_hash_vec

# create plate string
plate_num_vec = morphseq_df["hash_plate_num"].values.tolist()
plate_str_vec = [f"P{p:02}" for p in plate_num_vec]
morphseq_df["hash_plate_str"] = plate_str_vec

### Create ID variables to link to image and sequencing data

In [None]:
# the key variable from the imaging side is 'snip_id'
morphseq_df["snip_id"] = morphseq_df["experiment_date"] + "_" + morphseq_df["image_well"] + "_e00_t0000"

In [None]:
morphseq_df["sample"] = morphseq_df["sci_expt"] + "_" + morphseq_df["hash_plate_str"] + "_" + morphseq_df["hash_well_seq"]
morphseq_df.head()

### Join on key metadata from each modality--we will not join on latent variables at this stage

In [None]:
morph_meta_cols = ["snip_id", "embryo_id", "short_pert_name", "medium", "phenotype", "train_cat", "predicted_stage_hpf"]
morph_meta_df = morph_df.loc[:, morph_meta_cols]
morphseq_df = morphseq_df.merge(morph_meta_df, how="left", on="snip_id", indicator=True)

morphseq_df = morphseq_df.rename(columns={"_merge":"has_morph_data"})
morphseq_df["has_morph_data"] = (morphseq_df["has_morph_data"]=="both" ).astype(int)
morphseq_df.head()

In [None]:
seq_meta_cols = ['sample', 'Size_Factor', 'n.umi', 'perc_mitochondrial_umis',
       'top_to_second_best_ratio', 'hash_umis', 'log.n.umi', 'mean_nn_time', 'collection_batch',
       'dis_protocol', 'fix_protocol', 'timepoint', 'drug_addition', 'stage',
       'dose', 'temp', 'cells_per_embryo','perturbation', 'target', 'strain']
seq_meta_df = ccs_meta_df.loc[:, seq_meta_cols]

morphseq_df = morphseq_df.merge(seq_meta_df, how="left", on="sample", indicator=True)
morphseq_df = morphseq_df.rename(columns={"_merge":"has_sci_data"})
morphseq_df["has_sci_data"] = (morphseq_df["has_sci_data"]=="both" ).astype(int)


In [None]:
# save
morphseq_df.to_csv(os.path.join(root, "metadata", "morphseq_metadata.csv"), index=False)

In [None]:
# np.mean(morphseq_df["_merge"]=="both")
ei, ec = np.unique(morphseq_df.loc[morphseq_df["has_sci_data"]!=1, "experiment_date"], return_counts=True)
print(ei)
print(ec)

In [None]:
morphseq_df["snip_id"]

### Quick sanity check: do the stages match?

In [None]:
fig = px.scatter(morphseq_df, x="stage_hpf", y="timepoint", color="experiment_date")
fig.show()

In [None]:
# where are these mismatches coming from
miss_flags = (morphseq_df["stage_hpf"] != morphseq_df["timepoint"]) & ~np.isnan(morphseq_df["timepoint"])
morphseq_df.loc[miss_flags, :]

In [None]:
sci_flag = (morphseq_df["has_sci_data"]==1)
morphseq_df.loc[sci_flag & (morphseq_df["experiment_date"]=="20240510"), :]

In [None]:
morphseq_df.columns

In [None]:
d_cols = [col for col in ccs_meta_df.columns if "date" in col]
d_cols

In [None]:
np.unique(ccs_meta_df["expt_seq_date"].astype(str))

In [None]:
print(len(seq_meta_df["sample"]))
print(len(np.unique(seq_meta_df["sample"])))

In [None]:
print(len(morphseq_df["sample"]))
print(len(np.unique(morphseq_df["sample"])))

In [None]:
si, sc = np.unique(morphseq_df["sample"], return_counts=True)
problem_samples = si[sc>1]
np.unique(morphseq_df.loc[np.isin(morphseq_df["sample"], problem_samples), "experiment_date"])

In [None]:
print(np.sum(morphseq_df["sci_expt"]=="LMX1B"))
print(np.sum(morphseq_df["sci_expt"]=="LMX1Bearly"))
print(np.sum(ccs_meta_df["expt"]=="LMX1B"))
print(np.sum(ccs_meta_df["expt"]=="LMX1Bearly"))

In [None]:
np.unique(ccs_meta_df.loc[ccs_meta_df["target"]=="tbx6", "hash_well"])

In [None]:
lmx1bearly_filter = ccs_meta_df["expt"]=="LMX1Bearly"
hw_filter = ccs_meta_df["hash_well"]=="A08"
time_filter = ccs_meta_df["timepoint"] >= 36
ccs_meta_df.loc[lmx1bearly_filter & time_filter, "hash_well"]