#### This notebook add temperature and PCA fields to embryo metadata

In [None]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import os
from glob2 import glob

In [None]:
root = "/Users/nick/Cole Trapnell's Lab Dropbox/Nick Lammers/Nick/morphseq/"
read_path = os.path.join(root, "results", "20250312", "vae_output", "") 
# path to figures and data
# fig_root = "/media/nick/hdd02/Cole Trapnell's Lab Dropbox/Nick Lammers/Nick/slides/morphseq/20250213/"
# fig_path = "/Users/nick/Cole Trapnell's Lab Dropbox/Nick Lammers/Nick/slides/morphseq/20250312/morph_metrics/"
# os.makedirs(fig_path, exist_ok=True)

In [None]:
morph_df = pd.read_csv(read_path + "embryo_stats_df.csv", index_col=0)
umap_df = pd.read_csv(read_path + "umap_df.csv", index_col=0)
morph_df.head()

### We need to add temperature metadata from well map files

In [None]:
# subset for hotfish experiments
HF_experiments = np.asarray(['20240813_24hpf', '20240813_30hpf', '20240813_36hpf', '20240813_extras'])
hf_morph_df = morph_df.loc[np.isin(morph_df["experiment_date"], HF_experiments), :].reset_index()

# extract well info
exp_vec = hf_morph_df.loc[:, "experiment_date"].to_numpy()
eid_vec = hf_morph_df.loc[:, "embryo_id"].to_numpy()
well_list = np.asarray([eid.split("_")[-2] for eid in eid_vec])
hf_morph_df["well_id"] = well_list
letter_index = np.asarray(["A", "B", "C", "D", "E", "F", "G", "H"])

# load plate maps 
metadata_dir = os.path.join(root, "metadata", "well_metadata", "")
df_list = []
for exp in HF_experiments:
    # Construct the file path: e.g., "metadata/morphseq_maps/20240813_24hpf.xlsx"
    file_path = os.path.join(metadata_dir, f"{exp}_well_metadata.xlsx")
    
    # Read the "temperature" sheet from the Excel file into a DataFrame.
    temp_df = pd.read_excel(file_path, sheet_name="temperature", index_col=0)

    well_list = []
    temp_list = []
    for i in range(temp_df.shape[0]):
        for j in range(temp_df.shape[1]):
            well_list.append(letter_index[i] + f"{j+1:02}")
            temp_list.append(temp_df.iloc[i,j])

    df = pd.DataFrame(well_list, columns=["well_id"])
    df["temp"] = temp_list
    df["experiment_date"] = exp
    df_list.append(df)

temp_df = pd.concat(df_list, ignore_index=True)

temp_df.head()

In [None]:
# get snip IDs for the relevant embryos and update metadata
print(temp_df.shape)
temp_df = temp_df.merge(hf_morph_df.loc[:, ["experiment_date", "well_id", "snip_id"]], how="left", on=["experiment_date", "well_id"])
print(temp_df.shape)
snip_id_vec = temp_df["snip_id"].to_numpy()
temp_vec = temp_df["temp"].to_numpy()

# update
for s, snip_id in enumerate(snip_id_vec):
    # update the main morph df
    morph_df.loc[morph_df["snip_id"] == snip_id, "temperature"] = temp_vec[s]
    
# umap df
if "temperature" in umap_df.columns.tolist():
    umap_df = umap_df.drop(labels=["temperature"], axis=1)
umap_df = umap_df.merge(morph_df.loc[:, ["snip_id", "temperature"]], how="left", on="snip_id")

#### Calculate PCA to complement UMAP info

In [None]:
from sklearn.decomposition import PCA

# get cols
morph_cols = morph_df.columns.tolist()
full_cols = [col for col in morph_cols if "z_mu" in col]
bio_cols = [col for col in morph_cols if "z_mu_b" in col]
nbio_cols = [col for col in morph_cols if "z_mu_n" in col]

# initialize
n_components_b = 10
n_components_n = 5

pca_full = PCA(n_components=n_components_b)
pca_bio = PCA(n_components=n_components_b)
pca_nbio = PCA(n_components=n_components_n)

# fit
pca_full.fit(morph_df.loc[:, full_cols])
pca_bio.fit(morph_df.loc[:, bio_cols])
pca_nbio.fit(morph_df.loc[:, nbio_cols])

# reduce
pca_array_full = pca_full.transform(morph_df.loc[:, full_cols])
pca_array_bio = pca_bio.transform(morph_df.loc[:, bio_cols])
pca_array_nbio = pca_nbio.transform(morph_df.loc[:, nbio_cols])

In [None]:
# plot cumulative expllained variance
var_exp = pca_bio.explained_variance_ratio_
var_exp_c = np.cumsum(var_exp)

fig = px.line(x=np.arange(n_components_b), y=var_exp_c)
fig.update_layout(xaxis=dict(title="number of PCs"),
                  yaxis=dict(title="total explained variance"))
fig.show()
fig.write_image(os.path.join(fig_path, "pc_plot.png"))

In [None]:
# get col names
pca_cols_full = [f"PCA_{p:02}_all" for p in range(n_components_b)]
pca_cols_bio = [f"PCA_{p:02}_bio" for p in range(n_components_b)]
pca_cols_nbio = [f"PCA_{p:02}_nbio" for p in range(n_components_n)]

# update UMAP df
umap_df.loc[:, pca_cols_full] = pca_array_full
umap_df.loc[:, pca_cols_bio] = pca_array_bio  
umap_df.loc[:, pca_cols_nbio] = pca_array_nbio

In [None]:
# save
morph_df.to_csv(read_path + "embryo_stats_df.csv")
umap_df.to_csv(read_path + "umap_df.csv")