In [None]:
import os
import pandas as pd
import glob as glob

# set root path to directory
root = "/Users/nick/Dropbox (Cole Trapnell's Lab)/Nick/morphseq/"
# root = "E:\\Nick\\Dropbox (Cole Trapnell's Lab)\\Nick\\morphseq\\"

# read in metadata
metadata_path = os.path.join(root, 'metadata', '')
embryo_metadata_df = pd.read_csv(os.path.join(metadata_path, "embryo_metadata_df.csv"), index_col=0)

# path to image and snip files
im_snip_dir = os.path.join(root, 'training_data', 'bf_embryo_snips', '')
mask_snip_dir = os.path.join(root, 'training_data', 'bf_embryo_masks', '')

# get lists of snips and masks
im_snip_files = sorted(glob.glob(im_snip_dir + "*.tif"))
emb_mask_files = sorted(glob.glob(mask_snip_dir + "emb*.tif"))

## Plot basic trends in embryo growth over time

In [None]:
import plotly.express as px
import plotly.graph_objects as go
import numpy as np
import plotly.offline as pyo
pyo.init_notebook_mode()

# filter our low-quality frames
embryo_metadata_use = embryo_metadata_df.iloc[np.where(embryo_metadata_df["use_embryo_flag"]==1)]
embryo_metadata_use.reset_index(inplace=True)
embryo_metadata_use["snip_id"] = embryo_metadata_use["embryo_id"] + "_" + \
            embryo_metadata_use["time_int"].astype(str)

# plot surface area over time
fig = px.scatter(embryo_metadata_use, x='predicted_stage_hpf', y="surface_area_um", color="fraction_alive", 
                 opacity=0.1, title="surface area vs. age")
fig.show()

In [None]:
# plot length over time
fig = px.scatter(embryo_metadata_use, x='predicted_stage_hpf', y="length_um", color="genotype", 
                 opacity=0.1, title="embryo length vs. age")
fig.show()

In [None]:
# plot length over time by experiment
fig = px.scatter(embryo_metadata_use, x='predicted_stage_hpf', y="length_um", color='chem_perturbation', 
                 opacity=0.5, title="embryo length vs. age")
fig.show()

In [None]:
print(np.unique(embryo_metadata_df["chem_perturbation"]))
print(np.unique(embryo_metadata_df["genotype"]))

## Make a combined perturbation column

In [None]:
embryo_metadata_use["master_perturbation"] = embryo_metadata_use["chem_perturbation"].astype(str)
embryo_metadata_use["master_perturbation"].iloc[np.where(embryo_metadata_use["master_perturbation"]=="nan")[0]] = \
    embryo_metadata_use["genotype"].iloc[np.where(embryo_metadata_use["master_perturbation"]=="nan")[0]].copy().values

In [None]:
np.unique(embryo_metadata_use["master_perturbation"].values)

In [None]:
# plot length over time by experiment
fig = px.scatter(embryo_metadata_use, x='predicted_stage_hpf', y="length_um", color='master_perturbation', 
                 opacity=0.25, title="embryo length vs. age")
fig.show()

## Calculate a moving 2 hr average for each perturbation class

In [None]:
perturbation_class_vec = np.unique(embryo_metadata_use["master_perturbation"])
time_index = range(8, 48)
n_boots = 100
min_embryos = 10
length_array_boot = np.empty((len(time_index), len(perturbation_class_vec), n_boots))
length_array_boot[:] = np.nan

# calculate bootstrap averages
for p, perturbation in enumerate(perturbation_class_vec):
    # extract vectors
    p_indices = np.where(embryo_metadata_use["master_perturbation"]==perturbation)[0]
    length_vec = embryo_metadata_use["length_um"].iloc[p_indices].values
    time_vec = embryo_metadata_use['predicted_stage_hpf'].iloc[p_indices].values
    # iterate through time points
    for t, time in enumerate(time_index):
        t_indices = np.where((time_vec>=time-1) & (time_vec<=time+1))[0]
        if len(t_indices) >= min_embryos:
            for b in range(n_boots):
                boot_indices = np.random.choice(t_indices, len(t_indices), replace=True)
                length_array_boot[t, p, b] = np.mean(length_vec[boot_indices])
                

In [None]:
avg_len_df = pd.DataFrame(np.mean(length_array_boot, axis=2), columns=perturbation_class_vec)
avg_len_df["time"] = time_index
std_len_df = pd.DataFrame(np. std(length_array_boot, axis=2), columns=perturbation_class_vec)
std_len_df["time"] = time_index

avg_len_long = pd.melt(avg_len_df, id_vars="time")
avg_len_long = avg_len_long.rename(columns={"value": "mean_length_um"})
std_len_long = pd.melt(std_len_df, id_vars="time")
avg_len_long = avg_len_long.merge(std_len_long, how="left")
avg_len_long = avg_len_long.rename(columns={"value": "std_length_um"})


fig = px.scatter(avg_len_long, x="time", y="mean_length_um", color="variable", error_y="std_length_um", 
                color_discrete_sequence=px.colors.qualitative.Dark24)
fig.show()

**A few notes/observations:** 
- It looks like there is a significant issue with the staging for at least 1 of the FGF experiments. They rise much faster even than the wk-AB control
- I suspect that the same staging issue is responsible for the early rise of the ethanol control and "WT" trends
- Is "WT" truly WT? Or did I mess up those labels?
- The shh-i treatment has a powerful effect, even at 1/4 dosage
- Meanwhile, fgf-i only seems to have an effect once I get to 1.5x.
- As expected, the gdf3 mutants also exhibit a pronounced decrease in embryo length

In [None]:
fig = px.scatter(embryo_metadata_use, x='predicted_stage_hpf', y="length_um", color='master_perturbation', 
                 opacity=0.25, title="embryo length vs. age", trendline="lowess")
fig.show()

## Figure out what drives outliers in surface area

In [None]:
# plot surface area over time
fig = px.scatter(embryo_metadata_use, x='predicted_stage_hpf', y="surface_area_um", color="embryo_id", 
                 opacity=0.5, title="surface area vs. age")
fig.show()

In [None]:
from matplotlib import pyplot as plt
import cv2

sa_outlier_indices = np.where((embryo_metadata_use["predicted_stage_hpf"].values <= 20) & \
                              (embryo_metadata_use["surface_area_um"].values > 1e6))[0]

embryo_metadata_use["snip_id"] = embryo_metadata_use["embryo_id"] + "_" + \
            embryo_metadata_use["time_int"].astype(str)

sa_outlier_df = embryo_metadata_use.iloc[sa_outlier_indices]

print(sa_outlier_df["snip_id"])


In [None]:
embryo_id = sa_outlier_df["snip_id"].iloc[-1]
print(embryo_id)
embryo_snip_indices = [i for i in range(len(im_snip_files)) if embryo_id in im_snip_files[i]]
im_test = cv2.imread(im_snip_files[embryo_snip_indices[0]])

plt.imshow(im_test.astype(int))
plt.show()

In [None]:
sa_ceil = np.floor(np.max(embryo_metadata_use["surface_area_um"]))

sa_outlier_indices2 = np.where((embryo_metadata_use["predicted_stage_hpf"].values <= 40) & \
                               (embryo_metadata_use["surface_area_um"].values > sa_ceil*0.99))[0]

sa_outlier_df2 = embryo_metadata_use.iloc[sa_outlier_indices2]

embryo_id2 = sa_outlier_df2["snip_id"].iloc[3]
print(embryo_id)
embryo_snip_indices = [i for i in range(len(im_snip_files)) if embryo_id2 in im_snip_files[i]]
im_test = cv2.imread(im_snip_files[embryo_snip_indices[0]])

plt.imshow(im_test.astype(int))
plt.show()

In [None]:
print(sa_outlier_df2["Time Rel (s)"].iloc[3] / 3600)
print(sa_outlier_df2["predicted_stage_hpf"].iloc[3])
print(sa_outlier_df2["length_um"].iloc[3]/1000)

### Let's look at some weirdly small wk-AB embryos

In [None]:
wck_outlier_indices = np.where((embryo_metadata_use["predicted_stage_hpf"].values >= 30) & \
                              (embryo_metadata_use["master_perturbation"].values =="wck-AB") & \
                              (embryo_metadata_use["length_um"].values <= 2000))[0]

wck_outlier_df = embryo_metadata_use.iloc[wck_outlier_indices]

embryo_id = wck_outlier_df["snip_id"].iloc[350]
print(embryo_id)

embryo_snip_indices = [i for i in range(len(im_snip_files)) if embryo_id in im_snip_files[i]]
im_test = cv2.imread(im_snip_files[embryo_snip_indices[0]])

plt.imshow(im_test.astype(int))
plt.show()

In [None]:
# look at one weird mutant embryo
embryo_id = "20230627_D06_e00"

embryo_snip_indices = [i for i in range(len(im_snip_files)) if embryo_id in im_snip_files[i]]
im_test = cv2.imread(im_snip_files[embryo_snip_indices[20]])

plt.imshow(im_test.astype(int))
plt.show()

In [None]:
weird_indices = np.where(embryo_metadata_use["snip_id"].values == embryo_id + "_25")
embryo_metadata_use.iloc[weird_indices]

In [None]:
i = [i for i in embryo_metadata_use.columns if 'pin' in i]
print(i)

**Notes on outliers:**
- The early SA outliers look like they are mostly merged embryo masks
- The current SA cap I have for all snips is too low for later time points
- The fix is to use my data to develop a time-dependent SA bound
- Unusually small wck-AB embryos seem to be either (i) true morphological outluiers, (ii) embryos in a curved pose (likely due to well constraints), (iii) or dead/dying embryos