## Comparing VAE architectures
This notebook compares the performance of different VAE architectures. Specifically, we are testing how model depth (num convolutional layers) and the size of the latent space impact:
1. Image reconstruction quality
2. Model generalizability
3. Biological information content of the latent space

In [4]:
import os
from pythae.models import AutoModel
import numpy as np
import glob as glob
from functions.utilities import path_leaf

#### Get paths to data, figures, and latent space outputs

In [5]:
root = "/Users/nick/Dropbox (Cole Trapnell's Lab)/Nick/morphseq/"
# root = "E:\\Nick\\Dropbox (Cole Trapnell's Lab)\\Nick\\morphseq\\"

train_name = "20230815_vae"
train_dir = os.path.join(root, "training_data", train_name, '')
model_path_list = sorted(glob.glob(train_dir + '*beta*'))
model_name_list = [path_leaf(m) for m in model_path_list]

output_dir = os.path.join(train_dir, "figures_tc")
if not os.path.isdir(output_dir):
    os.makedirs(output_dir)

# # get path to model
# last_training = sorted(os.listdir(output_dir))[-1]
# trained_model = AutoModel.load_from_folder(
#     os.path.join(output_dir, last_training, 'final_model'))

# # path to figures and data
# figure_path = os.path.join(output_dir, last_training, "figures")
# out_figure_path = os.path.join(output_dir, last_training, "figures", "model_assessment")
# if not os.path.isdir(out_figure_path):
#     os.makedirs(out_figure_path)

#### Create DataLoader objects for train, eval, and test sets
- "Train" data were used to train the model
- "Eval" data were used to assess model during training
- "Test" data were untouched during training process

In [7]:
import pandas as pd


model_df_list = []
model_figpath_list = []

for model_name in model_name_list:
    
    mdir = os.path.join(train_dir, model_name) 
    
    last_training = sorted(os.listdir(mdir))[-1]
    
    m_fig_path = os.path.join(mdir, last_training, "figures")
    model_figpath_list.append(m_fig_path)
    
    # load data frame with results
    morph_df = pd.read_csv(os.path.join(m_fig_path, "embryo_stats_df.csv"), index_col=0)
    morph_df["model_name"] = model_name
#     morph_df_small
    model_df_list.append(morph_df)
    
master_df = pd.concat(model_df_list, axis=0, ignore_index=True)

EmptyDataError: No columns to parse from file

In [10]:
"/Users/nick/Dropbox (Cole Trapnell's Lab)/Nick/morphseq/training_data/20230815_vae/z10_bs032_ne100_depth05_beta01/BetaTCVAE_training_2023-09-04_09-25-29/figures/embryo_stats_df.csv"

"/Users/nick/Dropbox (Cole Trapnell's Lab)/Nick/morphseq/training_data/20230815_vae/z10_bs032_ne100_depth05_beta01/BetaTCVAE_training_2023-09-04_09-25-29/figures/embryo_stats_df.csv"

### Question 1: how does model architecture impact image reconstruction accuracy?
Compare reconstruction MSE for images in the "test" set that the models havec never seen before

In [None]:
import plotly.express as px
import plotly.graph_objects as go
import plotly.offline as pyo
pyo.init_notebook_mode()

# first, make model-specific histogram plots

for m, model_name in enumerate(model_name_list):
    
    morph_df = model_df_list[m]
    m_fig_path = model_figpath_list[m]
    
    fig = px.histogram(morph_df, x="recon_mse", color="train_cat", histnorm='probability density', 
                       title="Comparing image reconstruction quality: " + model_name,
                       labels=dict(recon_mse="reconstruction error (MSE)", train_cat="training class"),
                       template="plotly")

    # fig.update_layout(legend=[ f"training images (mu={np.round(train_mu)})", f"eval images (mu={np.round(eval_mu)})", f"test images (mu={np.round(test_mu)})"]) 


    fig.update_layout(barmode='overlay')
    fig.update_traces(opacity=0.5)
    fig.update_xaxes(range=[5000, 10000])
#     fig.show()
    fig.write_image(os.path.join(m_fig_path, "image_recon_hist.png"))
    

Make boxplot that compares reconstruction MSE for each model

In [None]:
fig = px.box(master_df, x="model_name", y="recon_mse", color="train_cat",
             title="Comparing image reconstruction quality",
             labels=dict(recon_mse="reconstruction error (MSE)", 
                         train_cat="training class",
                         model_name="model name"))


fig.update_yaxes(range=[6000, 7500])
fig.show()
fig.write_image(os.path.join(output_dir, "image_recon_box.png"))

In [None]:
recon_summary_df = master_df.groupby(['model_name','train_cat'], as_index=False).agg({'recon_mse':['mean','std']})

names_short = recon_summary_df["model_name"].values.copy()
names_short = [n[0:4] + n[-7:-6] + n[-2:] for n in names_short]
recon_summary_df["model_name_short"] = names_short

new_cols = []
old_cols = recon_summary_df.columns
for col in old_cols:
    
    if len(col[1])>0:
        new_col = '_'.join(col)
        new_cols.append(new_col)
    else:
        new_cols.append(col[0])
    
recon_summary_df.columns = new_cols

# print(recon_summary_df)


fig = px.scatter(recon_summary_df, x="model_name_short", y="recon_mse_mean", 
                 color="train_cat", error_y="recon_mse_std",
             title="Comparing image reconstruction quality",
             labels=dict(recon_mse_mean="mean image reconstruction error (MSE)", 
                         train_cat="training class",
                         model_name_short="model name"))

fig.update_traces(marker_size=15)
fig.update_layout(scattermode="group")
fig.update_yaxes(range=[6400, 7200])
fig.show()
fig.write_image(os.path.join(output_dir, "image_recon_scatter.png"))

In [None]:
# len(recon_summary_df.columns.values[0][1])
new_cols = []
old_cols = recon_summary_df.columns
for col in old_cols:
    
    if len(col[1])>0:
        new_col = '_'.join(col)
        new_cols.append(new_col)
    else:
        new_cols.append(col[0])
    
recon_summary_df.columns = new_cols
recon_summary_df

### Question 2: how well does image reconstruction generalize?
Now, look at "generalization penalities"; i.e. the difference between recon MSE for the train and test sets

In [None]:
n_pairs = 2500
gen_df_list = []

np.random.seed(326)

for m, model_name in enumerate(model_name_list):
    
    morph_df = model_df_list[m]
    gen_df = pd.DataFrame(np.empty((0, 2)), columns=["model_name", "mse_diff"])
    # get vectors of recon values
    train_mse_vec = morph_df.loc[np.where(morph_df["train_cat"]=="train")[0], "recon_mse"].to_numpy()
    test_mse_vec = morph_df.loc[np.where(morph_df["train_cat"]=="test")[0], "recon_mse"].to_numpy()
    
    train_indices = np.random.choice(range(len(train_mse_vec)), n_pairs, replace=True)
    test_indices = np.random.choice(range(len(test_mse_vec)), n_pairs, replace=True)

    mse_diff_vec = (test_mse_vec[test_indices] - train_mse_vec[train_indices]) / train_mse_vec[train_indices]
    
    gen_df["mse_diff"] = mse_diff_vec
    gen_df["model_name"] = model_name
    
    gen_df_list.append(gen_df)
    
gen_df_long = pd.concat(gen_df_list, axis=0, ignore_index=True)

names_short = gen_df_long["model_name"].values.copy()
names_short = [n[0:4] + n[-7:-6] + n[-2:] for n in names_short]
gen_df_long["model_name_short"] = names_short

# gen_df_long[]
# make figure
fig = px.box(gen_df_long, x="model_name_short", y="mse_diff",
             title="Assessing model generalizability",
             labels=dict(mse_diff="(test MSE - train MSE)/(train MSE)", 
                         model_name_short="model name"))

fig.update_yaxes(range=[-0.12, 0.18])
fig.show()
fig.write_image(os.path.join(output_dir, "image_recon_generalize_box.png"))

In [None]:
# fix a couple of mislabeled embryos
gdf3_mislabel_list = ["20230627_A08", "20230627_C08", "20230627_G08"]
snip_id_vec = master_df["snip_id"].values

for gid in gdf3_mislabel_list:
    swap_indices = np.asarray([s for s in range(len(snip_id_vec)) if gid in snip_id_vec[s]])
    master_df.loc[swap_indices, "master_perturbation"] = 'wck-AB'
#     print(len(swap_indices))
# fig = go.Figure()

# fig.add_trace(go.Scatter(x=morph_df["UMAP_00"].iloc[wck_indices],
#                          y=morph_df["UMAP_01"].iloc[wck_indices],
#                          mode="markers", marker=dict(size=5, opacity=0.25,
#                                             color='#636EFA'),
#                          name="wck-AB"))

# fig.add_trace(go.Scatter(x=morph_df["UMAP_00"].iloc[gdf3_indices],
#                          y=morph_df["UMAP_01"].iloc[gdf3_indices],
#                          mode="markers", marker=dict(size=5, opacity=0.25,
#                                             color='#EF553B'),
#                          name="gdf3"))
         
# fig.add_trace(go.Scatter(x=morph_df["UMAP_00"].iloc[np.where(np.asarray(eid_vec)==inlier_id)[0]],
#                          y=morph_df["UMAP_01"].iloc[np.where(np.asarray(eid_vec)==inlier_id)[0]],
#                          mode="markers+lines", marker=dict(size=8, opacity=1, line=dict(width=2, color="black"),
#                                             color='#EF553B'),
#                          line=dict(color="black"),
#                          name="gdf3 inlier"))

# fig.update_layout(template="plotly")

# fig.show()
# fig.write_image(os.path.join(out_figure_path, "latent_UMAP_ab_gdf3_v2_inlier.png"))

## Question 3: How much biological information does latent space contain? 
We can get at this question by trying to predict key biological covariates from the values of the 25 latent encodings. We will test both how much information is available overall (using an MLP) and how much information is linearly decodable.

Let's look at MLP first. We will use the "train" and "eval" partitions to train the classifier, and will test it out on the "test" partition. Let's look at time first

In [None]:
from sklearn.neural_network import MLPRegressor
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from sklearn import linear_model
import statsmodels.api as sm
from scipy import stats

age_R2_vec_nonlin = []
age_R2_vec_lin = []

age_df_list = []

# look at how well model predicts developmental age first
for m, model_name in enumerate(tqdm(model_name_list)):
    
    age_df = pd.DataFrame(np.empty((2, 3)), columns=["model_name", "reg_type", "R2"])
    
    morph_df = model_df_list[m]
    m_fig_path = model_figpath_list[m]
    
    train_indices = np.where((morph_df["train_cat"]=="train") | (morph_df["train_cat"]=="eval"))[0]
    test_indices = np.where(morph_df["train_cat"] =="test")[0]

    # extract target vector
    y_train = morph_df["predicted_stage_hpf"].iloc[train_indices].to_numpy().astype(float)
    y_test = morph_df["predicted_stage_hpf"].iloc[test_indices].to_numpy().astype(float)

    # extract predictor variables
    mu_indices = [i for i in range(len(morph_df.columns)) if "z_mu_" in morph_df.columns[i]]
    X_train = morph_df.iloc[train_indices, mu_indices].to_numpy().astype(float)
    X_test = morph_df.iloc[test_indices, mu_indices].to_numpy().astype(float)
    
    ###################
    # run MLP regressor
    ###################
    clf = MLPRegressor(random_state=1, max_iter=5000).fit(X_train, y_train)
    y_test_pd = clf.predict(X_test)
    # calculate R^2
    R2_nonlin = clf.score(X_test, y_test)
    age_R2_vec_nonlin.append(R2_nonlin)
    
    fig = px.scatter(x=y_test, y=y_test_pd, opacity=0.5,
                title="Predicted vs. Actual Developmental Ages: " + 
                      model_name + 
                      f" (MLP R^2={np.round(R2_nonlin,2)})")
    fig.update_xaxes(title_text='actual age (hpf)')
    fig.update_yaxes(title_text='predicted age (hpf)')
#     fig.show()

    fig.write_image(os.path.join(m_fig_path, "mlp_age_prediction.png"))
    
    ###################
    # Run multivariate linear regressor
    ###################

    reg = linear_model.LinearRegression().fit(X_train, y_train)

    R2_lin = reg.score(X_test, y_test)
    age_R2_vec_lin.append(R2_lin)
    
    y_test_pd_lin = reg.predict(X_test)

    fig = px.scatter(x=y_test, y=y_test_pd_lin, opacity=0.5,
                    title="Predicted vs. Actual Developmental Ages: " + 
                      model_name + 
                      f" (MVR R^2={np.round(R2_lin,2)})")
    fig.update_xaxes(title_text='actual age (hpf)')
    fig.update_yaxes(title_text='predicted age (hpf)')
#     fig.show()
    fig.write_image(os.path.join(m_fig_path, "mvr_age_prediction.png"))
    
    # add to df
    age_df["R2"] = np.asarray([R2_nonlin, R2_lin])
    age_df["reg_type"] = ["MLP", "MVR"]
    age_df["model_name"] = model_name
    age_df_list.append(age_df)
    


In [None]:
age_df_long = pd.concat(age_df_list, axis=0, ignore_index=True)

fig = px.bar(age_df_long, x="model_name", y="R2", color="reg_type", barmode="group",
             title="Predicting developmental age from latent space",
             labels=dict(R2="R squared", 
                         model_name="model name",
                         reg_type="regression type"))


fig.update_yaxes(range=[0.3, 1])
fig.show()
fig.write_image(os.path.join(output_dir, "age_regression_bar.png"))

In [None]:
from sklearn.neural_network import MLPRegressor
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from sklearn import linear_model
import statsmodels.api as sm
from scipy import stats


age_df_list_wt = []

# look at how well model predicts developmental age first
for m, model_name in enumerate(tqdm(model_name_list)):
    
    age_df_wt = pd.DataFrame(np.empty((2, 3)), columns=["model_name", "reg_type", "R2"])
    
    morph_df = model_df_list[m]
    m_fig_path = model_figpath_list[m]
    
    wck_indices = np.where(morph_df["master_perturbation"]=='wck-AB')[0]
    
    train_indices = np.where((morph_df["train_cat"]=="train") | (morph_df["train_cat"]=="eval"))[0]
    train_indices = [t for t in train_indices if t in wck_indices]
    
    test_indices = np.where(morph_df["train_cat"] =="test")[0]
    test_indices = [t for t in test_indices if t in wck_indices]

    # extract target vector
    y_train = morph_df["predicted_stage_hpf"].iloc[train_indices].to_numpy().astype(float)
    y_test = morph_df["predicted_stage_hpf"].iloc[test_indices].to_numpy().astype(float)

    # extract predictor variables
    mu_indices = [i for i in range(len(morph_df.columns)) if "z_mu_" in morph_df.columns[i]]
    X_train = morph_df.iloc[train_indices, mu_indices].to_numpy().astype(float)
    X_test = morph_df.iloc[test_indices, mu_indices].to_numpy().astype(float)
    
    ###################
    # run MLP regressor
    ###################
    clf = MLPRegressor(random_state=1, max_iter=5000).fit(X_train, y_train)
    y_test_pd = clf.predict(X_test)
    # calculate R^2
    R2_nonlin = clf.score(X_test, y_test)
    age_R2_vec_nonlin.append(R2_nonlin)
    
    fig = px.scatter(x=y_test, y=y_test_pd, opacity=0.5,
                title="Predicted vs. Actual Developmental Ages: " + 
                      model_name + 
                      f" (MLP R^2={np.round(R2_nonlin,2)})")
    fig.update_xaxes(title_text='actual age (hpf)')
    fig.update_yaxes(title_text='predicted age (hpf)')
#     fig.show()

    fig.write_image(os.path.join(m_fig_path, "mlp_age_prediction.png"))
    
    ###################
    # Run multivariate linear regressor
    ###################

    reg = linear_model.LinearRegression().fit(X_train, y_train)

    R2_lin = reg.score(X_test, y_test)
    age_R2_vec_lin.append(R2_lin)
    
    y_test_pd_lin = reg.predict(X_test)

    fig = px.scatter(x=y_test, y=y_test_pd_lin, opacity=0.5,
                    title="Predicted vs. Actual Developmental Ages: " + 
                      model_name + 
                      f" (MVR R^2={np.round(R2_lin,2)})")
    fig.update_xaxes(title_text='actual age (hpf)')
    fig.update_yaxes(title_text='predicted age (hpf)')
#     fig.show()
    fig.write_image(os.path.join(m_fig_path, "mvr_age_prediction.png"))
    
    # add to df
    age_df_wt["R2"] = np.asarray([R2_nonlin, R2_lin])
    age_df_wt["reg_type"] = ["MLP", "MVR"]
    age_df_wt["model_name"] = model_name
    age_df_list_wt.append(age_df_wt)
    

In [None]:
age_df_long_wt = pd.concat(age_df_list_wt, axis=0, ignore_index=True)
names_short = age_df_long_wt["model_name"].values.copy()
names_short = [n[0:4] + n[-7:-6] + n[-2:] for n in names_short]
age_df_long_wt["model_name_short"] = names_short

fig = px.bar(age_df_long_wt, x="model_name_short", y="R2", color="reg_type", barmode="group",
             title="Predicting developmental age from latent space",
             labels=dict(R2="R squared", 
                         model_name_short="model name",
                         reg_type="regression type"))


fig.update_yaxes(range=[0.3, 1])
fig.show()
fig.write_image(os.path.join(output_dir, "age_regression_bar_wt_only.png"))

### Next, examine whether latent space can distinguish between wt and gdf3 embryos

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression

gdf3_df_list = []

for m, model_name in enumerate(tqdm(model_name_list)):
    
    gdf3_df = pd.DataFrame(np.empty((2, 3)), columns=["model_name", "reg_type", "accuracy"])
    
    morph_df = model_df_list[m]
    m_fig_path = model_figpath_list[m]
    
    gdf3_class_train = np.asarray(morph_df["master_perturbation"].iloc[train_indices])
    train_gdf3_sub_indices = np.where(gdf3_class_train=="gdf3")[0]
    train_wck_sub_indices = np.random.choice(np.where(gdf3_class_train=="wck-AB")[0], len(train_gdf3_sub_indices), replace=False)
    train_sub_indices = np.asarray(train_gdf3_sub_indices.tolist() + train_wck_sub_indices.tolist())

    gdf3_class_test = np.asarray(morph_df["master_perturbation"].iloc[test_indices])
    test_sub_indices = np.where((gdf3_class_test=="wck-AB") | (gdf3_class_test=="gdf3"))[0]

    # extract predictor variables
    mu_indices = [i for i in range(len(morph_df.columns)) if "z_mu_" in morph_df.columns[i]]
    X_train = morph_df.iloc[train_indices, mu_indices].to_numpy().astype(float)
    X_test = morph_df.iloc[test_indices, mu_indices].to_numpy().astype(float)

    ###################
    # run MLP classifier
    ###################
    clf = MLPClassifier(random_state=1, max_iter=5000).fit(X_train[train_sub_indices], gdf3_class_train[train_sub_indices])
    accuracy_nonlin = clf.score(X_test[test_sub_indices], gdf3_class_test[test_sub_indices])
    
    
    ###################
    # Run multivariate logistic classifier
    ###################
    clf_lin = LogisticRegression(random_state=0).fit(X_train[train_sub_indices], gdf3_class_train[train_sub_indices])
    accuracy_lin = clf_lin.score(X_test[test_sub_indices], gdf3_class_test[test_sub_indices])
    
    # add to df
    gdf3_df["accuracy"] = np.asarray([accuracy_nonlin, accuracy_lin])
    gdf3_df["reg_type"] = ["MLP", "MVLR"]
    gdf3_df["model_name"] = model_name
    gdf3_df_list.append(gdf3_df)

In [None]:
gdf3_df_long = pd.concat(gdf3_df_list, axis=0, ignore_index=True)
gdf3_df_long["model_name_short"] = names_short

fig = px.bar(gdf3_df_long, x="model_name_short", y="accuracy", color="reg_type", barmode="group", 
             title="Classification accuracy, gdf3 vs WT",
             labels=dict(model_name_short="model name",
                         reg_type="regression type"))


fig.update_yaxes(range=[0.5, 1])
fig.show()
fig.write_image(os.path.join(output_dir, "gdf3_classification_bar.png"))

In [None]:
# save data frames
age_df_long.to_csv(os.path.join(output_dir, "age_prediction_summary.csv"))
age_df_long_wt.to_csv(os.path.join(output_dir, "age_prediction_summary_wt_only.csv"))
gdf3_df_long.to_csv(os.path.join(output_dir, "gdf3_classification_summary.csv"))

In [None]:
best_mlp_model = "z50_bs032_ne100_depth06"
best_mvlr_model = "z50_bs032_ne100_depth05"

# rt_list = [best_mlp_model, best_mvlr_mode]

age_vec = np.linspace(0, 60, 31)

############
# look at best nonlinear model first
############
md_ind = np.where(np.asarray(model_name_list)==best_mlp_model)[0]
morph_df = model_df_list[md_ind[0]]

train_indices = np.where((morph_df["train_cat"]=="train") | (morph_df["train_cat"]=="eval"))[0]
test_indices = np.where(morph_df["train_cat"] =="test")[0]
    
gdf3_class_train = np.asarray(morph_df["master_perturbation"].iloc[train_indices])

# resample to balance the classes
train_gdf3_sub_indices = np.where(gdf3_class_train=="gdf3")[0]
train_wck_sub_indices = np.random.choice(np.where(gdf3_class_train=="wck-AB")[0], len(train_gdf3_sub_indices), replace=False)
train_sub_indices = np.asarray(train_gdf3_sub_indices.tolist() + train_wck_sub_indices.tolist())

gdf3_class_test = np.asarray(morph_df["master_perturbation"].iloc[test_indices])
test_sub_indices = np.where((gdf3_class_test=="wck-AB") | (gdf3_class_test=="gdf3"))[0]

time_vec_test = morph_df.loc[test_sub_indices, "predicted_stage_hpf"].to_numpy()

# extract predictor variables
mu_indices = [i for i in range(len(morph_df.columns)) if "z_mu_" in morph_df.columns[i]]
X_train = morph_df.iloc[train_indices, mu_indices].to_numpy().astype(float)
X_test = morph_df.iloc[test_indices, mu_indices].to_numpy().astype(float)

###################
# run MLP classifier
###################
clf_mlp = MLPClassifier(random_state=1, max_iter=5000).fit(X_train[train_sub_indices], gdf3_class_train[train_sub_indices])

In [None]:
len(test_sub_indices)

In [None]:
test_probs_raw = clf_mlp.predict_proba(X_test[test_sub_indices]) + 1e-18
test_probs_raw = np.divide(test_probs_raw, np.sum(test_probs_raw, axis=1)[:, np.newaxis])
test_probs = np.log(test_probs_raw)

test_probs[np.where(np.isnan(test_probs))] = np.min(test_probs)
log_ratio_vec = np.empty(test_probs.shape[0],)
right_vec = np.zeros(test_probs.shape[0],)

for t in range(test_probs.shape[0]):
    if gdf3_class_test[[test_sub_indices[t]]] == 'wck-AB':
        log_ratio_vec[t] = test_probs[t, 1] - test_probs[t, 0]
    else:
        log_ratio_vec[t] = test_probs[t, 0] - test_probs[t, 1]
        
    if (gdf3_class_test[[test_sub_indices[t]]] == 'wck-AB') & (test_probs[t, 1] > test_probs[t, 0]):
        right_vec[t] = 1
    elif (gdf3_class_test[[test_sub_indices[t]]] == 'gdf3') & (test_probs[t, 0] > test_probs[t, 1]):
        right_vec[t] = 1


In [None]:
df_logL = pd.DataFrame(np.empty((time_vec_test.shape[0], 2)), columns = ["time", "log_ratio"])
df_logL["time"] = time_vec_test
df_logL["log_ratio"] = log_ratio_vec

logL_vec = np.empty((age_vec.shape))
logL_vec[:] = np.nan

for a in range(len(age_vec)-1):
    age_inds = np.where((time_vec_test>=age_vec[a]) & (time_vec_test<age_vec[a+1]))[0]
    if len(age_inds) > 5:
        logL_vec[a] = np.mean(log_ratio_vec[age_inds])
    

df_logL.dropna()

fig = px.scatter(df_logL, x="time", y="log_ratio",  opacity=0.5)

fig.add_trace(go.Scatter(x=age_vec+1, y=logL_vec, mode="lines", name="average trend"))

fig.show()

In [None]:
frac_correct_vec = np.empty((age_vec.shape))
frac_correct_vec[:] = np.nan

for a in range(len(age_vec)-1):
    age_inds = np.where((time_vec_test>=age_vec[a]) & (time_vec_test<age_vec[a+1]))[0]
    if len(age_inds) > 5:
        frac_correct_vec[a] = np.mean(right_vec[age_inds])
    


fig = px.scatter(x=age_vec+1, y=frac_correct_vec,  opacity=0.5)

fig.show()

In [None]:
age_inds

In [None]:
logL_vec