In [16]:
import sys
import numpy as np
import pandas as pd
import seaborn as sns
import nibabel as nib
import random
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split
import torch.nn as nn
import torch.optim as optim

import dask.dataframe as dd

sys.path.append("../")
from src.utils import *
from src.LSN_roi import *

from models import dp_loss as dpl
from models import dp_utils as dpu

from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

## Paths

In [17]:
project_dir = "../"
data_dir = "/home/nikhil/projects/brain_changes/data/ukbb/"

freesurfer_csv = f"{data_dir}imaging/freesurfer/ukb47552_followup_subset.csv"

train_csv = f"{project_dir}metadata/metadata_train.csv"
test_csv = f"{project_dir}metadata/metadata_test.csv"

freesurfer_fields = f"{project_dir}/metadata/ukbb_freesurfer_fields.txt"

summary_results_dir = "/home/nikhil/projects/brain_changes/brain-diff/results/summary/"



## Filter followup subject using Dask dataframe (pandas will crash)

In [18]:
# import dask.dataframe as dd

# # Grab eids with ses-3 data
# freesurfer_df = pd.read_csv(freesurfer_csv,usecols=["eid","26501-3.0"])
# freesurfer_eids = freesurfer_df[~freesurfer_df["26501-3.0"].isna()]["eid"]

# # Read entire CSV using dask
# freesurfer_df = dd.read_csv(freesurfer_csv)
# followup_freesurfer_df = freesurfer_df[freesurfer_df["eid"].isin(freesurfer_eids)].compute()

# # Save filtered df
# followup_freesurfer_df.to_csv(f"{data_dir}imaging/freesurfer/ukb47552_followup_subset.csv")

## Grab phenotype fields (e.g. Thicknes, Volume, Area)

In [4]:
freesurfer_fields_df = pd.read_csv(freesurfer_fields,sep="	")
freesurfer_fields_df["phenotype"] = freesurfer_fields_df["Description"].str.split(" ",1,expand=True)[0]
freesurfer_fields_df["phenotype"] = freesurfer_fields_df["phenotype"].replace({"Mean":"Mean Thickness"})
CT_fields = freesurfer_fields_df[freesurfer_fields_df["phenotype"]=="Mean Thickness"]["Field ID"]
volume_fields = freesurfer_fields_df[freesurfer_fields_df["phenotype"]=="Volume"]["Field ID"]

print(f"number of CT fields: {len(CT_fields)}, volume fields: {len(volume_fields)}")

number of CT fields: 62, volume fields: 62


## Read DKT volumes 

In [5]:
pheno_fields = CT_fields # + volume_fields
pheno_cols_ses2 = list(pheno_fields.astype(str) + "-2.0")
pheno_cols_ses3 = list(pheno_fields.astype(str) + "-3.0")
usecols = ["eid"] + pheno_cols_ses2 + pheno_cols_ses3

print(f"reading {len(usecols)} columes")

freesurfer_df = pd.read_csv(freesurfer_csv, usecols=usecols)

# Remove eids with missing 2nd or 3rd ses data
eid_missing_data = freesurfer_df[freesurfer_df.isna().any(axis=1)]["eid"].values
print(f"number participants missing 2nd or 3rd ses freesurfer data: {len(eid_missing_data)}")

freesurfer_df = freesurfer_df[~freesurfer_df["eid"].isin(eid_missing_data)]
freesurfer_eids = freesurfer_df["eid"].values

print(f"available freesurfer subjects: {len(freesurfer_eids)}")


# scale data
# pheno_max_val = np.max(freesurfer_df[pheno_cols_ses2 + pheno_cols_ses3].values)
# print(f"Max pheno val: {pheno_max_val}")
# freesurfer_df[pheno_cols_ses2 + pheno_cols_ses3] = freesurfer_df[pheno_cols_ses2 + pheno_cols_ses3] / pheno_max_val

freesurfer_df.head()

reading 125 columes
number participants missing 2nd or 3rd ses freesurfer data: 63
available freesurfer subjects: 3237


Unnamed: 0,eid,27174-2.0,27174-3.0,27175-2.0,27175-3.0,27176-2.0,27176-3.0,27177-2.0,27177-3.0,27178-2.0,...,27293-2.0,27293-3.0,27294-2.0,27294-3.0,27295-2.0,27295-3.0,27296-2.0,27296-3.0,27297-2.0,27297-3.0
0,1000635,2.786,2.874,2.91,2.852,2.275,2.307,3.389,3.379,2.836,...,2.387,2.435,2.967,2.958,2.707,2.628,2.229,2.142,2.875,2.75
1,1008391,3.191,2.875,3.08,3.037,2.273,2.143,3.004,2.571,2.846,...,2.588,2.592,3.073,2.859,2.839,2.77,3.086,3.322,3.255,3.003
2,1010129,2.329,1.87,2.836,2.798,1.995,1.943,3.302,3.193,2.812,...,2.376,2.395,3.081,3.086,2.993,2.945,3.016,3.032,3.193,3.168
3,1010994,2.785,2.581,2.671,2.603,2.06,1.819,3.144,3.225,2.793,...,2.513,2.416,2.919,2.881,2.654,2.581,2.088,2.161,2.912,2.866
4,1013774,2.963,3.191,2.617,2.856,2.035,2.05,2.751,3.385,2.83,...,2.274,2.403,3.041,3.008,2.635,2.737,2.527,2.696,2.983,3.263


In [6]:
train_df = pd.read_csv(train_csv)
train_eids = train_df["eid"]
train_eids_avail = set(train_eids) & set(freesurfer_eids)
train_df = pd.merge(train_df, freesurfer_df, on="eid", how="inner")

test_df = pd.read_csv(test_csv)
test_eids = test_df["eid"]
test_eids_avail = set(test_eids) & set(freesurfer_eids)
test_df = pd.merge(test_df, freesurfer_df, on="eid", how="inner")

print(f"train samples: {len(train_eids)}, freesurfer data available: {len(train_eids_avail)}, overlap: {len(train_df)}")
print(f"test samples: {len(test_eids)}, freesurfer data available: {len(test_eids_avail)}, overlap: {len(test_df)}")

train samples: 2145, freesurfer data available: 1909, overlap: 1909
test samples: 1057, freesurfer data available: 958, overlap: 958


In [15]:
np.hstack([train_df["age_at_ses3"].values, test_df["age_at_ses3"].values]).mean(), np.hstack([train_df["age_at_ses3"].values, test_df["age_at_ses3"].values]).std()

(64.80013951866061, 7.208743672171675)

## Data-loaders

In [None]:
# n_samples = 50

batch_size = 10
transform = "random_swap" #only for training

train_dataset = UKBB_ROI_Dataset(train_df, pheno_cols_ses2, pheno_cols_ses3, transform=transform)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

test_dataset = UKBB_ROI_Dataset(test_df, pheno_cols_ses2, pheno_cols_ses3, transform=None)
test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=False)

# iter(train_dataloader).next()
print(f"len train dataset: {len(train_dataset)}, test dataset: {len(test_dataset)}")

## CUDA

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

if torch.cuda.is_available():
    map_location=lambda storage, loc: storage.cuda()
else:
    map_location='cpu'

## Train model 

In [None]:
n_epochs = 10
input_size = len(pheno_cols_ses2)
hidden_size = 10
lr = 0.005

# model = LSN_FF(input_size,hidden_size=hidden_size)
model = LSN_FF_Linear(input_size,hidden_size=hidden_size)

model.train()

optimizer = torch.optim.Adam(model.parameters(), lr=lr) #optim.SGD(model.parameters(), lr=lr, momentum=0.5)                                                                                               
criterion = nn.MSELoss()  #nn.L1Loss() #

# using subset of train dataloader for debug
model, batch_loss_df, epoch_loss_df, preds_df = train(model,train_dataloader,optimizer,criterion,n_epochs)

In [None]:

plt.plot(epoch_loss_df)
epoch_loss_df.head()

## Test perf

In [None]:

test_dataloader = DataLoader(train_dataset, batch_size=1, shuffle=False)

eid_list, y_test_list, y_pred_list, test_loss1, test_loss2 = test(model, test_dataloader)
y_test = np.squeeze(np.vstack(y_test_list))
y_pred = np.squeeze(np.vstack(y_pred_list))

test_r1 = stats.pearsonr(y_pred[:,0],y_test[:,0])[0]
test_r2 = stats.pearsonr(y_pred[:,1],y_test[:,1])[0]   

test_age_1 = y_test[:,0]
test_age_2 = y_test[:,1]

test_brainage_1 = y_pred[:,0] # for two timepoints y is a matrix
test_brainage_2 = y_pred[:,1]                                    

df = pd.DataFrame()
df["eid"] = eid_list
df["test_age_1"] = test_age_1
df["test_age_2"] = test_age_2
df["test_brainage_1"] = test_brainage_1
df["test_brainage_2"] = test_brainage_2
df["test_loss1"] = test_loss1                    
df["test_loss2"] = test_loss2
df["test_r1"] = test_r1
df["test_r2"] = test_r2

# Test loss is L1 not MSE
test_loss = df["test_loss1"].mean()
print(f"test_loss: {test_loss}")

In [None]:
df

## Generate configs for CC runs

In [None]:
config_df = pd.DataFrame(columns=["hidden_size","transform","phenotype"])
hidden_size_list = [10,50,100]
transform_list = [None, "random_swap"]
phenotype_list = ["CT"] #,"volume","both"

i = 0
for hidden_size in hidden_size_list:
    for transform in transform_list:
        for phenotype in phenotype_list:
            config_df.loc[i] = [hidden_size,transform,phenotype]
            i += 1

print(config_df.shape)
config_df

In [None]:
# run_id = "run_1"
# config_path = f"../results/LSN_roi/configs/config_{run_id}.csv"
# config_df.to_csv(config_path)

## Plot batch runs

In [None]:
def collate_results(file_prefix, config_id_list):
    results_df = pd.DataFrame()
    for config_id in config_id_list:
        results_csv = f"{file_prefix}_{config_id}.csv"
        _df = pd.read_csv(results_csv)
        _df["config_id"] = config_id
        results_df = results_df.append(_df)

    return results_df


In [None]:
run_id = "run_3"
config_id_list = np.arange(6)

loss_csv = f"../results/LSN_roi/{run_id}/freesurfer_train_loss_config"
perf_csv = f"../results/LSN_roi/{run_id}/freesurfer_perf_config"

loss_df = collate_results(loss_csv, config_id_list)
perf_df = collate_results(perf_csv, config_id_list)

loss_df = loss_df.rename(columns={"Unnamed: 0":"epoch"})

perf_df.head()

## Plot learning curves

In [None]:
g = sns.lineplot(x="epoch", y="epoch_loss", data=loss_df, hue="config_id", palette="Set1")

## Plot prediction perf

In [None]:
plot_df = perf_df.copy()
plot_df["test_loss"] = 0.5 * (plot_df["test_loss1"] + plot_df["test_loss2"])

print(plot_df.groupby(["config_id", "visit_order"]).mean()["test_loss"])

sns.set(font_scale=2)
with sns.axes_style("whitegrid"):
    g = sns.catplot(y="test_loss",x="config_id",hue="visit_order", height=3, aspect=3, kind="point", 
                    sharey=False, data=plot_df)

In [None]:
best_configs = [2,3]
plot_df = plot_df[plot_df["config_id"].isin(best_configs)]

sns.set(font_scale=2)
with sns.axes_style("whitegrid"):
    g = sns.catplot(y="test_loss",x="config_id",hue="visit_order", height=3, aspect=3, kind="box", 
                    sharey=False, data=plot_df)

In [None]:
# config 0: no data_aug, config 1: data aug
perf_df = perf_df[(perf_df["config_id"].isin(best_configs)) & (perf_df["visit_order"]=="B,F")].copy()
perf_df.loc[perf_df["config_id"]==best_configs[0], "data_aug"] = False
perf_df.loc[perf_df["config_id"]==best_configs[1], "data_aug"] = True

perf_df["model"] = "LSN"
perf_df["baseline_err"] = perf_df["brainage_at_ses2"] - perf_df["age_at_ses2"]
perf_df["followup_err"] = perf_df["brainage_at_ses3"] - perf_df["age_at_ses3"]
perf_df["brainage_delta"] = perf_df["brainage_at_ses3"] - perf_df["brainage_at_ses2"]
perf_df["chronoage_delta"] = perf_df["age_at_ses3"] - perf_df["age_at_ses2"]

perf_df["delta_err"] = perf_df["brainage_delta"] - perf_df["chronoage_delta"]

perf_df["Benjamin_Button"] = perf_df["brainage_at_ses3"] < perf_df["brainage_at_ses2"]

n_BBs = perf_df["Benjamin_Button"].sum()
print(f"n BBs: {n_BBs} ({100*n_BBs/len(plot_df):4.3f}%)")

perf_df_melt = perf_df.melt(id_vars=["eid", "model", "data_aug", "visit_order", "age_at_ses2", "brainage_at_ses2", "age_at_ses3", "brainage_at_ses3", "chronoage_delta", "brainage_delta"], 
              value_vars=['baseline_err', 'followup_err', 'delta_err'],
              var_name='err_type', value_name='error')

perf_df.head()

In [None]:
plot_df = perf_df_melt.copy()

plot_df["abs_error"] = np.abs(plot_df["error"]) 

print(plot_df.groupby(["err_type","data_aug"]).mean()["abs_error"])

sns.set(font_scale=2)
with sns.axes_style("whitegrid"):
    g = sns.catplot(x="err_type",y="abs_error", hue="data_aug", kind="bar", data=plot_df,aspect=2)

In [None]:
save_summary_results = False
if save_summary_results:
    perf_df_melt.to_csv(f"{summary_results_dir}LSN_model_two_visit_train_two_visit_subset_test_two_visit_subset.csv")

## Legacy plots

In [None]:
plot_df = perf_df.copy()
plot_df = plot_df[plot_df["config_id"].isin([2,3])]

plot_df["test_MAE"] = 0.5 * (plot_df["test_MAE1"] + plot_df["test_MAE1"])
plot_df["brainage_diff"] = plot_df["test_brainage_2"] - plot_df["test_brainage_1"]
plot_df["chronoage_diff"] = plot_df["test_age_2"] - plot_df["test_age_1"]

with sns.axes_style("whitegrid"):
    fig, ax = plt.subplots(figsize=(20,10))
    g = sns.scatterplot(x="eid",y="brainage_diff",hue="visit_order", style="config_id",data=plot_df, ax=ax)

In [None]:
plot_df1 = plot_df[plot_df["visit_order"]=="B,F"][["eid","config_id","brainage_diff"]]
plot_df2 = plot_df[plot_df["visit_order"]=="F,B"][["eid","config_id","brainage_diff"]]
plot_df_long = pd.merge(plot_df1,plot_df2,on=["eid","config_id"])
plot_df_long = plot_df_long.rename(columns={"brainage_diff_x":"brainage_diff for B,F", "brainage_diff_y":"brainage_diff for F,B"})

with sns.axes_style("whitegrid"):
    g = sns.jointplot(x="brainage_diff for B,F", y="brainage_diff for F,B", hue="config_id", data=plot_df_long)

# Vector similarity

In [None]:
perf_df.head()

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial import distance

chrno_age = perf_df[["test_age_1","test_age_2"]].values
brain_age = perf_df[["test_brainage_1","test_brainage_2"]].values

sim_list = []
for i in range(len(chrno_age)):
    cos_sim = 1 - distance.cosine(chrno_age[i],brain_age[i])
    sim_list.append(cos_sim)

In [None]:
plot_df = perf_df.copy()
plot_df["cosine_sim"] = sim_list

sns.set(font_scale=2)
with sns.axes_style("whitegrid"):
    g = sns.catplot(y="cosine_sim",x="config_id",hue="visit_order", height=3, aspect=3, kind="point", 
                    sharey=False, data=plot_df)

In [None]:
sns.set(font_scale=2)
with sns.axes_style("whitegrid"):
    g = sns.catplot(y="cosine_sim",x="config_id",hue="visit_order", height=3, aspect=3, kind="box", 
                    sharey=False, data=plot_df)