# Baseline results: Single visit models  

### train on train subset of ses2 data from subjects with two visits. Test on ses2 and ses3 scans from test_metadata subjects. 

In [2]:
import sys
import numpy as np
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor

import seaborn as sns
import matplotlib.pyplot as plt

sys.path.append("../")
from src.utils import *

  return torch._C._cuda_getDeviceCount() > 0


In [3]:
data_dir = "/home/nikhil/projects/brain_changes/data/"
exp_setup_dir = f"{data_dir}exp_setup"

ukbb_dir = f"{exp_setup_dir}/ukbb/"
adni_dir = f"{exp_setup_dir}/adni/"

# UKB Paths
ukbb_demographics_csv = f"{ukbb_dir}ukbb_exp_data_demographics.csv" 
ukbb_DKT_csv = f"{ukbb_dir}ukbb_exp_data_DKT.csv" 
ukbb_exp_setup_csv = f"{ukbb_dir}ukbb_exp_setup_shortterm.csv"

# ADNI Paths
adni_demographics_csv = f"{adni_dir}adni_exp_data_demographics.csv" 
adni_DKT_csv = f"{adni_dir}adni_exp_data_DKT.csv" 

# DKT fields and labels
ukbb_dkt_fields = f"{data_dir}/ukbb/region_field_ids/UKBB_FS_DKT_Fields.csv"
ukbb_dkt_ct_fields = f"{data_dir}/ukbb/region_field_ids/UKBB_DKT_CT_Fields.csv"

# ASEG fields and labels
ukbb_aseg_fields = f"{data_dir}/ukbb/region_field_ids/UKBB_FS_ASEG_Fields.csv"
ukbb_aseg_vol_fields = f"{data_dir}/ukbb/region_field_ids/UKBB_ASEG_vol_Fields.csv"

### UKB short-term data
- labels: "age_at_ses2" --> age rounded in years 
- labels: "age_at_ses2 (calc)" --> floating point age calculated from years and months


In [16]:
# Global configs
session = "ses-2" # "ses-2" "both"
age_col = "age_at_ses2 (calc)" # "age_at_ses2" or "age_at_ses2 (calc)"

use_sex_col = False # Doesn't help much

DKT_fields_df = pd.read_csv(ukbb_dkt_ct_fields)
DKT_fields_sorted = list(DKT_fields_df.sort_values(["hemi","roi"])["Field ID"].astype(str).values)


In [17]:
ukbb_exp_setup_df = pd.read_csv(ukbb_exp_setup_csv)
train_ids = ukbb_exp_setup_df[ukbb_exp_setup_df["CV_subset"] == "train"]["participant_id"].values
val_ids = ukbb_exp_setup_df[ukbb_exp_setup_df["CV_subset"] == "val"]["participant_id"].values
print(f"n_train: {len(train_ids)}, n_val: {len(val_ids)}")

# Slice demographic cols
demo_cols = ["participant_id","sex",age_col]
train_demographics_df = ukbb_exp_setup_df[ukbb_exp_setup_df["CV_subset"] == "train"][demo_cols]
train_demographics_df = train_demographics_df.rename(columns={"participant_id":"subject_id"})
val_demographics_df = ukbb_exp_setup_df[ukbb_exp_setup_df["CV_subset"] == "val"][demo_cols]
val_demographics_df = val_demographics_df.rename(columns={"participant_id":"subject_id"})

# Read DKT
DKT_df = pd.read_csv(ukbb_DKT_csv)
train_ses2_DKT_df = DKT_df[(DKT_df["subject_id"].isin(train_ids)) & (DKT_df["session"] == "ses-2")]
val_ses2_DKT_df = DKT_df[(DKT_df["subject_id"].isin(val_ids)) & (DKT_df["session"] == "ses-2")]                                

train_ses3_DKT_df = DKT_df[(DKT_df["subject_id"].isin(train_ids)) & (DKT_df["session"] == "ses-3")]
val_ses3_DKT_df = DKT_df[(DKT_df["subject_id"].isin(val_ids)) & (DKT_df["session"] == "ses-3")] 

if session == "ses-2":
    train_DKT_df = train_ses2_DKT_df
    val_DKT_df = val_ses2_DKT_df
    feature_cols = DKT_fields_sorted
else:
    train_DKT_df = pd.merge(train_ses2_DKT_df, train_ses3_DKT_df, on="subject_id")
    val_DKT_df = pd.merge(val_ses2_DKT_df, val_ses3_DKT_df, on="subject_id")
    feature_cols = [f"{dkt_field}_x" for dkt_field in DKT_fields_sorted] + [f"{dkt_field}_y" for dkt_field in DKT_fields_sorted] 

if use_sex_col:
    feature_cols = feature_cols + ["sex"]
    print("Using sex column as input")
    
# merge with demographic df
train_df = pd.merge(train_DKT_df, train_demographics_df, on="subject_id")
val_df = pd.merge(val_DKT_df, val_demographics_df, on="subject_id")

X_train = train_df[feature_cols].values
y_train = train_df[age_col].values

print(f"shapes X_CV: {X_train.shape}, y_CV: {y_train.shape}")

X_val = val_df[feature_cols].values
y_val = val_df[age_col].values

print(f"shapes  X_val: {X_val.shape}, y_val: {y_val.shape}")

print(f"train shape: {train_df.shape}, val shape: {val_df.shape}")

n_train: 3071, n_val: 342
Using sex column as input
shapes X_CV: (3071, 63), y_CV: (3071,)
shapes  X_val: (342, 63), y_val: (342,)
train shape: (3071, 67), val shape: (342, 67)


### Train models

In [19]:
n_jobs = 4

model_dict = {
            "Ridge": Ridge(), 
            "RF": RandomForestRegressor(n_jobs=n_jobs, random_state=1)
            }

perf_df = pd.DataFrame()
for model_name, model_instance in model_dict.items():        
    CV_scores, y_pred, test_loss1, test_loss2, test_r1, test_r2 = get_brain_age_perf(X_train, y_train, X_val, y_val, model_instance)
    
    df = pd.DataFrame()

    df[age_col] = y_val
    df[f"brainage_at_{session}"] = y_pred
    df["test_sq_err"] = test_loss1
    df["test_abs_err"] = np.abs(y_pred - y_val)
    df["test_r1"] = test_r1
    df["model"] = model_name

    val_mae = df["test_abs_err"].mean()
    print(f"model: {model_name}, val mae: {val_mae:4.3f}, correlation: {test_r1:4.3f}")

    perf_df = perf_df.append(df)

model: Ridge, val mae: 5.085, correlation: 0.531
model: RF, val mae: 5.458, correlation: 0.436


# features: ses-2 
    # without sex col
    # model: Ridge, val mae: 5.083, correlation: 0.531
    # model: RF, val mae: 5.436, correlation: 0.445

    # with sex col
    # model: Ridge, val mae: 5.085, correlation: 0.531
    # model: RF, val mae: 5.458, correlation: 0.436

# features: ses-2 and 3 
    # without sex col
    # model: Ridge, val mae: 4.826, correlation: 0.575

    # with sex col
    # model: Ridge, val mae: 4.820, correlation: 0.574