# Hypothesis #1

Research Question:
If the distribution of ages in training and test sets is causing the poor generalizability of both complex and simple models, then both simple and complex models trained on datasets with similar age distributions to our original training dataset will perform worse than chance when tested on samples from new datasets that mimic the age distribution of our original test dataset.

## Setup

In [5]:
from pathlib import Path
import pandas as pd
import numpy as np
import subprocess
from datetime import date
import re
import matplotlib.pyplot as plt
import seaborn as sns
from glob import glob

In [4]:
# project directory
project_dir = Path('/data/NNDSP')

# NNDSP data directories
nndsp_bids_dir = Path('/data/NNDSP/bids_2017_07_14_generic')
nndsp_fs_dir = Path('/data/NNDSP/derivatives/fs5.3_subj')
nndsp_pheno_file = Path('/data/NNDSP/anal/analysis_notebooks/phenotype_files/NNDSP_famid.csv')

# HCP data directories
hcp_bids_dir = Path('/data/HCP/HCP_900/s3/hcp')
hcp_fs_dir = Path('/data/NNDSP/derivatives/fs_hcp_subj')
hcp_pheno_file = Path('/data/NNDSP/nino/HCP_ages.csv')

# NKI data directories
nki_bids_dir = Path('/data/NNDSP/anal/NKI')
nki_fs_dir = Path('/data/NNDSP/derivatives/fs_nki_subj')
nki_pheno_file = Path('/data/NNDSP/anal/analysis_notebooks/phenotype_files/participants.tsv')

# CoRR data directories
corr_bids_dir = Path('/data/DSST/CoRR/bids_corr')
corr_fs_dir = Path('/data/DSST/CoRR/fs_corr')
corr_pheno_file = Path('/data/DSST/CoRR/phenotype_files/corr_ages.csv')

# SALD data directories
sald_bids_dir = Path('/data/DSST/SALD/bids_sald')
# sald_fs_dir = Path('/data/NNDSP/derivatives/fs5.3_subj')
sald_pheno_file = Path('/data/DSST/SALD/phenotype_files/sub_information.xlsx')

## NNDSP Subject Data

In [45]:
# merge the subjects that we have fs and add subject number
df_nndsp = pd.DataFrame({'subj_paths' : [x.as_posix() for x in nndsp_fs_dir.glob('sub-*')]})
df_nndsp = df_nndsp.assign(MASKID = [int(Path(x).name[4:]) for x in df_nndsp.subj_paths])
df_nndsp.head()

Unnamed: 0,subj_paths,MASKID
0,/data/NNDSP/derivatives/fs5.3_subj/sub-1889,1889
1,/data/NNDSP/derivatives/fs5.3_subj/sub-2011,2011
2,/data/NNDSP/derivatives/fs5.3_subj/sub-1948,1948
3,/data/NNDSP/derivatives/fs5.3_subj/sub-1198,1198
4,/data/NNDSP/derivatives/fs5.3_subj/sub-1371,1371


In [46]:
# merge the subjects we have fs of with their ages
df_nndsp = pd.merge(df_nndsp, pd.read_csv(nndsp_pheno_file.as_posix()), on='MASKID')

In [58]:
# rename columns so that we have standardization across datasets
df_nndsp = df_nndsp.rename(index=str, columns = {'MASKID' : 'subject', 'age_at_scan' : 'age'})
df_nndsp = df_nndsp.drop_duplicates(subset='subject', keep='first')
df_nndsp.head()

Unnamed: 0,subj_paths,subject,MRN,nuclear_fam_id,Sex,age
0,/data/NNDSP/derivatives/fs5.3_subj/sub-1889,1889,7135075,10230,Female,9.667351
1,/data/NNDSP/derivatives/fs5.3_subj/sub-2011,2011,7078997,10109,Male,18.310746
2,/data/NNDSP/derivatives/fs5.3_subj/sub-1948,1948,4571265,1854,Female,14.269678
3,/data/NNDSP/derivatives/fs5.3_subj/sub-1198,1198,4359628,1613,Male,14.390144
4,/data/NNDSP/derivatives/fs5.3_subj/sub-1371,1371,7218874,10286,Male,16.621492


## HCP Subject Data

In [48]:
# merge the subjects that we have fs and add subject number
df_hcp = pd.DataFrame({'subj_paths' : [x.as_posix() for x in hcp_fs_dir.glob('sub-*')]})
df_hcp = df_hcp.assign(Subject = [int(Path(x).name[4:]) for x in df_hcp.subj_paths])
df_hcp.head()

Unnamed: 0,subj_paths,Subject
0,/data/NNDSP/derivatives/fs_hcp_subj/sub-749361,749361
1,/data/NNDSP/derivatives/fs_hcp_subj/sub-191942,191942
2,/data/NNDSP/derivatives/fs_hcp_subj/sub-983773,983773
3,/data/NNDSP/derivatives/fs_hcp_subj/sub-387959,387959
4,/data/NNDSP/derivatives/fs_hcp_subj/sub-193441,193441


In [49]:
# merge the subjects we have fs of with their ages
df_hcp = pd.merge(df_hcp, pd.read_csv(hcp_pheno_file.as_posix()), on='Subject')

In [59]:
# rename columns so that we have standardization across datasets
df_hcp = df_hcp.rename(index=str, columns={'Subject' : 'subject', 'Age_in_Yrs' : 'age'})
df_hcp = df_hcp.drop_duplicates(subset='subject', keep='first')
df_hcp.head()

Unnamed: 0,subj_paths,subject,age,HasGT,ZygositySR,ZygosityGT,Family_ID,Mother_ID,Father_ID,TestRetestInterval,...,SSAGA_Times_Used_Illicits,SSAGA_Times_Used_Cocaine,SSAGA_Times_Used_Hallucinogens,SSAGA_Times_Used_Opiates,SSAGA_Times_Used_Sedatives,SSAGA_Times_Used_Stimulants,SSAGA_Mj_Use,SSAGA_Mj_Ab_Dep,SSAGA_Mj_Age_1st_Use,SSAGA_Mj_Times_Used
0,/data/NNDSP/derivatives/fs_hcp_subj/sub-749361,749361,29,True,NotTwin,,52442_82285,52442,82285,,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3.0,1.0
1,/data/NNDSP/derivatives/fs_hcp_subj/sub-191942,191942,27,True,NotTwin,,56029_85850,56029,85850,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0
2,/data/NNDSP/derivatives/fs_hcp_subj/sub-983773,983773,28,True,NotTwin,,52801_82622,52801,82622,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0
3,/data/NNDSP/derivatives/fs_hcp_subj/sub-387959,387959,26,True,NotMZ,,55795_85616,55795,85616,,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3.0,2.0
4,/data/NNDSP/derivatives/fs_hcp_subj/sub-193441,193441,28,True,NotTwin,,52875_82697,52875,82697,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0


## NKI Subject Data

In [64]:
# merge the subjects that we have fs and add subject number
df_nki = pd.DataFrame({'subj_paths' : [x.as_posix() for x in nki_fs_dir.glob('sub-*')]})
df_nki = df_nki.assign(participant_id = [Path(x).name[4:13] for x in df_nki.subj_paths])
df_nki.head()

Unnamed: 0,subj_paths,participant_id
0,/data/NNDSP/derivatives/fs_nki_subj/sub-A00023510,A00023510
1,/data/NNDSP/derivatives/fs_nki_subj/sub-A00066...,A00066087
2,/data/NNDSP/derivatives/fs_nki_subj/sub-A00066...,A00066236
3,/data/NNDSP/derivatives/fs_nki_subj/sub-A00034350,A00034350
4,/data/NNDSP/derivatives/fs_nki_subj/sub-A00063003,A00063003


In [65]:
# merge the subjects that we have fs with age
df_nki = pd.merge(df_nki, pd.read_csv(nki_pheno_file.as_posix(), sep='\t'), on='participant_id')

In [68]:
# rename so that we have standardization across datasets
df_nki = df_nki.rename(index=str, columns={'participant_id' : 'subject', 'age' : 'age'})
df_nki = df_nki.drop_duplicates(subset='subject', keep='first')
df_nki.head()

Unnamed: 0,subj_paths,subject,age,sex,handedness
0,/data/NNDSP/derivatives/fs_nki_subj/sub-A00023510,A00023510,23.0,MALE,RIGHT
1,/data/NNDSP/derivatives/fs_nki_subj/sub-A00066...,A00066087,22.0,MALE,RIGHT
2,/data/NNDSP/derivatives/fs_nki_subj/sub-A00066087,A00066087,22.0,MALE,RIGHT
3,/data/NNDSP/derivatives/fs_nki_subj/sub-A00066...,A00066087,22.0,MALE,RIGHT
4,/data/NNDSP/derivatives/fs_nki_subj/sub-A00066...,A00066236,33.0,MALE,RIGHT


## CoRR Subject Data

In [82]:
# merge the subjects that we have fs and add subject number
df_corr = pd.DataFrame({'subj_paths' : [x.as_posix() for x in corr_fs_dir.glob('sub-*')]})
df_corr = df_corr.assign(SUBID = [int(Path(x).name[4:11]) for x in df_corr.subj_paths])
df_corr.head()

Unnamed: 0,subj_paths,SUBID
0,/data/DSST/CoRR/fs_corr/sub-0003057,3057
1,/data/DSST/CoRR/fs_corr/sub-0026120_ses-2,26120
2,/data/DSST/CoRR/fs_corr/sub-0025052,25052
3,/data/DSST/CoRR/fs_corr/sub-0025288_ses-2,25288
4,/data/DSST/CoRR/fs_corr/sub-0026009_ses-1,26009


In [83]:
df_corr_pheno.head()

Unnamed: 0.1,Unnamed: 0,SUBID,AGE_AT_SCAN_1,SEX
0,0,25921,20.7,1
1,2,25922,21.7,1
2,4,25923,22.5,1
3,6,25924,21.0,1
4,8,25925,21.2,1


In [84]:
# merge subjects with ages 
df_corr = pd.merge(df_corr, pd.read_csv(corr_pheno_file.as_posix()), on='SUBID')

2162

## SALD Subject Data