# Hypothesis 2.2

## Setup

In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
import subprocess
from datetime import date
import re
import matplotlib.pyplot as plt
import seaborn as sns
from glob import glob

In [2]:
# project directory
project_dir = Path('/data/NNDSP')

# NNDSP data directories
nndsp_bids_dir = Path('/data/NNDSP/bids_2017_07_14_generic')
nndsp_fs_dir = Path('/data/NNDSP/derivatives/fs5.3_subj')
nndsp_bar_dir = Path('/data/NNDSP/derivatives/bar_subj')
nndsp_pheno_file = Path('/data/NNDSP/anal/analysis_notebooks/phenotype_files/NNDSP_famid.csv')

# HCP data directories
hcp_bids_dir = Path('/data/HCP/HCP_900/s3/hcp')
hcp_fs_dir = Path('/data/NNDSP/derivatives/fs_hcp_subj')
hcp_bar_dir = Path('/data/NNDSP/derivatives/bar_hcp_subj')
hcp_pheno_file = Path('/data/NNDSP/nino/HCP_ages.csv')

# NKI data directories
nki_bids_dir = Path('/data/NNDSP/anal/NKI')
nki_fs_dir = Path('/data/NNDSP/derivatives/fs_nki_subj')
nki_bar_dir = Path('/data/NNDSP/derivatives/bar_nki_subj')
nki_pheno_file = Path('/data/NNDSP/anal/analysis_notebooks/phenotype_files/participants.tsv')

# CoRR data directories
corr_bids_dir = Path('/data/DSST/CoRR/bids_corr')
corr_fs_dir = Path('/data/DSST/CoRR/fs_corr')
corr_bar_dir = Path('/data/DSST/CoRR/bar_corr/baracus')
corr_pheno_file = Path('/data/DSST/CoRR/phenotype_files/corr_ages.csv')

# SALD data directories
sald_bids_dir = Path('/data/DSST/SALD/bids_sald')
sald_fs_dir = Path('/data/DSST/SALD/fs_sald')
sald_bar_dir = Path('/data/DSST/SALD/bar_sald/baracus')
sald_pheno_file = Path('/data/DSST/SALD/phenotype_files/sub_information.xlsx')

## Subject Data

In [3]:
%run /data/NNDSP/anal/analysis_notebooks/follow_up_analysis/util_setup.ipynb

### NNDSP

In [4]:
# merge the subjects that we have fs and add subject number
df_nndsp = nndsp_subjectdata(nndsp_fs_dir, nndsp_pheno_file)
df_nndsp.head()

Unnamed: 0,subj_paths,subject,MRN,nuclear_fam_id,Sex,age
0,/data/NNDSP/derivatives/fs5.3_subj/sub-1889,1889,7135075,10230,Female,9.667351
1,/data/NNDSP/derivatives/fs5.3_subj/sub-2011,2011,7078997,10109,Male,18.310746
2,/data/NNDSP/derivatives/fs5.3_subj/sub-1948,1948,4571265,1854,Female,14.269678
3,/data/NNDSP/derivatives/fs5.3_subj/sub-1198,1198,4359628,1613,Male,14.390144
4,/data/NNDSP/derivatives/fs5.3_subj/sub-1371,1371,7218874,10286,Male,16.621492


### HCP

In [5]:
df_hcp = hcp_subjectdata(hcp_fs_dir, hcp_pheno_file)
df_hcp.head()

Unnamed: 0,subj_paths,subject,age,HasGT,ZygositySR,ZygosityGT,Family_ID,Mother_ID,Father_ID,TestRetestInterval,...,SSAGA_Times_Used_Illicits,SSAGA_Times_Used_Cocaine,SSAGA_Times_Used_Hallucinogens,SSAGA_Times_Used_Opiates,SSAGA_Times_Used_Sedatives,SSAGA_Times_Used_Stimulants,SSAGA_Mj_Use,SSAGA_Mj_Ab_Dep,SSAGA_Mj_Age_1st_Use,SSAGA_Mj_Times_Used
0,/data/NNDSP/derivatives/fs_hcp_subj/sub-749361,749361,29,True,NotTwin,,52442_82285,52442,82285,,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3.0,1.0
1,/data/NNDSP/derivatives/fs_hcp_subj/sub-191942,191942,27,True,NotTwin,,56029_85850,56029,85850,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0
2,/data/NNDSP/derivatives/fs_hcp_subj/sub-983773,983773,28,True,NotTwin,,52801_82622,52801,82622,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0
3,/data/NNDSP/derivatives/fs_hcp_subj/sub-387959,387959,26,True,NotMZ,,55795_85616,55795,85616,,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3.0,2.0
4,/data/NNDSP/derivatives/fs_hcp_subj/sub-193441,193441,28,True,NotTwin,,52875_82697,52875,82697,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0


### NKI

In [6]:
df_nki = nki_subjectdata(nki_fs_dir, nki_pheno_file)
df_nki.head()

Unnamed: 0,subj_paths,subject,age,sex,handedness
0,/data/NNDSP/derivatives/fs_nki_subj/sub-A00023510,A00023510,23.0,MALE,RIGHT
1,/data/NNDSP/derivatives/fs_nki_subj/sub-A00066...,A00066087,22.0,MALE,RIGHT
4,/data/NNDSP/derivatives/fs_nki_subj/sub-A00066...,A00066236,33.0,MALE,RIGHT
7,/data/NNDSP/derivatives/fs_nki_subj/sub-A00034350,A00034350,11.0,MALE,RIGHT
8,/data/NNDSP/derivatives/fs_nki_subj/sub-A00063003,A00063003,13.0,MALE,RIGHT


### CoRR

In [7]:
df_corr = corr_subjectdata(corr_fs_dir, corr_pheno_file)
df_corr.head()

Unnamed: 0.1,subj_paths,subject,Unnamed: 0,age,SEX
1816,/data/DSST/CoRR/fs_corr/sub-0003001,3001,0,25,2
4159,/data/DSST/CoRR/fs_corr/sub-0003002,3002,2,23,1
4427,/data/DSST/CoRR/fs_corr/sub-0003004,3004,4,31,2
4077,/data/DSST/CoRR/fs_corr/sub-0003006,3006,6,23,1
131,/data/DSST/CoRR/fs_corr/sub-0003007,3007,8,43,2


### SALD

In [8]:
df_sald = sald_subjectdata(sald_fs_dir, sald_pheno_file)
df_sald.head()

Unnamed: 0,subj_paths,subject,Sex,age,Sex.1,Edinburgh Handedness Inventory (EHI),FunImg,T1Img
98,/data/DSST/SALD/fs_sald/sub-031274,31274,F,72,1,100.0,1,1
28,/data/DSST/SALD/fs_sald/sub-031277,31277,F,60,1,100.0,1,1
95,/data/DSST/SALD/fs_sald/sub-031279,31279,M,62,2,100.0,1,1
29,/data/DSST/SALD/fs_sald/sub-031281,31281,F,65,1,,1,1
108,/data/DSST/SALD/fs_sald/sub-031285,31285,M,54,2,,1,1


# Train/ Test Models

In [31]:
%run /data/NNDSP/anal/analysis_notebooks/follow_up_analysis/util_models.ipynb

In [27]:
print("Features for NNDSP")
x, nndsp_features = subject_features([os.path.basename(s) for s in df_nndsp.subj_paths], nndsp_bar_dir)
print("Features for HCP")
x, hcp_features = subject_features([os.path.basename(s) for s in df_hcp.subj_paths], hcp_bar_dir)
print("Features for NKI")
x, nki_features = subject_features([os.path.basename(s) for s in df_nki.subj_paths], nki_bar_dir)
print("Features for CoRR")
x, corr_features = subject_features([os.path.basename(s) for s in df_corr.subj_paths], corr_bar_dir, session='_ses-1')
print("Features for SALD")
x, sald_features = subject_features([os.path.basename(s) for s in df_sald.subj_paths], sald_bar_dir)

Features for NNDSP
Features for HCP
Features for NKI
Features for CoRR
Features for SALD


In [49]:
uncorr_datasets = {
    'NNDSP' : [df_nndsp, nndsp_bar_dir, nndsp_features, True, ''],
    'HCP' : [df_hcp, hcp_bar_dir, hcp_features, True, ''],
    'NKI' : [df_nki, nki_bar_dir, nki_features, False, ''],
    'CoRR' : [df_corr, corr_bar_dir, corr_features, True, '_ses-1'],
    'SALD' : [df_sald, sald_bar_dir, sald_features, True, '']
}

## Uncorrected Train/Test Models

In [50]:
def uncorrected_train_test(uncorr_datasets):
    
    uncorrected_mae = {'Complex' : {},
                'Simple' : {}}
    
    for title, data in uncorr_datasets.items():
        print('*****************************************************************')
        print("Training Complex for (Uncorrected)", title)
        scores, df_complex_train, df_complex_test, complex_pipes = complex_model(data[0],
                                                                       data[1],
                                                                       features = data[2],
                                                                      is_int = data[3],
                                                                            session = data[4])
    
        print("\n\nTraining Simple for (Uncorrected)", title)
        simple_train, df_simple_test, simple_pipe = simple_model(data[0], data[1], 
                                                          model_train = df_complex_train,
                                                          model_test=df_complex_test,
                                                         is_int = data[3],
                                                            session = data[4])
    
        test_datasets = uncorr_datasets.copy()
        del test_datasets[title]
    
        for test_title, test_data in test_datasets.items():
            print("\n\nTesting Complex Model on (Uncorrected)", test_title)
            uncorrected_mae['Complex'][('Train ' + title + ' ;Test ' + test_title)] = complex_test(test_data[0], 
                                        test_data[1],
                                        complex_pipes, 
                                        data=test_title,
                                        is_int=test_data[3], 
                                        features=test_data[2], 
                                        pred = True,
                                        session = test_data[4])
                             
            print("\n\nTesting Simple Model on (Uncorrected)", test_title)
            uncorrected_mae['Simple'][('Train ' + title + ' ;Test ' + test_title)] =  simple_test(test_data[0],
                                                                                test_data[1],
                                                                                simple_pipe,
                                                                                data=test_title,
                                                                                is_int=test_data[3], 
                                                                                pred = True,
                                                                                session = test_data[4])
    return uncorrected_mae

In [51]:
uncorrected_preds = uncorrected_train_test(uncorr_datasets)

*****************************************************************
Training Complex for (Uncorrected) NNDSP

Cortical Thickness Pipeline
Mean Absolute Error (Train, thickness): 0.0997523862864
Mean Absolute Error (Test, thickness): 5.65455987755

Subcortival Volumes Pipeline


  y = column_or_1d(y, warn=True)


Mean Absolute Error (Train, aseg): 9.28483858889
Mean Absolute Error (Test, aseg): 11.5887880054

Cortical Surface Area Pipeline


  y = column_or_1d(y, warn=True)


Mean Absolute Error (Train, area): 0.0988676179526
Mean Absolute Error (Test, area): 13.4042780653

Stacking predictions
Beginning Random Forest
Fitting stacking model
best max_depth: 1
Mean Absolute Error (Train): 11.0316202299
Mean Absolute Error (Test): 11.3006803385
                     r2     rpear    rpear2        mae      medae
aseg          -0.132141  0.111577  0.012449  11.588788  10.040532
ct            -0.847000 -0.035672  0.001272  15.605528  14.706846
ca            -0.344579 -0.036147  0.001307  13.404278  12.701769
mean_pred     -0.137519  0.002355  0.000006  11.955166  10.302539
pred_age_test -0.044269  0.014673  0.000215  11.300680  10.605933


Training Simple for (Uncorrected) NNDSP
Finding Subjects
Extracting Features
431
Training Model
Mean Absolute Error (Train): 6.55030807275
Mean Absolute Error (Test): 7.05906598994


Testing Complex Model on (Uncorrected) HCP
895
895
Predicting from Pipelines
Mean Absolute Error (HCP thickness): 5.65793753715
Mean Absolute Error 

  y = column_or_1d(y, warn=True)


Mean Absolute Error (Train, aseg): 2.75420993501
Mean Absolute Error (Test, aseg): 3.24003147958

Cortical Surface Area Pipeline


  y = column_or_1d(y, warn=True)


Mean Absolute Error (Train, area): 0.0997363323954
Mean Absolute Error (Test, area): 3.46910450241

Stacking predictions
Beginning Random Forest
Fitting stacking model
best max_depth: 1
Mean Absolute Error (Train): 3.07534116268
Mean Absolute Error (Test): 3.12429489326
                     r2     rpear    rpear2       mae     medae
aseg          -0.214485  0.067091  0.004501  3.240031  2.727892
ct            -0.298265  0.011790  0.000139  3.443540  3.138121
ca            -0.363746  0.048581  0.002360  3.469105  3.033682
mean_pred     -0.090211  0.067993  0.004623  3.161459  2.942363
pred_age_test -0.017934  0.016374  0.000268  3.124295  2.915708


Training Simple for (Uncorrected) HCP
Finding Subjects
Extracting Features
895
Training Model
Mean Absolute Error (Train): 2.77605757944
Mean Absolute Error (Test): 2.91401950543


Testing Complex Model on (Uncorrected) NNDSP
431
431
Predicting from Pipelines
Mean Absolute Error (NNDSP thickness): 12.6232691799
Mean Absolute Error (NNDSP ase

  y = column_or_1d(y, warn=True)


Mean Absolute Error (Train, aseg): 16.1635104525
Mean Absolute Error (Test, aseg): 21.3884067961

Cortical Surface Area Pipeline


  y = column_or_1d(y, warn=True)


Mean Absolute Error (Train, area): 0.100084250014
Mean Absolute Error (Test, area): 22.3203287882

Stacking predictions
Beginning Random Forest
Fitting stacking model
best max_depth: 1
Mean Absolute Error (Train): 18.26869955
Mean Absolute Error (Test): 20.1324699711
                     r2     rpear    rpear2        mae      medae
aseg          -0.260505 -0.048751  0.002377  21.388407  20.833338
ct            -0.684833  0.025798  0.000666  24.141091  21.500670
ca            -0.343315 -0.047477  0.002254  22.320329  20.430872
mean_pred     -0.120344 -0.026645  0.000710  20.972825  21.053304
pred_age_test  0.000044  0.043662  0.001906  20.132470  19.600161


Training Simple for (Uncorrected) NKI
Finding Subjects
Extracting Features
808
Training Model
Mean Absolute Error (Train): 10.3539657713
Mean Absolute Error (Test): 10.3940912298


Testing Complex Model on (Uncorrected) NNDSP
431
431
Predicting from Pipelines
Mean Absolute Error (NNDSP thickness): 17.4839372918
Mean Absolute Error (

  y = column_or_1d(y, warn=True)


Mean Absolute Error (Train, aseg): 5.35291903795
Mean Absolute Error (Test, aseg): 6.41493023691

Cortical Surface Area Pipeline


  y = column_or_1d(y, warn=True)


Mean Absolute Error (Train, area): 0.0996968890369
Mean Absolute Error (Test, area): 8.56153247335

Stacking predictions
Beginning Random Forest
Fitting stacking model
best max_depth: 5
Mean Absolute Error (Train): 2.91138321686
Mean Absolute Error (Test): 4.01905352154
                     r2     rpear    rpear2       mae     medae
aseg           0.651659  0.808975  0.654441  6.414930  4.445156
ct             0.523472  0.753066  0.567109  7.021226  5.138927
ca             0.516885  0.726339  0.527568  8.561532  6.750389
mean_pred      0.721805  0.854693  0.730500  5.946439  4.392893
pred_age_test  0.846838  0.922599  0.851190  4.019054  2.730378


Training Simple for (Uncorrected) CoRR
Finding Subjects
Extracting Features
1466
Training Model
Mean Absolute Error (Train): 7.34523905741
Mean Absolute Error (Test): 7.65945036911


Testing Complex Model on (Uncorrected) NNDSP
431
431
Predicting from Pipelines
Mean Absolute Error (NNDSP thickness): 16.7325506795
Mean Absolute Error (NNDSP a

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Mean Absolute Error (Train, area): 0.100013610671
Mean Absolute Error (Test, area): 9.5168510875

Stacking predictions
Beginning Random Forest
Fitting stacking model
best max_depth: 6
Mean Absolute Error (Train): 2.67680315486
Mean Absolute Error (Test): 6.33824334793
                     r2     rpear    rpear2       mae     medae
aseg           0.644973  0.805783  0.649286  8.020946  6.897806
ct             0.582432  0.768595  0.590738  9.433668  9.694668
ca             0.480956  0.700790  0.491107  9.516851  8.235612
mean_pred      0.693034  0.854668  0.730458  7.758230  7.083749
pred_age_test  0.727474  0.859133  0.738109  6.338243  4.435846


Training Simple for (Uncorrected) SALD
Finding Subjects
Extracting Features
195
Training Model
Mean Absolute Error (Train): 9.52413351654
Mean Absolute Error (Test): 9.83357661898


Testing Complex Model on (Uncorrected) NNDSP
431
431
Predicting from Pipelines
Mean Absolute Error (NNDSP thickness): 19.2784741941
Mean Absolute Error (NNDSP aseg