# Hypothesis 2.2

## Setup

In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
import subprocess
from datetime import date
import re
import matplotlib.pyplot as plt
import seaborn as sns
from glob import glob

In [2]:
# project directory
project_dir = Path('/data/NNDSP')

# NNDSP data directories
nndsp_bids_dir = Path('/data/NNDSP/bids_2017_07_14_generic')
nndsp_fs_dir = Path('/data/NNDSP/derivatives/fs5.3_subj')
nndsp_bar_dir = Path('/data/NNDSP/derivatives/bar_subj')
nndsp_pheno_file = Path('/data/NNDSP/anal/analysis_notebooks/phenotype_files/NNDSP_famid.csv')

# HCP data directories
hcp_bids_dir = Path('/data/HCP/HCP_900/s3/hcp')
hcp_fs_dir = Path('/data/NNDSP/derivatives/fs_hcp_subj')
hcp_bar_dir = Path('/data/NNDSP/derivatives/bar_hcp_subj')
hcp_pheno_file = Path('/data/NNDSP/nino/HCP_ages.csv')

# NKI data directories
nki_bids_dir = Path('/data/NNDSP/anal/NKI')
nki_fs_dir = Path('/data/NNDSP/derivatives/fs_nki_subj')
nki_bar_dir = Path('/data/NNDSP/derivatives/bar_nki_subj')
nki_pheno_file = Path('/data/NNDSP/anal/analysis_notebooks/phenotype_files/participants.tsv')

# CoRR data directories
corr_bids_dir = Path('/data/DSST/CoRR/bids_corr')
corr_fs_dir = Path('/data/DSST/CoRR/fs_corr')
corr_bar_dir = Path('/data/DSST/CoRR/bar_corr/baracus')
corr_pheno_file = Path('/data/DSST/CoRR/phenotype_files/corr_ages.csv')

# SALD data directories
sald_bids_dir = Path('/data/DSST/SALD/bids_sald')
sald_fs_dir = Path('/data/DSST/SALD/fs_sald')
sald_bar_dir = Path('/data/DSST/SALD/bar_sald/baracus')
sald_pheno_file = Path('/data/DSST/SALD/phenotype_files/sub_information.xlsx')

## Subject Data

In [3]:
%run /data/NNDSP/anal/analysis_notebooks/follow_up_analysis/util_setup.ipynb

### NNDSP

In [4]:
# merge the subjects that we have fs and add subject number
df_nndsp = nndsp_subjectdata(nndsp_fs_dir, nndsp_pheno_file)
df_nndsp.head()

Unnamed: 0,subj_paths,subject,MRN,nuclear_fam_id,Sex,age
0,/data/NNDSP/derivatives/fs5.3_subj/sub-1889,1889,7135075,10230,Female,9.667351
1,/data/NNDSP/derivatives/fs5.3_subj/sub-2011,2011,7078997,10109,Male,18.310746
2,/data/NNDSP/derivatives/fs5.3_subj/sub-1948,1948,4571265,1854,Female,14.269678
3,/data/NNDSP/derivatives/fs5.3_subj/sub-1198,1198,4359628,1613,Male,14.390144
4,/data/NNDSP/derivatives/fs5.3_subj/sub-1371,1371,7218874,10286,Male,16.621492


### HCP

In [5]:
df_hcp = hcp_subjectdata(hcp_fs_dir, hcp_pheno_file)
df_hcp.head()

Unnamed: 0,subj_paths,subject,age,HasGT,ZygositySR,ZygosityGT,Family_ID,Mother_ID,Father_ID,TestRetestInterval,...,SSAGA_Times_Used_Illicits,SSAGA_Times_Used_Cocaine,SSAGA_Times_Used_Hallucinogens,SSAGA_Times_Used_Opiates,SSAGA_Times_Used_Sedatives,SSAGA_Times_Used_Stimulants,SSAGA_Mj_Use,SSAGA_Mj_Ab_Dep,SSAGA_Mj_Age_1st_Use,SSAGA_Mj_Times_Used
0,/data/NNDSP/derivatives/fs_hcp_subj/sub-749361,749361,29,True,NotTwin,,52442_82285,52442,82285,,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3.0,1.0
1,/data/NNDSP/derivatives/fs_hcp_subj/sub-191942,191942,27,True,NotTwin,,56029_85850,56029,85850,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0
2,/data/NNDSP/derivatives/fs_hcp_subj/sub-983773,983773,28,True,NotTwin,,52801_82622,52801,82622,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0
3,/data/NNDSP/derivatives/fs_hcp_subj/sub-387959,387959,26,True,NotMZ,,55795_85616,55795,85616,,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3.0,2.0
4,/data/NNDSP/derivatives/fs_hcp_subj/sub-193441,193441,28,True,NotTwin,,52875_82697,52875,82697,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0


### NKI

In [6]:
df_nki = nki_subjectdata(nki_fs_dir, nki_pheno_file)
df_nki.head()

Unnamed: 0,subj_paths,subject,age,sex,handedness
0,/data/NNDSP/derivatives/fs_nki_subj/sub-A00023510,A00023510,23.0,MALE,RIGHT
1,/data/NNDSP/derivatives/fs_nki_subj/sub-A00066...,A00066087,22.0,MALE,RIGHT
4,/data/NNDSP/derivatives/fs_nki_subj/sub-A00066...,A00066236,33.0,MALE,RIGHT
7,/data/NNDSP/derivatives/fs_nki_subj/sub-A00034350,A00034350,11.0,MALE,RIGHT
8,/data/NNDSP/derivatives/fs_nki_subj/sub-A00063003,A00063003,13.0,MALE,RIGHT


### CoRR

In [7]:
df_corr = corr_subjectdata(corr_fs_dir, corr_pheno_file)
df_corr.head()

Unnamed: 0.1,subj_paths,subject,Unnamed: 0,age,SEX
1816,/data/DSST/CoRR/fs_corr/sub-0003001,3001,0,25,2
4159,/data/DSST/CoRR/fs_corr/sub-0003002,3002,2,23,1
4427,/data/DSST/CoRR/fs_corr/sub-0003004,3004,4,31,2
4077,/data/DSST/CoRR/fs_corr/sub-0003006,3006,6,23,1
131,/data/DSST/CoRR/fs_corr/sub-0003007,3007,8,43,2


### SALD

In [8]:
df_sald = sald_subjectdata(sald_fs_dir, sald_pheno_file)
df_sald.head()

Unnamed: 0,subj_paths,subject,Sex,age,Sex.1,Edinburgh Handedness Inventory (EHI),FunImg,T1Img
98,/data/DSST/SALD/fs_sald/sub-031274,31274,F,72,1,100.0,1,1
28,/data/DSST/SALD/fs_sald/sub-031277,31277,F,60,1,100.0,1,1
95,/data/DSST/SALD/fs_sald/sub-031279,31279,M,62,2,100.0,1,1
29,/data/DSST/SALD/fs_sald/sub-031281,31281,F,65,1,,1,1
108,/data/DSST/SALD/fs_sald/sub-031285,31285,M,54,2,,1,1


# Train/ Test Models

In [9]:
%run /data/NNDSP/anal/analysis_notebooks/follow_up_analysis/util_models.ipynb



In [10]:
hcp_mae = {}
nki_mae = {}
corr_mae = {}
sald_mae = {}
nndsp_mae = {}

## (Uncorrected) Train NNDSP

In [None]:
print("Features for NNDSP")
x, nndsp_features = subject_features([os.path.basename(s) for s in df_nndsp.subj_paths], nndsp_bar_dir)
print("Features for HCP")
x, hcp_features = subject_features([os.path.basename(s) for s in df_hcp.subj_paths], hcp_bar_dir)
print("Features for NKI")
x, nki_features = subject_features([os.path.basename(s) for s in df_nki.subj_paths], nki_bar_dir)
print("Features for CoRR")
x, corr_features = subject_features([os.path.basename(s) for s in df_corr.subj_paths], corr_bar_dir)
print("Features for SALD")
x, sald_features = subject_features([os.path.basename(s) for s in df_sald.subj_paths], sald_bar_dir)

In [None]:
datasets = {
    'NNDSP' : [df_nndsp, nndsp_bar_dir, nndsp_features, True],
    'HCP' : [df_hcp, hcp_bar_dir, hcp_features, True],
#     'NKI' : [df_nki, nki_bar_dir, nki_features, False],
#     'CoRR' : [df_corr, corr_bar_dir, corr_features, True],
#     'SALD' : [df_sald, sald_bar_dir, sald_features, True]
}

In [None]:
uncorrected_mae = {}

In [None]:
complex_test(data[0], 
                                        data[1],
                                        complex_pipes, 
                                        data=test_title,
                                        is_int=data[3], 
                                        features=data[2], pred = True)

In [None]:
for title, data in datasets.items():
   
    print("Training Complex for", title)
    scores, df_complex_train, df_complex_test, complex_pipes = complex_model(data[0],
                                                                       data[1],
                                                                       features = data[2],
                                                                      is_int = data[3])
    
    print("\n\nTraining Simple for", title)
    simple_train, df_simple_test, simple_pipe = simple_model(data[0], data[1], 
                                                          model_train = df_complex_train,
                                                          model_test=df_complex_test,
                                                         is_int = data[3])
    
    test_datasets = datasets.copy()
    del test_datasets[title]
    
    for test_title, test_data in test_datasets.items():
        print("\n\nTesting Complex Model on", test_title)
        mae['Uncorrect'] = {'Complex on ' + title + ' to ' + test_title : complex_test(data[0], 
                                        data[1],
                                        complex_pipes, 
                                        data=test_title,
                                        is_int=data[3], 
                                        features=data[2], pred = True)}
                             
        print("\n\nTesting Simple Model on", test_title)
        mae['Uncorrected']['Simple ' + 'on ' + title + ' to ' + test_title] = simple_test(test_data[0], 
                                                                                          test_data[1],
                                                                                          simple_pipe, 
                                                                                          data=test_title, 
                                                                                          is_int=test_data[3], pred = True)

In [None]:
x, nndsp_features = subject_features([os.path.basename(s) for s in df_nndsp.subj_paths], nndsp_bar_dir)

In [None]:
scores_nndsp, nndsp_complex_train, nndsp_complex_test, nndsp_complex_pipes = complex_model(df_nndsp, 
                                                                                           nndsp_bar_dir, 
                                                                                           features = nndsp_features)

In [None]:
nndsp_simple_train, nndsp_simple_test, nndsp_simple_pipe = simple_model(df_nndsp, nndsp_bar_dir, 
                                                                        model=True, 
                                                                        model_train = nndsp_complex_train, 
                                                                        model_test=nndsp_complex_test)

### Test on (uncorrected) HCP

In [None]:
x, hcp_features = subject_features([os.path.basename(s) for s in df_hcp.subj_paths], hcp_bar_dir)

In [None]:
hcp_mae['Uncorrected'] = {'Train on NNDSP' : {'Complex' : complex_test(df_hcp, 
                                                              hcp_bar_dir, nndsp_complex_pipes, 
                                                              data='HCP', features=hcp_features, pred = True)}}

In [None]:
hcp_mae['Uncorrected']['Train on NNDSP']['Simple'] = simple_test(df_hcp, hcp_bar_dir, 
                                                                     nndsp_simple_pipe, data='HCP',
                                                                    pred = True)

### Test on (uncorrected) NKI

In [None]:
x, nki_features = subject_features([os.path.basename(s) for s in df_nki.subj_paths], nki_bar_dir)

In [None]:
nki_mae['Uncorrected'] = {'Trained on NNDSP' : {'Complex' : complex_test(df_nki, 
                                                                         nki_bar_dir, 
                                                                         nndsp_complex_pipes, 
                                                                         data='NKI', is_int=False, 
                                                                         features=nki_features, pred = True)}}

In [None]:
nki_mae['Uncorrected']['Trained on NNDSP']['Simple'] = simple_test(df_nki, nki_bar_dir, 
                                                                     nndsp_simple_pipe, data='NKI', is_int=False,
                                                                    pred = True)

### Test on 