## Driver code comparing output of different atlas / parcellations 

### Steps
- import data csvs 
- visualize data distributions 
- correlate features across pipelines
- compare performance of machine-learning model (scikit-learn)
- compare performance of statsmodels (ols or logit)

In [1]:
import sys
import numpy as np
import pandas as pd
import itertools
from sklearn import svm

sys.path.append('../')
from lib.data_handling import *
from lib.data_stats import *

### Data paths

In [10]:
proj_dir = '/Users/nikhil/code/git_repos/compare-surf-tools/'
data_dir = proj_dir + 'data/'
fs60_dir = data_dir + 'fs60_group_stats/'
demograph_file = 'ABIDE_Phenotype.csv'

#fs60 all files
# Thickness
fs60_aparc_lh_thickness_file = 'lh.aparc.thickness.table.test1'
fs60_aparc_rh_thickness_file = 'rh.aparc.thickness.table.test1' #34 ROIs
fs60_aparc2009_lh_thickness_file = 'lh.aparc.a2009.thickness.table.test1' #74 ROIs per hemi
fs60_aparc2009_rh_thickness_file = 'rh.aparc.a2009.thickness.table.test1'
# Surface Area
fs60_aparc_lh_SA_file = 'aparc_lh_SA_table.test1'
fs60_aparc_rh_SA_file = 'aparc_rh_SA_table.test1'
fs60_aparc2009_lh_SA_file = 'lh.aparc.a2009.SA.table.test1'
fs60_aparc2009_rh_SA_file = 'rh.aparc.a2009.SA.table.test1'


### Global Vars

In [11]:
subject_ID_col = 'SubjID'

### Load data

In [12]:
# Demographics and Dx
demograph = pd.read_csv(data_dir + demograph_file)
demograph = demograph.rename(columns={'Subject_ID':subject_ID_col})

fs60_aparc_lh_thickness_data = pd.read_csv(fs60_dir + fs60_aparc_lh_thickness_file, delim_whitespace=True)
fs60_aparc_rh_thickness_data = pd.read_csv(fs60_dir + fs60_aparc_rh_thickness_file, delim_whitespace=True)

fs60_aparc2009_lh_thickness_data = pd.read_csv(fs60_dir + fs60_aparc2009_lh_thickness_file, delim_whitespace=True)
fs60_aparc2009_rh_thickness_data = pd.read_csv(fs60_dir + fs60_aparc2009_rh_thickness_file, delim_whitespace=True)

print('shape of fs60 aparc data l: {}, r: {}'.format(fs60_aparc_lh_thickness_data.shape,fs60_aparc_rh_thickness_data.shape))
print('shape of fs60 aparc2009 data l: {}, r: {}'.format(fs60_aparc2009_lh_thickness_data.shape,fs60_aparc2009_rh_thickness_data.shape))

# fs60_data_std = standardize_fs60_data(fs60_lh_data, fs60_rh_data, subject_ID_col)
# print('shape of stdized fs51 data {}'.format(fs60_data_std.shape))

shape of fs60 aparc data l: (1047, 36), r: (1047, 36)
shape of fs60 aparc2009 data l: (1047, 76), r: (1047, 76)


In [15]:
fs60_aparc_lh_thickness_data.columns

Index(['lh.aparc.thickness', 'lh_bankssts_thickness',
       'lh_caudalanteriorcingulate_thickness',
       'lh_caudalmiddlefrontal_thickness', 'lh_cuneus_thickness',
       'lh_entorhinal_thickness', 'lh_fusiform_thickness',
       'lh_inferiorparietal_thickness', 'lh_inferiortemporal_thickness',
       'lh_isthmuscingulate_thickness', 'lh_lateraloccipital_thickness',
       'lh_lateralorbitofrontal_thickness', 'lh_lingual_thickness',
       'lh_medialorbitofrontal_thickness', 'lh_middletemporal_thickness',
       'lh_parahippocampal_thickness', 'lh_paracentral_thickness',
       'lh_parsopercularis_thickness', 'lh_parsorbitalis_thickness',
       'lh_parstriangularis_thickness', 'lh_pericalcarine_thickness',
       'lh_postcentral_thickness', 'lh_posteriorcingulate_thickness',
       'lh_precentral_thickness', 'lh_precuneus_thickness',
       'lh_rostralanteriorcingulate_thickness',
       'lh_rostralmiddlefrontal_thickness', 'lh_superiorfrontal_thickness',
       'lh_superiorparieta

In [14]:
fs60_aparc2009_rh_thickness_data.head()

Unnamed: 0,rh.aparc.a2009s.thickness,rh_G&S_frontomargin_thickness,rh_G&S_occipital_inf_thickness,rh_G&S_paracentral_thickness,rh_G&S_subcentral_thickness,rh_G&S_transv_frontopol_thickness,rh_G&S_cingul-Ant_thickness,rh_G&S_cingul-Mid-Ant_thickness,rh_G&S_cingul-Mid-Post_thickness,rh_G_cingul-Post-dorsal_thickness,...,rh_S_pericallosal_thickness,rh_S_postcentral_thickness,rh_S_precentral-inf-part_thickness,rh_S_precentral-sup-part_thickness,rh_S_suborbital_thickness,rh_S_subparietal_thickness,rh_S_temporal_inf_thickness,rh_S_temporal_sup_thickness,rh_S_temporal_transverse_thickness,rh_MeanThickness_thickness
0,sub-0050002,2.445,2.588,2.558,2.927,2.609,2.871,2.95,2.86,3.06,...,1.748,2.197,2.653,2.586,2.244,2.615,2.819,2.602,2.77,2.6639
1,sub-0050003,2.359,2.444,2.096,2.452,2.684,2.955,3.065,2.848,3.101,...,2.144,2.246,2.435,2.452,3.14,2.86,2.281,2.497,2.704,2.49413
2,sub-0050004,2.563,2.789,2.5,2.699,2.489,2.718,3.046,2.959,3.044,...,1.687,2.325,2.476,2.679,2.44,2.658,2.578,2.62,2.216,2.63963
3,sub-0050006,2.709,2.473,2.622,2.711,2.873,2.917,2.895,2.941,3.129,...,1.551,2.503,2.766,2.804,2.997,2.694,2.57,2.919,3.12,2.79477
4,sub-0050007,2.462,2.336,2.243,2.527,2.767,2.415,2.782,2.589,2.972,...,2.058,2.079,2.475,2.402,1.932,2.246,2.427,2.427,2.527,2.42719


### Create master dataframe

In [9]:
data_dict = {'ants' : ants_data_std,
            'fs60' : fs60_data_std,
            'fs53' : fs53_data_std,
            'fs51' : fs51_data_std}

na_action = 'drop' # options: ignore, drop; anything else will not use the dataframe for analysis. 
master_df, common_subs, common_roi_cols = combine_processed_data(data_dict, subject_ID_col, na_action)

# Add demographic columns to the master_df
useful_demograph = demograph[[subject_ID_col,'SEX','AGE_AT_SCAN','DX_GROUP']]
master_df = pd.merge(master_df, useful_demograph, how='left', on=subject_ID_col)
print('\nmaster df shape after adding demographic info {}'.format(master_df.shape))

Number of datasets: 4
Finding common subject and columns
Number of common subjects and columns: 760, 63

checking ants dataframe
Shape of the dataframe based on common cols and subs (760, 63)
Basic data check passed
Shape of the concat dataframe (760, 64)

checking fs60 dataframe
Shape of the dataframe based on common cols and subs (760, 63)
Basic data check passed
Shape of the concat dataframe (1520, 64)

checking fs53 dataframe
Shape of the dataframe based on common cols and subs (760, 63)
Basic data check passed
Shape of the concat dataframe (2280, 64)

checking fs51 dataframe
Shape of the dataframe based on common cols and subs (760, 63)
Basic data check passed
Shape of the concat dataframe (3040, 64)

master df shape after adding demographic info (3040, 67)


### Correlation between pipelines

In [10]:
possible_pairs = list(itertools.combinations(data_dict.keys(), 2))

for pair in possible_pairs:
    pipe1 = pair[0]
    pipe2 = pair[1]
    df1 = master_df[master_df['pipeline']==pipe1][[subject_ID_col]+common_roi_cols]
    df2 = master_df[master_df['pipeline']==pipe2][[subject_ID_col]+common_roi_cols]

    xcorr = cross_correlations(df1,df2,subject_ID_col)
    print('Avg cross correlation between {} & {} = {:4.2f}\n'.format(pipe1,pipe2,np.mean(xcorr)))

Avg cross correlation between ants & fs60 = 0.43

Avg cross correlation between ants & fs53 = 0.47

Avg cross correlation between ants & fs51 = 0.43

Avg cross correlation between fs60 & fs53 = 0.91

Avg cross correlation between fs60 & fs51 = 0.84

Avg cross correlation between fs53 & fs51 = 0.88



### Compare ML performance 

In [11]:
input_cols = common_roi_cols
outcome_col = 'DX_GROUP'
clf = svm.SVC(kernel='linear')
ml_perf = getClassiferPerf(master_df,input_cols,outcome_col,clf)

Running ML classifer on 4 pipelines
Pipeline ants,  Accuracy mean:0.582, sd:0.058
Pipeline fs60,  Accuracy mean:0.551, sd:0.060
Pipeline fs53,  Accuracy mean:0.564, sd:0.057
Pipeline fs51,  Accuracy mean:0.544, sd:0.048



### Compare statsmodels performance 

In [12]:
roi_cols = common_roi_cols
covar_cols = ['SEX','AGE_AT_SCAN']
outcome_col = 'DX_GROUP'
stat_model = 'logit'
sm_perf = getStatModelPerf(master_df,roi_cols,covar_cols,outcome_col,stat_model)
print('Shape of the stats_models results df {}'.format(sm_perf.shape))

Running 62 mass-univariate logit statsmodels on 4 pipelines
Shape of the stats_models results df (248, 4)
