In [2]:
import pandas as pd
import numpy as np
from scipy.stats import ttest_1samp
import random, sys, os, multiprocessing
from itertools import chain
from sklearn.utils import shuffle
from sklearn.model_selection import KFold
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from scipy.stats import pearsonr, sem
from functools import partial

In [6]:
# function for computing cohen's D 
def _cohenD(df, parcel_name):
    dat = list(df[parcel_name])
    d = np.mean(dat)/np.std(dat, ddof = 1)  
    return d

# svr pipeline for each sampling bin 
def _simple_linear_cv(merged_df, lab_type, parcel):
    
    """
    Main function that use SVR to measure a list of parcel's predictive accuracy for the selected beh performance
    
    Parameters:
    merged_df: a pandas dataframe that include neural and behaviral data. Column names include parcel name and beh measure names
    parcel_list: one feature 
    lab_type: a string indicating the type of behaviral measure could be any of the following 5 strings: 
              1) WM_Task_2bk_Acc 2) ListSort_AgeAdj 3) PMAT24_A_CR 4) PicVocab_AgeAdj 5) ReadEng_AgeAdj
    
    Retrun:
    pred_acc: A single number that is the averaged of the 10 fold CV, measuring the predictive acc of the input parcel list. 
    
    """
    
    kf = KFold(n_splits=10, shuffle = False) # 10 fold
    feature = merged_df[parcel].to_numpy() # features of the current parcel_list
    label = merged_df[lab_type].to_numpy() # beh score

    pred_acc_list = [] 
    for train_index, test_index in kf.split(merge_df):

        train_feature, train_label = feature[train_index], label[train_index] # training feature and label
        train_feature = np.expand_dims(train_feature, axis = 1)
        test_feature, test_label = feature[test_index], label[test_index] # testing feature and label
        test_feature = np.expand_dims(test_feature, axis = 1)

        linear_pipeline = make_pipeline(StandardScaler(), LinearRegression()) # svr pipeline
        linear_pipeline.fit(train_feature, train_label)  # train the model 
        pred_test_label = linear_pipeline.predict(test_feature) # test the model 
        pred_acc = pearsonr(test_label, pred_test_label)[0] # check pred acc 
        pred_acc_list.append(pred_acc) 
    
    pred_acc = np.mean(pred_acc_list) # mean of 10 folds
    return(pred_acc)

# read in the csv for neural and behaviral data
neural_data = pd.read_csv('/home/peetal/hulacon/nested_permutation/schaefer400_cope11_rm_outlier.csv') # neural
full_beh = pd.read_csv('/home/peetal/hulacon/nested_permutation/HCP_behavioral_data.csv') # behavioral
beh = full_beh[['Subject','WM_Task_2bk_Acc', 'ListSort_AgeAdj',
                'PMAT24_A_CR', 'PicVocab_AgeAdj', 'ReadEng_AgeAdj']] # ID, in-scanner, and out-scanner task

# join neural and behaviral data by subject id
merge_df = pd.merge(neural_data, beh, how='left', on=['Subject']).dropna() # join with ID

# Effect size (Cohen's D) information
parcel_es = pd.DataFrame(columns = ['parcel_name','es']) # new df for cohen's D per parcel
parcel_es['parcel_name'] = list(neural_data.columns[1:])
parcel_es['es'] = [_cohenD(neural_data, parcel) for parcel in list(neural_data.columns[1:])]
parcel_es['abs_es'] = abs(parcel_es['es'])
parcel_es = parcel_es.sort_values(by=['abs_es'], ascending = False)
all_parcels = list(parcel_es['parcel_name'])

# estimate univariate predictive accuracy:
univariate_pred_acc_wm = []; univariate_pred_acc_ls = []; univariate_pred_acc_pmat = []
univariate_pred_acc_pic = []; univariate_pred_acc_read = []

for parcel in all_parcels:
    univariate_pred_acc_wm.append(_simple_linear_cv(merge_df, 'WM_Task_2bk_Acc', parcel))
    univariate_pred_acc_ls.append(_simple_linear_cv(merge_df, 'ListSort_AgeAdj', parcel))
    univariate_pred_acc_pmat.append(_simple_linear_cv(merge_df, 'PMAT24_A_CR', parcel))
    univariate_pred_acc_pic.append(_simple_linear_cv(merge_df, 'PicVocab_AgeAdj', parcel))
    univariate_pred_acc_read.append(_simple_linear_cv(merge_df, 'ReadEng_AgeAdj', parcel))
    

In [4]:
# do and save gordon
#gordon_univariate_pred_acc = pd.DataFrame(data = {"parcel_name": all_parcels,
#                                                    "pred_wm": univariate_pred_acc_wm, 
#                                                    "pred_ls": univariate_pred_acc_ls,
#                                                    "pred_pmat": univariate_pred_acc_pmat,
#                                                    "pred_pic": univariate_pred_acc_pic,
#                                                    "pred_read": univariate_pred_acc_read})
#gordon_univariate_pred_acc.to_csv('/this/dir/gordon_univariate_pred_acc_all_parcel.csv')

In [11]:
# do and save schaefer
#schaefer_univariate_pred_acc = pd.DataFrame(data = {"parcel_name": all_parcels,
#                                                    "pred_wm": univariate_pred_acc_wm, 
#                                                    "pred_ls": univariate_pred_acc_ls,
#                                                    "pred_pmat": univariate_pred_acc_pmat,
#                                                    "pred_pic": univariate_pred_acc_pic,
#                                                    "pred_read": univariate_pred_acc_read})
#schaefer_univariate_pred_acc.to_csv('/this/dir/schaefer_univariate_pred_acc_all_parcel.csv')