In [8]:
import numpy as np
import mat73
import scipy.io as sio
import pandas as pd
import itertools as it
import os
from tqdm.auto import tqdm


from sklearn.linear_model import Ridge, ElasticNet, LinearRegression
from sklearn.kernel_ridge import KernelRidge
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
from scipy.stats import pearsonr
from sklearn.model_selection import KFold
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, f1_score
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import pandas as pd

In [9]:
# needed to allow for mat73 loading (hdf5 reader)
os.environ["HDF5_USE_FILE_LOCKING"] = "FALSE"

load_path = '../data/'
save_path = '../data/'
save_each_pred = False  # if you want to save a .csv file for each of the 120 combinations

In [10]:
############ Loading data ############

# Read in functional connectivity data
imaging_dat = mat73.loadmat(os.path.join(load_path, 'lowest_motion_rsfMRI.mat'))

# make a dataframe of subject IDs and motion values
df_imaging = pd.DataFrame({'src_subject_id':[id[0] for id in imaging_dat['mat_sub_ids_final']],
                          'lowest_motion_vals':imaging_dat['lowest_motion_vals'].ravel(),
                          'lowest_motion_connectomes':[con for con in imaging_dat['lowest_motion_connectomes'].T]})

# read in behavioral .csv files
df_behavior = pd.read_csv(os.path.join(load_path, 'combined_genon_behaviors.csv'))
df_behavior['src_subject_id'] = df_behavior['src_subject_id'].apply(lambda x: 'sub-NDARINV' + x[8:])  # adjust formatting of ID

# #Load covar files
df_covar = pd.read_csv(os.path.join(load_path, 'abcd_covariates.csv'))
df_covar['src_subject_id'] = df_covar['src_subject_id'].apply(lambda x: 'sub-NDARINV' + x[8:])  # adjust formatting of ID
df_covar['site'] = df_covar['site'].apply(lambda x: int(x[-2:]))
site_cluster_dict = dict({1:1 , 2:8 , 3:5 , 4:2 , 5:9 , 6:7 , 7:9 , 8:3 , 9:9 , 10:3 , 11:5 ,
                  12:8 , 13:3 , 14:6 , 15:7 , 16:5 , 17:1 , 18:3 , 19:4 , 20:6 , 21:10 , 22:3})  # dictionary to map sites to site clusters

df_covar['site_cluster'] = df_covar['site'].apply(lambda x: site_cluster_dict[x])  # map sites to site clusters

In [11]:
display(df_imaging)
display(df_covar)
display(df_behavior)

Unnamed: 0,src_subject_id,lowest_motion_vals,lowest_motion_connectomes
0,sub-NDARINV003RTV85,0.107034,"[0.5606011945696006, 0.7212088477897042, 0.488..."
1,sub-NDARINV007W6H7B,0.038635,"[0.0976396269791431, 1.2069281791640374, 0.129..."
2,sub-NDARINV00BD7VDC,0.071244,"[0.8459719958908221, 1.3294982196537812, 0.841..."
3,sub-NDARINV00CY2MDM,0.139307,"[0.18972710609307056, -0.033165649409561494, 0..."
4,sub-NDARINV00HEV6HB,0.101255,"[0.22358413617035067, 0.5409492657889924, 0.12..."
...,...,...,...
6959,sub-NDARINVZZ81LEEV,0.044081,"[-0.010722287012410315, 0.42799088400713436, 0..."
6960,sub-NDARINVZZL0VA2F,0.075359,"[0.4254030854684528, 0.6511216746879125, 0.130..."
6961,sub-NDARINVZZNX6W2P,0.106594,"[0.1421542154079856, 0.5161761333175069, 0.292..."
6962,sub-NDARINVZZPKBDAC,0.083939,"[0.199873706473604, 0.6059864431258375, 0.1179..."


Unnamed: 0,src_subject_id,family_id,sex,site,age,demo_comb_income_v2,site_cluster
0,sub-NDARINV003RTV85,8781,1,6,131.0,8.0,7
1,sub-NDARINV005V6D2C,10210,1,10,121.0,,3
2,sub-NDARINV007W6H7B,4722,0,22,126.0,10.0,3
3,sub-NDARINV00BD7VDC,3810,0,7,112.0,10.0,9
4,sub-NDARINV00CY2MDM,5355,0,20,130.0,6.0,6
...,...,...,...,...,...,...,...
11863,sub-NDARINVZZNX6W2P,3797,0,14,131.0,9.0,6
11864,sub-NDARINVZZPKBDAC,2445,1,12,113.0,10.0,8
11865,sub-NDARINVZZZ2ALR6,7032,1,8,121.0,10.0,3
11866,sub-NDARINVZZZNB0XC,6676,1,3,108.0,3.0,5


Unnamed: 0,src_subject_id,Race,pea_ravlt_sd_trial_vi_tc,pea_ravlt_ld_trial_vii_tc,pea_wiscv_trs,nihtbx_flanker_uncorrected,nihtbx_list_uncorrected,nihtbx_cardsort_uncorrected,nihtbx_reading_uncorrected,nihtbx_pattern_uncorrected,...,pps_y_ss_severity_score,upps_y_ss_negative_urgency,upps_y_ss_positive_urgency,upps_y_ss_lack_of_planning,upps_y_ss_lack_of_perseverance,upps_y_ss_sensation_seeking,bis_y_ss_bis_sum,bis_y_ss_bas_rr,bis_y_ss_bas_drive,bis_y_ss_bas_fs
0,sub-NDARINV00LJVZK2,Black,1.0,1.0,10.0,97.0,94.0,81.0,90.0,94.0,...,2.0,5.0,4.0,10.0,6.0,4.0,17.0,13.0,1.0,4.0
1,sub-NDARINV052HU3CU,Black,13.0,12.0,23.0,107.0,105.0,91.0,103.0,82.0,...,0.0,8.0,4.0,7.0,5.0,7.0,17.0,15.0,10.0,4.0
2,sub-NDARINV05ATJ1V1,Black,14.0,14.0,15.0,84.0,109.0,94.0,90.0,92.0,...,8.0,12.0,8.0,9.0,4.0,11.0,9.0,10.0,3.0,5.0
3,sub-NDARINV0889M0JE,Black,13.0,14.0,20.0,105.0,120.0,104.0,99.0,111.0,...,59.0,11.0,13.0,8.0,8.0,10.0,7.0,8.0,3.0,6.0
4,sub-NDARINV08FUB58A,Black,6.0,6.0,14.0,97.0,74.0,92.0,89.0,94.0,...,0.0,4.0,4.0,11.0,4.0,7.0,3.0,14.0,0.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9006,sub-NDARINVZZL0VA2F,Black,14.0,14.0,24.0,84.0,109.0,88.0,95.0,62.0,...,6.0,5.0,12.0,12.0,6.0,15.0,5.0,13.0,5.0,7.0
9007,sub-NDARINVZZLZCKAY,White,7.0,7.0,15.0,92.0,97.0,86.0,91.0,82.0,...,14.0,9.0,10.0,9.0,5.0,6.0,17.0,10.0,3.0,5.0
9008,sub-NDARINVZZNX6W2P,White,14.0,14.0,18.0,106.0,90.0,102.0,90.0,107.0,...,0.0,7.0,10.0,8.0,5.0,13.0,10.0,11.0,8.0,6.0
9009,sub-NDARINVZZPKBDAC,White,8.0,8.0,18.0,98.0,97.0,91.0,91.0,74.0,...,5.0,8.0,10.0,6.0,7.0,12.0,12.0,14.0,4.0,3.0


In [12]:
df_covar.dropna()

Unnamed: 0,src_subject_id,family_id,sex,site,age,demo_comb_income_v2,site_cluster
0,sub-NDARINV003RTV85,8781,1,6,131.0,8.0,7
2,sub-NDARINV007W6H7B,4722,0,22,126.0,10.0,3
3,sub-NDARINV00BD7VDC,3810,0,7,112.0,10.0,9
4,sub-NDARINV00CY2MDM,5355,0,20,130.0,6.0,6
6,sub-NDARINV00J52GPG,4151,0,17,110.0,6.0,1
...,...,...,...,...,...,...,...
11863,sub-NDARINVZZNX6W2P,3797,0,14,131.0,9.0,6
11864,sub-NDARINVZZPKBDAC,2445,1,12,113.0,10.0,8
11865,sub-NDARINVZZZ2ALR6,7032,1,8,121.0,10.0,3
11866,sub-NDARINVZZZNB0XC,6676,1,3,108.0,3.0,5


In [13]:
df_merged = df_behavior.merge(df_imaging[['src_subject_id', 'lowest_motion_connectomes']], how='left', on='src_subject_id')
df_merged = df_merged.merge(df_covar, how='left', on='src_subject_id')
df_merged.dropna(inplace=True)
df_merged.reset_index(drop=True, inplace=True)
df_merged

Unnamed: 0,src_subject_id,Race,pea_ravlt_sd_trial_vi_tc,pea_ravlt_ld_trial_vii_tc,pea_wiscv_trs,nihtbx_flanker_uncorrected,nihtbx_list_uncorrected,nihtbx_cardsort_uncorrected,nihtbx_reading_uncorrected,nihtbx_pattern_uncorrected,...,bis_y_ss_bas_rr,bis_y_ss_bas_drive,bis_y_ss_bas_fs,lowest_motion_connectomes,family_id,sex,site,age,demo_comb_income_v2,site_cluster
0,sub-NDARINV052HU3CU,Black,13.0,12.0,23.0,107.0,105.0,91.0,103.0,82.0,...,15.0,10.0,4.0,"[0.11681183750730145, 0.3168612347340638, 0.13...",14,0,15,123.0,7.0,7
1,sub-NDARINV08FUB58A,Black,6.0,6.0,14.0,97.0,74.0,92.0,89.0,94.0,...,14.0,0.0,3.0,"[-0.04830105229297071, 0.13237761095926748, 0....",2468,1,4,118.0,1.0,2
2,sub-NDARINV0D5J9T8P,Black,7.0,7.0,17.0,94.0,94.0,91.0,88.0,99.0,...,9.0,6.0,6.0,"[0.20385988778199252, 0.15394168107384226, 0.1...",5343,0,20,128.0,2.0,6
3,sub-NDARINV0DVK13LU,Black,9.0,9.0,18.0,92.0,97.0,91.0,80.0,82.0,...,15.0,10.0,8.0,"[0.26458490612684127, 0.25291413669924734, 0.3...",2496,1,4,128.0,1.0,2
4,sub-NDARINV0E4CT74P,Black,12.0,11.0,22.0,99.0,101.0,99.0,97.0,69.0,...,11.0,2.0,11.0,"[0.6228513039680226, 1.3187762718318077, 0.697...",11314,0,2,120.0,9.0,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4731,sub-NDARINVZZ6ZJ2KY,White,13.0,15.0,12.0,99.0,82.0,102.0,91.0,115.0,...,12.0,12.0,6.0,"[0.173489705711735, 0.7554215095552818, 0.3608...",9345,1,6,124.0,9.0,7
4732,sub-NDARINVZZ81LEEV,Black,14.0,11.0,20.0,97.0,101.0,83.0,90.0,82.0,...,14.0,5.0,8.0,"[-0.010722287012410315, 0.42799088400713436, 0...",8433,0,11,108.0,5.0,5
4733,sub-NDARINVZZL0VA2F,Black,14.0,14.0,24.0,84.0,109.0,88.0,95.0,62.0,...,13.0,5.0,7.0,"[0.4254030854684528, 0.6511216746879125, 0.130...",9942,0,22,129.0,10.0,3
4734,sub-NDARINVZZNX6W2P,White,14.0,14.0,18.0,106.0,90.0,102.0,90.0,107.0,...,11.0,8.0,6.0,"[0.1421542154079856, 0.5161761333175069, 0.292...",3797,0,14,131.0,9.0,6


In [14]:
add_ft = df_merged[["sex", "site", "age", "demo_comb_income_v2"]]

encoded_add_ft = pd.get_dummies(add_ft, columns=['sex', 'site', 'age', 'demo_comb_income_v2']).astype(int)

In [16]:
# NOTE FROM MATT: one-hot encoding here?

Index(['sex_0', 'sex_1', 'site_1', 'site_2', 'site_3', 'site_4', 'site_5',
       'site_6', 'site_7', 'site_8', 'site_9', 'site_10', 'site_11', 'site_12',
       'site_13', 'site_14', 'site_15', 'site_16', 'site_17', 'site_18',
       'site_19', 'site_20', 'site_21', 'site_22', 'age_107.0', 'age_108.0',
       'age_109.0', 'age_110.0', 'age_111.0', 'age_112.0', 'age_113.0',
       'age_114.0', 'age_115.0', 'age_116.0', 'age_117.0', 'age_118.0',
       'age_119.0', 'age_120.0', 'age_121.0', 'age_122.0', 'age_123.0',
       'age_124.0', 'age_125.0', 'age_126.0', 'age_127.0', 'age_128.0',
       'age_129.0', 'age_130.0', 'age_131.0', 'age_132.0', 'age_133.0',
       'demo_comb_income_v2_1.0', 'demo_comb_income_v2_2.0',
       'demo_comb_income_v2_3.0', 'demo_comb_income_v2_4.0',
       'demo_comb_income_v2_5.0', 'demo_comb_income_v2_6.0',
       'demo_comb_income_v2_7.0', 'demo_comb_income_v2_8.0',
       'demo_comb_income_v2_9.0', 'demo_comb_income_v2_10.0'],
      dtype='object')

### CPM with Covar

In [None]:
from sklearn.preprocessing import StandardScaler
import numpy as np
from scipy.stats import pearsonr

# Extract connectomes
connectomes = np.vstack(df_merged["lowest_motion_connectomes"].values)

# Extract additional columns as features
additional_features = encoded_add_ft.values

# Combine the connectomes with additional features
all_features = np.hstack([connectomes, additional_features])

# Standardize the combined features
scaler = StandardScaler()
all_features_standardized = scaler.fit_transform(all_features)

# Calculate 10% of edges
n_features = round(all_features_standardized.shape[1] * 0.1)

behavior_columns = df_merged.columns[2:-7]
# Loop through all the behavioral variables
for pheno in behavior_columns:
    behavior = np.array(df_merged[pheno])    
    correlations = []
    
    print(f"Calculating CPM for {pheno}")
    
    # Calculate correlations for each standardized feature
    for i in range(all_features_standardized.shape[1]):
        correlation, _ = pearsonr(all_features_standardized[:, i], behavior)
        correlations.append(correlation)
    
    correlations = np.array(correlations)
    
    # Find indices of the top features by absolute correlation
    top_indices = np.argsort(np.abs(correlations))[-n_features:]
    
    # Split into positively and negatively correlated features
    positive_indices = [idx for idx in top_indices if correlations[idx] > 0]
    negative_indices = [idx for idx in top_indices if correlations[idx] < 0]
    
    # Calculate Positive CPM score
    positive_features = all_features_standardized[:, positive_indices]
    positive_cpm_score = positive_features.sum(axis=1)  # Sum across selected features for each subject
    
    # Calculate Negative CPM score
    negative_features = all_features_standardized[:, negative_indices]
    negative_cpm_score = negative_features.sum(axis=1)  # Sum across selected features for each subject
    
    # Calculate Total CPM score
    total_cpm_score = positive_cpm_score - negative_cpm_score
    
    # Add the scores to the dataframe
    df_merged[f"{pheno}_positive_cpm"] = positive_cpm_score
    df_merged[f"{pheno}_negative_cpm"] = negative_cpm_score
    df_merged[f"{pheno}_total_cpm"] = total_cpm_score

### CPM WIthout COVAR

In [None]:
from sklearn.preprocessing import StandardScaler
import numpy as np
from scipy.stats import pearsonr

# Extract connectomes
connectomes = np.vstack(df_merged["lowest_motion_connectomes"].values)

# # Extract additional columns as features
# additional_features = encoded_add_ft.values

# # Combine the connectomes with additional features
# all_features = np.hstack([connectomes, additional_features])

# Standardize the combined features
scaler = StandardScaler()
all_features_standardized = scaler.fit_transform(connectomes)

# Calculate 10% of edges
n_features = round(all_features_standardized.shape[1] * 0.1)

behavior_columns = df_merged.columns[2:-7]
# Loop through all the behavioral variables
for pheno in behavior_columns:
    behavior = np.array(df_merged[pheno])    
    correlations = []
    
    print(f"Calculating CPM for {pheno}")
    
    # Calculate correlations for each standardized feature
    for i in range(all_features_standardized.shape[1]):
        correlation, _ = pearsonr(all_features_standardized[:, i], behavior)
        correlations.append(correlation)
    
    correlations = np.array(correlations)
    
    # Find indices of the top features by absolute correlation
    top_indices = np.argsort(np.abs(correlations))[-n_features:]
    
    # Split into positively and negatively correlated features
    positive_indices = [idx for idx in top_indices if correlations[idx] > 0]
    negative_indices = [idx for idx in top_indices if correlations[idx] < 0]
    
    # Calculate Positive CPM score
    positive_features = all_features_standardized[:, positive_indices]
    positive_cpm_score = positive_features.sum(axis=1)  # Sum across selected features for each subject
    
    # Calculate Negative CPM score
    negative_features = all_features_standardized[:, negative_indices]
    negative_cpm_score = negative_features.sum(axis=1)  # Sum across selected features for each subject
    
    # Calculate Total CPM score
    total_cpm_score = positive_cpm_score - negative_cpm_score
    
    # Add the scores to the dataframe
    df_merged[f"{pheno}_positive_cpm"] = positive_cpm_score
    df_merged[f"{pheno}_negative_cpm"] = negative_cpm_score
    df_merged[f"{pheno}_total_cpm"] = total_cpm_score