We want to load the .npz files for each training dataset / test dataset / phenotype combination. Then, we will aggregate across all datasets/phenotypes and save several different .csv files:

- within.csv
- within_grouped.csv
- performance_grouped.csv
- power_grouped.csv
- effect_size.csv
- full_test.csv


In [2]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from scipy import stats
from scipy.stats import norm
import os

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
'''
Select dataset type as one of the following options:

main: primary results for developmental functional connectivity analysis (HBN, ABCD, HCPD, PNC)

dev_sc: results using developmental structural connectivity (HBN, HCPD, QTAB)

adult_fc: results using adult functional connectivity (CHCP, HCP)

adult_sc: results using adult structural connectivity (CHCP, HCP)

data_amount_train: results varying the scan length of the training data

data_amount_test: results varying the scan length of the external/test data
'''

dataset_type = 'main'

# set save path
if dataset_type=='dev_sc':
    save_path = '/data_dustin/store3/training/matt/repro_data_final/updates_nat_hum_behav/results/dev_sc/processed_csv_files' 
    load_path = '/data_dustin/store3/training/matt/repro_data_final/updates_nat_hum_behav/results/dev_sc/' 

    # load in sample size information
    df_sample_size = pd.read_csv('/data_dustin/store3/training/matt/repro_data_final/updates_nat_hum_behav/sample_size/dev_sc_pheno_dataset_sample_size.csv')
    pheno_all = list( df_sample_size.pheno.unique() )

elif dataset_type=='adult_fc':
    save_path = '/data_dustin/store3/training/matt/repro_data_final/updates_nat_hum_behav/results/adult/adult_fc_processed_csv_files' 
    load_path = '/data_dustin/store3/training/matt/repro_data_final/updates_nat_hum_behav/results/adult/' 

    # load in sample size information
    df_sample_size = pd.read_csv('/data_dustin/store3/training/matt/repro_data_final/updates_nat_hum_behav/sample_size/adult_pheno_dataset_sample_size.csv')
    df_sample_size = df_sample_size[df_sample_size.dataset.isin(['chcp', 'hcp'])].reset_index(drop=True)
    pheno_all = list( df_sample_size.pheno.unique() )
    
elif dataset_type=='adult_sc':
    save_path = '/data_dustin/store3/training/matt/repro_data_final/updates_nat_hum_behav/results/adult/adult_sc_processed_csv_files' 
    load_path = '/data_dustin/store3/training/matt/repro_data_final/updates_nat_hum_behav/results/adult/' 
    
    # load in sample size information
    df_sample_size = pd.read_csv('/data_dustin/store3/training/matt/repro_data_final/updates_nat_hum_behav/sample_size/adult_pheno_dataset_sample_size.csv')
    df_sample_size = df_sample_size[df_sample_size.dataset.isin(['chcp_sc', 'hcp_sc'])].reset_index(drop=True)

    pheno_all = list( df_sample_size.pheno.unique() )
elif dataset_type=='data_amount_train':
    save_path = '/data_dustin/store3/training/matt/repro_data_final/updates_nat_hum_behav/results/data_amount/data_amount_train_processed_csv_files' 
    load_path = '/data_dustin/store3/training/matt/repro_data_final/updates_nat_hum_behav/results/data_amount/' 
    
    # load in sample size information
    df_sample_size = pd.read_csv('/data_dustin/store3/training/matt/repro_data_final/updates_nat_hum_behav/sample_size/data_amount_pheno_dataset_sample_size.csv')
    
    pheno_all = [p for p in list( df_sample_size.pheno.unique() ) if (p!='mr_scaled' and p!='wm_corrected')]  # remove scaled metrics

elif dataset_type=='data_amount_test':
    save_path = '/data_dustin/store3/training/matt/repro_data_final/updates_nat_hum_behav/results/data_amount/data_amount_test_processed_csv_files' 
    load_path = '/data_dustin/store3/training/matt/repro_data_final/updates_nat_hum_behav/results/data_amount/' 
    
    # load in sample size information
    df_sample_size = pd.read_csv('/data_dustin/store3/training/matt/repro_data_final/updates_nat_hum_behav/sample_size/pheno_dataset_sample_size.csv')
    df_sample_size = df_sample_size[df_sample_size.dataset!='abcd'].reset_index(drop=True)
    
    pheno_all = [p for p in list( df_sample_size.pheno.unique() ) if (p!='mr_scaled' and p!='wm_corrected')]  # remove scaled metrics
elif dataset_type=='main':
    save_path = '/data_dustin/store3/training/matt/repro_data_final/updates_nat_hum_behav/results/processed_csv_files'
    load_path = '/data_dustin/store3/training/matt/repro_data_final/updates_nat_hum_behav/results/'

    # load in sample size information
    df_sample_size = pd.read_csv('/data_dustin/store3/training/matt/repro_data_final/updates_nat_hum_behav/sample_size/pheno_dataset_sample_size.csv')
    pheno_all = list( df_sample_size.pheno.unique() )
    
df_sample_size.head()

Unnamed: 0,dataset,pheno,n,n_heldout,n_train,percent_heldout
0,abcd,age,7996,1600,6396,0.2001
1,hbn,age,1201,200,1001,0.166528
2,hcpd,age,599,100,499,0.166945
3,pnc,age,1179,200,979,0.169635
4,abcd,mr,7846,1600,6246,0.203926


In [4]:
# get held-out sizes for each dataset
heldout_size_dict = dict({'abcd':1600, 'hbn':200, 'pnc':200, 'hcpd':100,
                         'hbn_sc':500, 'hcpd_sc':100, 'qtab_sc':100,
                         'hcp':200, 'hcp_sc':200, 'chcp':75, 'chcp_sc':75,
                         'abcd_1_scans':800, 'abcd_2_scans':800, 'abcd_3_scans':800, 'abcd_4_scans':800})

In [5]:
# define various functions

def r_to_p(r, n):
    '''
    for one-sided positive test only
    '''
    t = r / np.sqrt((1-r**2)/ (n-2) )
    p = stats.t.sf(t, df=n-2)   # positive only
    
    return p

def p_to_r(p, n):
    '''
    for one-sided positive test only, undoes r_to_p
    '''
    t_inv = stats.t.isf(p, df=n-2)
    r = np.sqrt(t_inv**2 / ( (n-2) * (1+t_inv**2/(n-2)) ))
    return r 

# functions for finding quantiles from dataframes
def qlower(x):
    return x.quantile(0.025)
def qupper(x):
    return x.quantile(0.975)


def power_curve_1t(r, N):
    '''
    power curve for correlation (one-tailed, testing >0)
    '''
    # MATLAB code: power1 = 1 - cdf('Normal',1.96,atanh(r_true)*sqrt(N-3), 1);  % one tail
    N = np.array(N)  # allows for multiple N
    return 1-norm.cdf(x=1.645, loc=np.arctanh(r)*np.sqrt(N-3), scale=1)

# Process .npz files and save as .csv files

In [5]:
'''
This cell takes loads the .npz files for each training dataset / test dataset / phenotype combination
Then, it saves a .csv file for each .npz file
'''

# number of permutations we used
num_perm_external = 500
num_perm_internal = 10000

# loop over all possible phenotypes
for pheno in tqdm(pheno_all):
    
    # get all datasets
    if dataset_type!='main':
        all_datasets = list(df_sample_size.dataset.unique())
    else:
        # get relevant datasets for each phenotype
        if (pheno=='mr_scaled') or (pheno=='wm_corrected'):  # no scaled phenotypes for PNC
            all_datasets = ['abcd', 'hbn', 'hcpd']
        else:
            all_datasets = ['abcd', 'hbn', 'hcpd', 'pnc']

    # loop over all training datasets
    for train_dataset in all_datasets:
        
        # get corresponding test datasets and loop over
        if 'data_amount' not in dataset_type:
            all_test_datasets = [t for t in all_datasets if t!=train_dataset]
        elif dataset_type=='data_amount_train':
            all_test_datasets = ['hbn', 'hcpd', 'pnc']
        elif dataset_type=='data_amount_test':
            all_test_datasets = ['abcd_1_scans', 'abcd_2_scans', 'abcd_3_scans', 'abcd_4_scans']
            
        # loop over all test datasets
        for within_idx, test_dataset in enumerate(all_test_datasets):

            #################### loading and make initial dataframe ####################
            dat = np.load(os.path.join(load_path,
                                       'across_aggregated/results_train_'+ train_dataset + '_test_' +\
                          test_dataset + '_pheno_' + pheno + '.npz') )
            df_results = pd.DataFrame()
            for k in dat.files:  # read in all saved variables into dataframe columns
                df_results[k] = dat[k]


            #################### add new columns ####################
            # first get heldout size for within dataset internal validation
            df_results['heldout_size'] = np.array([heldout_size_dict[d] for d in list(df_results.train_dataset)])

            # now get p values for internal and external validation
            df_results['p_internal'] = r_to_p(np.array(df_results.r_internal), np.array(df_results.heldout_size))
            df_results['p_external'] = r_to_p(np.array(df_results.r_external), np.array(df_results.num_test))

            # add column for combination of dataset and phenotype
            df_results['test_dataset_pheno'] = df_results['test_dataset'] + '_' + df_results['pheno']
            df_results['train_dataset_pheno'] = df_results['train_dataset'] + '_' + df_results['pheno']

            # add in max training/test sizes
            df_results['ntest_max'] = df_results.num_test.max()
            df_results['ntrain_max'] = df_results.num_train.max()

            # adjust permutation test p values by 1/num_perm
            df_results['p_perm_mae_internal'] = df_results['p_perm_mae_internal'].apply(lambda x: 1/num_perm_internal+x)
            df_results['p_perm_mae_external'] = df_results['p_perm_mae_external'].apply(lambda x: 1/num_perm_external+x)
            df_results['p_perm_r_internal'] = df_results['p_perm_r_internal'].apply(lambda x: 1/num_perm_internal+x)
            df_results['p_perm_r_external'] = df_results['p_perm_r_external'].apply(lambda x: 1/num_perm_external+x)
            
            # add ground truth performance
            df_results['train_test_pheno'] = df_results['train_dataset'] + '_' + df_results['test_dataset'] + '_' + df_results['pheno']
            df_results['r_gt'] = df_results[ (df_results.num_train==df_results.num_train.max()) &
                                            (df_results.num_test==df_results.num_test.max()) ].r_external.median()
            df_results['p_gt'] = df_results[ (df_results.num_train==df_results.num_train.max()) &
                                            (df_results.num_test==df_results.num_test.max()) ].p_external.median()
            df_results['mae_gt'] = df_results[ (df_results.num_train==df_results.num_train.max()) &
                                            (df_results.num_test==df_results.num_test.max()) ].mae_external.median()
            df_results['p_mae_gt'] = df_results[ (df_results.num_train==df_results.num_train.max()) &
                                            (df_results.num_test==df_results.num_test.max()) ].p_perm_mae_external.median()
            df_results['p_mae_gt_internal'] = df_results[ (df_results.num_train==df_results.num_train.max()) ].p_perm_mae_internal.median()
            
            # add in columns for significance
            pthresh = 0.05
            df_results['sig_internal'] = 1.0*(df_results['p_internal']<pthresh)
            df_results['sig_external'] = 1.0*(df_results['p_external']<pthresh)
            df_results['sig_gt'] = 1.0*(df_results['p_gt']<pthresh)
            df_results['sig_internal_mae'] = 1.0*(df_results['p_perm_mae_internal']<pthresh)
            df_results['sig_external_mae'] = 1.0*(df_results['p_perm_mae_external']<pthresh)
            df_results['sig_gt_mae'] = 1.0*(df_results['p_mae_gt']<pthresh)
                    
            if within_idx==0:  # only want within dataset on first loop iteration
                #################### within ####################
                # get within-dataset dataframe as just any single test seed and test size 
                df_within = df_results.groupby(['train_dataset', 'pheno', 'num_train', 'train_seed'], 
                                               as_index=False).agg({'r_internal':'median',
                                                                    'mae_internal':'median',
                                                                   'p_mae_gt_internal':'median',
                                                                    'q2_internal':'median'})
                df_within.to_csv(os.path.join(save_path, 'within',
                                              'within_{:s}_{:s}.csv'.format(train_dataset, pheno)),
                                 index=False)

                #################### within_grouped ####################
                # get median within datasets performance
                df_within_grouped = df_within.groupby(['train_dataset', 'pheno', 'num_train'],
                                  as_index=False).agg(r_med=('r_internal', 'median'),
                                                      r_lower=('r_internal', qlower),
                                                      r_upper=('r_internal', qupper),
                                                     mae_med=('mae_internal', 'median'),
                                                     mae_lower=('mae_internal', qlower),
                                                     mae_upper=('mae_internal', qupper),
                                                     p_mae_gt=('p_mae_gt_internal', 'median'))
                df_within_grouped.to_csv(os.path.join(save_path, 'within_grouped',
                                              'within_grouped_{:s}_{:s}.csv'.format(train_dataset, pheno)),
                                 index=False)

            #################### performance_grouped ####################
            df_performance_grouped = df_results.groupby(['train_dataset', 'test_dataset', 'pheno', 'num_train', 'num_test'],
                                  as_index=False).agg(r_med=('r_external', 'median'),
                                                     r_lower=('r_external', qlower),
                                                     r_upper=('r_external', qupper),
                                                     mae_med=('mae_external', 'median'),
                                                     mae_lower=('mae_external', qlower),
                                                     mae_upper=('mae_external', qupper),
                                                     r_gt=('r_gt', 'min'),
                                                      mae_gt=('mae_gt', 'min'),
                                                     r_pos_rate=('sig_external', 'mean'),
                                                      mae_pos_rate = ('sig_external_mae', 'mean'),
                                                     r_sig_ground_truth=('sig_gt', 'mean'),
                                                    mae_sig_ground_truth=('sig_gt_mae', 'mean'),
                                                     r_ground_truth=('r_gt', 'max'),
                                                     mae_ground_truth=('mae_gt', 'max')
                                                     )
            df_performance_grouped.to_csv(os.path.join(save_path, 'performance_grouped',
                                                 'performance_grouped_{:s}_{:s}_{:s}.csv'.format(train_dataset, test_dataset, pheno)),
                                    index=False)
                
            #################### power_grouped ####################
            # calculate power as rate of significant results (for significant ground truth results only)
            df_power_grouped = df_results.groupby(['train_dataset', 'test_dataset', 'pheno', 'num_train', 'num_test'],
                                  as_index=False).agg(r_pos_rate=('sig_external', 'mean'),
                                                     r_sig_ground_truth=('sig_gt', 'mean'),
                                                     r_ground_truth=('r_gt', 'max'),
                                                     mae_pos_rate=('sig_external_mae', 'mean'),
                                                     mae_sig_ground_truth=('sig_gt_mae', 'mean'),
                                                     mae_ground_truth=('mae_gt', 'max'))   
            df_power_grouped.to_csv(os.path.join(save_path, 'power_grouped',
                                                 'power_grouped_{:s}_{:s}_{:s}.csv'.format(train_dataset,
                                                                                           test_dataset,
                                                                                           pheno)),
                                    index=False)


            #################### effect_size ####################
            # calculate median performance
            df_effect_size = df_results.groupby(['train_dataset', 'test_dataset', 'pheno', 
                                                 'num_train', 'num_test', 'sig_external'],
                                                as_index=False).agg(r_med=('r_external', 'median'),
                                                     r_ground_truth=('r_gt', 'max'),
                                                     r_sig_ground_truth=('sig_gt', 'max'),
                                                     mae_med=('mae_external', 'median'),
                                                     mae_ground_truth=('mae_gt', 'max'),
                                                     mae_sig_ground_truth=('sig_gt_mae', 'max'))

            # add in column for inflation of effects relative to ground truth
            df_effect_size['r_infl_med'] = df_effect_size['r_med'] - df_effect_size['r_ground_truth']  # difference between test and ground truth
            df_effect_size['mae_infl_med'] = df_effect_size['mae_med'] - df_effect_size['mae_ground_truth']  # difference between test and ground truth
           
            # do another one where grouping by MAE significance
            df_effect_size_mae = df_results.groupby(['train_dataset', 'test_dataset', 'pheno',
                                                 'num_train', 'num_test', 'sig_external_mae'],
                                                as_index=False).agg(r_med=('r_external', 'median'),
                                                                 r_ground_truth=('r_gt', 'max'),
                                                                 r_sig_ground_truth=('sig_gt', 'max'),
                                                                 mae_med=('mae_external', 'median'),
                                                                 mae_ground_truth=('mae_gt', 'max'),
                                                                 mae_sig_ground_truth=('sig_gt_mae', 'max'))
            # add in column for inflation of effects relative to ground truth
            df_effect_size_mae['mae_infl_med'] = df_effect_size_mae['mae_med'] - df_effect_size_mae['mae_ground_truth']  # difference between test and ground truth
            df_effect_size_mae['r_infl_med'] = df_effect_size_mae['r_med'] - df_effect_size_mae['r_ground_truth']  # difference between test and ground truth
            
            
            # combine r and MAE inflation dataframes with new column
            df_effect_size['eval_metric'] = 'r'
            df_effect_size_mae['eval_metric'] = 'mae'
            df_effect_size = pd.concat([df_effect_size, df_effect_size_mae]).reset_index(drop=True)
            
            df_effect_size.to_csv(os.path.join(save_path, 'effect_size', 
                                               'effect_size_{:s}_{:s}_{:s}.csv'.format(train_dataset,
                                                                                       test_dataset,
                                                                                       pheno)),
                                  index=False)

            
            #################### full_test ####################

            # calculate difference between internal and external performance
            df_results['r_internal_external_diff'] = df_results.r_external - df_results.r_internal
            df_results['mae_internal_external_diff'] = df_results.mae_external - df_results.mae_internal            

            # take only data at full test sample size
            df_full_test = df_results[(df_results.num_test==df_results.ntest_max) & 
                                    (df_results.test_seed==0)].reset_index(drop=True)
            df_full_test.to_csv(os.path.join(save_path, 'full_test', 'full_test_{:s}_{:s}_{:s}.csv'.format(train_dataset, test_dataset, pheno)),
                                index=False)

100%|████████████████████████████████████████████| 6/6 [30:45<00:00, 307.56s/it]


# Combine all processed .csv files into larger files 

In [6]:
'''
This cell aggregates all the saved .csv files
'''

# loop over possible folder names
folder_name_all = [ 'within', 'within_grouped', 'performance_grouped',
                   'power_grouped', 'effect_size', 'full_test']
for folder_name in folder_name_all:
    
    # initialize dataframe
    df_combined = pd.DataFrame()
    
    # loop over all possible phenotypes
    for pheno in pheno_all:
        
        # get relevant datasets for each phenotype
        if dataset_type!='main':  # for developmental structural connectivity analysis
             all_datasets = list(df_sample_size.dataset.unique())
        else:  # for main analysis
            if (pheno=='mr_scaled') or (pheno=='wm_corrected'):  # no pnc in scaled matrix reasoning or scaled working memory
                all_datasets = ['abcd', 'hbn', 'hcpd']
            else:
                all_datasets = ['abcd', 'hbn', 'hcpd', 'pnc']

        # loop over all training datasets
        for train_dataset in all_datasets:

            # get corresponding test datasets and loop over
            if 'within' in folder_name:
                all_test_datasets = [np.nan]
            elif dataset_type=='data_amount_train':
                all_test_datasets = ['hbn', 'hcpd', 'pnc']
            elif dataset_type=='data_amount_test':
                all_test_datasets = ['abcd_1_scans', 'abcd_2_scans', 'abcd_3_scans', 'abcd_4_scans']
            else:
                all_test_datasets = [t for t in all_datasets if t!=train_dataset]
    
            # loop over all test datasets
            for within_idx, test_dataset in enumerate(all_test_datasets):
                
                # do different things whether within-dataset or cross-dataset
                if 'within' in folder_name:  # for within-dataset, no consideration of test dataset
                    if within_idx==0:  # only need to save one for within-dataset (don't loop over test datasets)
                        # load dataframe for combination of .csv type, training dataset, phenotype
                        df_tmp = pd.read_csv(os.path.join(save_path,
                                                          folder_name,
                                                          '{:s}_{:s}_{:s}.csv'.format(folder_name,
                                                                                      train_dataset,
                                                                                      pheno)))
                        # combine dataframes into larger one
                        df_combined = pd.concat([df_combined, df_tmp])
                else:  # for external validation, consider both training and test dataset
                    # load dataframe for combination of .csv type, training dataset, test dataset, phenotype
                    df_tmp = pd.read_csv(os.path.join(save_path,
                                                      folder_name,
                                                      '{:s}_{:s}_{:s}_{:s}.csv'.format(folder_name,
                                                                                       train_dataset,
                                                                                       test_dataset,
                                                                                       pheno)))

                    # combine dataframes into larger one
                    df_combined = pd.concat([df_combined, df_tmp])
                
    # save dataframe   
    df_combined = df_combined.reset_index(drop=True)
    df_combined.to_csv(os.path.join(save_path, folder_name+'.csv'), index=False)