In [1]:
import pandas as pd
pd.set_option('max_colwidth', 100)
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm, datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import confusion_matrix
import itertools
import scipy.stats as st
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.feature_selection import mutual_info_classif
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 500)
import rpy2
import pingouin as pg

In [None]:
# ICC - https://www.uvm.edu/~statdhtx/methods8/Supplements/icc/More%20on%20ICCs.pdf
# ICC(2,1) - Each subject is measured by each rater, and raters are considered representative
#            of a larger population of similar raters. 
#            Reliability calculated from a single measurement.

# I found two python implementations:
#     1 - pinguoin 
#     https://pingouin-stats.org/build/html/generated/pingouin.intraclass_corr.html
#     2 - R thorugh rpy2
#     https://www.r-bloggers.com/2021/06/intraclass-correlation-coefficient-in-r-quick-guide/
#     also:
#     https://stackoverflow.com/questions/40965579/intraclass-correlation-in-python-module

# ICC(2,1) for : 
# total score of HITOP
# total score of BAARS
# each subscale of HITOP
# each subscale of BAARS-IV
# GAD-7
# PHQ-8

# Functions

In [93]:
def do_icc(data, measure, show = True):
    # just the measure of interest
    data_measure = data[['Subject', measure, measure+'_recontact']]
    data_measure = data_measure.rename(columns={measure: "Original", measure+'_recontact': "Recontact"})
    data_measure_melted = pd.melt(data_measure, id_vars='Subject', value_vars=['Original','Recontact'], value_name='Score')
    data_measure_melted = data_measure_melted.rename(columns={"variable": "Session"})
    # adding Measure - don't really need to do it but eh
    shape = data_measure_melted.shape
    data_measure_melted['Measure'] = [measure] * shape[0]
    #print(measure)
    icc = pg.intraclass_corr(data=data_measure_melted, targets='Subject', raters='Session', ratings='Score').round(4)
    icc.set_index("Type")
    #print(icc)
    if show:
        plt = sns.jointplot(data=data_measure, x='Original', y='Recontact')
        plt.fig.suptitle(measure)
        #plt = plt.set(title=measure)
        #plt.subtitle(measure)
        #plt.show()
        #print(icc)
    return (icc)

def create_table (data, measures):
    df_init = pd.DataFrame()
    for measure in measures:
        if 'bothered' not in measure:
            # print(measure)
            new_icc = do_icc(data, measure, show=False)
            new_icc_row = new_icc.iloc[['1']].copy()
            new_icc_row['Measure'] = measure
            df_init = pd.concat([df_init, new_icc_row], axis = 0)
    df_result = df_init
    return (df_result)

def run_icc_analysis(data, measures):
    for measure in measures:
        if 'bothered' not in measure:
            do_icc(data, measure, show = False) # set show = "True" (default setting) if want to see the individual plots\   
    # organize results neatly
    results_table = create_table(data, measures)
    # put measure first; shift column 'Name' to first position 
    first_column = results_table.pop('Measure') 
    # insert column using insert(position,column_name,first_column) function 
    results_table.insert(0, 'Measure', first_column)
    return (results_table)

# Analysis ICC 

In [94]:
# this will be the same for any data (genpop/enriched/combined), so we can go ahead and run it
measures_for_icc = ['hitop_sum', 'baars_sum', 'phq_sum', 'gad_sum', \
                   'baars_inattention_sum', 'baars_hyperactivity_sum', 'baars_impulsivity_sum', 'baars_sct_sum',\
                   'hitop_anhedonic_depression', 'hitop_anxious_worry', 'hitop_appetite_gain',\
                   'hitop_appetite_loss', 'hitop_cognitive_problems', 'hitop_hyposomnia', 'hitop_indecisiveness',\
                   'hitop_insomnia', 'hitop_panic', 'hitop_separation_insecurity', 'hitop_shame_guilt',\
                   'hitop_situational_phobia', 'hitop_social_anxiety', 'hitop_well_being']

## Analysis original data (not invariant cores)

### Analysis NOT INV CORES - GenPop only

In [95]:
# load data
data_path_genpop = '../../data/mydata_1general_forICC_removedGRIDchecks.csv'
mydata_genpop = pd.read_csv(data_path_genpop)
mydata_genpop = mydata_genpop.rename(columns={'Unnamed: 0': 'Subject'})

# specify path where to save the results table
path_to_save_genpop = '../../results/ICC_final/ICC_fullscales_genpop_removedGRIDchecks.csv'

'''
DONT analyze data with grid checks removed on 1st visit only. ICC analysis compares responses on 1st visit to responses on 
recontact visit, obviously if we only remove failed checks on 1st visit, we will get gibberish results!
'''

'\nDONT analyze data with grid checks removed on 1st visit only. ICC analysis compares responses on 1st visit to responses on \nrecontact visit, obviously if we only remove failed checks on 1st visit, we will get gibberish results!\n'

In [96]:
# run icc
icc_results = run_icc_analysis(mydata_genpop, measures_for_icc)

In [97]:
# save results 
icc_results.to_csv(path_to_save_genpop, index=False)

In [98]:
# calculate Cronbachs alpha
pg.cronbach_alpha(data=mydata_genpop)

(np.float64(0.7695938388151335), array([0.736, 0.801]))

### Analysis NOT INV CORES - Enriched only

In [86]:
# load data
data_path_enriched = '../../data/mydata_1highrisk_forICC_removedGRIDchecks.csv'
mydata_enriched = pd.read_csv(data_path_enriched)
mydata_enriched = mydata_enriched.rename(columns={'Unnamed: 0': 'Subject'})

# specify path where to save the results table
path_to_save_enriched = '../../results/ICC_final/ICC_fullscales_enriched_removedGRIDchecks.csv'

# run icc
icc_results = run_icc_analysis(mydata_enriched, measures_for_icc)

# save results 
icc_results.to_csv(path_to_save_enriched, index=False)

# calculate Cronbachs alpha
pg.cronbach_alpha(data=mydata_enriched)

(np.float64(0.8177508535982394), array([0.785, 0.848]))

### Analysis NOT INV CORES - GenPop and Enriched COMBINED

In [90]:
# combine data
mydata_combined = pd.concat([mydata_genpop, mydata_enriched])
mydata_combined = mydata_combined.rename(columns={'Unnamed: 0': 'Subject'})

# path to save
path_to_save_combined = '../../results/ICC_final/ICC_fullscales_genpopANDenriched_removedGRIDchecks.csv'

# do the usual:
# run icc
icc_results = run_icc_analysis(mydata_combined, measures_for_icc)

# save results 
icc_results.to_csv(path_to_save_combined, index=False)

# calculate Cronbachs alpha
pg.cronbach_alpha(data=mydata_combined)

(np.float64(0.7995446332093369), array([0.777, 0.821]))

## Analysis invariant cores

In [107]:
# invariant cores that ensure at least metric invariance between validation data and GenPop/Enriched data
# I don't need to load data anymore. Data is loaded in the previous steps (not inv cores).
# I am now editing the data so that it only includes inv cores 

In [106]:
# anh depression - combined, GenPop and Enriched
mydata_combined['hitop_anhedonic_depression_invcore'] = mydata_combined['hitop39'] + mydata_combined['hitop77'] + mydata_combined['hitop84'] + mydata_combined['hitop93'] + mydata_combined['hitop182'] + mydata_combined['hitop230'] + mydata_combined['hitop246']
mydata_genpop['hitop_anhedonic_depression_invcore'] = mydata_genpop['hitop39'] + mydata_genpop['hitop77'] + mydata_genpop['hitop84'] + mydata_genpop['hitop93'] + mydata_genpop['hitop182'] + mydata_genpop['hitop230'] + mydata_genpop['hitop246']
mydata_enriched['hitop_anhedonic_depression_invcore'] = mydata_enriched['hitop39'] + mydata_enriched['hitop77'] + mydata_enriched['hitop84'] + mydata_enriched['hitop93'] + mydata_enriched['hitop182'] + mydata_enriched['hitop230'] + mydata_enriched['hitop246']

# anx worry
mydata_combined['hitop_anxious_worry_invcore'] = mydata_combined['hitop20'] + mydata_combined['hitop34'] + mydata_combined['hitop89'] + mydata_combined['hitop203'] + mydata_combined['hitop248'] 
mydata_genpop['hitop_anxious_worry_invcore'] = mydata_genpop['hitop20'] + mydata_genpop['hitop34'] + mydata_genpop['hitop89'] + mydata_genpop['hitop203'] + mydata_genpop['hitop248'] 
mydata_enriched['hitop_anxious_worry_invcore'] = mydata_enriched['hitop20'] + mydata_enriched['hitop34'] + mydata_enriched['hitop89'] + mydata_enriched['hitop203'] + mydata_enriched['hitop248'] 

# appetite gain
mydata_combined['hitop_appetite gain_invcore'] = mydata_combined['hitop120'] + mydata_combined['hitop243'] + mydata_combined['hitop275'] 
mydata_genpop['hitop_appetite gain_invcore'] = mydata_genpop['hitop120'] + mydata_genpop['hitop243'] + mydata_genpop['hitop275']
mydata_enriched['hitop_appetite gain_invcore'] = mydata_enriched['hitop120'] + mydata_enriched['hitop243'] + mydata_enriched['hitop275']

# separation insecurity
mydata_combined['hitop_separation insecurity_invcore'] = mydata_combined['hitop50'] + mydata_combined['hitop69'] + mydata_combined['hitop81'] + mydata_combined['hitop136'] + mydata_combined['hitop151'] + mydata_combined['hitop197'] 
mydata_genpop['hitop_separation insecurity_invcore'] = mydata_genpop['hitop50'] + mydata_genpop['hitop69'] + mydata_genpop['hitop81'] + mydata_genpop['hitop136'] + mydata_genpop['hitop151'] + mydata_genpop['hitop197']
mydata_enriched['hitop_separation insecurity_invcore'] = mydata_enriched['hitop50'] + mydata_enriched['hitop69'] + mydata_enriched['hitop81'] + mydata_enriched['hitop136'] + mydata_enriched['hitop151'] + mydata_enriched['hitop197']

# social anxiety
mydata_combined['hitop_social_anxiety_invcore'] = mydata_combined['hitop17'] + mydata_combined['hitop117'] + mydata_combined['hitop124'] + mydata_combined['hitop129'] + mydata_combined['hitop204'] + mydata_combined['hitop258']
mydata_genpop['hitop_social_anxiety_invcore'] = mydata_genpop['hitop17'] + mydata_genpop['hitop117'] + mydata_genpop['hitop124'] + mydata_genpop['hitop129'] + mydata_genpop['hitop204'] + mydata_genpop['hitop258']
mydata_enriched['hitop_social_anxiety_invcore'] = mydata_enriched['hitop17'] + mydata_enriched['hitop117'] + mydata_enriched['hitop124'] + mydata_enriched['hitop129'] + mydata_enriched['hitop204'] + mydata_enriched['hitop258']

# well-being
mydata_combined['hitop_well_being_invcore'] = mydata_combined['hitop9'] + mydata_combined['hitop23'] + mydata_combined['hitop106'] + mydata_combined['hitop149'] + mydata_combined['hitop200'] + mydata_combined['hitop244'] + mydata_combined['hitop250'] + mydata_combined['hitop281']
mydata_genpop['hitop_well_being_invcore'] = mydata_genpop['hitop9'] + mydata_genpop['hitop23'] + mydata_genpop['hitop106'] + mydata_genpop['hitop149'] + mydata_genpop['hitop200'] + mydata_genpop['hitop244'] + mydata_genpop['hitop250'] + mydata_genpop['hitop281']
mydata_enriched['hitop_well_being_invcore'] = mydata_enriched['hitop9'] + mydata_enriched['hitop23'] + mydata_enriched['hitop106'] + mydata_enriched['hitop149'] + mydata_enriched['hitop200'] + mydata_enriched['hitop244'] + mydata_enriched['hitop250'] + mydata_enriched['hitop281']


# BELOW IS JUST A PLACEHOLDER, I DIDNT DO INVARIANT CORE FOR RECONTACT
# THIS IS JUST TO MAME THE LOOP LOOPING CORRECTLY

mydata_combined['hitop_anhedonic_depression_invcore_recontact'] = mydata_combined['hitop39_recontact'] + mydata_combined['hitop77_recontact'] + mydata_combined['hitop84_recontact'] + mydata_combined['hitop93_recontact'] + mydata_combined['hitop182_recontact'] + mydata_combined['hitop230_recontact'] + mydata_combined['hitop246_recontact']
mydata_genpop['hitop_anhedonic_depression_invcore_recontact'] = mydata_genpop['hitop39_recontact'] + mydata_genpop['hitop77_recontact'] + mydata_genpop['hitop84_recontact'] + mydata_genpop['hitop93_recontact'] + mydata_genpop['hitop182_recontact'] + mydata_genpop['hitop230_recontact'] + mydata_genpop['hitop246_recontact']
mydata_enriched['hitop_anhedonic_depression_invcore_recontact'] = mydata_enriched['hitop39_recontact'] + mydata_enriched['hitop77_recontact'] + mydata_enriched['hitop84_recontact'] + mydata_enriched['hitop93_recontact'] + mydata_enriched['hitop182_recontact'] + mydata_enriched['hitop230_recontact'] + mydata_enriched['hitop246_recontact']

mydata_combined['hitop_anxious_worry_invcore_recontact'] = mydata_combined['hitop20_recontact'] + mydata_combined['hitop34_recontact'] + mydata_combined['hitop89_recontact'] + mydata_combined['hitop203_recontact'] + mydata_combined['hitop248_recontact'] 
mydata_genpop['hitop_anxious_worry_invcore_recontact'] = mydata_genpop['hitop20_recontact'] + mydata_genpop['hitop34_recontact'] + mydata_genpop['hitop89_recontact'] + mydata_genpop['hitop203_recontact'] + mydata_genpop['hitop248_recontact'] 
mydata_enriched['hitop_anxious_worry_invcore_recontact'] = mydata_enriched['hitop20_recontact'] + mydata_enriched['hitop34_recontact'] + mydata_enriched['hitop89_recontact'] + mydata_enriched['hitop203_recontact'] + mydata_enriched['hitop248_recontact'] 

# appetite gain
mydata_combined['hitop_appetite gain_invcore_recontact'] = mydata_combined['hitop120_recontact'] + mydata_combined['hitop243_recontact'] + mydata_combined['hitop275_recontact'] 
mydata_genpop['hitop_appetite gain_invcore_recontact'] = mydata_genpop['hitop120_recontact'] + mydata_genpop['hitop243_recontact'] + mydata_genpop['hitop275_recontact']
mydata_enriched['hitop_appetite gain_invcore_recontact'] = mydata_enriched['hitop120_recontact'] + mydata_enriched['hitop243_recontact'] + mydata_enriched['hitop275_recontact']

# separation insecurity
mydata_combined['hitop_separation insecurity_invcore_recontact'] = mydata_combined['hitop50_recontact'] + mydata_combined['hitop69_recontact'] + mydata_combined['hitop81_recontact'] + mydata_combined['hitop136_recontact'] + mydata_combined['hitop151_recontact'] + mydata_combined['hitop197_recontact'] 
mydata_genpop['hitop_separation insecurity_invcore_recontact'] = mydata_genpop['hitop50_recontact'] + mydata_genpop['hitop69_recontact'] + mydata_genpop['hitop81_recontact'] + mydata_genpop['hitop136_recontact'] + mydata_genpop['hitop151_recontact'] + mydata_genpop['hitop197_recontact']
mydata_enriched['hitop_separation insecurity_invcore_recontact'] = mydata_enriched['hitop50_recontact'] + mydata_enriched['hitop69_recontact'] + mydata_enriched['hitop81_recontact'] + mydata_enriched['hitop136_recontact'] + mydata_enriched['hitop151_recontact'] + mydata_enriched['hitop197_recontact']

# social anxiety
mydata_combined['hitop_social_anxiety_invcore_recontact'] = mydata_combined['hitop17_recontact'] + mydata_combined['hitop117_recontact'] + mydata_combined['hitop124_recontact'] + mydata_combined['hitop129_recontact'] + mydata_combined['hitop204_recontact'] + mydata_combined['hitop258_recontact']
mydata_genpop['hitop_social_anxiety_invcore_recontact'] = mydata_genpop['hitop17_recontact'] + mydata_genpop['hitop117_recontact'] + mydata_genpop['hitop124_recontact'] + mydata_genpop['hitop129_recontact'] + mydata_genpop['hitop204_recontact'] + mydata_genpop['hitop258_recontact']
mydata_enriched['hitop_social_anxiety_invcore_recontact'] = mydata_enriched['hitop17_recontact'] + mydata_enriched['hitop117_recontact'] + mydata_enriched['hitop124_recontact'] + mydata_enriched['hitop129_recontact'] + mydata_enriched['hitop204_recontact'] + mydata_enriched['hitop258_recontact']

# well-being
mydata_combined['hitop_well_being_invcore_recontact'] = mydata_combined['hitop9_recontact'] + mydata_combined['hitop23_recontact'] + mydata_combined['hitop106_recontact'] + mydata_combined['hitop149_recontact'] + mydata_combined['hitop200_recontact'] + mydata_combined['hitop244_recontact'] + mydata_combined['hitop250_recontact'] + mydata_combined['hitop281_recontact']
mydata_genpop['hitop_well_being_invcore_recontact'] = mydata_genpop['hitop9_recontact'] + mydata_genpop['hitop23_recontact'] + mydata_genpop['hitop106_recontact'] + mydata_genpop['hitop149_recontact'] + mydata_genpop['hitop200_recontact'] + mydata_genpop['hitop244_recontact'] + mydata_genpop['hitop250_recontact'] + mydata_genpop['hitop281_recontact']
mydata_enriched['hitop_well_being_invcore_recontact'] = mydata_enriched['hitop9_recontact'] + mydata_enriched['hitop23_recontact'] + mydata_enriched['hitop106_recontact'] + mydata_enriched['hitop149_recontact'] + mydata_enriched['hitop200_recontact'] + mydata_enriched['hitop244_recontact'] + mydata_enriched['hitop250_recontact'] + mydata_enriched['hitop281_recontact']


### Analysis ENV CORES - GenPop

In [108]:
# specify path where to save the results table
path_to_save_genpop_inv = '../../results/ICC_final/ICC_invcores_genpop_removedGRIDchecks.csv'

# run icc
icc_results = run_icc_analysis(mydata_genpop, measures_for_icc)

# save results 
icc_results.to_csv(path_to_save_genpop_inv, index=False)

# calculate Cronbachs alpha
pg.cronbach_alpha(data=mydata_genpop)

(np.float64(0.8003779990037636), array([0.772, 0.827]))

### Analysis ENV CORES - Enriched

In [109]:
# specify path where to save the results table
path_to_save_enriched_inv = '../../results/ICC_final/ICC_invcores_enriched_removedGRIDchecks.csv'

# run icc
icc_results = run_icc_analysis(mydata_enriched, measures_for_icc)

# save results 
icc_results.to_csv(path_to_save_enriched_inv, index=False)

# calculate Cronbachs alpha
pg.cronbach_alpha(data=mydata_enriched)

(np.float64(0.8445291688895238), array([0.816, 0.87 ]))

### Analysis ENV CORES - Combined

In [110]:
# specify path where to save the results table
path_to_save_combined_inv = '../../results/ICC_final/ICC_invcores_genpopANDenriched_removedGRIDchecks.csv'

# run icc
icc_results = run_icc_analysis(mydata_combined, measures_for_icc)

# save results 
icc_results.to_csv(path_to_save_combined_inv, index=False)

# calculate Cronbachs alpha
pg.cronbach_alpha(data=mydata_combined)

(np.float64(0.8297883924548175), array([0.811, 0.848]))