In [91]:
import pandas as pd
pd.set_option('max_colwidth', 100)
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm, datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import confusion_matrix
import itertools
import scipy.stats as st
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.feature_selection import mutual_info_classif
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 500)
import rpy2
import pingouin as pg

In [92]:
def preprocess_codesheet(path_to_codesheet):
    # load the codesheet file
    # wherever the emane column exists, creates a dict variablename:ename
    code_book = pd.read_csv(path_to_codesheet, sep='\t')
    ename_lut = {vn:en for vn, en in code_book.loc[code_book.ename.notnull(), ['var_name', 'ename']].values}  
    for vn, en in code_book.loc[code_book.ename.notnull(), ['var_name', 'ename']].values:
        ename_lut[vn+'_recontact'] = en+'_recontact'
    # this groups hitop items by subscales    
    tmp = code_book.query('ename.notnull()')
    scale_lut = {}
    for ss, df in tmp.loc[tmp.ename.str.contains('hitop') & ~tmp.ename.str.contains('today')].groupby('subscale'):
        scale_name = ss.replace(" ", "_").replace("/", "_").replace("-", "_")
        items = df.ename.values
        scale_lut[scale_name] = items
        #scale_name_initial = scale_name+'_initial'
        scale_name_recontact = scale_name+'_recontact'
        items_recontact = []
        for item in items:
            items_recontact.append(item+'_recontact')
        scale_lut[scale_name_recontact] = np.array(items_recontact, dtype=object) 
    return scale_lut, ename_lut

def clean_mood_diagnosis(dat):
    # clean mood diagnosis
    dat['mood_dx'] = ''
    dat['n_mood_dx'] = (dat.loc[:, ['FNM_Q25_1', 'FNM_Q25_2', 'FNM_Q25_3', 'FNM_Q25_4', 'FNM_Q25_5', 'FNM_Q25_6']]==1).sum(1)
    dat.loc[dat.FNM_Q25_955 == 1, 'mood_dx'] = 'other'
    dat.loc[dat.FNM_Q25_1 == 1, 'mood_dx'] = 'mdd'
    dat.loc[dat.FNM_Q25_2 == 1, 'mood_dx'] = 'persistent'
    dat.loc[dat.FNM_Q25_3 == 1, 'mood_dx'] = 'premenstrual'
    dat.loc[dat.FNM_Q25_4 == 1, 'mood_dx'] = 'bipolarI'
    dat.loc[dat.FNM_Q25_5 == 1, 'mood_dx'] = 'bipolarII'
    dat.loc[dat.FNM_Q25_6 == 1, 'mood_dx'] = 'cyclothymic'
    dat.loc[dat.n_mood_dx > 1, 'mood_dx'] = 'multiple'
    # clean other mood columns
    dat.loc[dat.mood_years == 999, 'mood_years'] = np.nan
    dat['mood_bothered']=False
    dat.loc[dat.mood_bothered_orig == 1, 'mood_bothered'] = True
    return(dat)

def clean_mood_diagnosis_recontact(dat):
    # clean mood diagnosis - recontact
    dat['mood_dx_recontact'] = ''
    dat['n_mood_dx_recontact'] = (dat.loc[:, ['FNM_Q25_1_recontact', 'FNM_Q25_2_recontact', 'FNM_Q25_3_recontact', 'FNM_Q25_4_recontact', 'FNM_Q25_5_recontact', 'FNM_Q25_6_recontact']]==1).sum(1)
    dat.loc[dat.FNM_Q25_955_recontact == 1, 'mood_dx_recontact'] = 'other'
    dat.loc[dat.FNM_Q25_1_recontact == 1, 'mood_dx_recontact'] = 'mdd'
    dat.loc[dat.FNM_Q25_2_recontact == 1, 'mood_dx_recontact'] = 'persistent'
    dat.loc[dat.FNM_Q25_3_recontact == 1, 'mood_dx_recontact'] = 'premenstrual'
    dat.loc[dat.FNM_Q25_4_recontact == 1, 'mood_dx_recontact'] = 'bipolarI'
    dat.loc[dat.FNM_Q25_5_recontact == 1, 'mood_dx_recontact'] = 'bipolarII'
    dat.loc[dat.FNM_Q25_6_recontact == 1, 'mood_dx_recontact'] = 'cyclothymic'
    dat.loc[dat.n_mood_dx_recontact > 1, 'mood_dx_recontact'] = 'multiple'
    # clean other mood columns - recontact
    # clean other mood columns
    dat.loc[dat.mood_years_recontact == 999, 'mood_years_recontact'] = np.nan
    dat['mood_bothered_recontact']=False
    dat.loc[dat.mood_bothered_recontact_orig == 1, 'mood_bothered_recontact'] = True
    return(dat)

def clean_anxiety_diagnosis(dat):
    # clean anxiety diagnosis
    dat['anxiety_dx'] = ''
    dat['n_anxiety_dx'] = (dat.loc[:, ['FNM_Q30_m_1',
                                       'FNM_Q30_m_2',
                                       'FNM_Q30_m_3', 
                                       'FNM_Q30_m_4', 
                                       'FNM_Q30_m_5',
                                       'FNM_Q30_m_6',
                                       'FNM_Q30_m_7',
                                       'FNM_Q30_m_8',]]==1).sum(1)
    dat.loc[dat.FNM_Q30_m_955 == 1, 'anxiety_dx'] = 'other'
    dat.loc[dat.FNM_Q30_m_1 == 1, 'anxiety_dx'] = 'gad'
    dat.loc[dat.FNM_Q30_m_2 == 1, 'anxiety_dx'] = 'separation'
    dat.loc[dat.FNM_Q30_m_3 == 1, 'anxiety_dx'] = 'agoraphobia'
    dat.loc[dat.FNM_Q30_m_4 == 1, 'anxiety_dx'] = 'phobia'
    dat.loc[dat.FNM_Q30_m_5 == 1, 'anxiety_dx'] = 'social'
    dat.loc[dat.FNM_Q30_m_6 == 1, 'anxiety_dx'] = 'panic_disorder'
    dat.loc[dat.FNM_Q30_m_7 == 1, 'anxiety_dx'] = 'panic_attack'
    dat.loc[dat.FNM_Q30_m_8 == 1, 'anxiety_dx'] = 'mutism'
    dat.loc[dat.n_anxiety_dx > 1, 'anxiety_dx'] = 'multiple'
    # clean other anxiety columns
    dat.loc[dat.anxiety_years == 999, 'anxiety_years'] = np.nan
    dat['anxiety_bothered']=False
    dat.loc[dat.anxiety_bothered_orig == 1, 'anxiety_bothered'] = True
    return(dat)

def clean_anxiety_diagnosis_recontact(dat):
    # clean anxiety diagnosis - recontact
    dat['anxiety_dx_recontact'] = ''
    dat['n_anxiety_dx_recontact'] = (dat.loc[:, ['FNM_Q30_m_1_recontact',
                                       'FNM_Q30_m_2_recontact',
                                       'FNM_Q30_m_3_recontact', 
                                       'FNM_Q30_m_4_recontact', 
                                       'FNM_Q30_m_5_recontact',
                                       'FNM_Q30_m_6_recontact',
                                       'FNM_Q30_m_7_recontact',
                                       'FNM_Q30_m_8_recontact',]]==1).sum(1)
    dat.loc[dat.FNM_Q30_m_955_recontact == 1, 'anxiety_dx_recontact'] = 'other'
    dat.loc[dat.FNM_Q30_m_1_recontact == 1, 'anxiety_dx_recontact'] = 'gad'
    dat.loc[dat.FNM_Q30_m_2_recontact == 1, 'anxiety_dx_recontact'] = 'separation'
    dat.loc[dat.FNM_Q30_m_3_recontact == 1, 'anxiety_dx_recontact'] = 'agoraphobia'
    dat.loc[dat.FNM_Q30_m_4_recontact == 1, 'anxiety_dx_recontact'] = 'phobia'
    dat.loc[dat.FNM_Q30_m_5_recontact == 1, 'anxiety_dx_recontact'] = 'social'
    dat.loc[dat.FNM_Q30_m_6_recontact == 1, 'anxiety_dx_recontact'] = 'panic_disorder'
    dat.loc[dat.FNM_Q30_m_7_recontact == 1, 'anxiety_dx_recontact'] = 'panic_attack'
    dat.loc[dat.FNM_Q30_m_8_recontact == 1, 'anxiety_dx_recontact'] = 'mutism'
    
    dat.loc[dat.n_anxiety_dx_recontact > 1, 'anxiety_dx_recontact'] = 'multiple'
    # clean other anxiety columns - recontact
    dat.loc[dat.anxiety_years_recontact == 999, 'anxiety_years_recontact'] = np.nan
    dat['anxiety_bothered_recontact']=False
    dat.loc[dat.anxiety_bothered_recontact_orig == 1, 'anxiety_bothered_recontact'] = True
    return(dat)

def clean_attnt_dx(dat):
    # clean attention diagnosis
    dat['attention_dx'] = ''
    dat['n_attention_dx'] = (dat.loc[:, ['FNM_Q35_m_1',
                                       'FNM_Q35_m_2',
                                      ]]==1).sum(1)
    dat.loc[dat.FNM_Q35_m_3 == 1, 'attention_dx'] = 'other'
    dat.loc[dat.FNM_Q35_m_1 == 1, 'attention_dx'] = 'adhd'
    dat.loc[dat.FNM_Q35_m_2 == 1, 'attention_dx'] = 'adhd'
    
    # clean other attention columns
    dat.loc[dat.attention_years == 999, 'attention_years'] = np.nan
    dat['attention_bothered']=False
    dat.loc[dat.attention_bothered_orig == 1, 'attention_bothered'] = True
    return(dat)

def clean_attnt_dx_recontact(dat):
    # clean attention diagnosis - recontact
    dat['attention_dx_recontact'] = ''
    dat['n_attention_dx_recontact'] = (dat.loc[:, ['FNM_Q35_m_1_recontact',
                                       'FNM_Q35_m_2_recontact',
                                      ]]==1).sum(1)
    dat.loc[dat.FNM_Q35_m_3_recontact == 1, 'attention_dx_recontact'] = 'other'
    dat.loc[dat.FNM_Q35_m_1_recontact == 1, 'attention_dx_recontact'] = 'adhd'
    dat.loc[dat.FNM_Q35_m_2_recontact == 1, 'attention_dx_recontact'] = 'adhd'
    
    # clean other attention columns - recontact
    dat.loc[dat.attention_years_recontact == 999, 'attention_years_recontact'] = np.nan
    dat['attention_bothered_recontact']=False
    dat.loc[dat.attention_bothered_recontact_orig == 1, 'attention_bothered_recontact'] = True
    return(dat)


def define_attnt_checks(dat):
    # deal with attention checks 
    dat['passed_checks'] = True
    dat['passed_grid'] = True
    dat['passed_list'] = True
    dat.loc[dat.check_moderately != 3, 'passed_checks'] = False
    dat.loc[dat.check_notatall != 1, 'passed_checks'] = False
    dat.loc[dat.check_moderately != 3, 'passed_grid'] = False
    dat.loc[dat.check_notatall != 1, 'passed_grid'] = False
    dat.loc[dat.todaycheck_1 != 1, 'passed_checks'] = False
    dat.loc[dat.todaycheck_2 != 1, 'passed_checks'] = False
    dat.loc[dat.todaycheck_1 != 1, 'passed_list'] = False
    dat.loc[dat.todaycheck_2 != 1, 'passed_list'] = False 
    # deal with attention checks - recontact
    dat['passed_checks_recontact'] = True
    dat['passed_grid_recontact'] = True
    dat['passed_list_recontact'] = True
    dat.loc[dat.check_moderately_recontact != 3, 'passed_checks_recontact'] = False
    dat.loc[dat.check_notatall_recontact != 1, 'passed_checks_recontact'] = False
    dat.loc[dat.check_moderately_recontact != 3, 'passed_grid_recontact'] = False
    dat.loc[dat.check_notatall_recontact != 1, 'passed_grid_recontact'] = False
    dat.loc[dat.todaycheck_1_recontact != 1, 'passed_checks_recontact'] = False
    dat.loc[dat.todaycheck_2_recontact != 1, 'passed_checks_recontact'] = False
    dat.loc[dat.todaycheck_1_recontact != 1, 'passed_list_recontact'] = False
    dat.loc[dat.todaycheck_2_recontact != 1, 'passed_list_recontact'] = False
    # how many passed checks?
    print('\nThis is how many ppl passed attention checks:')
    print('Calculated as 1 - dat.loc[:, [\'passed_checks\', \'passed_grid\', \'passed_list\']].mean()')
    print(1 - dat.loc[:, ['passed_checks', 'passed_grid', 'passed_list']].mean())
    print('Same for recontact:')
    print(1 - dat.loc[:, ['passed_checks_recontact', 'passed_grid_recontact', 'passed_list_recontact']].mean())
    return(dat)

def do_remove_checks(mydat, whichchecks):
    print('\nShape of initial data:')
    print(mydat.shape)
    if whichchecks == 'all':
        mydat = mydat.loc[mydat['passed_checks'] == True]
        mydat = mydat.loc[mydat['passed_checks_recontact'] == True]
    elif whichchecks == 'grid':
        mydat = mydat.loc[mydat['passed_grid'] == True]
        mydat = mydat.loc[mydat['passed_grid_recontact'] == True]
    print('\nShape of data after removing '+ whichchecks +' checks:')
    print(mydat.shape)  
    return(mydat)

def do_chronbachs_alpha(dat, m1_stems, scale_lut):
    print('\n\n-----------------||--------------------')
    print('Doing Chronbach\'s alpha...')
    # general columns (all hitop, gad, phq, plus recontact)
    for ms in m1_stems:
        print(ms)
        if "recontact" not in ms:
            desired_columns = list(dat.columns[dat.columns.str.contains(ms) & ~dat.columns.str.contains('today') & ~dat.columns.str.contains('sum') & ~dat.columns.str.contains('recontact')].values)
        else:
            desired_columns = list(dat.columns[dat.columns.str.contains(ms[:-10]) & ~dat.columns.str.contains('today') & ~dat.columns.str.contains('sum') & dat.columns.str.contains('recontact')].values)
        desired_df=dat[desired_columns]
        cr_alpha = pg.cronbach_alpha(data=desired_df)
        print(cr_alpha)
    # every hitop scale
    for item in scale_lut:
        desired_columns = scale_lut[item]
        print(item)
        desired_df=dat[desired_columns]
        cr_alpha = pg.cronbach_alpha(data=desired_df)
        print(cr_alpha)
    # ALL baars
    desired_columns_baars = ['inattention_1', 'inattention_2', 'inattention_3', 'inattention_4', 'inattention_5', 'inattention_6', 'inattention_7', 'inattention_8', 'inattention_9',
     'hyperactivity_1', 'hyperactivity_2', 'hyperactivity_3', 'hyperactivity_4', 'hyperactivity_5',
     'impulsivity_1', 'impulsivity_2', 'impulsivity_3', 'impulsivity_4']
    desired_df_baars=dat[desired_columns_baars]
    cr_alpha_baars = pg.cronbach_alpha(data=desired_df_baars)
    print('All baars')
    print(cr_alpha_baars)
    print('All baars recontact')
    desired_columns_baars_recontact = ['inattention_1_recontact', 'inattention_2_recontact', 'inattention_3_recontact', 'inattention_4_recontact', 'inattention_5_recontact', 'inattention_6_recontact', 'inattention_7_recontact', 'inattention_8_recontact', 'inattention_9_recontact',
     'hyperactivity_1_recontact', 'hyperactivity_2_recontact', 'hyperactivity_3_recontact', 'hyperactivity_4_recontact', 'hyperactivity_5_recontact',
     'impulsivity_1_recontact', 'impulsivity_2_recontact', 'impulsivity_3_recontact', 'impulsivity_4_recontact']
    desired_df_baars_recontact=dat[desired_columns_baars_recontact]
    cr_alpha_baars_recontact = pg.cronbach_alpha(data=desired_df_baars_recontact)
    print(cr_alpha_baars_recontact)
    print('\n... finished Chronbach\'s alpha')    
    print('-----------------||--------------------')
    

In [98]:
def preprocess_data(genpop_or_enriched, remove_checks, do_chronbachs):
    
    assert(genpop_or_enriched in ['genpop', 'enriched'])
    assert(remove_checks in ['no', 'grid', 'all'])
    assert(do_chronbachs in [True, False])

    print('\n\nGENPOP OR ENRICHED: ' + genpop_or_enriched)
    print('REMOVE CHECKS: ' + remove_checks)
    
    path_to_codesheet = '../dylan_github/yougov_codesheet_alignment.tsv'
    
    if remove_checks == 'no':
        if genpop_or_enriched == 'genpop':
            datapath = '../data/NIMH0007_genpop_num_OUTPUT.csv'
            path_to_write = '../data/mydata_1general_fordescriptive_removedNOchecks.csv'
            path_to_write_ICC = '../data/mydata_1general_forICC_removedNOchecks.csv'
            path_to_write_CFA = '../data/mydata_1general_forCFA_removedNOchecks.csv'
            path_to_write_CORR = '../data/mydata_1general_forCORR_removedNOchecks.csv'   
        elif genpop_or_enriched == 'enriched':
            datapath = '../data/NIMH0007_mental_health_num_OUTPUT.csv'
            path_to_write = '../data/mydata_1highrisk_fordescriptive_removedNOchecks.csv'
            path_to_write_ICC = '../data/mydata_1highrisk_forICC_removedNOchecks.csv'
            path_to_write_CFA = '../data/mydata_1highrisk_forCFA_removedNOchecks.csv'
            path_to_write_CORR = '../data/mydata_1highrisk_forCORR_removedNOchecks.csv' 
    elif remove_checks == 'grid':
        if genpop_or_enriched == 'genpop':
            datapath = '../data/NIMH0007_genpop_num_OUTPUT.csv'
            path_to_write = '../data/mydata_1general_fordescriptive_removedGRIDchecks.csv'
            path_to_write_ICC = '../data/mydata_1general_forICC_removedGRIDchecks.csv'
            path_to_write_CFA = '../data/mydata_1general_forCFA_removedGRIDchecks.csv'
            path_to_write_CORR = '../data/mydata_1general_forCORR_removedGRIDchecks.csv'   
        elif genpop_or_enriched == 'enriched':
            datapath = '../data/NIMH0007_mental_health_num_OUTPUT.csv'
            path_to_write = '../data/mydata_1highrisk_fordescriptive_removedGRIDchecks.csv'
            path_to_write_ICC = '../data/mydata_1highrisk_forICC_removedGRIDchecks.csv'
            path_to_write_CFA = '../data/mydata_1highrisk_forCFA_removedGRIDchecks.csv'
            path_to_write_CORR = '../data/mydata_1highrisk_forCORR_removedGRIDchecks.csv'         
    elif remove_checks == 'all':
        if genpop_or_enriched == 'genpop':
            datapath = '../data/NIMH0007_genpop_num_OUTPUT.csv'
            path_to_write = '../data/mydata_1general_fordescriptive_removedALLchecks.csv'
            path_to_write_ICC = '../data/mydata_1general_forICC_removedALLchecks.csv'
            path_to_write_CFA = '../data/mydata_1general_forCFA_removedALLchecks.csv'
            path_to_write_CORR = '../data/mydata_1general_forCORR_removedALLchecks.csv'   
        elif genpop_or_enriched == 'enriched':
            datapath = '../data/NIMH0007_mental_health_num_OUTPUT.csv'
            path_to_write = '../data/mydata_1highrisk_fordescriptive_removedALLchecks.csv'
            path_to_write_ICC = '../data/mydata_1highrisk_forICC_removedALLchecks.csv'
            path_to_write_CFA = '../data/mydata_1highrisk_forCFA_removedALLchecks.csv'
            path_to_write_CORR = '../data/mydata_1highrisk_forCORR_removedALLchecks.csv'             
    
    # preprocess codesheet file
    scale_lut, ename_lut = preprocess_codesheet(path_to_codesheet)

    # now opening the actual data
    dat = pd.read_csv(datapath, dtype={'caseid':str}, engine='python')
    # drop the .0 that pandas appends for some reason
    dat['caseid'] = dat.caseid.str[:-2]
    # i'm not sure what this does
    dat = dat.rename(ename_lut, axis=1)

    # renaming columns
    dat = dat.rename(columns={
        'mood_bothered': 'mood_bothered_orig',
        'mood_bothered_recontact': 'mood_bothered_recontact_orig',
        'anxiety_bothered': 'anxiety_bothered_orig',
        'anxiety_bothered_recontact': 'anxiety_bothered_recontact_orig',
        'attention_bothered': 'attention_bothered_orig',
        'attention_bothered_recontact': 'attention_bothered_recontact_orig'})    

    # clean diagnoses
    dat = clean_mood_diagnosis(dat)
    dat = clean_mood_diagnosis_recontact(dat)
    dat = clean_anxiety_diagnosis(dat)
    dat = clean_anxiety_diagnosis_recontact(dat)
    dat = clean_attnt_dx(dat)
    dat = clean_attnt_dx_recontact(dat)

    # deal with attention checks
    dat = define_attnt_checks(dat)
    if remove_checks == 'grid':
        dat = do_remove_checks(mydat = dat, whichchecks = 'grid')
    elif remove_checks == 'all':
        dat = do_remove_checks(mydat = dat, whichchecks = 'all')        
    else:
        pass

    # [QUAESTION] the what
    minus_1_cols = []
    m1_stems = ['inattention', 'hyperactivity', 'impulsivity', 'sct', 'gad', 'phq', 'hitop',
           'inattention_recontact', 'hyperactivity_recontact', 'impulsivity_recontact', 'sct_recontact', 'gad_recontact', 'phq_recontact', 'hitop_recontact']
    for ms in m1_stems:
        #print(ms)
        if "recontact" not in ms:
            cols = list(dat.columns[dat.columns.str.contains(ms) & ~dat.columns.str.contains('today') & ~dat.columns.str.contains('sum') & ~dat.columns.str.contains('recontact')].values)
        else:
            cols = list(dat.columns[dat.columns.str.contains(ms[:-10]) & ~dat.columns.str.contains('today') & ~dat.columns.str.contains('sum') & dat.columns.str.contains('recontact')].values)
        #print(cols)
        minus_1_cols.extend(cols)

    # why are we doing this
    # subtract 1 from responses
    dat.loc[:, minus_1_cols] -= 1

    if do_chronbachs:
        do_chronbachs_alpha(dat, m1_stems, scale_lut)

    # summing up the values
    sum_cols = []
    for ms in m1_stems:
        if 'hitop' not in ms:
            if "recontact" not in ms:
                cols = list(dat.columns[dat.columns.str.contains(ms) & ~dat.columns.str.contains('today') & ~dat.columns.str.contains('sum') & ~dat.columns.str.contains('recontact')].values)
                dat[f'{ms}_sum'] = dat.loc[:, cols].sum(1)
                sum_cols.append(ms + '_sum')
            else:
                cols = list(dat.columns[dat.columns.str.contains(ms[:-10]) & ~dat.columns.str.contains('today') & ~dat.columns.str.contains('sum') & dat.columns.str.contains('recontact')].values)
                dat[f'{ms[:-10]}_sum_recontact'] = dat.loc[:, cols].sum(1)
                sum_cols.append(ms[:-10] + '_sum_recontact')    
    hitop_sums = []
    for scale_name, items in scale_lut.items():
        dat[scale_name] = dat.loc[:, items].sum(1) # adding a dat[scale_name] with a sum of all values ("items)"
        hitop_sums.append(scale_name)   

    sum = 0
    for scale_name, items in scale_lut.items():
        if "well_being" not in scale_name:
            sum += len(items)

    # !!!!!!!!!! double-check this
    dat['hitop_sum'] = dat.loc[:, hitop_sums[::2][:-1]].sum(1)  # [:-1] because we don't include the well-being scale
    dat['hitop_sum_recontact'] = dat.loc[:, hitop_sums[1::2][:-1]].sum(1)
    dat['baars_sum'] = dat.inattention_sum + dat.hyperactivity_sum + dat.impulsivity_sum
    dat['baars_sum_recontact'] = dat.inattention_sum_recontact + dat.hyperactivity_sum_recontact + dat.impulsivity_sum_recontact
    dat['moodanxiety_bothered'] = dat.mood_bothered | dat.anxiety_bothered
    dat['moodanxiety_bothered_recontact'] = dat.mood_bothered_recontact | dat.anxiety_bothered_recontact

    my_columns = []
    for item in ['hitop_sum', 'baars_sum', 'phq_sum', 'gad_sum', # all sums
                 'mood_bothered','anxiety_bothered', 'attention_bothered', 'moodanxiety_bothered', # bothered
                 'inattention_sum', 'hyperactivity_sum', 'impulsivity_sum', 'sct_sum']: # each subscale of baars
        my_columns.append(item)
        my_columns.append(item+'_recontact')
    my_columns.extend(hitop_sums) #each subscale of hitop
        
    # let's save descriptive
    dat_to_save_for_descr = dat.copy(deep = True)
    dat_to_save_for_descr.to_csv(path_to_write)

    # rename columns with phq and gad sums for cnsistency in naming
    dat = dat.rename(columns={"gad_recontact_sum": "gad_sum_recontact", "phq_recontact_sum": "phq_sum_recontact"})

    # rename baars subscales
    dat = dat.rename(columns={"inattention_sum": "baars_inattention_sum", 
                          "inattention_sum_recontact": "baars_inattention_sum_recontact",
                          "hyperactivity_sum": "baars_hyperactivity_sum", 
                          "hyperactivity_sum_recontact": "baars_hyperactivity_sum_recontact",
                          "impulsivity_sum": "baars_impulsivity_sum", 
                          "impulsivity_sum_recontact": "baars_impulsivity_sum_recontact",
                          "sct_sum": "baars_sct_sum",
                          "sct_sum_recontact": "baars_sct_sum_recontact"})

    # rename hitops subscales
    rename_dict = {}
    for hitop_item in hitop_sums:
        rename_dict[hitop_item] = 'hitop_' + hitop_item
    dat = dat.rename(columns=rename_dict)

    # this is the data we will save for different types of analysis
    # we do this AFTER removing checks so we have the same data for each analysis

    # which columns to save for each analysis
    my_columns_for_icc = []
    for item in ['hitop_sum', 'baars_sum', 'phq_sum', 'gad_sum', # all sums
                 'mood_bothered','anxiety_bothered', 'attention_bothered', 'moodanxiety_bothered', # bothered
                 'baars_inattention_sum', 'baars_hyperactivity_sum', 'baars_impulsivity_sum', 'baars_sct_sum']: # each subscale of baars
        my_columns_for_icc.append(item)
        my_columns_for_icc.append(item+'_recontact')
    for hitop_item in hitop_sums:
        my_columns_for_icc.append('hitop_' + hitop_item)

    # ALL baars
    desired_columns_baars = ['inattention_1', 'inattention_2', 'inattention_3', 'inattention_4', 'inattention_5', 'inattention_6', 'inattention_7', 'inattention_8', 'inattention_9',
     'hyperactivity_1', 'hyperactivity_2', 'hyperactivity_3', 'hyperactivity_4', 'hyperactivity_5',
     'impulsivity_1', 'impulsivity_2', 'impulsivity_3', 'impulsivity_4']
    desired_columns_baars_recontact = ['inattention_1_recontact', 'inattention_2_recontact', 'inattention_3_recontact', 'inattention_4_recontact', 'inattention_5_recontact', 'inattention_6_recontact', 'inattention_7_recontact', 'inattention_8_recontact', 'inattention_9_recontact',
     'hyperactivity_1_recontact', 'hyperactivity_2_recontact', 'hyperactivity_3_recontact', 'hyperactivity_4_recontact', 'hyperactivity_5_recontact',
     'impulsivity_1_recontact', 'impulsivity_2_recontact', 'impulsivity_3_recontact', 'impulsivity_4_recontact']
    c1 = ['gad_1', 'gad_2', 'gad_3', 'gad_4', 'gad_5', 'gad_6', 'gad_7','phq_1', 'phq_2', 'phq_3', 'phq_4', 'phq_5', 'phq_6', 'phq_7', 'phq_8', 'sct_1', 'sct_2', 'sct_3', 'sct_4', 'sct_5', 'sct_6', 'sct_7', 'sct_8', 'sct_9']
    c2 = []
    for c in c1:
        c_recontact = c + '_recontact'
        c2.append(c_recontact)
    my_columns_for_cfa = my_columns_for_icc + desired_columns_baars + desired_columns_baars_recontact + c1 + c2

    # which columns I want for corrs
    # FNM_Q6_5 ---> Phq-8 poor appetite or overeating
    # FNM_Q41_m_12 --> TODAY phq-8 poor appetite or overeating
    # FNM_Q6_7 --> Trouble concentrating on things, such as school work, reading or watching TV
    # FNM_Q41_m_14 --> TODAY Trouble concentrating on things, such as school work, reading or watching TV
    # FNM_Q6_3 --> Trouble falling or staying asleep, or sleeping too much
    # FNM_Q41_m_10 --> TODAY Trouble falling or staying asleep, or sleeping too much 
    '''my_columns_for_corr = my_columns_for_cfa + ['FNM_Q6_5', 'FNM_Q41_m_12', 'FNM_Q6_7', 'FNM_Q41_m_14', 'FNM_Q6_3', 'FNM_Q41_m_10', 
                                                     'mood_bothered', 'mood_bothered_recontact', 
                                                     'anxiety_bothered', 'anxiety_bothered_recontact',
                                                     'attention_bothered', 'attention_bothered_recontact']'''
    my_columns_for_corr = my_columns_for_cfa + ['FNM_Q6_5', 'FNM_Q41_m_12', 'FNM_Q6_7', 'FNM_Q41_m_14', 'FNM_Q6_3', 'FNM_Q41_m_10']    
        
    dat_to_save_for_icc = pd.DataFrame(dat, columns=my_columns_for_icc)
    dat_to_save_for_cfa = pd.DataFrame(dat, columns=my_columns_for_cfa)
    dat_to_save_for_corr = pd.DataFrame(dat, columns=my_columns_for_corr)

    '''# QRemove all negative items
    dat_to_save_for_icc = dat_to_save_for_icc[(dat_to_save_for_icc >= 0).all(axis=1)]
    dat_to_save_for_cfa = dat_to_save_for_cfa[(dat_to_save_for_cfa >= 0).all(axis=1)]'''

    # some more preprocessing for corr
    dat_to_save_for_corr['moodanxiety_bothered'] = dat_to_save_for_corr.mood_bothered | dat_to_save_for_corr.anxiety_bothered
    dat_to_save_for_corr['moodanxiety_bothered_recontact'] = dat_to_save_for_corr.mood_bothered_recontact | dat_to_save_for_corr.anxiety_bothered_recontact
    dat_to_save_for_corr['attentionanxiety_bothered'] = dat_to_save_for_corr.attention_bothered | dat_to_save_for_corr.anxiety_bothered
    dat_to_save_for_corr['attentionanxiety_bothered_recontact'] = dat_to_save_for_corr.attention_bothered_recontact | dat_to_save_for_corr.anxiety_bothered_recontact
    dat_to_save_for_corr['moodattention_bothered'] = dat_to_save_for_corr.mood_bothered | dat_to_save_for_corr.attention_bothered
    dat_to_save_for_corr['moodattention_bothered_recontact'] = dat_to_save_for_corr.mood_bothered_recontact | dat_to_save_for_corr.attention_bothered_recontact
    dat_to_save_for_corr['moodattentionanxiety_bothered'] = dat_to_save_for_corr.mood_bothered | dat_to_save_for_corr.anxiety_bothered | dat_to_save_for_corr.attention_bothered
    dat_to_save_for_corr['moodattentionanxiety_bothered_recontact'] = dat_to_save_for_corr.mood_bothered_recontact | dat_to_save_for_corr.anxiety_bothered_recontact | dat_to_save_for_corr.attention_bothered_recontact    
    
    dat_to_save_for_corr['moodYES_anxNO_attntNO'] =  dat_to_save_for_corr.mood_bothered & ~dat_to_save_for_corr.anxiety_bothered & ~dat_to_save_for_corr.attention_bothered
    dat_to_save_for_corr['moodNO_anxYES_attntNO'] = ~dat_to_save_for_corr.mood_bothered & dat_to_save_for_corr.anxiety_bothered & ~dat_to_save_for_corr.attention_bothered
    dat_to_save_for_corr['moodNO_anxNO_attntYES'] = ~dat_to_save_for_corr.mood_bothered & ~dat_to_save_for_corr.anxiety_bothered & dat_to_save_for_corr.attention_bothered
    dat_to_save_for_corr['moodYES_anxYES_attntNO'] = dat_to_save_for_corr.mood_bothered & dat_to_save_for_corr.anxiety_bothered & ~dat_to_save_for_corr.attention_bothered
    dat_to_save_for_corr['moodYES_anxNO_attntYES'] = dat_to_save_for_corr.mood_bothered & ~dat_to_save_for_corr.anxiety_bothered & dat_to_save_for_corr.attention_bothered
    dat_to_save_for_corr['moodNO_anxYES_attntYES'] = ~dat_to_save_for_corr.mood_bothered & dat_to_save_for_corr.anxiety_bothered & dat_to_save_for_corr.attention_bothered
    dat_to_save_for_corr['moodYES_anxYES_attntYES'] = dat_to_save_for_corr.mood_bothered & dat_to_save_for_corr.anxiety_bothered & dat_to_save_for_corr.attention_bothered
    
    dat_to_save_for_corr['moodYES_anxNO_attntNO_recontact'] =  dat_to_save_for_corr.mood_bothered_recontact & ~dat_to_save_for_corr.anxiety_bothered_recontact & ~dat_to_save_for_corr.attention_bothered_recontact
    dat_to_save_for_corr['moodNO_anxYES_attntNO_recontact'] = ~dat_to_save_for_corr.mood_bothered_recontact & dat_to_save_for_corr.anxiety_bothered_recontact & ~dat_to_save_for_corr.attention_bothered_recontact
    dat_to_save_for_corr['moodNO_anxNO_attntYES_recontact'] = ~dat_to_save_for_corr.mood_bothered_recontact & ~dat_to_save_for_corr.anxiety_bothered_recontact & dat_to_save_for_corr.attention_bothered_recontact
    dat_to_save_for_corr['moodYES_anxYES_attntNO_recontact'] = dat_to_save_for_corr.mood_bothered_recontact & dat_to_save_for_corr.anxiety_bothered_recontact & ~dat_to_save_for_corr.attention_bothered_recontact
    dat_to_save_for_corr['moodYES_anxNO_attntYES_recontact'] = dat_to_save_for_corr.mood_bothered_recontact & ~dat_to_save_for_corr.anxiety_bothered_recontact & dat_to_save_for_corr.attention_bothered_recontact
    dat_to_save_for_corr['moodNO_anxYES_attntYES_recontact'] = ~dat_to_save_for_corr.mood_bothered_recontact & dat_to_save_for_corr.anxiety_bothered_recontact & dat_to_save_for_corr.attention_bothered_recontact
    dat_to_save_for_corr['moodYES_anxYES_attntYES_recontact'] = dat_to_save_for_corr.mood_bothered_recontact & dat_to_save_for_corr.anxiety_bothered_recontact & dat_to_save_for_corr.attention_bothered_recontact    

    '''# QRemove all negative items
    dat_to_save_for_corr = dat_to_save_for_corr[(dat_to_save_for_corr >= 0).all(axis=1)]'''
    
    # let's save
    dat_to_save_for_icc.to_csv(path_to_write_ICC)
    dat_to_save_for_cfa.to_csv(path_to_write_CFA)
    dat_to_save_for_corr.to_csv(path_to_write_CORR)  

    print(genpop_or_enriched + ', ' + remove_checks + ': SAVED')

In [99]:
do_chronbachs = True
for remove_checks in ['no', 'grid', 'all']:
    for genpop_or_enriched in ['genpop', 'enriched']:
        preprocess_data(genpop_or_enriched, remove_checks, do_chronbachs)

print('ALL SAVED')



GENPOP OR ENRICHED: genpop
REMOVE CHECKS: no

This is how many ppl passed attention checks:
Calculated as 1 - dat.loc[:, ['passed_checks', 'passed_grid', 'passed_list']].mean()
passed_checks    0.462
passed_grid      0.006
passed_list      0.458
dtype: float64
Same for recontact:
passed_checks_recontact    0.548
passed_grid_recontact      0.200
passed_list_recontact      0.548
dtype: float64


-----------------||--------------------
Doing Chronbach's alpha...
inattention
(0.920690566334131, array([0.91 , 0.931]))
hyperactivity
(0.7966861592799785, array([0.767, 0.824]))
impulsivity
(0.8083734923149037, array([0.779, 0.834]))
sct
(0.9038882529820571, array([0.891, 0.916]))
gad
(0.9308788922150162, array([0.921, 0.94 ]))
phq
(0.8978058255721241, array([0.884, 0.911]))
hitop
(0.9640204607873396, array([0.959, 0.968]))
inattention_recontact
(0.9822234049888874, array([0.98 , 0.984]))
hyperactivity_recontact
(0.9602928074809189, array([0.955, 0.966]))
impulsivity_recontact
(0.965361609624