In [1]:
import pandas as pd
pd.set_option('max_colwidth', 100)
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm, datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import confusion_matrix
import itertools
import scipy.stats as st
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.feature_selection import mutual_info_classif
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 500)
import rpy2
import pingouin as pg

In [2]:
def preprocess_codesheet(path_to_codesheet):
    # load the codesheet file
    # wherever the emane column exists, creates a dict variablename:ename
    code_book = pd.read_csv(path_to_codesheet, sep='\t')
    ename_lut = {vn:en for vn, en in code_book.loc[code_book.ename.notnull(), ['var_name', 'ename']].values}  
    for vn, en in code_book.loc[code_book.ename.notnull(), ['var_name', 'ename']].values:
        ename_lut[vn+'_recontact'] = en+'_recontact'
    # this groups hitop items by subscales    
    tmp = code_book.query('ename.notnull()')
    scale_lut = {}
    for ss, df in tmp.loc[tmp.ename.str.contains('hitop') & ~tmp.ename.str.contains('today')].groupby('subscale'):
        scale_name = ss.replace(" ", "_").replace("/", "_").replace("-", "_")
        items = df.ename.values
        scale_lut[scale_name] = items
        #scale_name_initial = scale_name+'_initial'
        scale_name_recontact = scale_name+'_recontact'
        items_recontact = []
        for item in items:
            items_recontact.append(item+'_recontact')
        scale_lut[scale_name_recontact] = np.array(items_recontact, dtype=object) 
    return scale_lut, ename_lut

def clean_mood_diagnosis(dat):
    # clean mood diagnosis
    dat['mood_dx'] = ''
    dat['n_mood_dx'] = (dat.loc[:, ['FNM_Q25_1', 'FNM_Q25_2', 'FNM_Q25_3', 'FNM_Q25_4', 'FNM_Q25_5', 'FNM_Q25_6']]==1).sum(1)
    dat.loc[dat.FNM_Q25_955 == 1, 'mood_dx'] = 'other'
    dat.loc[dat.FNM_Q25_1 == 1, 'mood_dx'] = 'mdd'
    dat.loc[dat.FNM_Q25_2 == 1, 'mood_dx'] = 'persistent'
    dat.loc[dat.FNM_Q25_3 == 1, 'mood_dx'] = 'premenstrual'
    dat.loc[dat.FNM_Q25_4 == 1, 'mood_dx'] = 'bipolarI'
    dat.loc[dat.FNM_Q25_5 == 1, 'mood_dx'] = 'bipolarII'
    dat.loc[dat.FNM_Q25_6 == 1, 'mood_dx'] = 'cyclothymic'
    dat.loc[dat.n_mood_dx > 1, 'mood_dx'] = 'multiple'
    # clean other mood columns
    dat.loc[dat.mood_years == 999, 'mood_years'] = np.nan
    dat['mood_bothered']=False
    dat.loc[dat.mood_bothered_orig == 1, 'mood_bothered'] = True
    return(dat)

def clean_mood_diagnosis_recontact(dat):
    # clean mood diagnosis - recontact
    dat['mood_dx_recontact'] = ''
    dat['n_mood_dx_recontact'] = (dat.loc[:, ['FNM_Q25_1_recontact', 'FNM_Q25_2_recontact', 'FNM_Q25_3_recontact', 'FNM_Q25_4_recontact', 'FNM_Q25_5_recontact', 'FNM_Q25_6_recontact']]==1).sum(1)
    dat.loc[dat.FNM_Q25_955_recontact == 1, 'mood_dx_recontact'] = 'other'
    dat.loc[dat.FNM_Q25_1_recontact == 1, 'mood_dx_recontact'] = 'mdd'
    dat.loc[dat.FNM_Q25_2_recontact == 1, 'mood_dx_recontact'] = 'persistent'
    dat.loc[dat.FNM_Q25_3_recontact == 1, 'mood_dx_recontact'] = 'premenstrual'
    dat.loc[dat.FNM_Q25_4_recontact == 1, 'mood_dx_recontact'] = 'bipolarI'
    dat.loc[dat.FNM_Q25_5_recontact == 1, 'mood_dx_recontact'] = 'bipolarII'
    dat.loc[dat.FNM_Q25_6_recontact == 1, 'mood_dx_recontact'] = 'cyclothymic'
    dat.loc[dat.n_mood_dx_recontact > 1, 'mood_dx_recontact'] = 'multiple'
    # clean other mood columns - recontact
    # clean other mood columns
    dat.loc[dat.mood_years_recontact == 999, 'mood_years_recontact'] = np.nan
    dat['mood_bothered_recontact']=False
    dat.loc[dat.mood_bothered_recontact_orig == 1, 'mood_bothered_recontact'] = True
    return(dat)

def clean_anxiety_diagnosis(dat):
    # clean anxiety diagnosis
    dat['anxiety_dx'] = ''
    dat['n_anxiety_dx'] = (dat.loc[:, ['FNM_Q30_m_1',
                                       'FNM_Q30_m_2',
                                       'FNM_Q30_m_3', 
                                       'FNM_Q30_m_4', 
                                       'FNM_Q30_m_5',
                                       'FNM_Q30_m_6',
                                       'FNM_Q30_m_7',
                                       'FNM_Q30_m_8',]]==1).sum(1)
    dat.loc[dat.FNM_Q30_m_955 == 1, 'anxiety_dx'] = 'other'
    dat.loc[dat.FNM_Q30_m_1 == 1, 'anxiety_dx'] = 'gad'
    dat.loc[dat.FNM_Q30_m_2 == 1, 'anxiety_dx'] = 'separation'
    dat.loc[dat.FNM_Q30_m_3 == 1, 'anxiety_dx'] = 'agoraphobia'
    dat.loc[dat.FNM_Q30_m_4 == 1, 'anxiety_dx'] = 'phobia'
    dat.loc[dat.FNM_Q30_m_5 == 1, 'anxiety_dx'] = 'social'
    dat.loc[dat.FNM_Q30_m_6 == 1, 'anxiety_dx'] = 'panic_disorder'
    dat.loc[dat.FNM_Q30_m_7 == 1, 'anxiety_dx'] = 'panic_attack'
    dat.loc[dat.FNM_Q30_m_8 == 1, 'anxiety_dx'] = 'mutism'
    dat.loc[dat.n_anxiety_dx > 1, 'anxiety_dx'] = 'multiple'
    # clean other anxiety columns
    dat.loc[dat.anxiety_years == 999, 'anxiety_years'] = np.nan
    dat['anxiety_bothered']=False
    dat.loc[dat.anxiety_bothered_orig == 1, 'anxiety_bothered'] = True
    return(dat)

def clean_anxiety_diagnosis_recontact(dat):
    # clean anxiety diagnosis - recontact
    dat['anxiety_dx_recontact'] = ''
    dat['n_anxiety_dx_recontact'] = (dat.loc[:, ['FNM_Q30_m_1_recontact',
                                       'FNM_Q30_m_2_recontact',
                                       'FNM_Q30_m_3_recontact', 
                                       'FNM_Q30_m_4_recontact', 
                                       'FNM_Q30_m_5_recontact',
                                       'FNM_Q30_m_6_recontact',
                                       'FNM_Q30_m_7_recontact',
                                       'FNM_Q30_m_8_recontact',]]==1).sum(1)
    dat.loc[dat.FNM_Q30_m_955_recontact == 1, 'anxiety_dx_recontact'] = 'other'
    dat.loc[dat.FNM_Q30_m_1_recontact == 1, 'anxiety_dx_recontact'] = 'gad'
    dat.loc[dat.FNM_Q30_m_2_recontact == 1, 'anxiety_dx_recontact'] = 'separation'
    dat.loc[dat.FNM_Q30_m_3_recontact == 1, 'anxiety_dx_recontact'] = 'agoraphobia'
    dat.loc[dat.FNM_Q30_m_4_recontact == 1, 'anxiety_dx_recontact'] = 'phobia'
    dat.loc[dat.FNM_Q30_m_5_recontact == 1, 'anxiety_dx_recontact'] = 'social'
    dat.loc[dat.FNM_Q30_m_6_recontact == 1, 'anxiety_dx_recontact'] = 'panic_disorder'
    dat.loc[dat.FNM_Q30_m_7_recontact == 1, 'anxiety_dx_recontact'] = 'panic_attack'
    dat.loc[dat.FNM_Q30_m_8_recontact == 1, 'anxiety_dx_recontact'] = 'mutism'
    
    dat.loc[dat.n_anxiety_dx_recontact > 1, 'anxiety_dx_recontact'] = 'multiple'
    # clean other anxiety columns - recontact
    dat.loc[dat.anxiety_years_recontact == 999, 'anxiety_years_recontact'] = np.nan
    dat['anxiety_bothered_recontact']=False
    dat.loc[dat.anxiety_bothered_recontact_orig == 1, 'anxiety_bothered_recontact'] = True
    return(dat)

def clean_attnt_dx(dat):
    # clean attention diagnosis
    dat['attention_dx'] = ''
    dat['n_attention_dx'] = (dat.loc[:, ['FNM_Q35_m_1',
                                       'FNM_Q35_m_2',
                                      ]]==1).sum(1)
    dat.loc[dat.FNM_Q35_m_3 == 1, 'attention_dx'] = 'other'
    dat.loc[dat.FNM_Q35_m_1 == 1, 'attention_dx'] = 'adhd'
    dat.loc[dat.FNM_Q35_m_2 == 1, 'attention_dx'] = 'adhd'
    
    # clean other attention columns
    dat.loc[dat.attention_years == 999, 'attention_years'] = np.nan
    dat['attention_bothered']=False
    dat.loc[dat.attention_bothered_orig == 1, 'attention_bothered'] = True
    return(dat)

def clean_attnt_dx_recontact(dat):
    # clean attention diagnosis - recontact
    dat['attention_dx_recontact'] = ''
    dat['n_attention_dx_recontact'] = (dat.loc[:, ['FNM_Q35_m_1_recontact',
                                       'FNM_Q35_m_2_recontact',
                                      ]]==1).sum(1)
    dat.loc[dat.FNM_Q35_m_3_recontact == 1, 'attention_dx_recontact'] = 'other'
    dat.loc[dat.FNM_Q35_m_1_recontact == 1, 'attention_dx_recontact'] = 'adhd'
    dat.loc[dat.FNM_Q35_m_2_recontact == 1, 'attention_dx_recontact'] = 'adhd'
    
    # clean other attention columns - recontact
    dat.loc[dat.attention_years_recontact == 999, 'attention_years_recontact'] = np.nan
    dat['attention_bothered_recontact']=False
    dat.loc[dat.attention_bothered_recontact_orig == 1, 'attention_bothered_recontact'] = True
    return(dat)


def define_attnt_checks(dat):
    # deal with attention checks 
    dat['passed_checks'] = True
    dat['passed_grid'] = True
    dat['passed_list'] = True
    dat.loc[dat.check_moderately != 3, 'passed_checks'] = False
    dat.loc[dat.check_notatall != 1, 'passed_checks'] = False
    dat.loc[dat.check_moderately != 3, 'passed_grid'] = False
    dat.loc[dat.check_notatall != 1, 'passed_grid'] = False
    dat.loc[dat.todaycheck_1 != 1, 'passed_checks'] = False
    dat.loc[dat.todaycheck_2 != 1, 'passed_checks'] = False
    dat.loc[dat.todaycheck_1 != 1, 'passed_list'] = False
    dat.loc[dat.todaycheck_2 != 1, 'passed_list'] = False 
    # deal with attention checks - recontact
    dat['passed_checks_recontact'] = True
    dat['passed_grid_recontact'] = True
    dat['passed_list_recontact'] = True
    dat.loc[dat.check_moderately_recontact != 3, 'passed_checks_recontact'] = False
    dat.loc[dat.check_notatall_recontact != 1, 'passed_checks_recontact'] = False
    dat.loc[dat.check_moderately_recontact != 3, 'passed_grid_recontact'] = False
    dat.loc[dat.check_notatall_recontact != 1, 'passed_grid_recontact'] = False
    dat.loc[dat.todaycheck_1_recontact != 1, 'passed_checks_recontact'] = False
    dat.loc[dat.todaycheck_2_recontact != 1, 'passed_checks_recontact'] = False
    dat.loc[dat.todaycheck_1_recontact != 1, 'passed_list_recontact'] = False
    dat.loc[dat.todaycheck_2_recontact != 1, 'passed_list_recontact'] = False
    # how many passed checks?
    print('\nThis is how many ppl passed attention checks:')
    print('Calculated as 1 - dat.loc[:, [\'passed_checks\', \'passed_grid\', \'passed_list\']].mean()')
    print(1 - dat.loc[:, ['passed_checks', 'passed_grid', 'passed_list']].mean())
    print('Same for recontact:')
    print(1 - dat.loc[:, ['passed_checks_recontact', 'passed_grid_recontact', 'passed_list_recontact']].mean())
    return(dat)

def do_remove_checks(mydat, whichchecks):
    print('\nShape of initial data:')
    print(mydat.shape)
    if whichchecks == 'all':
        mydat = mydat.loc[mydat['passed_checks'] == True]
        mydat = mydat.loc[mydat['passed_checks_recontact'] == True]
    elif whichchecks == 'grid':
        mydat = mydat.loc[mydat['passed_grid'] == True]
        mydat = mydat.loc[mydat['passed_grid_recontact'] == True]
    print('\nShape of data after removing '+ whichchecks +' checks:')
    print(mydat.shape)  
    return(mydat)

def print_cr(cronbachs):
    cr = cronbachs[0]
    cr = round(cr,3)
    cr_ci = cronbachs[1]
    print(str(cr) + '(' + str(cr_ci) + ')')

def do_chronbachs_alpha(dat, m1_stems, scale_lut):
    print('\n\n-----------------||--------------------')
    print('Doing Chronbach\'s alpha...')
    # general columns (all hitop, gad, phq, plus recontact)
    for ms in m1_stems:
        print(ms)
        if "recontact" not in ms:
            desired_columns = list(dat.columns[dat.columns.str.contains(ms) & ~dat.columns.str.contains('today') & ~dat.columns.str.contains('sum') & ~dat.columns.str.contains('recontact')].values)
        else:
            desired_columns = list(dat.columns[dat.columns.str.contains(ms[:-10]) & ~dat.columns.str.contains('today') & ~dat.columns.str.contains('sum') & dat.columns.str.contains('recontact')].values)
        desired_df=dat[desired_columns]
        cr_alpha = pg.cronbach_alpha(data=desired_df)
        #print(cr_alpha)
        print_cr(cr_alpha)
    # every hitop scale
    for item in scale_lut:
        desired_columns = scale_lut[item]
        print(item)
        desired_df=dat[desired_columns]
        cr_alpha = pg.cronbach_alpha(data=desired_df)
        #print(cr_alpha)
        print_cr(cr_alpha)
    # ALL baars
    desired_columns_baars = ['inattention_1', 'inattention_2', 'inattention_3', 'inattention_4', 'inattention_5', 'inattention_6', 'inattention_7', 'inattention_8', 'inattention_9',
     'hyperactivity_1', 'hyperactivity_2', 'hyperactivity_3', 'hyperactivity_4', 'hyperactivity_5',
     'impulsivity_1', 'impulsivity_2', 'impulsivity_3', 'impulsivity_4']
    desired_df_baars=dat[desired_columns_baars]
    cr_alpha_baars = pg.cronbach_alpha(data=desired_df_baars)
    print('All baars')
    #print(cr_alpha_baars)
    print_cr(cr_alpha_baars)
    print('All baars recontact')
    desired_columns_baars_recontact = ['inattention_1_recontact', 'inattention_2_recontact', 'inattention_3_recontact', 'inattention_4_recontact', 'inattention_5_recontact', 'inattention_6_recontact', 'inattention_7_recontact', 'inattention_8_recontact', 'inattention_9_recontact',
     'hyperactivity_1_recontact', 'hyperactivity_2_recontact', 'hyperactivity_3_recontact', 'hyperactivity_4_recontact', 'hyperactivity_5_recontact',
     'impulsivity_1_recontact', 'impulsivity_2_recontact', 'impulsivity_3_recontact', 'impulsivity_4_recontact']
    desired_df_baars_recontact=dat[desired_columns_baars_recontact]
    cr_alpha_baars_recontact = pg.cronbach_alpha(data=desired_df_baars_recontact)
    #print(cr_alpha_baars_recontact)
    print_cr(cr_alpha_baars_recontact)
    print('\n... finished Chronbach\'s alpha')    
    print('-----------------||--------------------')
    

In [3]:
def preprocess_data(genpop_or_enriched, remove_checks, do_chronbachs):
    
    assert(genpop_or_enriched in ['genpop', 'enriched'])
    assert(remove_checks in ['no', 'grid', 'all'])
    assert(do_chronbachs in [True, False])

    print('\n\nGENPOP OR ENRICHED: ' + genpop_or_enriched)
    print('REMOVE CHECKS: ' + remove_checks)
    
    path_to_codesheet = '../dylan_github/yougov_codesheet_alignment.tsv'
    
    if remove_checks == 'no':
        if genpop_or_enriched == 'genpop':
            datapath = '../data/NIMH0007_genpop_num_OUTPUT.csv'
            path_to_write = '../data/mydata_1general_fordescriptive_removedNOchecks.csv'
            path_to_write_ICC = '../data/mydata_1general_forICC_removedNOchecks.csv'
            path_to_write_CFA = '../data/mydata_1general_forCFA_removedNOchecks.csv'
            path_to_write_CORR = '../data/mydata_1general_forCORR_removedNOchecks.csv'   
        elif genpop_or_enriched == 'enriched':
            datapath = '../data/NIMH0007_mental_health_num_OUTPUT.csv'
            path_to_write = '../data/mydata_1highrisk_fordescriptive_removedNOchecks.csv'
            path_to_write_ICC = '../data/mydata_1highrisk_forICC_removedNOchecks.csv'
            path_to_write_CFA = '../data/mydata_1highrisk_forCFA_removedNOchecks.csv'
            path_to_write_CORR = '../data/mydata_1highrisk_forCORR_removedNOchecks.csv' 
    elif remove_checks == 'grid':
        if genpop_or_enriched == 'genpop':
            datapath = '../data/NIMH0007_genpop_num_OUTPUT.csv'
            path_to_write = '../data/mydata_1general_fordescriptive_removedGRIDchecks.csv'
            path_to_write_ICC = '../data/mydata_1general_forICC_removedGRIDchecks.csv'
            path_to_write_CFA = '../data/mydata_1general_forCFA_removedGRIDchecks.csv'
            path_to_write_CORR = '../data/mydata_1general_forCORR_removedGRIDchecks.csv'   
        elif genpop_or_enriched == 'enriched':
            datapath = '../data/NIMH0007_mental_health_num_OUTPUT.csv'
            path_to_write = '../data/mydata_1highrisk_fordescriptive_removedGRIDchecks.csv'
            path_to_write_ICC = '../data/mydata_1highrisk_forICC_removedGRIDchecks.csv'
            path_to_write_CFA = '../data/mydata_1highrisk_forCFA_removedGRIDchecks.csv'
            path_to_write_CORR = '../data/mydata_1highrisk_forCORR_removedGRIDchecks.csv'         
    elif remove_checks == 'all':
        if genpop_or_enriched == 'genpop':
            datapath = '../data/NIMH0007_genpop_num_OUTPUT.csv'
            path_to_write = '../data/mydata_1general_fordescriptive_removedALLchecks.csv'
            path_to_write_ICC = '../data/mydata_1general_forICC_removedALLchecks.csv'
            path_to_write_CFA = '../data/mydata_1general_forCFA_removedALLchecks.csv'
            path_to_write_CORR = '../data/mydata_1general_forCORR_removedALLchecks.csv'   
        elif genpop_or_enriched == 'enriched':
            datapath = '../data/NIMH0007_mental_health_num_OUTPUT.csv'
            path_to_write = '../data/mydata_1highrisk_fordescriptive_removedALLchecks.csv'
            path_to_write_ICC = '../data/mydata_1highrisk_forICC_removedALLchecks.csv'
            path_to_write_CFA = '../data/mydata_1highrisk_forCFA_removedALLchecks.csv'
            path_to_write_CORR = '../data/mydata_1highrisk_forCORR_removedALLchecks.csv'             
    
    # preprocess codesheet file
    scale_lut, ename_lut = preprocess_codesheet(path_to_codesheet)

    # now opening the actual data
    dat = pd.read_csv(datapath, dtype={'caseid':str}, engine='python')
    
    # drop the .0 that pandas appends for some reason
    dat['caseid'] = dat.caseid.str[:-2]
    
    # i'm not sure what this does
    dat = dat.rename(ename_lut, axis=1)

    # renaming columns
    dat = dat.rename(columns={
        'mood_bothered': 'mood_bothered_orig',
        'mood_bothered_recontact': 'mood_bothered_recontact_orig',
        'anxiety_bothered': 'anxiety_bothered_orig',
        'anxiety_bothered_recontact': 'anxiety_bothered_recontact_orig',
        'attention_bothered': 'attention_bothered_orig',
        'attention_bothered_recontact': 'attention_bothered_recontact_orig'})    

    # clean diagnoses
    dat = clean_mood_diagnosis(dat)
    dat = clean_mood_diagnosis_recontact(dat)
    dat = clean_anxiety_diagnosis(dat)
    dat = clean_anxiety_diagnosis_recontact(dat)
    dat = clean_attnt_dx(dat)
    dat = clean_attnt_dx_recontact(dat)

    # deal with attention checks
    dat = define_attnt_checks(dat)
    if remove_checks == 'grid':
        dat = do_remove_checks(mydat = dat, whichchecks = 'grid')
    elif remove_checks == 'all':
        dat = do_remove_checks(mydat = dat, whichchecks = 'all')        
    else:
        pass

    # [QUAESTION] the what
    minus_1_cols = []
    m1_stems = ['inattention', 'hyperactivity', 'impulsivity', 'sct', 'gad', 'phq', 'hitop',
           'inattention_recontact', 'hyperactivity_recontact', 'impulsivity_recontact', 'sct_recontact', 'gad_recontact', 'phq_recontact', 'hitop_recontact']
    for ms in m1_stems:
        #print(ms)
        if "recontact" not in ms:
            cols = list(dat.columns[dat.columns.str.contains(ms) & ~dat.columns.str.contains('today') & ~dat.columns.str.contains('sum') & ~dat.columns.str.contains('recontact')].values)
        else:
            cols = list(dat.columns[dat.columns.str.contains(ms[:-10]) & ~dat.columns.str.contains('today') & ~dat.columns.str.contains('sum') & dat.columns.str.contains('recontact')].values)
        #print(cols)
        minus_1_cols.extend(cols)

    # why are we doing this
    # subtract 1 from responses
    dat.loc[:, minus_1_cols] -= 1

    if do_chronbachs:
        do_chronbachs_alpha(dat, m1_stems, scale_lut)

    # summing up the values
    sum_cols = []
    for ms in m1_stems:
        if 'hitop' not in ms:
            if "recontact" not in ms:
                cols = list(dat.columns[dat.columns.str.contains(ms) & ~dat.columns.str.contains('today') & ~dat.columns.str.contains('sum') & ~dat.columns.str.contains('recontact')].values)
                dat[f'{ms}_sum'] = dat.loc[:, cols].sum(1)
                sum_cols.append(ms + '_sum')
            else:
                cols = list(dat.columns[dat.columns.str.contains(ms[:-10]) & ~dat.columns.str.contains('today') & ~dat.columns.str.contains('sum') & dat.columns.str.contains('recontact')].values)
                dat[f'{ms[:-10]}_sum_recontact'] = dat.loc[:, cols].sum(1)
                sum_cols.append(ms[:-10] + '_sum_recontact')    
    hitop_sums = []
    for scale_name, items in scale_lut.items():
        dat[scale_name] = dat.loc[:, items].sum(1) # adding a dat[scale_name] with a sum of all values ("items)"
        hitop_sums.append(scale_name)   

    sum = 0
    for scale_name, items in scale_lut.items():
        if "well_being" not in scale_name:
            sum += len(items)

    # !!!!!!!!!! double-check this
    dat['hitop_sum'] = dat.loc[:, hitop_sums[::2][:-1]].sum(1)  # [:-1] because we don't include the well-being scale
    dat['hitop_sum_recontact'] = dat.loc[:, hitop_sums[1::2][:-1]].sum(1)
    dat['baars_sum'] = dat.inattention_sum + dat.hyperactivity_sum + dat.impulsivity_sum
    dat['baars_sum_recontact'] = dat.inattention_sum_recontact + dat.hyperactivity_sum_recontact + dat.impulsivity_sum_recontact
    dat['moodanxiety_bothered'] = dat.mood_bothered | dat.anxiety_bothered
    dat['moodanxiety_bothered_recontact'] = dat.mood_bothered_recontact | dat.anxiety_bothered_recontact

    my_columns = []
    for item in ['hitop_sum', 'baars_sum', 'phq_sum', 'gad_sum', # all sums
                 'mood_bothered','anxiety_bothered', 'attention_bothered', 'moodanxiety_bothered', # bothered
                 'inattention_sum', 'hyperactivity_sum', 'impulsivity_sum', 'sct_sum']: # each subscale of baars
        my_columns.append(item)
        my_columns.append(item+'_recontact')
    my_columns.extend(hitop_sums) #each subscale of hitop
        
    # let's save descriptive
    dat_to_save_for_descr = dat.copy(deep = True)

    # rename columns with phq and gad sums for cnsistency in naming
    dat = dat.rename(columns={"gad_recontact_sum": "gad_sum_recontact", "phq_recontact_sum": "phq_sum_recontact"})

    # rename baars subscales
    dat = dat.rename(columns={"inattention_sum": "baars_inattention_sum", 
                          "inattention_sum_recontact": "baars_inattention_sum_recontact",
                          "hyperactivity_sum": "baars_hyperactivity_sum", 
                          "hyperactivity_sum_recontact": "baars_hyperactivity_sum_recontact",
                          "impulsivity_sum": "baars_impulsivity_sum", 
                          "impulsivity_sum_recontact": "baars_impulsivity_sum_recontact",
                          "sct_sum": "baars_sct_sum",
                          "sct_sum_recontact": "baars_sct_sum_recontact"})

    # rename hitops subscales
    rename_dict = {}
    for hitop_item in hitop_sums:
        rename_dict[hitop_item] = 'hitop_' + hitop_item
    dat = dat.rename(columns=rename_dict)

    # this is the data we will save for different types of analysis
    # we do this AFTER removing checks so we have the same data for each analysis

    # which columns to save for each analysis
    my_columns_for_icc = []
    for item in ['hitop_sum', 'baars_sum', 'phq_sum', 'gad_sum', # all sums
                 'mood_bothered','anxiety_bothered', 'attention_bothered', 'moodanxiety_bothered', # bothered
                 'baars_inattention_sum', 'baars_hyperactivity_sum', 'baars_impulsivity_sum', 'baars_sct_sum']: # each subscale of baars
        my_columns_for_icc.append(item)
        my_columns_for_icc.append(item+'_recontact')
    for hitop_item in hitop_sums:
        my_columns_for_icc.append('hitop_' + hitop_item)
    for c5 in dat.columns:
        if "hitop" in c5:
            if '_' not in c5:
                my_columns_for_icc.append(c5)
                my_columns_for_icc.append(c5 + '_recontact') 

    # ALL baars
    desired_columns_baars = ['inattention_1', 'inattention_2', 'inattention_3', 'inattention_4', 'inattention_5', 'inattention_6', 'inattention_7', 'inattention_8', 'inattention_9',
     'hyperactivity_1', 'hyperactivity_2', 'hyperactivity_3', 'hyperactivity_4', 'hyperactivity_5',
     'impulsivity_1', 'impulsivity_2', 'impulsivity_3', 'impulsivity_4']
    desired_columns_baars_recontact = ['inattention_1_recontact', 'inattention_2_recontact', 'inattention_3_recontact', 'inattention_4_recontact', 'inattention_5_recontact', 'inattention_6_recontact', 'inattention_7_recontact', 'inattention_8_recontact', 'inattention_9_recontact',
     'hyperactivity_1_recontact', 'hyperactivity_2_recontact', 'hyperactivity_3_recontact', 'hyperactivity_4_recontact', 'hyperactivity_5_recontact',
     'impulsivity_1_recontact', 'impulsivity_2_recontact', 'impulsivity_3_recontact', 'impulsivity_4_recontact']
    c1 = ['gad_1', 'gad_2', 'gad_3', 'gad_4', 'gad_5', 'gad_6', 'gad_7','phq_1', 'phq_2', 'phq_3', 'phq_4', 'phq_5', 'phq_6', 'phq_7', 'phq_8', 'sct_1', 'sct_2', 'sct_3', 'sct_4', 'sct_5', 'sct_6', 'sct_7', 'sct_8', 'sct_9']
    c2 = []
    for c in c1:
        c_recontact = c + '_recontact'
        c2.append(c_recontact)
    c3 = []
    for c4 in dat.columns:
        if "hitop" in c4:
            if '_' not in c4:
                c3.append(c4)
                c3.append(c4 + '_recontact')
    my_columns_for_cfa = my_columns_for_icc + desired_columns_baars + desired_columns_baars_recontact + c1 + c2 + c3

    # which columns I want for corrs
    # FNM_Q6_5 ---> phq_5 ---> Phq-8 poor appetite or overeating
    # FNM_Q41_m_12 --> todayphq_5 --> TODAY phq-8 poor appetite or overeating
    # FNM_Q6_7 --> phq_7 --> Trouble concentrating on things, such as school work, reading or watching TV
    # FNM_Q41_m_14 --> todayphq7 --> TODAY Trouble concentrating on things, such as school work, reading or watching TV
    # FNM_Q6_3 --> phq_3 --> Trouble falling or staying asleep, or sleeping too much
    # FNM_Q41_m_10 --> todayphq_3 --> TODAY Trouble falling or staying asleep, or sleeping too much 
    '''my_columns_for_corr = my_columns_for_cfa + ['FNM_Q6_5', 'FNM_Q41_m_12', 'FNM_Q6_7', 'FNM_Q41_m_14', 'FNM_Q6_3', 'FNM_Q41_m_10', 
                                                     'mood_bothered', 'mood_bothered_recontact', 
                                                     'anxiety_bothered', 'anxiety_bothered_recontact',
                                                     'attention_bothered', 'attention_bothered_recontact']'''
    my_columns_for_corr = my_columns_for_cfa + ['phq_5', 'todayphq_5', 'phq_7', 'todayphq_7', 'phq_3', 'todayphq_3']  
          
    dat_to_save_for_icc = pd.DataFrame(dat, columns=my_columns_for_icc)
    dat_to_save_for_cfa = pd.DataFrame(dat, columns=my_columns_for_cfa)
    dat_to_save_for_corr = pd.DataFrame(dat, columns=my_columns_for_corr)

    '''# QRemove all negative items
    dat_to_save_for_icc = dat_to_save_for_icc[(dat_to_save_for_icc >= 0).all(axis=1)]
    dat_to_save_for_cfa = dat_to_save_for_cfa[(dat_to_save_for_cfa >= 0).all(axis=1)]'''

    # some more preprocessing for corr
    dat_to_save_for_corr['moodanxiety_bothered'] = dat_to_save_for_corr.mood_bothered | dat_to_save_for_corr.anxiety_bothered
    dat_to_save_for_corr['moodanxiety_bothered_recontact'] = dat_to_save_for_corr.mood_bothered_recontact | dat_to_save_for_corr.anxiety_bothered_recontact
    dat_to_save_for_corr['attentionanxiety_bothered'] = dat_to_save_for_corr.attention_bothered | dat_to_save_for_corr.anxiety_bothered
    dat_to_save_for_corr['attentionanxiety_bothered_recontact'] = dat_to_save_for_corr.attention_bothered_recontact | dat_to_save_for_corr.anxiety_bothered_recontact
    dat_to_save_for_corr['moodattention_bothered'] = dat_to_save_for_corr.mood_bothered | dat_to_save_for_corr.attention_bothered
    dat_to_save_for_corr['moodattention_bothered_recontact'] = dat_to_save_for_corr.mood_bothered_recontact | dat_to_save_for_corr.attention_bothered_recontact
    dat_to_save_for_corr['moodattentionanxiety_bothered'] = dat_to_save_for_corr.mood_bothered | dat_to_save_for_corr.anxiety_bothered | dat_to_save_for_corr.attention_bothered
    dat_to_save_for_corr['moodattentionanxiety_bothered_recontact'] = dat_to_save_for_corr.mood_bothered_recontact | dat_to_save_for_corr.anxiety_bothered_recontact | dat_to_save_for_corr.attention_bothered_recontact    
    
    dat_to_save_for_corr['moodYES_anxNO_attntNO'] =  dat_to_save_for_corr.mood_bothered & ~dat_to_save_for_corr.anxiety_bothered & ~dat_to_save_for_corr.attention_bothered
    dat_to_save_for_corr['moodNO_anxYES_attntNO'] = ~dat_to_save_for_corr.mood_bothered & dat_to_save_for_corr.anxiety_bothered & ~dat_to_save_for_corr.attention_bothered
    dat_to_save_for_corr['moodNO_anxNO_attntYES'] = ~dat_to_save_for_corr.mood_bothered & ~dat_to_save_for_corr.anxiety_bothered & dat_to_save_for_corr.attention_bothered
    dat_to_save_for_corr['moodYES_anxYES_attntNO'] = dat_to_save_for_corr.mood_bothered & dat_to_save_for_corr.anxiety_bothered & ~dat_to_save_for_corr.attention_bothered
    dat_to_save_for_corr['moodYES_anxNO_attntYES'] = dat_to_save_for_corr.mood_bothered & ~dat_to_save_for_corr.anxiety_bothered & dat_to_save_for_corr.attention_bothered
    dat_to_save_for_corr['moodNO_anxYES_attntYES'] = ~dat_to_save_for_corr.mood_bothered & dat_to_save_for_corr.anxiety_bothered & dat_to_save_for_corr.attention_bothered
    dat_to_save_for_corr['moodYES_anxYES_attntYES'] = dat_to_save_for_corr.mood_bothered & dat_to_save_for_corr.anxiety_bothered & dat_to_save_for_corr.attention_bothered
    
    dat_to_save_for_corr['moodYES_anxNO_attntNO_recontact'] =  dat_to_save_for_corr.mood_bothered_recontact & ~dat_to_save_for_corr.anxiety_bothered_recontact & ~dat_to_save_for_corr.attention_bothered_recontact
    dat_to_save_for_corr['moodNO_anxYES_attntNO_recontact'] = ~dat_to_save_for_corr.mood_bothered_recontact & dat_to_save_for_corr.anxiety_bothered_recontact & ~dat_to_save_for_corr.attention_bothered_recontact
    dat_to_save_for_corr['moodNO_anxNO_attntYES_recontact'] = ~dat_to_save_for_corr.mood_bothered_recontact & ~dat_to_save_for_corr.anxiety_bothered_recontact & dat_to_save_for_corr.attention_bothered_recontact
    dat_to_save_for_corr['moodYES_anxYES_attntNO_recontact'] = dat_to_save_for_corr.mood_bothered_recontact & dat_to_save_for_corr.anxiety_bothered_recontact & ~dat_to_save_for_corr.attention_bothered_recontact
    dat_to_save_for_corr['moodYES_anxNO_attntYES_recontact'] = dat_to_save_for_corr.mood_bothered_recontact & ~dat_to_save_for_corr.anxiety_bothered_recontact & dat_to_save_for_corr.attention_bothered_recontact
    dat_to_save_for_corr['moodNO_anxYES_attntYES_recontact'] = ~dat_to_save_for_corr.mood_bothered_recontact & dat_to_save_for_corr.anxiety_bothered_recontact & dat_to_save_for_corr.attention_bothered_recontact
    dat_to_save_for_corr['moodYES_anxYES_attntYES_recontact'] = dat_to_save_for_corr.mood_bothered_recontact & dat_to_save_for_corr.anxiety_bothered_recontact & dat_to_save_for_corr.attention_bothered_recontact    

    '''# QRemove all negative items
    dat_to_save_for_corr = dat_to_save_for_corr[(dat_to_save_for_corr >= 0).all(axis=1)]'''

    # remove nans

    # let's save
    dat_to_save_for_descr.to_csv(path_to_write)
    dat_to_save_for_icc.to_csv(path_to_write_ICC)
    dat_to_save_for_cfa.to_csv(path_to_write_CFA)
    dat_to_save_for_corr.to_csv(path_to_write_CORR)  

    # the max amount of data is in the CORR file, let's check it for nans
    print('checking for nans')
    print(dat_to_save_for_corr.isnull().values.any())
    print('checking for nans - ROWS')
    print('original rows')
    print(len(dat_to_save_for_corr))
    print('rows without nan')
    print(len(dat_to_save_for_corr.dropna()))
    print(genpop_or_enriched + ', ' + remove_checks + ': SAVED')

In [4]:
do_chronbachs = True
for remove_checks in ['no', 'grid', 'all']:
    for genpop_or_enriched in ['genpop', 'enriched']:
        preprocess_data(genpop_or_enriched, remove_checks, do_chronbachs)

print('ALL SAVED')



GENPOP OR ENRICHED: genpop
REMOVE CHECKS: no

This is how many ppl passed attention checks:
Calculated as 1 - dat.loc[:, ['passed_checks', 'passed_grid', 'passed_list']].mean()
passed_checks    0.462
passed_grid      0.006
passed_list      0.458
dtype: float64
Same for recontact:
passed_checks_recontact    0.548
passed_grid_recontact      0.200
passed_list_recontact      0.548
dtype: float64


-----------------||--------------------
Doing Chronbach's alpha...
inattention
0.921([0.91  0.931])
hyperactivity
0.797([0.767 0.824])
impulsivity
0.808([0.779 0.834])
sct
0.904([0.891 0.916])
gad
0.931([0.921 0.94 ])
phq
0.898([0.884 0.911])
hitop
0.964([0.959 0.968])
inattention_recontact
0.982([0.98  0.984])
hyperactivity_recontact
0.96([0.955 0.966])
impulsivity_recontact
0.965([0.96 0.97])
sct_recontact
0.979([0.976 0.982])
gad_recontact
0.98([0.977 0.983])
phq_recontact
0.975([0.972 0.978])
hitop_recontact
0.995([0.995 0.996])
anhedonic_depression
0.927([0.917 0.936])
anhedonic_depression

In [5]:
# doing only grid, just to double-check cronbachs 

do_chronbachs = True
for remove_checks in ['grid']:
    for genpop_or_enriched in ['genpop', 'enriched']:
        preprocess_data(genpop_or_enriched, remove_checks, do_chronbachs)

print('DONE')



GENPOP OR ENRICHED: genpop
REMOVE CHECKS: grid

This is how many ppl passed attention checks:
Calculated as 1 - dat.loc[:, ['passed_checks', 'passed_grid', 'passed_list']].mean()
passed_checks    0.462
passed_grid      0.006
passed_list      0.458
dtype: float64
Same for recontact:
passed_checks_recontact    0.548
passed_grid_recontact      0.200
passed_list_recontact      0.548
dtype: float64

Shape of initial data:
(500, 752)

Shape of data after removing grid checks:
(398, 752)


-----------------||--------------------
Doing Chronbach's alpha...
inattention
0.919([0.907 0.931])
hyperactivity
0.801([0.768 0.83 ])
impulsivity
0.82([0.79  0.848])
sct
0.906([0.892 0.919])
gad
0.927([0.915 0.937])
phq
0.892([0.876 0.908])
hitop
0.964([0.959 0.969])
inattention_recontact
0.923([0.911 0.933])
hyperactivity_recontact
0.829([0.801 0.854])
impulsivity_recontact
0.851([0.826 0.874])
sct_recontact
0.916([0.903 0.928])
gad_recontact
0.932([0.921 0.942])
phq_recontact
0.909([0.895 0.922])
hitop

In [6]:
# check the nans situation
path_to_write_CORR = '../data/mydata_1general_forCORR_removedNOchecks.csv'
my_corr = pd.read_csv(path_to_write_CORR)
my_corr.head()

Unnamed: 0.1,Unnamed: 0,hitop_sum,hitop_sum_recontact,baars_sum,baars_sum_recontact,phq_sum,phq_sum_recontact,gad_sum,gad_sum_recontact,mood_bothered,mood_bothered_recontact,anxiety_bothered,anxiety_bothered_recontact,attention_bothered,attention_bothered_recontact,moodanxiety_bothered,moodanxiety_bothered_recontact,baars_inattention_sum,baars_inattention_sum_recontact,baars_hyperactivity_sum,baars_hyperactivity_sum_recontact,baars_impulsivity_sum,baars_impulsivity_sum_recontact,baars_sct_sum,baars_sct_sum_recontact,hitop_anhedonic_depression,hitop_anhedonic_depression_recontact,hitop_anxious_worry,hitop_anxious_worry_recontact,hitop_appetite_gain,hitop_appetite_gain_recontact,hitop_appetite_loss,hitop_appetite_loss_recontact,hitop_cognitive_problems,hitop_cognitive_problems_recontact,hitop_hyposomnia,hitop_hyposomnia_recontact,hitop_indecisiveness,hitop_indecisiveness_recontact,hitop_insomnia,hitop_insomnia_recontact,hitop_panic,hitop_panic_recontact,hitop_separation_insecurity,hitop_separation_insecurity_recontact,hitop_shame_guilt,hitop_shame_guilt_recontact,hitop_situational_phobia,hitop_situational_phobia_recontact,hitop_social_anxiety,hitop_social_anxiety_recontact,hitop_well_being,hitop_well_being_recontact,hitop157,hitop157_recontact,hitop81,hitop81_recontact,hitop34,hitop34_recontact,hitop54,hitop54_recontact,hitop243,hitop243_recontact,hitop182,hitop182_recontact,hitop69,hitop69_recontact,hitop89,hitop89_recontact,hitop50,hitop50_recontact,hitop129,hitop129_recontact,hitop265,hitop265_recontact,hitop124,hitop124_recontact,hitop231,hitop231_recontact,hitop93,hitop93_recontact,hitop67,hitop67_recontact,hitop245,hitop245_recontact,hitop281,hitop281_recontact,hitop141,hitop141_recontact,hitop40,hitop40_recontact,hitop204,hitop204_recontact,hitop21,hitop21_recontact,hitop236,hitop236_recontact,hitop280,hitop280_recontact,hitop84,hitop84_recontact,hitop120,hitop120_recontact,hitop77,hitop77_recontact,hitop92,hitop92_recontact,hitop258,hitop258_recontact,hitop39,hitop39_recontact,hitop254,hitop254_recontact,hitop215,hitop215_recontact,hitop95,hitop95_recontact,hitop106,hitop106_recontact,hitop283,hitop283_recontact,hitop16,hitop16_recontact,hitop20,hitop20_recontact,hitop189,hitop189_recontact,hitop1,hitop1_recontact,hitop136,hitop136_recontact,hitop246,hitop246_recontact,hitop248,hitop248_recontact,hitop257,hitop257_recontact,hitop114,hitop114_recontact,hitop117,hitop117_recontact,hitop250,hitop250_recontact,hitop200,hitop200_recontact,hitop160,hitop160_recontact,hitop23,hitop23_recontact,hitop165,hitop165_recontact,hitop244,hitop244_recontact,hitop9,hitop9_recontact,hitop142,hitop142_recontact,hitop230,hitop230_recontact,hitop149,hitop149_recontact,hitop247,hitop247_recontact,hitop99,hitop99_recontact,hitop66,hitop66_recontact,hitop240,hitop240_recontact,hitop222,hitop222_recontact,hitop90,hitop90_recontact,hitop113,hitop113_recontact,hitop278,hitop278_recontact,hitop203,hitop203_recontact,hitop159,hitop159_recontact,hitop123,hitop123_recontact,hitop275,hitop275_recontact,hitop268,hitop268_recontact,hitop225,hitop225_recontact,hitop143,hitop143_recontact,hitop151,hitop151_recontact,hitop181,hitop181_recontact,hitop211,hitop211_recontact,hitop17,hitop17_recontact,hitop126,hitop126_recontact,hitop5,hitop5_recontact,hitop261,hitop261_recontact,hitop220,hitop220_recontact,hitop15,hitop15_recontact,hitop72,hitop72_recontact,hitop140,hitop140_recontact,hitop109,hitop109_recontact,hitop197,hitop197_recontact,hitop104,hitop104_recontact,todayhitop157,todayhitop157_recontact,todayhitop81,todayhitop81_recontact,todayhitop34,todayhitop34_recontact,todayhitop54,todayhitop54_recontact,todayhitop243,todayhitop243_recontact,todayhitop182,todayhitop182_recontact,todayhitop69,todayhitop69_recontact,todayhitop89,todayhitop89_recontact,todayhitop50,todayhitop50_recontact,todayhitop129,todayhitop129_recontact,todayhitop265,todayhitop265_recontact,todayhitop124,todayhitop124_recontact,todayhitop231,todayhitop231_recontact,todayhitop93,todayhitop93_recontact,todayhitop67,todayhitop67_recontact,todayhitop245,...,hitop247.1,hitop247_recontact.1,hitop99.1,hitop99_recontact.1,hitop66.1,hitop66_recontact.1,hitop240.1,hitop240_recontact.1,hitop222.1,hitop222_recontact.1,hitop90.1,hitop90_recontact.1,hitop113.1,hitop113_recontact.1,hitop278.1,hitop278_recontact.1,hitop203.1,hitop203_recontact.1,hitop159.1,hitop159_recontact.1,hitop123.1,hitop123_recontact.1,hitop275.1,hitop275_recontact.1,hitop268.1,hitop268_recontact.1,hitop225.1,hitop225_recontact.1,hitop143.1,hitop143_recontact.1,hitop151.1,hitop151_recontact.1,hitop181.1,hitop181_recontact.1,hitop211.1,hitop211_recontact.1,hitop17.1,hitop17_recontact.1,hitop126.1,hitop126_recontact.1,hitop5.1,hitop5_recontact.1,hitop261.1,hitop261_recontact.1,hitop220.1,hitop220_recontact.1,hitop15.1,hitop15_recontact.1,hitop72.1,hitop72_recontact.1,hitop140.1,hitop140_recontact.1,hitop109.1,hitop109_recontact.1,hitop197.1,hitop197_recontact.1,hitop104.1,hitop104_recontact.1,todayhitop157.1,todayhitop157_recontact.1,todayhitop81.1,todayhitop81_recontact.1,todayhitop34.1,todayhitop34_recontact.1,todayhitop54.1,todayhitop54_recontact.1,todayhitop243.1,todayhitop243_recontact.1,todayhitop182.1,todayhitop182_recontact.1,todayhitop69.1,todayhitop69_recontact.1,todayhitop89.1,todayhitop89_recontact.1,todayhitop50.1,todayhitop50_recontact.1,todayhitop129.1,todayhitop129_recontact.1,todayhitop265.1,todayhitop265_recontact.1,todayhitop124.1,todayhitop124_recontact.1,todayhitop231.1,todayhitop231_recontact.1,todayhitop93.1,todayhitop93_recontact.1,todayhitop67.1,todayhitop67_recontact.1,todayhitop245.1,todayhitop245_recontact.1,todayhitop281.1,todayhitop281_recontact.1,todayhitop141.1,todayhitop141_recontact.1,todayhitop40.1,todayhitop40_recontact.1,todayhitop204.1,todayhitop204_recontact.1,todayhitop21.1,todayhitop21_recontact.1,todayhitop236.1,todayhitop236_recontact.1,todayhitop280.1,todayhitop280_recontact.1,todayhitop84.1,todayhitop84_recontact.1,todayhitop120.1,todayhitop120_recontact.1,todayhitop77.1,todayhitop77_recontact.1,todayhitop92.1,todayhitop92_recontact.1,todayhitop258.1,todayhitop258_recontact.1,todayhitop39.1,todayhitop39_recontact.1,todayhitop254.1,todayhitop254_recontact.1,todayhitop215.1,todayhitop215_recontact.1,todayhitop95.1,todayhitop95_recontact.1,todayhitop106.1,todayhitop106_recontact.1,todayhitop283.1,todayhitop283_recontact.1,todayhitop16.1,todayhitop16_recontact.1,todayhitop20.1,todayhitop20_recontact.1,todayhitop189.1,todayhitop189_recontact.1,todayhitop1.1,todayhitop1_recontact.1,todayhitop136.1,todayhitop136_recontact.1,todayhitop246.1,todayhitop246_recontact.1,todayhitop248.1,todayhitop248_recontact.1,todayhitop257.1,todayhitop257_recontact.1,todayhitop114.1,todayhitop114_recontact.1,todayhitop117.1,todayhitop117_recontact.1,todayhitop250.1,todayhitop250_recontact.1,todayhitop200.1,todayhitop200_recontact.1,todayhitop160.1,todayhitop160_recontact.1,todayhitop23.1,todayhitop23_recontact.1,todayhitop165.1,todayhitop165_recontact.1,todayhitop244.1,todayhitop244_recontact.1,todayhitop9.1,todayhitop9_recontact.1,todayhitop142.1,todayhitop142_recontact.1,todayhitop230.1,todayhitop230_recontact.1,todayhitop149.1,todayhitop149_recontact.1,todayhitop247.1,todayhitop247_recontact.1,todayhitop99.1,todayhitop99_recontact.1,todayhitop66.1,todayhitop66_recontact.1,todayhitop240.1,todayhitop240_recontact.1,todayhitop222.1,todayhitop222_recontact.1,todayhitop90.1,todayhitop90_recontact.1,todayhitop113.1,todayhitop113_recontact.1,todayhitop278.1,todayhitop278_recontact.1,todayhitop203.1,todayhitop203_recontact.1,todayhitop159.1,todayhitop159_recontact.1,todayhitop123.1,todayhitop123_recontact.1,todayhitop275.1,todayhitop275_recontact.1,todayhitop268.1,todayhitop268_recontact.1,todayhitop225.1,todayhitop225_recontact.1,todayhitop143.1,todayhitop143_recontact.1,todayhitop151.1,todayhitop151_recontact.1,todayhitop181.1,todayhitop181_recontact.1,todayhitop211.1,todayhitop211_recontact.1,todayhitop17.1,todayhitop17_recontact.1,todayhitop126.1,todayhitop126_recontact.1,todayhitop5.1,todayhitop5_recontact.1,todayhitop261.1,todayhitop261_recontact.1,todayhitop220.1,todayhitop220_recontact.1,todayhitop15.1,todayhitop15_recontact.1,todayhitop72.1,todayhitop72_recontact.1,todayhitop140.1,todayhitop140_recontact.1,todayhitop109.1,todayhitop109_recontact.1,todayhitop197.1,todayhitop197_recontact.1,todayhitop104.1,todayhitop104_recontact.1,phq_5.1,todayphq_5,phq_7.1,todayphq_7,phq_3.1,todayphq_3,attentionanxiety_bothered,attentionanxiety_bothered_recontact,moodattention_bothered,moodattention_bothered_recontact,moodattentionanxiety_bothered,moodattentionanxiety_bothered_recontact,moodYES_anxNO_attntNO,moodNO_anxYES_attntNO,moodNO_anxNO_attntYES,moodYES_anxYES_attntNO,moodYES_anxNO_attntYES,moodNO_anxYES_attntYES,moodYES_anxYES_attntYES,moodYES_anxNO_attntNO_recontact,moodNO_anxYES_attntNO_recontact,moodNO_anxNO_attntYES_recontact,moodYES_anxYES_attntNO_recontact,moodYES_anxNO_attntYES_recontact,moodNO_anxYES_attntYES_recontact,moodYES_anxYES_attntYES_recontact
0,0,39,22,16,11,7,3,10,6,False,False,False,False,False,False,False,False,7,4,7,7,2,0,15,9,2,3,11,6,5,4,0,0,6,2,0,0,3,1,1,2,0,0,4,2,2,0,0,0,5,2,28,26,1,2,1,0,2,1,3,3,2,2,1,0,1,1,1,1,0,1,1,1,2,1,0,0,0,0,0,0,1,0,3,3,3,3,2,2,0,0,1,0,1,1,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,2,3,0,0,0,0,1,1,3,1,1,0,1,0,0,0,1,1,0,0,0,0,1,0,3,0,2,2,0,0,3,3,0,0,3,3,3,3,1,1,0,0,3,3,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,3,1,1,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,2,0,0,0,0,0,0,1,0,0,0,0,0,0,0,2,2,2,2,2,2,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,...,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,3,1,1,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,2,0,0,0,0,0,0,1,0,0,0,0,0,0,0,2,2,2,2,2,2,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,1,2,2,2,2,2,2,2,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,0,2,1,2,0,2,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,1,93,77,12,11,4,6,6,4,False,False,False,False,False,False,False,False,6,5,3,4,3,2,11,9,10,10,13,9,5,4,2,4,6,4,5,6,3,4,4,8,12,5,3,0,3,0,10,10,17,13,27,26,1,1,0,0,3,2,3,3,0,0,2,1,0,0,2,1,0,0,1,2,3,3,2,1,0,1,1,1,2,0,3,3,3,3,3,3,1,0,3,2,1,2,2,1,1,1,0,0,1,0,3,3,1,1,1,1,0,1,1,3,0,0,1,1,3,3,0,2,2,3,3,1,1,2,2,1,0,0,1,1,1,1,3,1,1,0,1,1,0,1,3,1,2,2,3,3,3,3,3,3,3,3,1,1,1,1,3,3,3,3,1,2,2,1,1,1,3,3,1,1,1,0,1,0,0,0,2,1,0,0,1,1,0,1,1,1,1,0,1,0,1,1,3,1,1,1,2,1,1,1,1,2,1,0,2,1,0,0,1,0,1,1,0,0,2,1,2,1,2,2,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2,1,2,2,2,2,2,2,2,2,2,1,...,3,3,1,2,2,1,1,1,3,3,1,1,1,0,1,0,0,0,2,1,0,0,1,1,0,1,1,1,1,0,1,0,1,1,3,1,1,1,2,1,1,1,1,2,1,0,2,1,0,0,1,0,1,1,0,0,2,1,2,1,2,2,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2,1,2,2,2,2,2,2,2,2,2,1,2,2,2,1,2,2,2,2,2,1,2,2,2,2,2,2,2,2,2,2,2,2,2,1,2,2,2,1,2,2,2,2,2,1,1,2,2,1,1,1,2,1,2,1,2,2,2,2,2,2,2,2,2,2,2,2,1,2,2,2,1,2,1,2,1,2,1,2,1,1,2,2,2,2,2,1,2,1,1,2,2,2,2,2,2,2,2,2,2,2,2,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,1,1,1,2,1,2,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,2,108,-146,23,-36,14,-16,5,-14,False,False,False,False,False,False,False,False,10,-18,7,-10,6,-8,13,-18,15,-20,12,-14,6,-8,6,-6,5,-8,5,-10,4,-6,5,-8,8,-12,8,-16,7,-8,9,-10,18,-20,16,-20,1,-2,1,-2,2,-2,2,-2,2,-2,1,-2,1,-2,2,-2,1,-2,1,-2,1,-2,3,-2,1,-2,2,-2,1,-2,1,-2,2,-2,1,-2,2,-2,1,-2,1,-2,2,-2,2,-2,2,-2,1,-2,2,-2,1,-2,2,-2,1,-2,1,-2,2,-2,1,-2,3,-2,3,-2,2,-2,1,-2,1,-2,2,-2,0,-2,1,-2,2,-2,1,-2,1,-2,3,-2,2,-2,1,-2,2,-2,1,-2,3,-2,2,-2,1,-2,2,-2,2,-2,1,-2,1,-2,1,-2,1,-2,2,-2,1,-2,2,-2,2,-2,1,-2,2,-2,1,-2,2,-2,2,-2,1,-2,2,-2,1,-2,1,-2,1,-2,2,-2,2,-2,1,-2,1,-2,1,-2,3,-2,2,-2,1,-2,2,-2,1,-2,0,-2,0,-2,2,-1,2,-1,1,-1,2,-1,2,-1,2,-1,1,-1,2,-1,2,-1,2,-1,2,-1,1,-1,2,-1,2,-1,1,-1,2,...,1,-2,1,-2,1,-2,2,-2,1,-2,2,-2,2,-2,1,-2,2,-2,1,-2,2,-2,2,-2,1,-2,2,-2,1,-2,1,-2,1,-2,2,-2,2,-2,1,-2,1,-2,1,-2,3,-2,2,-2,1,-2,2,-2,1,-2,0,-2,0,-2,2,-1,2,-1,1,-1,2,-1,2,-1,2,-1,1,-1,2,-1,2,-1,2,-1,2,-1,1,-1,2,-1,2,-1,1,-1,2,-1,1,-1,2,-1,2,-1,1,-1,2,-1,1,-1,2,-1,2,-1,1,-1,2,-1,2,-1,2,-1,2,-1,2,-1,2,-1,2,-1,2,-1,2,-1,2,-1,2,-1,2,-1,1,-1,2,-1,2,-1,2,-1,1,-1,2,-1,2,-1,2,-1,2,-1,2,-1,2,-1,2,-1,2,-1,2,-1,2,-1,1,-1,1,-1,2,-1,1,-1,1,-1,2,-1,2,-1,2,-1,2,-1,2,-1,2,-1,2,-1,2,-1,2,-1,2,-1,1,-1,2,-1,1,-1,2,-1,2,-1,2,-1,2,-1,2,-1,2,-1,2,-1,2,-1,2,-1,1,-1,2,-1,1,-1,2,-1,2,2,2,1,2,2,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,3,17,29,4,3,0,0,0,0,False,False,False,False,False,False,False,False,4,2,0,1,0,0,3,1,1,4,0,5,3,0,0,6,1,1,0,0,0,1,6,4,1,3,0,1,0,1,4,3,1,0,13,11,1,1,0,0,0,1,2,3,1,0,0,0,0,0,0,1,0,1,0,0,0,3,0,0,0,0,0,0,0,1,2,1,0,0,1,0,0,0,0,0,0,1,0,0,0,3,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,3,2,1,3,3,3,3,2,1,1,0,0,3,2,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0,1,0,0,0,1,1,0,0,0,1,0,3,0,0,0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,...,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0,1,0,0,0,1,1,0,0,0,1,0,3,0,0,0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,0,2,0,2,0,2,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,4,92,77,16,18,13,9,4,7,False,False,False,False,False,False,False,False,8,10,3,4,5,4,11,11,16,15,6,10,12,7,0,4,9,8,2,2,3,4,10,7,9,5,8,4,4,4,2,0,11,7,8,4,3,1,0,0,1,1,3,0,3,2,3,2,3,1,0,0,1,1,2,0,0,1,1,0,0,0,2,2,2,2,1,0,0,1,3,0,2,1,1,1,1,1,2,1,0,1,1,2,3,2,2,2,1,1,1,1,3,2,3,2,1,0,1,2,0,0,0,1,0,0,1,2,2,2,1,1,0,0,1,1,2,2,3,1,1,2,1,1,0,2,2,0,3,2,1,1,0,0,0,0,1,0,3,2,0,1,0,0,1,0,2,1,0,1,0,2,0,0,1,1,0,0,0,0,2,2,2,2,0,1,3,3,1,1,1,0,1,1,2,1,0,0,1,1,1,0,1,0,0,0,3,2,1,1,1,1,1,1,1,1,0,2,0,0,2,2,1,1,2,2,1,1,2,2,2,2,1,1,1,2,2,2,1,1,2,2,2,2,2,2,2,2,1,1,2,1,2,...,1,0,2,1,0,1,0,2,0,0,1,1,0,0,0,0,2,2,2,2,0,1,3,3,1,1,1,0,1,1,2,1,0,0,1,1,1,0,1,0,0,0,3,2,1,1,1,1,1,1,1,1,0,2,0,0,2,2,1,1,2,2,1,1,2,2,2,2,1,1,1,2,2,2,1,1,2,2,2,2,2,2,2,2,1,1,2,1,2,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,2,2,1,1,1,2,2,2,1,2,1,1,2,2,2,1,2,2,2,1,2,2,2,1,1,1,2,2,2,2,1,2,2,1,1,2,2,1,1,1,2,2,2,2,1,2,2,2,2,2,2,2,2,2,1,1,2,2,2,2,2,2,2,2,2,2,2,1,2,2,1,1,2,2,2,2,1,1,2,1,2,2,1,1,1,2,2,2,2,1,2,2,2,2,1,2,2,2,2,2,2,2,1,1,2,1,2,2,2,2,2,1,1,1,2,2,1,1,3,1,3,1,2,1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [7]:
#myd = pd.read_csv('../data/NIMH0007_genpop_num_OUTPUT.csv')
myd = pd.read_csv('../data/NIMH0007_mental_health_num_OUTPUT.csv', dtype={'caseid':str}, engine='python')
#myd
myd.FNM_Q6_5.value_counts()

4    96
1    76
2    73
3    65
Name: FNM_Q6_5, dtype: int64

In [8]:
    # which columns I want for corrs
    # FNM_Q6_5 ---> Phq-8 poor appetite or overeating
    # FNM_Q41_m_12 --> TODAY phq-8 poor appetite or overeating
    # FNM_Q6_7 --> Trouble concentrating on things, such as school work, reading or watching TV
    # FNM_Q41_m_14 --> TODAY Trouble concentrating on things, such as school work, reading or watching TV
    # FNM_Q6_3 --> Trouble falling or staying asleep, or sleeping too much
    # FNM_Q41_m_10 --> TODAY Trouble falling or staying asleep, or sleeping too much 

In [9]:
a = 0.923229558777681
output = round(a,3)
output

0.923