In [1]:
import pandas as pd
pd.set_option('max_colwidth', 100)
import numpy as np
import matplotlib.pyplot as plt
from itertools import cycle
from sklearn import svm, datasets
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
import itertools
import scipy.stats as st
import seaborn as sns
import scipy

In [2]:
path_to_data = '/Users/zeleninam2/Documents/projects/cbcl_ksads_work/revision_2023/data'

In [9]:
def load_ksads(path_to_mydata, reporter, wave, do_print = False):
    print('BEGIN LOADING KSADS...')
    assert reporter in ['parent','child'], "wrong reporter" # reporter has to be either parent or child
    assert wave in ['baseline', 'optimal'] # wrong wave
    # Load baseline data
    if reporter == 'child':
        path_to_ksads_data = path_to_mydata+'/abcd-data-release-5.1/core/mental-health/mh_y_ksads_ss.csv'
    elif reporter == 'parent':
        path_to_ksads_data = path_to_mydata+'/abcd-data-release-5.1/core/mental-health/mh_p_ksads_ss.csv'        
    ksads_data = pd.read_csv(path_to_ksads_data, sep=',', header=0, low_memory=False)
    # Select only baseline rows
    if wave == 'baseline':
        ksads_data_bsl = ksads_data.loc[ksads_data.eventname == 'baseline_year_1_arm_1'].copy()
    elif wave == 'optimal':
        ksads_data_bsl = ksads_data.loc[ksads_data.eventname == '2_year_follow_up_y_arm_1'].copy()        
    # check dimensions of the dataset
    print('ORIGINAL DATA SIZE:')
    print(ksads_data_bsl.shape)
    # ksads - only leave columns with name id and depression diagnoses
    if reporter == 'child':
        depression_column_list = [f'ksads_1_{x}_t' for x in [840, 841, 843, 844, 846]]
        adhd_column_list = ['ksads_14_856_t', 'ksads_14_855_t', 'ksads_14_853_t']
        anxiety_column_list = [ 'ksads_5_906_t',  'ksads_5_857_t', 'ksads_6_908_t',  'ksads_7_861_t',  'ksads_7_909_t',  'ksads_8_863_t',  'ksads_8_911_t', 'ksads_10_913_t', 'ksads_10_869_t', 'ksads_9_867_t']
    elif reporter == 'parent':
        depression_column_list = [f'ksads_1_{x}_p' for x in [840, 841, 843, 844, 846]]
        adhd_column_list = ['ksads_14_856_p', 'ksads_14_855_p', 'ksads_14_853_p']
        anxiety_column_list = [ 'ksads_5_906_p',  'ksads_5_857_p', 'ksads_6_908_p',  'ksads_7_861_p',  'ksads_7_909_p',  'ksads_8_863_p',  'ksads_8_911_p', 'ksads_10_913_p', 'ksads_10_869_p', 'ksads_9_867_p']
    col_list = ['src_subject_id']
    col_list += depression_column_list
    col_list += adhd_column_list
    col_list += anxiety_column_list
    anxiety_or_adhd_column_list = anxiety_column_list + adhd_column_list
    ksads_bsl_id_depanxadhd = ksads_data_bsl.loc[:, col_list]
    ksads_bsl_id_depanxadhd = ksads_bsl_id_depanxadhd.set_index('src_subject_id')
    ksads_bsl_id_depanxadhd = ksads_bsl_id_depanxadhd.astype(float)
    # work with missing values (could be encoded as 555 or 888)
    # in child data, there are some columns that are fully nan. remove them
    if reporter == 'child':
        nan_columns = ['ksads_14_856_t', 'ksads_14_855_t', 'ksads_14_853_t', 'ksads_9_867_t', 'ksads_5_906_t', 'ksads_5_857_t',  'ksads_6_908_t', 'ksads_7_861_t', 'ksads_7_909_t']
        # just do this once to check if everything in these columns are nans
        #print('\n check that nan-columns are truly nan')
        #for c in nan_columns:
            #print(ksads_bsl_id_depanxadhd[c].value_counts())
        ksads_bsl_id_depanxadhd = ksads_bsl_id_depanxadhd.drop(columns = nan_columns)
        # only keeping non-nan columns 
        adhd_column_list = []
        if reporter == 'child':
            anxiety_column_list = ['ksads_8_863_t', 'ksads_8_911_t', 'ksads_10_913_t', 'ksads_10_869_t']
        col_list = []
        col_list += depression_column_list
        col_list += adhd_column_list
        col_list += anxiety_column_list
        anxiety_or_adhd_column_list = anxiety_column_list + adhd_column_list
        ksads_bsl_id_depanxadhd = ksads_bsl_id_depanxadhd.loc[:, col_list]
    # dealing with nan rows
    # do this once to make sure values are 0, 1, or 555/888
    #for c in ksads_bsl_id_depanxadhd.columns:
        #print(ksads_bsl_id_depanxadhd[c].value_counts())
    # if encoded as 555 or 888, replace with nan
    ksads_bsl_id_depanxadhd[ksads_bsl_id_depanxadhd > 1.0] = np.nan
    # [QUESTION] ???
    #nnans = ksads_bsl_id_alldep.isnull().sum(1)
    #print('\nnnans')
    #print(nnans)
    #bad_ids = nnans[nnans > 1].index
    #print('\nbad ids')
    #print(bad_ids)
    #good_ids = nnans[nnans == 0].index
    #ksads_dat = (ksads_bsl_id_alldep.loc[good_ids].sum(1) > 0).astype(int)
    #print('\ngood id sum')
    #print(ksads_dat.sum())
    # drop all rows that include nans
    ksads_bsl_id_depanxadhd = ksads_bsl_id_depanxadhd.dropna()
    if do_print:
        print('\nWITHOUT NANS:')
        print(ksads_bsl_id_depanxadhd.shape)    
    if do_print:
        print('\n shape after droping nans')
        print(ksads_bsl_id_depanxadhd.shape)
    # sum ksads scores of individual diagnoses to reach the overall ksads score, which we will use a gs
    ksads_bsl_id_depanxadhd['any_depression'] = (ksads_bsl_id_depanxadhd.loc[:, depression_column_list] > 0).sum(1) > 0
    ksads_bsl_id_depanxadhd['anxadhd_null'] = ksads_bsl_id_depanxadhd.loc[:, adhd_column_list + anxiety_column_list].isnull().sum(1) > 0
    ksads_bsl_id_depanxadhd['good'] = ksads_bsl_id_depanxadhd.any_depression | (~ksads_bsl_id_depanxadhd.anxadhd_null)
    bad_ids = ksads_bsl_id_depanxadhd.loc[~ksads_bsl_id_depanxadhd.good].index
    good_ids = ksads_bsl_id_depanxadhd.loc[ksads_bsl_id_depanxadhd.good].index
    if do_print:
        check1 = (ksads_bsl_id_depanxadhd.loc[:, depression_column_list].sum(1) > 0).astype(int)
        print('check1')
        print(check1.sum())
    ksads_dat = (ksads_bsl_id_depanxadhd.loc[good_ids, depression_column_list].sum(1) > 0).astype(int)
    ksads_dat = pd.DataFrame(ksads_dat).rename(columns={0:'ksads_DEPRESSION'})
    ksads_dat['ksads_ANXIETY'] = (ksads_bsl_id_depanxadhd.loc[good_ids, anxiety_column_list].sum(1) > 0).astype(int)
    ksads_dat['ksads_ADHD'] = (ksads_bsl_id_depanxadhd.loc[good_ids, adhd_column_list].sum(1) > 0).astype(int)
    ksads_dat['ksads_ADHD_OR_ANXIETY'] = (ksads_bsl_id_depanxadhd.loc[good_ids, anxiety_or_adhd_column_list].sum(1) > 0).astype(int)
    # sometimes the sum is more than one, if more than one depression diagnosis given
    # I want a binary value, so let's just put these to 1
    #ksads_bsl_id_dep[ksads_bsl_id_dep > 1.0] = 1.0
    if do_print:
        print('\nksads DEPRESSION value counts')
        print(ksads_dat['ksads_DEPRESSION'].value_counts())
    # creating different combinations of depression, anxiety, adhd comorbidities
    DEPyes_ANXADHDno = []
    DEPno_ANXADHDyes = []
    for index, row in ksads_dat.iterrows():
        if row['ksads_DEPRESSION'] == 1 and row['ksads_ADHD_OR_ANXIETY'] == 0:
            DEPyes_ANXADHDno.append(1)
        else:
            DEPyes_ANXADHDno.append(0)   
        if row['ksads_DEPRESSION'] == 0 and row['ksads_ADHD_OR_ANXIETY'] == 1:
            DEPno_ANXADHDyes.append(1)
        else:
            DEPno_ANXADHDyes.append(0)
    ksads_dat['ksads_DEPyes_ANXADHDno'] = DEPyes_ANXADHDno
    ksads_dat['ksads_DEPno_ANXADHDyes'] = DEPno_ANXADHDyes       
    # this is just renaming the sum depression columns
    #ksads_bsl_id_dep = pd.DataFrame(ksads_bsl_id_dep).rename(columns = {0:'ksads_dep_binary'})
    #if do_print:
        #print('\nfinal number of subjects overall')
        #print(ksads_bsl_id_dep.shape[0])
    if do_print:
        print('\nfinal sample size')
        print(ksads_dat.shape)
    print('\n...FINISH LOADING KSADS')
    return ksads_dat


In [11]:
data_ksads = load_ksads(path_to_data, reporter = 'child', wave = 'baseline', do_print = True)

BEGIN LOADING KSADS...
ORIGINAL DATA SIZE:
(11812, 1912)

WITHOUT NANS:
(11792, 9)

 shape after droping nans
(11792, 9)
check1
119

ksads DEPRESSION value counts
ksads_DEPRESSION
0    11673
1      119
Name: count, dtype: int64

final sample size
(11792, 6)

...FINISH LOADING KSADS
