In [16]:
import pandas as pd
pd.set_option('max_colwidth', 100)
import numpy as np
import matplotlib.pyplot as plt
from itertools import cycle
from sklearn import svm, datasets
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
import itertools
import scipy.stats as st

In [17]:
path_to_data = '/Users/zeleninam2/Documents/projects/cbcl_ksads_work/revision_2023/data'

In [22]:
# FUNCTION TO LOAD KSADS DATA, WHICH WE USE FOR _ALL_ ANALYSES

def load_ksads(path_to_mydata, reporter):
    assert reporter in ['parent','child'], "wrong reporter" # reporter has to be either parent or child
    # Load baseline data
    if reporter == 'child':
        path_to_ksads_data = path_to_mydata+'/abcd-data-release-5.1/core/mental-health/mh_y_ksads_ss.csv'
    elif reporter == 'parent':
        path_to_ksads_data = path_to_mydata+'/abcd-data-release-5.1/core/mental-health/mh_p_ksads_ss.csv'
    ksads_data = pd.read_csv(path_to_ksads_data, sep=',', header=0, low_memory=False)
    # Select only baseline rows
    ksads_data_bsl = ksads_data.loc[ksads_data.eventname == 'baseline_year_1_arm_1'].copy()
    # check dimensions of the dataset
    print('shape at loading, baseline only')
    print(ksads_data_bsl.shape)
    # ksads - only leave columns with name id and depression diagnoses
    if reporter == 'child':
        depression_column_list = [f'ksads_1_{x}_t' for x in [840, 841, 843, 844, 846]]
    elif reporter == 'parent':
        depression_column_list = [f'ksads_1_{x}_p' for x in [840, 841, 843, 844, 846]]
    col_list = ['src_subject_id']
    col_list += depression_column_list
    ksads_bsl_id_alldep = ksads_data_bsl.loc[:, col_list]
    # ksads_dict_data = ksads_bsl_id_alldep.to_dict('records') # do we need this?
    # change types to float so I can work with them 
    ksads_bsl_id_alldep = ksads_bsl_id_alldep.set_index('src_subject_id')
    ksads_bsl_id_alldep = ksads_bsl_id_alldep.astype(float)
    # work with missing values (could be encoded as 555 or 888)
    ksads_bsl_id_alldep[ksads_bsl_id_alldep > 1.0] = np.nan
    # ???
    nnans = ksads_bsl_id_alldep.isnull().sum(1)
    bad_ids = nnans[nnans > 1].index
    good_ids = nnans[nnans == 0].index
    ksads_dat = (ksads_bsl_id_alldep.loc[good_ids].sum(1) > 0).astype(int)
    print('good id sum')
    print(ksads_dat.sum())
    # this is just renaming the sum depression columns
    ksads_bsl_id_dep = pd.DataFrame(ksads_dat).rename(columns = {0:'ksads_dep_binary'})
    print('final number of subjects overall')
    print(ksads_bsl_id_dep.shape[0])
    return ksads_bsl_id_dep

# FUNCTION TO LOAD CBCL DATA

def load_cbcl(path_to_mydata):
    path_to_cbcl = path_to_data+'/abcd-data-release-5.1/core/mental-health/mh_p_cbcl.csv'
    cbcl_data = pd.read_csv(path_to_cbcl, sep=',', header=0, low_memory=False)
    cbcl_data_bsl = cbcl_data.loc[cbcl_data.eventname == 'baseline_year_1_arm_1'].copy()
    print('cbcl shape at loading, baseline only')
    print(cbcl_data_bsl.shape)
    # only leave columns with id and depression diagnosis
    # use RAW SCORE for depresison diagnosis, previously used cbcl_scr_dsm5_depress_t
    col_list_cbcl = ['src_subject_id', 'cbcl_scr_dsm5_depress_r']
    cbcl_bsl_id_dep = cbcl_data_bsl.loc[:, col_list_cbcl]
    cbcl_bsl_id_dep = cbcl_bsl_id_dep.set_index('src_subject_id')
    return cbcl_bsl_id_dep

In [23]:
data_ksads_child = load_ksads(path_to_data, reporter = 'child')

shape at loading, baseline only
(11812, 1912)
good id sum
119
final number of subjects overall
11808


In [24]:
data_ksads_parent = load_ksads(path_to_data, reporter = 'parent')

shape at loading, baseline only
(11747, 1911)
good id sum
54
final number of subjects overall
11726


In [None]:
path_to_cbcl = path_to_data+'/abcd-data-release-5.1/core/mental-health/mh_p_cbcl.csv'