In [16]:
import pandas as pd
pd.set_option('max_colwidth', 100)
import numpy as np
import matplotlib.pyplot as plt
from itertools import cycle
from sklearn import svm, datasets
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
import itertools
import scipy.stats as st

In [17]:
path_to_data = '/Users/zeleninam2/Documents/projects/cbcl_ksads_work/revision_2023/data'

In [72]:
# FUNCTION TO LOAD KSADS DATA, WHICH WE USE FOR _ALL_ ANALYSES

def load_ksads(path_to_mydata, reporter):
    print('BEGIN LOADING KSADS...')
    assert reporter in ['parent','child'], "wrong reporter" # reporter has to be either parent or child
    # Load baseline data
    if reporter == 'child':
        path_to_ksads_data = path_to_mydata+'/abcd-data-release-5.1/core/mental-health/mh_y_ksads_ss.csv'
    elif reporter == 'parent':
        path_to_ksads_data = path_to_mydata+'/abcd-data-release-5.1/core/mental-health/mh_p_ksads_ss.csv'
    ksads_data = pd.read_csv(path_to_ksads_data, sep=',', header=0, low_memory=False)
    # Select only baseline rows
    ksads_data_bsl = ksads_data.loc[ksads_data.eventname == 'baseline_year_1_arm_1'].copy()
    # check dimensions of the dataset
    print('\nshape at loading, baseline only')
    print(ksads_data_bsl.shape)
    # ksads - only leave columns with name id and depression diagnoses
    if reporter == 'child':
        depression_column_list = [f'ksads_1_{x}_t' for x in [840, 841, 843, 844, 846]]
    elif reporter == 'parent':
        depression_column_list = [f'ksads_1_{x}_p' for x in [840, 841, 843, 844, 846]]
    col_list = ['src_subject_id']
    col_list += depression_column_list
    ksads_bsl_id_alldep = ksads_data_bsl.loc[:, col_list]
    # ksads_dict_data = ksads_bsl_id_alldep.to_dict('records') # do we need this?
    # change types to float so I can work with them 
    ksads_bsl_id_alldep = ksads_bsl_id_alldep.set_index('src_subject_id')
    ksads_bsl_id_alldep = ksads_bsl_id_alldep.astype(float)
    # work with missing values (could be encoded as 555 or 888)
    print('\nhow many nans:')
    for c in ksads_bsl_id_alldep.columns:
        print(ksads_bsl_id_alldep[c].value_counts())
    # if encoded as 555 or 888, replace with nan
    ksads_bsl_id_alldep[ksads_bsl_id_alldep > 1.0] = np.nan
    # [QUESTION] ???
    #nnans = ksads_bsl_id_alldep.isnull().sum(1)
    #print('\nnnans')
    #print(nnans)
    #bad_ids = nnans[nnans > 1].index
    #print('\nbad ids')
    #print(bad_ids)
    #good_ids = nnans[nnans == 0].index
    #ksads_dat = (ksads_bsl_id_alldep.loc[good_ids].sum(1) > 0).astype(int)
    #print('\ngood id sum')
    #print(ksads_dat.sum())
    # drop all rows that include nans
    ksads_bsl_id_alldep = ksads_bsl_id_alldep.dropna()
    print('\n shape after droping nans')
    print(ksads_bsl_id_alldep.shape)
    # sum ksads scores of individual diagnoses to reach the overall ksads score, which we will use a gs
    ksads_bsl_id_dep = (ksads_bsl_id_alldep.sum(1)).astype(int)
    # this is just renaming the sum depression columns
    ksads_bsl_id_dep = pd.DataFrame(ksads_bsl_id_dep).rename(columns = {0:'ksads_dep_binary'})
    print('\nfinal number of subjects overall')
    print(ksads_bsl_id_dep.shape[0])
    print('\n...FINISH LOADING KSADS')
    return ksads_bsl_id_dep

# FUNCTION TO LOAD CBCL DATA

def load_cbcl(path_to_mydata):
    print('BEGIN LOADING CBCL...')
    path_to_cbcl = path_to_data+'/abcd-data-release-5.1/core/mental-health/mh_p_cbcl.csv'
    cbcl_data = pd.read_csv(path_to_cbcl, sep=',', header=0, low_memory=False)
    cbcl_data_bsl = cbcl_data.loc[cbcl_data.eventname == 'baseline_year_1_arm_1'].copy()
    print('\ncbcl shape at loading, baseline only')
    print(cbcl_data_bsl.shape)
    # only leave columns with id and depression diagnosis
    # use RAW SCORE for depresison diagnosis, previously used cbcl_scr_dsm5_depress_t
    col_list_cbcl = ['src_subject_id', 'cbcl_scr_dsm5_depress_r']
    cbcl_bsl_id_dep = cbcl_data_bsl.loc[:, col_list_cbcl]
    cbcl_bsl_id_dep = cbcl_bsl_id_dep.set_index('src_subject_id')
    print('\cbcl values')
    print(cbcl_bsl_id_dep.cbcl_scr_dsm5_depress_r.value_counts())
    print ('\nshape after merging')
    print(cbcl_bsl_id_dep.shape)
    cbcl_bsl_id_dep = cbcl_bsl_id_dep.dropna()
    print ('\nshape after checking for nans')
    print(cbcl_bsl_id_dep.shape)    
    print('...FINISH LOADING CBCL')
    return cbcl_bsl_id_dep

# FUNCTION TO MERGE DATASETS

def merge_my_data(measure_data, ksads_data):
    print('BEGIN MERGING...')
    # check if shapes of data match ot not
    print('samples in measure (cbcl,...) data:')
    print(measure_data.shape)
    print('samples in gold standard (ksads) data:')
    print(ksads_data.shape)  
    # merge on ksads data
    # merging on ksads because we need all subjects to have a gold standard ksads number
    data_ksads_and_measure = pd.merge(ksads_data, measure_data, how='left', left_index = True, right_index = True, indicator=True)
    # check if there are items that are present in only one dataframe
    if len(data_ksads_and_measure.query('_merge != "both"')) != 0:
        print('merge indicator:')
        print(data_ksads_and_measure._merge.value_counts())
        print(data_ksads_and_measure.loc[data_ksads_and_measure._merge == 'left_only'])
    # only keep rows that have data from *both* dataframes
    data_ksads_and_measure = data_ksads_and_measure.loc[data_ksads_and_measure._merge == 'both'].drop(columns=['_merge']) 
    # shape after merging
    print('\shape after merging')
    print(data_ksads_and_measure.shape)
    # I don't think we need to do this. This was removing null values, whoch is how missing data used to be encoded.
    # cbcl_t takes values form 50 to 50-something.
    # cbcl_r takes values from 0 to 19, and so 0 is meaningful.
    #data_ksads_cbcl = data_ksads_cbcl.loc[data_ksads_cbcl.cbcl_scr_dsm5_depress_t.notnull()].copy()
    #data_ksads_cbcl['cbcl_scr_dsm5_depress_t'] = data_ksads_cbcl.cbcl_scr_dsm5_depress_t.astype(int)
    # We shouldn't have any nans but let's heck just to be extra careful
    data_ksads_and_measure = data_ksads_and_measure.dropna()
    print ('\nshape after checking for nans')
    print(data_ksads_and_measure.shape) 
    print('...FINISH MERGING')
    return data_ksads_and_measure

# -----------------
# ANALYSIS FUNCTONS
# -----------------



    

In [73]:
data_ksads_child = load_ksads(path_to_data, reporter = 'child')
data_ksads_child.head()

BEGIN LOADING KSADS...

shape at loading, baseline only
(11812, 1912)

how many nans:
ksads_1_840_t
0.0      11742
1.0         66
888.0        1
Name: count, dtype: int64
ksads_1_841_t
0.0      11782
1.0         26
888.0        1
Name: count, dtype: int64
ksads_1_843_t
0.0      11808
888.0        1
Name: count, dtype: int64
ksads_1_844_t
0.0      11808
888.0        1
Name: count, dtype: int64
ksads_1_846_t
0.0      11780
1.0         28
888.0        1
Name: count, dtype: int64

 shape after droping nans
(11808, 5)

final number of subjects overall
11808

...FINISH LOADING KSADS


Unnamed: 0_level_0,ksads_dep_binary
src_subject_id,Unnamed: 1_level_1
NDAR_INV003RTV85,0
NDAR_INV005V6D2C,0
NDAR_INV007W6H7B,0
NDAR_INV00BD7VDC,0
NDAR_INV00CY2MDM,0


In [74]:
data_ksads_parent = load_ksads(path_to_data, reporter = 'parent')

BEGIN LOADING KSADS...

shape at loading, baseline only
(11747, 1911)

how many nans:
ksads_1_840_p
0.0      11703
1.0         23
888.0        3
Name: count, dtype: int64
ksads_1_841_p
0.0      11707
1.0         19
888.0        3
Name: count, dtype: int64
ksads_1_843_p
0.0      11726
888.0        3
Name: count, dtype: int64
ksads_1_844_p
0.0      11726
888.0        3
Name: count, dtype: int64
ksads_1_846_p
0.0      11712
1.0         14
888.0        3
Name: count, dtype: int64

 shape after droping nans
(11726, 5)

final number of subjects overall
11726

...FINISH LOADING KSADS


In [75]:
cbcl_data = load_cbcl(path_to_data)
cbcl_data.head()

BEGIN LOADING CBCL...

cbcl shape at loading, baseline only
(11862, 203)
\cbcl values
cbcl_scr_dsm5_depress_r
0.0     6085
1.0     2390
2.0     1317
3.0      731
4.0      480
5.0      290
6.0      201
7.0      124
8.0       81
9.0       53
10.0      42
11.0      20
12.0      19
13.0      10
14.0       8
16.0       3
15.0       3
17.0       1
18.0       1
19.0       1
Name: count, dtype: int64

shape after merging
(11862, 1)

shape after checking for nans
(11860, 1)
...FINISH LOADING CBCL


Unnamed: 0_level_0,cbcl_scr_dsm5_depress_r
src_subject_id,Unnamed: 1_level_1
NDAR_INV003RTV85,0.0
NDAR_INV005V6D2C,0.0
NDAR_INV007W6H7B,0.0
NDAR_INV00BD7VDC,1.0
NDAR_INV00CY2MDM,1.0


In [76]:
dat_cbcl_ksads_p = merge_my_data(cbcl_data, data_ksads_parent)

BEGIN MERGING...
samples in measure (cbcl,...) data:
(11860, 1)
samples in gold standard (ksads) data:
(11726, 1)
merge indicator:
_merge
both          11719
left_only         7
right_only        0
Name: count, dtype: int64
                  ksads_dep_binary  cbcl_scr_dsm5_depress_r     _merge
src_subject_id                                                        
NDAR_INV6PEZ2Z0K                 0                      NaN  left_only
NDAR_INV9PVR76W7                 0                      NaN  left_only
NDAR_INVC2ZW147D                 0                      NaN  left_only
NDAR_INVFZWKRBW7                 0                      NaN  left_only
NDAR_INVJHJDGEFN                 0                      NaN  left_only
NDAR_INVL9NUBDAN                 0                      NaN  left_only
NDAR_INVNR9XMXRM                 0                      NaN  left_only
\shape after merging
(11719, 2)

shape after checking for nans
(11719, 2)
...FINISH MERGING


In [77]:
dat_cbcl_ksads_c = merge_my_data(cbcl_data, data_ksads_child)

BEGIN MERGING...
samples in measure (cbcl,...) data:
(11860, 1)
samples in gold standard (ksads) data:
(11808, 1)
merge indicator:
_merge
both          11801
left_only         7
right_only        0
Name: count, dtype: int64
                  ksads_dep_binary  cbcl_scr_dsm5_depress_r     _merge
src_subject_id                                                        
NDAR_INV9PVR76W7                 0                      NaN  left_only
NDAR_INVC2ZW147D                 0                      NaN  left_only
NDAR_INVFZWKRBW7                 0                      NaN  left_only
NDAR_INVJHJDGEFN                 0                      NaN  left_only
NDAR_INVL9NUBDAN                 0                      NaN  left_only
NDAR_INVNR9XMXRM                 0                      NaN  left_only
NDAR_INVTRG5GX9T                 0                      NaN  left_only
\shape after merging
(11801, 2)

shape after checking for nans
(11801, 2)
...FINISH MERGING
