In [3]:
import pandas as pd
pd.set_option('max_colwidth', 100)
import numpy as np
import matplotlib.pyplot as plt
from itertools import cycle
from sklearn import svm, datasets
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
import itertools
import scipy.stats as st

In [None]:
# looking at what waves exist at YSR, 
# which of these waves also exist in KSADS,
# and which of these waves, existing in both dataframes (or all three), have the max amount of depressed

From paper comments:

"The elephant in the room is ignored, which is that CBCL is completed by parents, and there is a self-report version too (YRS). It might partly explain the discrepancy across KSADS informants in the ABCD dataset, which could be tested if the YRS is available. I am aware this would not be preregistered but neither it was the use of self-report KSADS. I am not sure whether there would be enough participants to test it, though."

From https://wiki.abcdstudy.org/release-notes/non-imaging/mental-health.html:

Brief Problem Monitor (ASEBA)
Release 5.0 Data Table: mh_y_bpm

Measure Description: Normed multi-informant monitoring of children’s functioning. This is the youth self-report. The BPM’s Internalizing (INT), Attention Problems (ATT), Externalizing (EXT), and Total Problems (TOT) scales comprise items from the ASEBA Youth Self-Report (YSR). The items, scales, and norms are based on decades of research and practical experience, as summarized in the BPM Manual (Achenbach, McConaughy, Ivanova, & Rescorla, 2017).

ABCD Subdomain: Broad Psychopathology

Number of Variables: 48

Summary Score(s): Yes

Measurement Waves Administered: Annually since the 1-year follow-up

Modifications since initial administration: None

Notes and special considerations: None

Reference: Achenbach, T. M. (2009). The Achenbach System of Empirically Based Assessment (ASEBA): Development, Findings, Theory, and Applications. Burlington, VT: University of Vermont Research Center for Children, Youth, & Families. Find here

In [14]:
path_to_data = '/Users/zeleninam2/Documents/projects/cbcl_ksads_work/revision_2023/data'
path_to_cbcl = path_to_data+'/abcd-data-release-5.1/core/mental-health/mh_p_cbcl.csv'
path_to_ysr = path_to_data+'/abcd-data-release-5.1/core/mental-health/mh_y_bpm.csv'
path_to_ksads_c = path_to_data+'/abcd-data-release-5.1/core/mental-health/mh_y_ksads_ss.csv'
path_to_ksads_p = path_to_data+'/abcd-data-release-5.1/core/mental-health/mh_p_ksads_ss.csv'

In [5]:
ysr_data = pd.read_csv(path_to_ysr, sep=',', header=0, low_memory=False)
ysr_data.head()

Unnamed: 0,src_subject_id,eventname,bpm_1_y,bpm_2_y,bpm_3_y,bpm_4_y,bpm_5_y,bpm_6_y,bpm_7_y,bpm_8_y,...,bpm_y_ss_external_mean,bpm_y_ss_external_nm,bpm_y_ss_external_nt,bpm_y_scr_totalprob_r,bpm_y_scr_totalprob_t,bpm_y_scr_totalprob_nm,bpm_y_scr_totalprob_nt,bpm_y_ss_totalprob_mean,bpm_y_ss_totalprob_nm,bpm_y_ss_totalprob_nt
0,NDAR_INV003RTV85,6_month_follow_up_arm_1,1.0,1.0,2.0,1.0,0.0,0.0,1.0,0.0,...,0.428571,0.0,7.0,9.0,53.0,0.0,19.0,0.473684,0.0,19.0
1,NDAR_INV003RTV85,1_year_follow_up_y_arm_1,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.285714,0.0,7.0,5.0,50.0,0.0,19.0,0.263158,0.0,19.0
2,NDAR_INV003RTV85,18_month_follow_up_arm_1,1.0,1.0,2.0,1.0,0.0,0.0,0.0,0.0,...,0.285714,0.0,7.0,10.0,55.0,0.0,19.0,0.526316,0.0,19.0
3,NDAR_INV003RTV85,2_year_follow_up_y_arm_1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,7.0,0.0,50.0,0.0,19.0,0.0,0.0,19.0
4,NDAR_INV003RTV85,30_month_follow_up_arm_1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,7.0,1.0,50.0,0.0,19.0,0.052632,0.0,19.0


I'm assuming we either need Internal-Raw score or Internal-T score (bpm_y_scr_internal_r or bpm_y_scr_internal_t)

Checking sample size

In [6]:
ysr_data_bsl = ysr_data.loc[ysr_data.eventname == 'baseline_year_1_arm_1'].copy()
ysr_data_bsl.head()

Unnamed: 0,src_subject_id,eventname,bpm_1_y,bpm_2_y,bpm_3_y,bpm_4_y,bpm_5_y,bpm_6_y,bpm_7_y,bpm_8_y,...,bpm_y_ss_external_mean,bpm_y_ss_external_nm,bpm_y_ss_external_nt,bpm_y_scr_totalprob_r,bpm_y_scr_totalprob_t,bpm_y_scr_totalprob_nm,bpm_y_scr_totalprob_nt,bpm_y_ss_totalprob_mean,bpm_y_ss_totalprob_nm,bpm_y_ss_totalprob_nt


In [7]:
ysr_data.eventname.value_counts()

eventname
6_month_follow_up_arm_1     11389
1_year_follow_up_y_arm_1    11220
18_month_follow_up_arm_1    11084
2_year_follow_up_y_arm_1    10973
3_year_follow_up_y_arm_1    10336
30_month_follow_up_arm_1    10256
42_month_follow_up_arm_1     8566
4_year_follow_up_y_arm_1     4754
Name: count, dtype: int64

In [9]:
# internalizing problems
for c in ysr_data.columns:
    print(c)

src_subject_id
eventname
bpm_1_y
bpm_2_y
bpm_3_y
bpm_4_y
bpm_5_y
bpm_6_y
bpm_7_y
bpm_8_y
bpm_9_y
bpm_10_y
bpm_11_y
bpm_12_y
bpm_13_y
bpm_14_y
bpm_15_y
bpm_16_y
bpm_17_y
bpm_18_y
bpm_19_y
bpm_admin
bpm_y_scr_attention_r
bpm_y_scr_attention_t
bpm_y_scr_attention_nm
bpm_y_scr_attention_nt
bpm_y_ss_attention_mean
bpm_y_ss_attention_nm
bpm_y_ss_attention_nt
bpm_y_scr_internal_r
bpm_y_scr_internal_t
bpm_y_scr_internal_nm
bpm_y_scr_internal_nt
bpm_y_ss_internal_mean
bpm_y_ss_internal_nm
bpm_y_ss_internal_nt
bpm_y_scr_external_r
bpm_y_scr_external_t
bpm_y_scr_external_nm
bpm_y_scr_external_nt
bpm_y_ss_external_mean
bpm_y_ss_external_nm
bpm_y_ss_external_nt
bpm_y_scr_totalprob_r
bpm_y_scr_totalprob_t
bpm_y_scr_totalprob_nm
bpm_y_scr_totalprob_nt
bpm_y_ss_totalprob_mean
bpm_y_ss_totalprob_nm
bpm_y_ss_totalprob_nt


In [10]:
ysr_data.bpm_y_scr_internal_r.value_counts()

bpm_y_scr_internal_r
0.0     25731
1.0     14180
2.0      9993
3.0      6533
4.0      4497
5.0      2813
6.0      2101
7.0      1265
8.0       815
9.0       546
10.0      353
11.0      195
12.0      135
Name: count, dtype: int64

In [11]:
ysr_data.eventname.value_counts()

eventname
6_month_follow_up_arm_1     11389
1_year_follow_up_y_arm_1    11220
18_month_follow_up_arm_1    11084
2_year_follow_up_y_arm_1    10973
3_year_follow_up_y_arm_1    10336
30_month_follow_up_arm_1    10256
42_month_follow_up_arm_1     8566
4_year_follow_up_y_arm_1     4754
Name: count, dtype: int64

eventname
6_month_follow_up_arm_1     11389 <br>
1_year_follow_up_y_arm_1    11220 <br>
18_month_follow_up_arm_1    11084 <br>
2_year_follow_up_y_arm_1    10973 <br>
3_year_follow_up_y_arm_1    10336 <br>
30_month_follow_up_arm_1    10256 <br>
42_month_follow_up_arm_1     8566 <br>
4_year_follow_up_y_arm_1     4754

In [15]:
ksads_data_c = pd.read_csv(path_to_ksads_c, sep=',', header=0, low_memory=False)
ksads_data_p = pd.read_csv(path_to_ksads_p, sep=',', header=0, low_memory=False)

In [16]:
ksads_data_c.eventname.value_counts()

eventname
baseline_year_1_arm_1       11812
1_year_follow_up_y_arm_1    11095
2_year_follow_up_y_arm_1    10883
3_year_follow_up_y_arm_1    10326
4_year_follow_up_y_arm_1     4674
Name: count, dtype: int64

In [17]:
ksads_data_p.eventname.value_counts()

eventname
baseline_year_1_arm_1       11747
1_year_follow_up_y_arm_1    11103
2_year_follow_up_y_arm_1    10756
3_year_follow_up_y_arm_1    10330
4_year_follow_up_y_arm_1     4754
Name: count, dtype: int64

Only 1, 2, 3, and 4 year follow-ups exist in both YSR and KSADS
Let's see how many depressed kids there are in each wave (intuitively 1 yr follow-up should have most becaus eit has most data)

In [36]:
def count_depressed_in_ksads(ksads_dat, reporter, wave):
    print('\n WAVE: %s' % wave)
    ksads_data_wave = ksads_dat.loc[ksads_dat.eventname == wave].copy()
    if reporter == 'child':
        depression_column_list = [f'ksads_1_{x}_t' for x in [840, 841, 843, 844, 846]]
    elif reporter == 'parent':
        depression_column_list = [f'ksads_1_{x}_p' for x in [840, 841, 843, 844, 846]]
    col_list = ['src_subject_id']
    col_list += depression_column_list
    ksads_data_wave = ksads_data_wave.loc[:, col_list] 
    # change types to float so I can work with them 
    ksads_data_wave = ksads_data_wave.set_index('src_subject_id')
    ksads_data_wave = ksads_data_wave.astype(float)
    #print(ksads_data_wave.head())
    ksads_data_wave[ksads_data_wave > 1.0] = np.nan
    ksads_data_wave = ksads_data_wave.dropna()
    ksads_data_wave = (ksads_data_wave.sum(1)).astype(int)
    ksads_data_wave = pd.DataFrame(ksads_data_wave).rename(columns = {0:'ksads_dep_binary'})
    #print(ksads_data_wave.head())
    print('shape: %d' % ksads_data_wave.shape[0])
    depressed = ksads_data_wave.sum(0)
    print('depressed: %d' % depressed)

In [38]:
# child-report ksads

for wave in ['baseline_year_1_arm_1', '1_year_follow_up_y_arm_1', '2_year_follow_up_y_arm_1', '3_year_follow_up_y_arm_1', '4_year_follow_up_y_arm_1']:
    count_depressed_in_ksads(ksads_data_c, 'child', wave)


 WAVE: baseline_year_1_arm_1
shape: 11808
depressed: 120

 WAVE: 1_year_follow_up_y_arm_1
shape: 0
depressed: 0

 WAVE: 2_year_follow_up_y_arm_1


  print('depressed: %d' % depressed)
  print('depressed: %d' % depressed)
  print('depressed: %d' % depressed)


shape: 10871
depressed: 97

 WAVE: 3_year_follow_up_y_arm_1
shape: 3
depressed: 0

 WAVE: 4_year_follow_up_y_arm_1
shape: 0
depressed: 0


  print('depressed: %d' % depressed)
  print('depressed: %d' % depressed)


I checked and these weird numbers are correct. The data is basicalle all NaN, except for the 2 year follow up.

In [39]:
# parent-report ksads

for wave in ['baseline_year_1_arm_1', '1_year_follow_up_y_arm_1', '2_year_follow_up_y_arm_1', '3_year_follow_up_y_arm_1', '4_year_follow_up_y_arm_1']:
    count_depressed_in_ksads(ksads_data_p, 'parent', wave)


 WAVE: baseline_year_1_arm_1
shape: 11726
depressed: 56

 WAVE: 1_year_follow_up_y_arm_1
shape: 0
depressed: 0

 WAVE: 2_year_follow_up_y_arm_1


  print('depressed: %d' % depressed)
  print('depressed: %d' % depressed)
  print('depressed: %d' % depressed)


shape: 10752
depressed: 85

 WAVE: 3_year_follow_up_y_arm_1
shape: 1
depressed: 0

 WAVE: 4_year_follow_up_y_arm_1
shape: 0
depressed: 0


  print('depressed: %d' % depressed)
  print('depressed: %d' % depressed)
