In [6]:
import pandas as pd
pd.set_option('max_colwidth', 100)
import numpy as np
import matplotlib.pyplot as plt
from itertools import cycle
from sklearn import svm, datasets
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
import itertools
import scipy.stats as st

In [82]:
path_to_data = '/Users/zeleninam2/Documents/projects/cbcl_ksads_work/revision_2023/data'
path_to_ksads5_child = path_to_data+'/abcd-data-release-5.0/core/mental-health/mh_y_ksads_ss.csv' 
path_to_ksads4_child = path_to_data+'/abcd-data-release-4/abcd_ksad501.txt'
path_to_ksads3_child = path_to_data+'/abcd-data-release-3/abcd_ksad501.txt'

path_to_ksads5_parent = path_to_data+'/abcd-data-release-5.0/core/mental-health/mh_p_ksads_ss.csv' 
path_to_ksads4_parent = path_to_data+'/abcd-data-release-4/abcd_ksad01.txt'
path_to_ksads3_parent = path_to_data+'/abcd-data-release-3/abcd_ksad01.txt'

depression_column_list = [f'ksads_1_{x}_t' for x in [840, 841, 843, 844, 846]]

# first observation - besides the diagnoses in depression_column_list 
# (which is identical to the list of diagnosis in release 3),
# release 5 has some "ksads2" diagnoses 
# they seem to be coming from another table from the previous releases 
# the table used to be called ksads2_daic_use_only_01
# I think DAIC stands for Distress Analysis Interview Corpus, and this is some other data we are not using
# in any case, these ksads2 diagnoses are not new 

In [90]:
# LOADING KSADS DATA

def load_ksads_simple(release):

    if release == '3':
        path_to_ksads = path_to_ksads3_child
        separator = '\t'
    elif release == '4':
        path_to_ksads = path_to_ksads4_child
        separator = '\t'
    elif release == '5':
        path_to_ksads = path_to_ksads5_child
        separator = ','        
    
    #load the initial table
    ksads_data = pd.read_csv(path_to_ksads, sep=separator, header=0, low_memory=False)
    ksads_data_bsl = ksads_data.loc[ksads_data.eventname == 'baseline_year_1_arm_1'].copy()
    
    return ksads_data_bsl

In [91]:
data3 = load_ksads_simple(release='3')
data4 = load_ksads_simple(release='4')
data5 = load_ksads_simple(release='5')

In [116]:
# ------
tmp3 = data3.loc[:,['src_subject_id']+depression_column_list]
tmp5 = data5.loc[:,['src_subject_id']+depression_column_list]
dep35 = tmp5.merge(tmp3, how='inner', on='src_subject_id', suffixes=['_5', '_3'])


In [120]:
dep35.dtypes

src_subject_id      object
ksads_1_840_t_5    float64
ksads_1_841_t_5    float64
ksads_1_843_t_5    float64
ksads_1_844_t_5    float64
ksads_1_846_t_5    float64
ksads_1_840_t_3     object
ksads_1_841_t_3     object
ksads_1_843_t_3     object
ksads_1_844_t_3     object
ksads_1_846_t_3     object
dtype: object

In [124]:
for diagn in depression_column_list:
    print(diagn)
    dep35[f"{diagn}_3"] = dep35[f"{diagn}_3"].astype(float)
    print((dep35[f"{diagn}_3"]==dep35[f"{diagn}_5"]).mean())

ksads_1_840_t
0.9999152255001695
ksads_1_841_t
0.9999152255001695
ksads_1_843_t
0.9999152255001695
ksads_1_844_t
0.9999152255001695
ksads_1_846_t
0.9999152255001695


In [125]:
dep35['mismatch'] = False 
for diagn in depression_column_list:
    dep35.loc[dep35[f"{diagn}_3"]!=dep35[f"{diagn}_5"],'mismatch']=True

In [126]:
dep35.query('mismatch')

Unnamed: 0,src_subject_id,ksads_1_840_t_5,ksads_1_841_t_5,ksads_1_843_t_5,ksads_1_844_t_5,ksads_1_846_t_5,ksads_1_840_t_3,ksads_1_841_t_3,ksads_1_843_t_3,ksads_1_844_t_3,ksads_1_846_t_3,mismatch
10202,NDAR_INVVGEWB754,0.0,0.0,0.0,0.0,0.0,555.0,555.0,555.0,555.0,555.0,True


In [95]:
# we previously used ksads3 data. Ksads4 data is all nans:
# data4.head(20)

print('\n\nDEPRESSION COLUMNS:')
for depression_diag in depression_column_list:
    print('\n')
    print(data4[depression_diag].value_counts())



DEPRESSION COLUMNS:


Series([], Name: count, dtype: int64)


Series([], Name: count, dtype: int64)


Series([], Name: count, dtype: int64)


Series([], Name: count, dtype: int64)


Series([], Name: count, dtype: int64)


In [94]:
#data3['ksads_dep_binary'].equals(data5['ksads_dep_binary'])

In [106]:
# pull only positive cases from both datasets

def pull_positives(dataset, depression_column_list):
    # KSADS - only leave columns with name id and depression (anxiety, adhd)  diagnoses
    col_list = ['src_subject_id']
    col_list += depression_column_list
    ksads_bsl_id_alldepandanx = dataset.loc[:, col_list]
    print('\n\nDEPRESSION COLUMNS:')
    for depression_diag in depression_column_list:
        print('\n')
        print(ksads_bsl_id_alldepandanx[depression_diag].value_counts())
    #print(ksads_bsl_id_alldepandanx.head())
    ksads_bsl_id_alldepandanx = ksads_bsl_id_alldepandanx.set_index('src_subject_id')
    ksads_bsl_id_alldepandanx = ksads_bsl_id_alldepandanx.astype(int)
    ksads_bsl_id_alldepandanx['any_depression'] = (ksads_bsl_id_alldepandanx.loc[:, depression_column_list] > 0).sum(1) > 0
    is_pos =  ksads_bsl_id_alldepandanx['any_depression']==1
    pos_ksads_all = ksads_bsl_id_alldepandanx[is_pos]
    return pos_ksads_all

data5_posonly = pull_positives(data5, depression_column_list)
data3_posonly = pull_positives(data3, depression_column_list)



DEPRESSION COLUMNS:


ksads_1_840_t
0.0    11730
1.0       66
Name: count, dtype: int64


ksads_1_841_t
0.0    11769
1.0       27
Name: count, dtype: int64


ksads_1_843_t
0.0    11796
Name: count, dtype: int64


ksads_1_844_t
0.0    11796
Name: count, dtype: int64


ksads_1_846_t
0.0    11768
1.0       28
Name: count, dtype: int64


DEPRESSION COLUMNS:


ksads_1_840_t
0      11739
555       73
1         66
Name: count, dtype: int64


ksads_1_841_t
0      11778
555       73
1         27
Name: count, dtype: int64


ksads_1_843_t
0      11805
555       73
Name: count, dtype: int64


ksads_1_844_t
0      11805
555       73
Name: count, dtype: int64


ksads_1_846_t
0      11777
555       73
1         28
Name: count, dtype: int64


In [97]:
print(data5_posonly.shape)
print(data3_posonly.shape)

(120, 6)
(193, 6)


In [98]:
# but I notice that data3 has nans while data5 doesn't
# so I'm cleaning the nans next

In [99]:
# clean nans in data3

def clean_nans(dataset):
    dataset[dataset > 1] = np.nan
    nnans = dataset.isnull().sum(1)
    bad_ids = nnans[nnans > 1].index
    good_ids = nnans[nnans == 0].index
    #print(good_ids)
    ksads_dat = (dataset.loc[good_ids]) 
    return ksads_dat

In [100]:
data3_posonly_nonans = clean_nans(data3_posonly)

In [101]:
mydata3 = data3_posonly_nonans
mydata5 = data5_posonly

In [102]:
# compare shapes
print(mydata3.shape)
print(mydata5.shape)

# they are equal

(120, 6)
(120, 6)


In [103]:
# compare subjects ids
mydata3_ids = mydata3.index.values.tolist()
mydata5_ids = mydata5.index.values.tolist()

def compare(s, t):
    return sorted(s) == sorted(t)

compare(mydata3_ids, mydata5_ids)

True

In [104]:
# -------------------------------------------
# now iterate through each specific diagnosis

def is_pos(dataset, diagnosis):
    is_pos = dataset[diagnosis] == 1
    pos_all = dataset[is_pos]
    return pos_all

depression_column_list = [f'ksads_1_{x}_t' for x in [840, 841, 843, 844, 846]]
for depression_diagnosis in depression_column_list:
    print(depression_diagnosis)
    diagnosis_pos_3 = is_pos(mydata3, depression_diagnosis)
    diagnosis_pos_5 = is_pos(mydata5, depression_diagnosis)
    mydata3_ids_diagnosis = diagnosis_pos_3.index.values.tolist()
    mydata5_ids_diagnosis = diagnosis_pos_5.index.values.tolist()
    print(compare(mydata3_ids_diagnosis, mydata5_ids_diagnosis))

ksads_1_840_t
True
ksads_1_841_t
True
ksads_1_843_t
True
ksads_1_844_t
True
ksads_1_846_t
True


In [None]:
#--------------- ksads2 diagnoses --------------#
'''
• DEPRESSION
1. Major Depressive Disorder Present
        abcd_ksad501: ksads_1_840_t
        ksads2daic_use_only01: ksads2_1_790_t
2. Major Depressive Disorder, Current, in Partial Remission
        abcd_ksad501: ksads_1_841_t
        ksads2daic_use_only01: ksads2_1_791_t
3. Persistent Depressive Disorder (Dysthymia), Present
        abcd_ksad501: ksads_1_843_t
        ksads2daic_use_only01: ksads2_1_793_t
4. Persistent Depressive Disorder (Dysthymia), In Partial Remission
        abcd_ksad501: ksads_1_844_t
        ksads2daic_use_only01: ksads2_1_794_t
5. Unspecified Depressive Disorder, Current
        abcd_ksad501: ksads_1_846_t
        ksads2daic_use_only01: ksads2_1_796_t
''''''

In [114]:
diagnoses_ksads2 = ['ksads2_1_790_t', 'ksads2_1_791_t', 'ksads2_1_793_t', 'ksads2_1_794_t', 'ksads2_1_796_t']
#diagnoses_of_interest = ['ksads_1_840_t', 'ksads2_1_790_t'] # MDD, Present

def load_ksads2(dataset, diagnoses):
    col_list = ['src_subject_id']
    col_list += diagnoses
    dataset1 = dataset.loc[:, col_list]
    print('\n\nDEPRESSION COLUMNS:')
    for depression_diag in diagnoses:
        print('\n')
        print(dataset1[depression_diag].value_counts())
    return dataset1

data_ksads2_all = load_ksads2(data5, diagnoses_ksads2)



DEPRESSION COLUMNS:


Series([], Name: count, dtype: int64)


Series([], Name: count, dtype: int64)


Series([], Name: count, dtype: int64)


Series([], Name: count, dtype: int64)


Series([], Name: count, dtype: int64)


In [115]:
data5.loc[:, ['ksads2_1_790_t', 'ksads2_1_791_t', 'ksads2_1_793_t', 'ksads2_1_794_t', 'ksads2_1_796_t']].head(5)

Unnamed: 0,ksads2_1_790_t,ksads2_1_791_t,ksads2_1_793_t,ksads2_1_794_t,ksads2_1_796_t
0,,,,,
4,,,,,
8,,,,,
11,,,,,
14,,,,,


In [None]:
['ksads2_1_790_p', 'ksads2_1_791_p', 'ksads2_1_793_p', 'ksads2_1_794_p', 'ksads2_1_796_p']