Initial data exploration script for ABCD Dataset.

ABCD Youth Diagnostic Interview for DSM-5 5 (KSADS-5) (abcd_ksad501): mental health diagnosis - youth questions

In [120]:
import pandas as pd
import os

In [121]:
data_dir = '/Users/margotwagner/ucsd/research/ABCD/data'
ksads_file = 'abcd_ksad501.txt'

In [122]:
with open(os.path.join(data_dir, ksads_file)) as file:

    # create list of subject's data
    data_list = []

    for i, line in enumerate(file):

        # split up lines (tab-separated values)
        clean_line = line.strip("\n").replace('"',"").split("\t")

        # element names
        if i == 0:
            elem_name = clean_line
            depression_i = elem_name.index('ksads_1_2_t')

        # element descriptions
        elif i == 1:
            about_elem = clean_line
            print(about_elem[:depression_i+1])

        # subject entries are 2-end
        else:
            data_list.append(clean_line[:depression_i+1])

    # convert to dataframe
    data = pd.DataFrame(data_list, columns = elem_name[:depression_i+1])

['collection_id', 'abcd_ksad501_id', 'dataset_id', 'The NDAR Global Unique Identifier (GUID) for research subject', "Subject ID how it's defined in lab/project", 'Date on which the interview/genetic test/sampling/imaging/biospecimen was completed. MM/DD/YYYY', 'Age in months at the time of the interview/test/sampling/imaging.', 'Sex of subject at birth', 'The event name for which the data was collected', 'Unique ID provided by KSADS for dataset', 'Symptom - Depressed Mood, Present', 'Symptom - Depressed Mood, Past']


In [190]:
data.head(0)

Unnamed: 0,collection_id,abcd_ksad501_id,dataset_id,subjectkey,src_subject_id,interview_date,interview_age,sex,eventname,ksads_import_id_t,ksads_1_1_t,ksads_1_2_t


We can start by looking at depressed patients (present) to see if there are trends in age, gender, and past depression. Comorbidities can be an easy next step by looking at other KSADS categories.

In [124]:
print('Number of entries:', data.shape[0])

Number of entries: 33515


Split the data by event name (baseline, 1-year, 2-year)

In [135]:
# split by event name

# baseline
baseline = data.loc[data['eventname'] == 'baseline_year_1_arm_1']

# 1 year follow up
year_1 = data.loc[data['eventname'] == '1_year_follow_up_y_arm_1']

# 2 year follow up
year_2 = data.loc[data['eventname'] == '2_year_follow_up_y_arm_1']

In [136]:
print('Number of baseline entries:', baseline.shape[0])
print('Number of year 1 entries:', year_1.shape[0])
print('Number of year 2 entries:', year_2.shape[0])

Number of baseline entries: 11876
Number of year 1 entries: 11225
Number of year 2 entries: 10414


In [140]:
print('KSADS Inclusion status (year 1):', year_1.ksads_1_1_t.unique(), '- not administered in the assessment')

KSADS Inclusion status (year 1): ['555'] - not administered in the assessment


In [130]:
print('KSADS Present Values:', data.ksads_1_1_t.unique())
print('KSADS Past Values:', data.ksads_1_2_t.unique())
print('555 is "Not administered in the assessment"')
print('888 is "Question not asked due to primary question response (branching logic)"')

KSADS Present Values: ['0' '555' '1' '' '888']
KSADS Past Values: ['0' '555' '1' '' '888']
555 is "Not administered in the assessment"
888 is "Question not asked due to primary question response (branching logic)"


Split by depressed and not depressed (present value)

In [141]:
# select depression datapoints
# BASELINE
# subjects with present depressed mood
depressed_baseline = baseline.loc[baseline['ksads_1_1_t'] == '1']
print('Number of depressed subjects (baseline):', depressed_baseline.shape[0])

# non-depressed subjects
control_baseline = baseline.loc[baseline['ksads_1_1_t'] == '0']
print('Number of control subjects (baseline):', control_baseline.shape[0])


# YEAR 2
# subjects with present depressed mood
depressed_year_2 = year_2.loc[year_2['ksads_1_1_t'] == '1']
print('Number of depressed subjects (year 2):', depressed_year_2.shape[0])

# non-depressed subjects
control_year_2 = year_2.loc[year_2['ksads_1_1_t'] == '0']
print('Number of control subjects (year 2):', control_year_2.shape[0])


Number of depressed subjects (baseline): 288
Number of control subjects (baseline): 11516
Number of depressed subjects (year 2): 210
Number of control subjects (year 2): 10102


Gender distribution for depressed and not depressed

In [187]:
def gender_distribution(data_input, title="DATA"):
    total, depressed, control = data_input
    print('\t', title, 'SEX DATA')
    print('---------------------------------')
    print("TOTAL DISTRIBUTION:", 100*total.groupby('sex').size()/total.shape[0])

    print("DEPRESSED DISTRIBUTION:", 100*depressed.groupby('sex').size()/depressed.shape[0])

    print("CONTROL DISTRIBUTION:", 100*control.groupby('sex').size()/control.shape[0])

Baseline

In [188]:
gender_distribution([baseline, depressed_baseline, control_baseline], title="BASELINE")

	 BASELINE SEX DATA
---------------------------------
TOTAL DISTRIBUTION: sex
F    47.827551
M    52.172449
dtype: float64
DEPRESSED DISTRIBUTION: sex
F    46.875
M    53.125
dtype: float64
CONTROL DISTRIBUTION: sex
F    47.81174
M    52.18826
dtype: float64


Year 2

In [189]:
gender_distribution([year_2, depressed_year_2, control_year_2], title="YEAR 2")

	 YEAR 2 SEX DATA
---------------------------------
TOTAL DISTRIBUTION: sex
F    47.647398
M    52.352602
dtype: float64
DEPRESSED DISTRIBUTION: sex
F    66.666667
M    33.333333
dtype: float64
CONTROL DISTRIBUTION: sex
F    47.238171
M    52.761829
dtype: float64


Age stats

In [176]:
def age_stats(data_input, title='DATA'):
    print('\t\t', title, 'AGE DATA')
    print('---------------------------------------------------------------')
    print('SUMMARY STATS (months):', data_input.interview_age.astype(int).describe())
    print('RANGE (months):', data_input.interview_age.astype(int).max() - data_input.interview_age.astype(int).min())
    print('MEDIAN (months):', data_input.interview_age.astype(int).median())
    print('MODE (months):', data_input.interview_age.astype(int).mode())
    print('MEAN:', data_input.interview_age.astype(int).mean()//12, 'years', round(data_input.interview_age.astype(int).mean()%12), 'months')

Baseline

In [177]:
age_stats(baseline, title='BASELINE')

		 BASELINE AGE DATA
---------------------------------------------------------------
SUMMARY STATS (months): count    11876.000000
mean       118.979791
std          7.495937
min        107.000000
25%        112.000000
50%        119.000000
75%        126.000000
max        133.000000
Name: interview_age, dtype: float64
RANGE (months): 26
MEDIAN (months): 119.0
MODE (months): 0    108
dtype: int64
MEAN: 9.0 years 11.0 months


Year 2

In [178]:
age_stats(year_2, title='YEAR 2')

		 YEAR 2 AGE DATA
---------------------------------------------------------------
SUMMARY STATS (months): count    10414.000000
mean       144.038890
std          7.950735
min        127.000000
25%        137.000000
50%        144.000000
75%        151.000000
max        168.000000
Name: interview_age, dtype: float64
RANGE (months): 41
MEDIAN (months): 144.0
MODE (months): 0    138
dtype: int64
MEAN: 12.0 years 0.0 months


Interview dates

In [147]:
baseline.interview_date

0        02/21/2017
2        01/29/2018
4        04/08/2018
7        06/25/2018
11       12/03/2017
            ...    
33496    07/24/2017
33501    03/29/2017
33508    06/24/2017
33513    06/21/2017
33514    08/16/2018
Name: interview_date, Length: 11876, dtype: object

In [146]:
year_2.interview_date

5        03/11/2019
10       08/14/2019
14       08/10/2020
15       06/29/2019
16       04/16/2019
            ...    
33495    07/22/2019
33498    05/28/2019
33503    11/19/2018
33507    12/19/2020
33511    02/16/2020
Name: interview_date, Length: 10414, dtype: object

Comorbidities