In [2]:
from xgboost import XGBRegressor
import pandas as pd
import numpy as np

In [3]:
cbcl = pd.read_csv('cbcl_1_5-2023-07-21.csv')
iq = pd.read_csv("iq-2023-07-21.csv")

# ADOS focused

## Data Loading

Which datasets are relevant?

- CBCL1-5
- CBCL6-18
- IQ
- Autism Diagnostic Observation Schedule (ADOS)

In [5]:
ados1 = pd.read_csv('ados_original_module_1.csv')
ados2 = pd.read_csv('ados_original_module_2.csv')
ados3 = pd.read_csv('ados_original_module_3.csv')
ados4 = pd.read_csv('ados_original_module_4.csv')

ados2_1 = pd.read_csv('ados_2_module_1.csv')
ados2_2 = pd.read_csv('ados_2_module_2.csv')
ados2_3 = pd.read_csv('ados_2_module_3.csv')
ados2_4 = pd.read_csv('ados_2_module_4.csv')

ados_toddler = pd.read_csv('ados_2_toddler.csv')

In [20]:
# Printing the shape of each DataFrame

print("Shape of CBCL:", cbcl.shape)
print("Shape of IQ:", iq.shape)

print("\nShape of ados1:", ados1.shape)
print("Shape of ados2:", ados2.shape)
print("Shape of ados3:", ados3.shape)
print("Shape of ados4:", ados4.shape)

print("\nShape of ados2_1:", ados2_1.shape)
print("Shape of ados2_2:", ados2_2.shape)
print("Shape of ados2_3:", ados2_3.shape)
print("Shape of ados2_4:", ados2_4.shape)

print("\nShape of ados_toddler:", ados_toddler.shape)

Shape of CBCL: (2835, 175)
Shape of IQ: (5966, 26)

Shape of ados1: (90, 53)
Shape of ados2: (28, 51)
Shape of ados3: (28, 50)
Shape of ados4: (4, 49)

Shape of ados2_1: (521, 55)
Shape of ados2_2: (259, 50)
Shape of ados2_3: (405, 49)
Shape of ados2_4: (20, 57)

Shape of ados_toddler: (355, 62)


## Check for subject ID overlaps between datasets 

Does ADOS files fill in the gaps of diagnostic results in CBCL?

In [21]:
ados_dataframes = [
    'ados1', 'ados2', 'ados3', 'ados4', 'ados2_1', 'ados2_2', 'ados2_3', 'ados2_4', 'ados_toddler'
]

In [26]:
for ados_df in ados_dataframes:
    current_df = globals()[ados_df]
    print(current_df.shape)
    
    merged_df = pd.merge(cbcl, current_df, on="subject_sp_id", how="inner")
    print(f"Shape after merging with {ados_df}: {merged_df.shape}")

(90, 53)
Shape after merging with ados1: (0, 227)
(28, 51)
Shape after merging with ados2: (0, 225)
(28, 50)
Shape after merging with ados3: (0, 224)
(4, 49)
Shape after merging with ados4: (0, 223)
(521, 55)
Shape after merging with ados2_1: (5, 229)
(259, 50)
Shape after merging with ados2_2: (5, 224)
(405, 49)
Shape after merging with ados2_3: (0, 223)
(20, 57)
Shape after merging with ados2_4: (0, 231)
(355, 62)
Shape after merging with ados_toddler: (13, 236)


In [28]:
merged_df

Unnamed: 0,subject_sp_id,respondent_sp_id,family_sf_id,biomother_sp_id,biofather_sp_id,sex_x,current_depend_adult_x,asd_x,age_at_eval_months_x,age_at_eval_years_x,...,e4_anxiety,ados_2_T_algorithm,ados_2_social_affect_total,ados_2_restricted_repetitive_total,ados_2_SA_RRB_total,sa_css,rrb_css,total_css,range_of_concern,eval_year
0,SP0388241,SP0021095,SF0021113,SP0021095,SP0021345,Female,,True,49,4,...,0,21-30 Months some words,11.0,3.0,14.0,,,,moderate-to-severe,2020
1,SP0360269,SP0041857,SF0041858,SP0041857,SP0041861,Female,,True,53,4,...,1,,,,,,,,,2019
2,SP0250852,SP0250851,SF0250852,SP0250851,,Male,,True,58,4,...,0,,,,,,,,,2018
3,SP0292661,SP0292660,SF0292661,SP0292660,SP0292686,Male,,True,58,4,...,0,12-20 Months or 21-30 Months few-no words,17.0,4.0,21.0,9.0,7.0,8.0,moderate-to-severe,2019
4,SP0326399,SP0326398,SF0326399,SP0326398,SP0326476,Male,,True,58,4,...,0,21-30 Months some words,19.0,5.0,24.0,,,,moderate-to-severe,2019
5,SP0336034,SP0336033,SF0336034,SP0336033,SP0339957,Male,,True,63,5,...,0,12-20 Months or 21-30 Months few-no words,18.0,4.0,22.0,9.0,7.0,9.0,moderate-to-severe,2019
6,SP0341871,SP0341868,SF0341871,SP0341868,SP0396338,Female,,True,52,4,...,0,12-20 Months or 21-30 Months few-no words,12.0,4.0,16.0,6.0,7.0,6.0,moderate-to-severe,2019
7,SP0357816,SP0357815,SF0357816,SP0357815,SP0387745,Male,,True,46,3,...,0,12-20 Months or 21-30 Months few-no words,14.0,7.0,21.0,,,,moderate-to-severe,2019
8,SP0369166,SP0369165,SF0369166,SP0369165,SP0369788,Male,,True,55,4,...,0,12-20 Months or 21-30 Months few-no words,13.0,6.0,19.0,,,,moderate-to-severe,2019
9,SP0370091,SP0370090,SF0370091,SP0370090,,Female,,True,45,3,...,0,,,,,,,,,2019


- Only 13 entries are common between CBCL1-5 and ADOS_2_toddler (insufficient)
- So the prospect of observing 

# Demographic analysis

variable-      | 40          |


In [18]:
sex_counts = cbcl['sex'].value_counts(normalize=False)
sex_percentage = cbcl['sex'].value_counts(normalize=True) * 100

In [19]:
sex_counts

sex
Male      2087
Female     748
Name: count, dtype: int64

In [5]:
sex_percentage

sex
Male      73.61552
Female    26.38448
Name: proportion, dtype: float64

In [21]:
male_df = cbcl[cbcl['sex'] == 'Male']
female_df = cbcl[cbcl['sex'] == 'Female']

In [25]:
male_asd_range_percent = male_df['dsm5_autism_spectrum_problems_problems_range'].value_counts(normalize=True) * 100
male_asd_range_counts = male_df['dsm5_autism_spectrum_problems_problems_range'].value_counts(normalize=False)

In [26]:
male_asd_range_counts

dsm5_autism_spectrum_problems_problems_range
clinical      1271
normal         546
borderline     160
Name: count, dtype: int64

In [17]:
male_asd_range_percent

dsm5_autism_spectrum_problems_problems_range
clinical      64.352370
normal        27.136991
borderline     8.510638
Name: proportion, dtype: float64

In [22]:
female_asd_range_percent = female_df['dsm5_autism_spectrum_problems_problems_range'].value_counts(normalize=True) * 100
female_asd_range_counts = female_df['dsm5_autism_spectrum_problems_problems_range'].value_counts(normalize=False)

In [23]:
female_asd_range_counts

dsm5_autism_spectrum_problems_problems_range
clinical      453
normal        181
borderline     68
Name: count, dtype: int64

In [28]:
female_asd_range_percent

dsm5_autism_spectrum_problems_problems_range
clinical      64.529915
normal        25.783476
borderline     9.686610
Name: proportion, dtype: float64

In [31]:
female_depressive_counts = female_df['dsm5_depressive_problems_range'].value_counts(normalize=False)
female_depressive_percent = female_df['dsm5_depressive_problems_range'].value_counts(normalize=True) * 100

In [33]:
print(female_depressive_counts)
print(female_depressive_percent)

dsm5_depressive_problems_range
normal        367
clinical      262
borderline     73
Name: count, dtype: int64
dsm5_depressive_problems_range
normal        52.279202
clinical      37.321937
borderline    10.398860
Name: proportion, dtype: float64


In [37]:
male_anxiety_counts = male_df['dsm5_anxiety_problems_range'].value_counts(normalize=False)
male_anxiety_percent = male_df['dsm5_anxiety_problems_range'].value_counts(normalize=True) * 100

female_anxiety_counts = female_df['dsm5_anxiety_problems_range'].value_counts(normalize=False)
female_anxiety_percent = female_df['dsm5_anxiety_problems_range'].value_counts(normalize=True) * 100

In [38]:
print(female_anxiety_counts)
print(female_anxiety_percent)

dsm5_anxiety_problems_range
normal        455
clinical      196
borderline     51
Name: count, dtype: int64
dsm5_anxiety_problems_range
normal        64.814815
clinical      27.920228
borderline     7.264957
Name: proportion, dtype: float64


In [39]:
print(male_anxiety_counts)
print(male_anxiety_percent)

dsm5_anxiety_problems_range
normal        1405
clinical       454
borderline     118
Name: count, dtype: int64
dsm5_anxiety_problems_range
normal        71.067274
clinical      22.964087
borderline     5.968639
Name: proportion, dtype: float64


In [40]:
male_ADHD_counts = male_df['dsm5_attention_deficit_hyperactivity_range'].value_counts(normalize=False)
male_ADHD_percent = male_df['dsm5_attention_deficit_hyperactivity_range'].value_counts(normalize=True) * 100

female_ADHD_counts = female_df['dsm5_attention_deficit_hyperactivity_range'].value_counts(normalize=False)
female_ADHD_percent = female_df['dsm5_attention_deficit_hyperactivity_range'].value_counts(normalize=True) * 100

In [41]:
print(male_ADHD_counts)
print(male_ADHD_percent)

print(female_ADHD_counts)
print(female_ADHD_percent)

dsm5_attention_deficit_hyperactivity_range
normal        1292
clinical       434
borderline     251
Name: count, dtype: int64
dsm5_attention_deficit_hyperactivity_range
normal        65.351543
clinical      21.952453
borderline    12.696004
Name: proportion, dtype: float64
dsm5_attention_deficit_hyperactivity_range
normal        423
clinical      181
borderline     98
Name: count, dtype: int64
dsm5_attention_deficit_hyperactivity_range
normal        60.256410
clinical      25.783476
borderline    13.960114
Name: proportion, dtype: float64


In [42]:
ADHD_counts = cbcl['dsm5_attention_deficit_hyperactivity_range'].value_counts(normalize=False)
ADHD_percent = cbcl['c'].value_counts(normalize=True) * 100

print(ADHD_counts)
print(ADHD_percent)

dsm5_attention_deficit_hyperactivity_range
normal        1715
clinical       615
borderline     349
Name: count, dtype: int64
dsm5_attention_deficit_hyperactivity_range
normal        64.016424
clinical      22.956327
borderline    13.027249
Name: proportion, dtype: float64


In [43]:
anxiety_counts = cbcl['dsm5_anxiety_problems_range'].value_counts(normalize=False)
anxiety_percent = cbcl['dsm5_anxiety_problems_range'].value_counts(normalize=True) * 100

print(anxiety_counts)
print(anxiety_percent)

dsm5_anxiety_problems_range
normal        1860
clinical       650
borderline     169
Name: count, dtype: int64
dsm5_anxiety_problems_range
normal        69.428891
clinical      24.262785
borderline     6.308324
Name: proportion, dtype: float64


In [45]:
# Create a 3x3 matrix for intersections of ADHD and ASD
adhd_categories = ['normal', 'borderline', 'clinical']
asd_categories = ['normal', 'borderline', 'clinical']

matrix_data = []

for adhd_category in adhd_categories:
    row_data = []
    for asd_category in asd_categories:
        adhd_col = f'{adhd_category}_adhd'
        asd_col = f'{asd_category}_asd'
        
        intersection_count = cbcl[adhd_col] & cbcl[asd_col]
        total_count = v[adhd_col] | cbcl[asd_col]
        
        row_data.append(f"{intersection_count.sum()} / {total_count.sum()}")

    matrix_data.append(row_data)

# Create a DataFrame for the intersection matrix
intersection_matrix = pd.DataFrame(matrix_data, index=adhd_categories, columns=asd_categories)

print(intersection_matrix)

KeyError: 'normal_adhd'