In [23]:
import numpy as np
import pandas as pd
import os
from pathlib import Path
from src.pipeline.load import filter_subjects_with_two_timepoints
from src.pipeline.connectomes import (
    merge_features_and_connectomes,
    enforce_same_subjects
)
from src.pipeline.clean import (
    handle_missing_values,
    merge_family_ids,
    enforce_common_subjects,
    drop_siblings,
    link_with_g_scores,
    save_aligned_features

)
from src.pipeline.preprocess import (
    preprocess_features,
    filter_sites_by_threshold,
    add_income_group,
    print_data_summary
)
from src.analysis.eda import (
    list_columns_with_missing_values,
    count_columns_with_missing_values,
    percentage_missing_values,
    divide_features,
    categorical_summary_stats
)

In [2]:
# Load data
features = pd.read_csv('../data/raw/Demographics.csv')
# Load g_score
g_factor = pd.read_csv('../data/raw/ABCD_new_G_all.csv')
# Drop "Task" since it's constant
features = features.drop(columns=["Task"], inplace=False)

# Features description
The dataset we are working with consist of 13274 rows and 20 columns. Thus, we have information for the 13274 subjects. Let us look at the description of the features:
* src_subject_id - subject identifier
* eventname - timepoint of data
* site_id_l - data collection site (may want to include this in models as a grouping factor as there are likely site effects in the data, we will sometimes model subject nested in site as random effects)
* rel_family_id - family identifier. There are some siblings, twins, etc in the data. Not necessarily an issue sometimes, but when we're doing any kind of cross validation or anything we make sure we don't have family members in different folds.
* interview_age - age of the subject in months
* Subject/Session are just recoded versions of subject ID and eventname as they match with the naming of the imaging data better
* Task - this should all be rest
* GoodRun_5 - number of good runs in the included data
* censor_5 - number of censored timepoints in the included data
* TRs - number of included timepoints in the data
* confounds_nocensor - number of confounds in the run-level nuisance correction model (other than censored timepoints). Most of these quality metrics you probably don't need to worry about
* meanFD - the mean motion of the subject during their included scans. We usually use linear and quadratic terms of this as a confound at the group level
* race.4level - subject reported race
* hisp - subject reported hispanic yes/no
* demo_sex_v2 - sex at birth 1=male, 2=female (I think I dropped any other responses, but if there are others we might need to exclude just because there are so few we can't get good estimates of effects)
* EdYearsHighest - parental years of education (highest among parents, I think I need to double check we might actually use average elsewhere, but this variable is likely not immediately relevant for now)
* IncCombinedMidpoint - combined income of parents (midpoint of a bin, because it's only reported in 10 bins)
* Income2Needs - calculated income to needs metric based on parental income and number of people in the household
* Married - parents currently married or not

In [3]:
# Keep only participants with both baseline and follow-up
df_filtered = filter_subjects_with_two_timepoints(features)

# Split by event
df_baseline = df_filtered[df_filtered["eventname"] == "baseline_year_1_arm_1"]
df_followup = df_filtered[df_filtered["eventname"] == "2_year_follow_up_y_arm_1"]

In [4]:
# Drop rows with missing values (except family IDs)
df_baseline, df_followup = handle_missing_values(df_baseline, df_followup)

# Fill in family IDs for follow-up from baseline
df_followup = merge_family_ids(df_baseline, df_followup)

In [5]:
# Enforce common subjects
df_baseline, df_followup = enforce_common_subjects(df_baseline, df_followup)

In [6]:
# Drop siblings
df_baseline = drop_siblings(df_baseline)
df_followup = drop_siblings(df_followup)


In [7]:
# Merge with g-score data
merged_df_baseline_A, merged_df_followup_A = link_with_g_scores(df_baseline, df_followup, g_factor)

In [8]:
# Define renaming rules
rename_map = {
    "G_lavaan.baseline": "g_lavaan",
    "G_lavaan.2Year": "g_lavaan",
    "demo_sex_v2": "sex",
    "interview_age": "age"
}

# Apply renaming
merged_df_baseline_A = merged_df_baseline_A.rename(columns=rename_map)
merged_df_followup_A = merged_df_followup_A.rename(columns=rename_map)

# Columns to drop
drop_cols = ["src_subject_id", "eventname", "rel_family_id",
             "Session", "GoodRun_5", "censor_5", "TRs", "confounds_nocensor"]

merged_df_baseline_A = merged_df_baseline_A.drop(columns=drop_cols)
merged_df_followup_A = merged_df_followup_A.drop(columns=drop_cols)

In [9]:
merged_df_baseline_A = add_income_group(merged_df_baseline_A, t1=50000, t2=100000)
merged_df_followup_A  = add_income_group(merged_df_followup_A, t1=50000, t2=100000)

In [10]:
print_data_summary(merged_df_baseline_A)

Number of subjects: 2480
Age mean: 9.96 years
Age std: 0.63 years

sex:
  1.0: 1258 (50.73%)
  2.0: 1222 (49.27%)

race.4level:
  White: 1816 (73.23%)
  Other/Mixed: 375 (15.12%)
  Black: 245 (9.88%)
  Asian: 44 (1.77%)

hisp:
  No: 2032 (81.94%)
  Yes: 448 (18.06%)

income_group:
  high: 1088 (43.87%)
  medium: 742 (29.92%)
  low: 650 (26.21%)


In [11]:
print_data_summary(merged_df_followup_A)

Number of subjects: 2480
Age mean: 11.96 years
Age std: 0.65 years

sex:
  1.0: 1258 (50.73%)
  2.0: 1222 (49.27%)

race.4level:
  White: 1816 (73.23%)
  Other/Mixed: 375 (15.12%)
  Black: 245 (9.88%)
  Asian: 44 (1.77%)

hisp:
  No: 2032 (81.94%)
  Yes: 448 (18.06%)

income_group:
  high: 1244 (50.16%)
  medium: 699 (28.19%)
  low: 537 (21.65%)


In [12]:
merged_df_baseline_A.columns

Index(['g_lavaan', 'site_id_l', 'age', 'Subject', 'meanFD', 'race.4level',
       'hisp', 'sex', 'EdYearsHighest', 'IncCombinedMidpoint', 'Income2Needs',
       'Married', 'income_group'],
      dtype='object')

In [13]:
merged_df_baseline_B = preprocess_features(features, g_factor, 'baseline_year_1_arm_1')
merged_df_followup_B = preprocess_features(features, g_factor, '2_year_follow_up_y_arm_1')

In [14]:
print_data_summary(merged_df_baseline_B)

Number of subjects: 4321
Age mean: 9.98 years
Age std: 0.63 years

sex:
  2.0: 2217 (51.31%)
  1.0: 2104 (48.69%)

race.4level:
  White: 3071 (71.07%)
  Other/Mixed: 631 (14.60%)
  Black: 529 (12.24%)
  Asian: 90 (2.08%)

hisp:
  No: 3591 (83.11%)
  Yes: 730 (16.89%)

income_group:
  high: 1919 (44.41%)
  medium: 1244 (28.79%)
  low: 1158 (26.80%)


In [15]:
print_data_summary(merged_df_followup_B)

Number of subjects: 2756
Age mean: 11.92 years
Age std: 0.65 years

sex:
  1.0: 1399 (50.76%)
  2.0: 1357 (49.24%)

race.4level:
  White: 1923 (69.78%)
  Other/Mixed: 460 (16.69%)
  Black: 310 (11.25%)
  Asian: 63 (2.29%)

hisp:
  No: 2234 (81.06%)
  Yes: 522 (18.94%)

income_group:
  high: 1270 (46.08%)
  medium: 790 (28.66%)
  low: 696 (25.25%)


In [17]:
save_aligned_features(
    merged_df_baseline_A, merged_df_followup_A,
    merged_df_baseline_B, merged_df_followup_B
)

Saved features_A_baseline.csv with 2480 subjects and 13 features
Saved features_A_followup.csv with 2480 subjects and 13 features
Saved features_B_baseline.csv with 4321 subjects and 13 features
Saved features_B_followup.csv with 2756 subjects and 13 features


Two different approaches of data preprocessing were used.
* Approach A : we consider only subjects with two time points, keep only one sibling.
* Approach B : is as close as possible to the method described in original article.

In [16]:
EVENT_TO_SESSION = {
    "baseline_year_1_arm_1": "baselineYear1Arm1",
    "2_year_follow_up_y_arm_1": "2YearFollowUpYArm1"
}

# Baseline features (Version A)
features_A_baseline_aligned, conn_A_baseline, subs_A_baseline = merge_features_and_connectomes(
    merged_df_baseline_A,
    visit=EVENT_TO_SESSION["baseline_year_1_arm_1"]
)

print(features_A_baseline_aligned.shape)
print(conn_A_baseline.shape)
print(len(subs_A_baseline))

[skip] NDARINVYXRGTMYM: Failed to convert matrix to array: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (341,) + inhomogeneous part.
Loaded 2479 subjects
Flattened shape: (2479, 87153)  (n_subjects x n_edges)
Subjects in df1 but not in df2: ['NDARINVYXRGTMYM']
Keeping 2479 common subjects.
(2479, 13)
(2479, 87153)
2479


In [17]:
# Followup features (Version A)
features_A_followup_aligned, conn_A_followup, subs_A_followup = merge_features_and_connectomes(
    merged_df_followup_A,
    visit=EVENT_TO_SESSION["2_year_follow_up_y_arm_1"]
)

print(features_A_followup_aligned.shape)
print(conn_A_followup.shape)
print(len(subs_A_followup))

Loaded 2480 subjects
Flattened shape: (2480, 87153)  (n_subjects x n_edges)
Keeping 2480 common subjects.
(2480, 13)
(2480, 87153)
2480


In [19]:
(features_A_baseline_final, conn_A_baseline_final,
 features_A_followup_final, conn_A_followup_final,
 subs_A_final) = enforce_same_subjects(
    features_A_baseline_aligned, conn_A_baseline, subs_A_baseline,
    features_A_followup_aligned, conn_A_followup, subs_A_followup
)

In [21]:
output_dir = Path("data/processed/version_A")
output_dir.mkdir(parents=True, exist_ok=True)

# Save features
features_A_baseline_final.to_csv(output_dir / "features_baseline.csv", index=False)
features_A_followup_final.to_csv(output_dir / "features_followup.csv", index=False)

# Save connectomes
np.save(output_dir / "connectomes_baseline.npy", conn_A_baseline_final)
np.save(output_dir / "connectomes_followup.npy", conn_A_followup_final)

# Save subject IDs
pd.DataFrame({"Subject": subs_A_final}).to_csv(output_dir / "subjects.csv", index=False)


In [24]:
for fname in ["features_baseline.csv", "features_followup.csv",
              "connectomes_baseline.npy", "connectomes_followup.npy",
              "subjects.csv"]:
    fpath = output_dir / fname
    print(f"{fname}: exists={fpath.exists()}, size={os.path.getsize(fpath) if fpath.exists() else 'N/A'} bytes")


features_baseline.csv: exists=True, size=332579 bytes
features_followup.csv: exists=True, size=332715 bytes
connectomes_baseline.npy: exists=True, size=1728418424 bytes
connectomes_followup.npy: exists=True, size=1728418424 bytes
subjects.csv: exists=True, size=39672 bytes
