# Data Preparation Pipeline

## Pipeline Overview

**Stage 1: Environment Setup**
- Initialize reproducible environment with fixed random seeds
- Load and validate all configuration files

**Stage 2: Data Loading & Timepoint Extraction**
- Load and merge neuroimaging and behavioral datasets
- Extract baseline timepoint data for cross-sectional analysis

**Stage 3: Feature Engineering**
- Apply sex coding transformations (1→male, 2→female)
- Create anxiety group binning from t-scores (Control<59, Subclinical 60-64, Clinical 65+)

**Stage 4: Quality Control**
- Apply surface holes QC policy (≤62 defects) to remove poor-quality scans
- Generate QC pass/fail masks and summary statistics

**Stage 5: Missing Data Handling** 
- Drop columns with >30% missing values (excluding protected metadata)
- Require complete DTI data and essential metadata
- Generate before/after missing data summaries

**Stage 6: Modeling Splits**
- Create stratified 80/10/10 train/val/test splits
- Stratify by anxiety groups to ensure balanced representation
- Generate split assignments and verify stratification

In [None]:
%load_ext autoreload
%autoreload 2

from core.config import initialize_notebook
#from core.preprocessing.pipeline import preprocess_abcd_data

# Pass in name of notebook, default is "anxiety"
# regenerate_run_id = True will create a new run id
env = initialize_notebook()
configs = env.configs

In [None]:
from core.preprocessing.pipeline import preprocess_abcd_data
#Full pipeline
train, val, test = preprocess_abcd_data(env)

In [None]:
from IPython.display import display

In [None]:
from core.preprocessing.ingest import load_and_merge

df = load_and_merge(env)
display(df.head())

In [None]:
from core.preprocessing.splits import timepoint_split

baseline, longitudinal = timepoint_split(env, df)
display(baseline.head())
display(longitudinal.head())

In [None]:
from core.preprocessing.transforms import recode, binning

recoded = recode(env, baseline)
binned = binning(env, recoded)
display(recoded.head())
display(binned.head())



In [None]:
from core.preprocessing.qc import quality_control

qc_df, qc_mask = quality_control(env, binned)
display(qc_df.head())
display(qc_mask.head())

total_pass = int(qc_mask["qc_pass"].sum())
total_fail = int((~qc_mask["qc_pass"]).sum())

print(f"QC pass: {total_pass}")
print(f"QC fail: {total_fail}")
if total_fail:
    print(
        "Fail reasons:\n"
        + qc_mask.loc[~qc_mask["qc_pass"], "qc_reason"].value_counts().to_string()
    )

In [None]:
from core.preprocessing.missing import (
    summarize_missing,
    handle_missing,
)

def imaging_columns(env, df):
    """Get imaging columns based on config prefixes."""
    imaging_cfg = env.configs.data["columns"]["imaging"]
    cols = []
    for cfg in imaging_cfg.values():
        prefixes = cfg.get("prefixes", [])
        cols.extend(
            col for col in df.columns
            if any(col.startswith(prefix) for prefix in prefixes)
        )
    return sorted(set(cols))

def show_missing_summary(env, df, label):
    """Show missing data summary for metadata and imaging columns."""
    meta_cols = env.configs.data["columns"]["metadata"]
    imaging_cols = imaging_columns(env, df)
    
    meta_missing = summarize_missing(env, df[meta_cols])
    imaging_missing = summarize_missing(env, df[imaging_cols])
    
    print(f"=== {label} ===")
    display(meta_missing.head(10))
    display(imaging_missing.head(10))

# Show before cleanup
show_missing_summary(env, qc_df, "Before Cleanup")

# Apply missing data handling
clean_df = handle_missing(env, qc_df, drop_rows=True)

# Show after cleanup  
show_missing_summary(env, clean_df, "After Cleanup")

# Print summary statistics
rows_removed = len(qc_df) - len(clean_df)
columns_removed = len(qc_df.columns) - len(clean_df.columns)

print(f"=== Summary ===")
print(f"Total rows removed: {rows_removed:,}")
print(f"Total columns removed: {columns_removed:,}")
print(clean_df.shape)
#Before filtering for QC-pass participants

In [None]:
from core.preprocessing.splits import create_modeling_splits

#Filters for QC-pass participants
train, val, test, split_map = create_modeling_splits(env, clean_df)
display(train.head())
print(train.shape)
display(val.head())
print(val.shape)
display(test.head())
print(test.shape)
display(split_map.head())
print(split_map.shape)

In [None]:
print("=== Modeling Splits Summary ===")
print(f"Train: {len(train):,}")
print(f"Val  : {len(val):,}")
print(f"Test : {len(test):,}")
print(f"Total: {len(train) + len(val) + len(test):,}")

print("\n=== Stratification Check ===")
for split_name, split_df in [("Train", train), ("Val", val), ("Test", test)]:
    anx_counts = split_df["anx_group"].value_counts()
    sex_counts = split_df["sex_mapped"].value_counts() 
    print(f"{split_name}: {dict(anx_counts)} | {dict(sex_counts)}")