# Data Preparation Pipeline

## Pipeline Overview

**Stage 1: Environment Setup**
- Initialize reproducible environment with fixed random seeds
- Load and validate all configuration files
- Audit raw data file inventory

**Stage 3: Feature Engineering**
- Select neuroimaging features by family (DTI, cortical area, thickness, etc.)
- Create derived labels (anxiety groups from t-scores)
- Apply transformations (e.g., sex coding)

**Stage 4: Quality Control**
- Apply surface holes QC policy to remove poor-quality scans
- Generate QC visualizations and reports
- Save pre-QC dataset for visualization and post-QC for downstream analysis

**Stage 5: Data Splitting**
- Create stratified train/validation/test splits
- Ensure reproducible splits with fixed random seeds

In [None]:
%load_ext autoreload
%autoreload 2

from core.config import initialize_notebook
#from core.preprocessing.pipeline import preprocess_abcd_data

# Pass in name of notebook, default is "anxiety"
# regenerate_run_id = True will create a new run id
env = initialize_notebook()
configs = env.configs

In [None]:
#train, val, test = preprocess_abcd_data(env)

In [None]:
from IPython.display import display

In [None]:
from core.preprocessing.ingest import load_and_merge

df = load_and_merge(env)
display(df.head())

In [None]:
from core.preprocessing.splits import timepoint_split

baseline, longitudinal = timepoint_split(env, df)
display(baseline.head())
display(longitudinal.head())

In [None]:
from core.preprocessing.transforms import recode, binning

recoded = recode(env, baseline)
binned = binning(env, recoded)
display(recoded.head())
display(binned.head())



In [None]:
from core.preprocessing.qc import quality_control

qc_df, qc_mask = quality_control(env, binned)
display(qc_df.head())
display(qc_mask.head())

total_pass = int(qc_mask["qc_pass"].sum())
total_fail = int((~qc_mask["qc_pass"]).sum())

print(f"QC pass: {total_pass}")
print(f"QC fail: {total_fail}")
if total_fail:
    print(
        "Fail reasons:\n"
        + qc_mask.loc[~qc_mask["qc_pass"], "qc_reason"].value_counts().to_string()
    )

In [None]:
from core.preprocessing.missing import (
    summarize_missing,
    handle_missing,
)

def imaging_columns(env, df):
    """Get imaging columns based on config prefixes."""
    imaging_cfg = env.configs.data["columns"]["imaging"]
    cols = []
    for cfg in imaging_cfg.values():
        prefixes = cfg.get("prefixes", [])
        cols.extend(
            col for col in df.columns
            if any(col.startswith(prefix) for prefix in prefixes)
        )
    return sorted(set(cols))

def show_missing_summary(env, df, label):
    """Show missing data summary for metadata and imaging columns."""
    meta_cols = env.configs.data["columns"]["metadata"]
    imaging_cols = imaging_columns(env, df)
    
    meta_missing = summarize_missing(env, df[meta_cols])
    imaging_missing = summarize_missing(env, df[imaging_cols])
    
    print(f"=== {label} ===")
    display(meta_missing.head(10))
    display(imaging_missing.head(10))

# Show before cleanup
show_missing_summary(env, qc_df, "Before Cleanup")

# Apply missing data handling
clean_df = handle_missing(env, qc_df, drop_rows=True)

# Show after cleanup  
show_missing_summary(env, clean_df, "After Cleanup")

# Print summary statistics
rows_removed = len(qc_df) - len(clean_df)
columns_removed = len(qc_df.columns) - len(clean_df.columns)

print(f"=== Summary ===")
print(f"Total rows removed: {rows_removed:,}")
print(f"Total columns removed: {columns_removed:,}")