# 01 - Data Ingestion & Profiling

Load all hackathon datasets from `data/raw/`, profile them, identify schemas and join keys.

In [None]:
import sys
sys.path.insert(0, "..")
from src.ingest import load_all_datasets, profile_dataset
from src.clean import clean_pipeline
from pathlib import Path

## Load All Datasets

In [None]:
datasets = load_all_datasets(Path("../data/raw"))
print(f"Loaded {len(datasets)} dataset(s)")
for name in datasets:
    print(f"  - {name}: {datasets[name].shape}")

## Profile Each Dataset

In [None]:
for name, df in datasets.items():
    print(f"\n{'='*60}")
    profile_dataset(df, name=name)

## Identify Join Keys

Look for common columns across datasets (account_id, customer_id, etc.)

In [None]:
from collections import Counter
all_cols = []
for name, df in datasets.items():
    all_cols.extend(df.columns.str.lower().tolist())
common = Counter(all_cols)
print("Columns appearing in multiple datasets:")
for col, count in common.most_common():
    if count > 1:
        print(f"  {col}: {count} datasets")

## Clean & Save

In [None]:
cleaned = {}
for name, df in datasets.items():
    cleaned[name] = clean_pipeline(df)
    out_path = Path(f"../data/processed/{Path(name).stem}.parquet")
    out_path.parent.mkdir(parents=True, exist_ok=True)
    cleaned[name].to_parquet(out_path, index=False)
    print(f"Saved {out_path} ({cleaned[name].shape})")