# 03 | Silver Cleaning

## Stage Contract
- Type cast and normalize data.
- Enforce business keys and dedupe strategy.
- Produce deterministic Silver Parquet outputs.

In [None]:
# Parameters
source = "fivethirtyeight"
dataset = "recent_grads,bechdel_movies"
run_date = "2026-02-22"
force_refresh = False

In [None]:
import sys
from pathlib import Path

ROOT_DIR = Path.cwd()
if str(ROOT_DIR) not in sys.path:
    sys.path.insert(0, str(ROOT_DIR))

In [None]:
import time

import pandas as pd

from src.common.datasets import parse_dataset_argument
from src.common.io import update_pipeline_metrics, update_stage_metrics
from src.common.paths import BRONZE_DIR, SILVER_DIR
from src.common.pipeline import clean_bechdel_movies, clean_recent_grads, write_parquet

selected_datasets = parse_dataset_argument(dataset)
stage_start = time.perf_counter()

rows_silver = 0
freshness_values = []
summary_rows = []

for dataset_name in selected_datasets:
    bronze_path = BRONZE_DIR / dataset_name / 'data.parquet'
    bronze_df = pd.read_parquet(bronze_path)

    if dataset_name == 'recent_grads':
        silver_df = clean_recent_grads(bronze_df)
        assert silver_df['major_code'].notna().all()
        assert silver_df['major_code'].is_unique
    elif dataset_name == 'bechdel_movies':
        silver_df = clean_bechdel_movies(bronze_df)
        assert silver_df['binary'].isin(['PASS', 'FAIL']).all()
        max_year = int(silver_df['year'].max())
        freshness_values.append(f'{max_year}-01-01')
    else:
        continue

    silver_path = SILVER_DIR / dataset_name / 'data.parquet'
    write_parquet(silver_df, silver_path)

    rows_silver += len(silver_df)
    summary_rows.append(
        {
            'dataset': dataset_name,
            'rows_bronze': len(bronze_df),
            'rows_silver': len(silver_df),
            'silver_path': str(silver_path),
        }
    )

runtime_seconds = round(time.perf_counter() - stage_start, 2)

update_stage_metrics(
    'silver',
    {
        'runtime_seconds': runtime_seconds,
        'rows_silver': rows_silver,
        'datasets_processed': selected_datasets,
    },
)
update_pipeline_metrics(
    {
        'rows_silver': rows_silver,
        'data_freshness': max(freshness_values) if freshness_values else None,
    }
)

pd.DataFrame(summary_rows)