# 01 | Data Sources

This notebook documents upstream datasets, URLs, and licensing. It writes a machine-readable manifest to `reports/dataset_manifest.json`.

In [None]:
# Parameters
source = "fivethirtyeight"
dataset = "recent_grads,bechdel_movies"
run_date = "2026-02-22"
force_refresh = False

In [None]:
import sys
from pathlib import Path

ROOT_DIR = Path.cwd()
if str(ROOT_DIR) not in sys.path:
    sys.path.insert(0, str(ROOT_DIR))

In [None]:
import pandas as pd

from src.common.datasets import DATASETS, SOURCE_LICENSE, SOURCE_NAME, SOURCE_REPOSITORY
from src.common.io import update_stage_metrics, write_json
from src.common.paths import REPORTS_DIR

manifest_rows = []
for name, config in DATASETS.items():
    manifest_rows.append(
        {
            'dataset': name,
            'url': config['url'],
            'raw_filename': config['raw_filename'],
            'description': config['description'],
            'source': SOURCE_NAME,
            'license': SOURCE_LICENSE,
        }
    )

manifest_df = pd.DataFrame(manifest_rows)
manifest_payload = {
    'source': SOURCE_NAME,
    'source_repository': SOURCE_REPOSITORY,
    'license': SOURCE_LICENSE,
    'datasets': manifest_rows,
}

write_json(REPORTS_DIR / 'dataset_manifest.json', manifest_payload)
update_stage_metrics(
    'sources',
    {
        'dataset_count': len(DATASETS),
        'source': SOURCE_NAME,
        'license': SOURCE_LICENSE,
    },
)

manifest_df