In [7]:
import os
import pandas as pd
from dotenv import load_dotenv

# -----------------------------
# Load env + paths
# -----------------------------
load_dotenv()

DATA_PATH = os.getenv("DATA_PATH")
if not DATA_PATH:
    raise ValueError("DATA_PATH not found. Check your .env file and that load_dotenv() is running.")

movers_path = os.path.join(DATA_PATH, "movers.csv")

with_chn_path = os.path.join(DATA_PATH, "Microsimulations", "with_chn")
census_2024_path = os.path.join(with_chn_path, "census2024_household_chn.csv")
census_2025_path = os.path.join(with_chn_path, "census2025_household_chn.csv")

# -----------------------------
# Sanity checks
# -----------------------------
print("DATA_PATH:", DATA_PATH)
print("Movers path:", movers_path, "exists?", os.path.exists(movers_path))
print("Census 2024 path:", census_2024_path, "exists?", os.path.exists(census_2024_path))
print("Census 2025 path:", census_2025_path, "exists?", os.path.exists(census_2025_path))

if not os.path.exists(movers_path):
    raise FileNotFoundError(f"Movers file not found: {movers_path}")

# -----------------------------
# Load movers IDs (as ints)
# -----------------------------
movers_df = pd.read_csv(movers_path)

if "HH_ID" not in movers_df.columns:
    raise ValueError("movers.csv is missing required column: HH_ID")

movers_ids = (
    pd.to_numeric(movers_df["HH_ID"], errors="coerce")
      .dropna()
      .astype(int)
)

# Optional: de-duplicate movers list
movers_ids = movers_ids.drop_duplicates()

print("Movers HH_ID count:", len(movers_ids))

# -----------------------------
# Helper: subset + save
# -----------------------------
def subset_and_save(year: int, census_path: str, movers_ids: pd.Series, out_folder: str) -> pd.DataFrame:
    if not os.path.exists(census_path):
        raise FileNotFoundError(f"[{year}] Census file not found: {census_path}")

    df = pd.read_csv(census_path)

    if "HH_ID" not in df.columns:
        raise ValueError(f"[{year}] Census file missing required column: HH_ID")

    df["HH_ID"] = pd.to_numeric(df["HH_ID"], errors="coerce")

    subset = df[df["HH_ID"].isin(movers_ids)].copy()

    out_path = os.path.join(out_folder, f"{year}subset.csv")
    subset.to_csv(out_path, index=False)

    print(f"[{year}] Total rows: {len(df)} | Subset rows: {len(subset)} | Unique HH_ID: {subset['HH_ID'].nunique()}")
    print(f"[{year}] Saved subset to: {out_path}")

    return subset

# -----------------------------
# Run for 2024 + 2025
# -----------------------------
subset_2024_movers = subset_and_save(2024, census_2024_path, movers_ids, DATA_PATH)
subset_2025_movers = subset_and_save(2025, census_2025_path, movers_ids, DATA_PATH)


DATA_PATH: C:/Users/mgordon/OneDrive - Financial Accountability Office of Ontario/FA2404 Housing and Homelessness Update/Data
Movers path: C:/Users/mgordon/OneDrive - Financial Accountability Office of Ontario/FA2404 Housing and Homelessness Update/Data\movers.csv exists? True
Census 2024 path: C:/Users/mgordon/OneDrive - Financial Accountability Office of Ontario/FA2404 Housing and Homelessness Update/Data\Microsimulations\with_chn\census2024_household_chn.csv exists? True
Census 2025 path: C:/Users/mgordon/OneDrive - Financial Accountability Office of Ontario/FA2404 Housing and Homelessness Update/Data\Microsimulations\with_chn\census2025_household_chn.csv exists? True
Movers HH_ID count: 455
[2024] Total rows: 32972 | Subset rows: 455 | Unique HH_ID: 455
[2024] Saved subset to: C:/Users/mgordon/OneDrive - Financial Accountability Office of Ontario/FA2404 Housing and Homelessness Update/Data\2024subset.csv
[2025] Total rows: 32972 | Subset rows: 455 | Unique HH_ID: 455
[2025] Saved s

In [6]:
subset_out_path = os.path.join(DATA_PATH, "2024subset.csv")

subset_2024_movers.to_csv(subset_out_path, index=False)

print("Saved subset to:", subset_out_path)


Saved subset to: C:/Users/mgordon/OneDrive - Financial Accountability Office of Ontario/FA2404 Housing and Homelessness Update/Data\2024subset.csv
