In [3]:
import os
import pandas as pd
from dotenv import load_dotenv

# -----------------------------
# Load env + paths
# -----------------------------
load_dotenv()

DATA_PATH = os.getenv("DATA_PATH")
if not DATA_PATH:
    raise ValueError("DATA_PATH not found. Check your .env file and that load_dotenv() is running.")

movers_path = os.path.join(DATA_PATH, "moversd.csv")

with_chn_path = os.path.join(DATA_PATH, "Microsimulations", "with_chn")
census_2024_path = os.path.join(with_chn_path, "census2024_household_chn.csv")
census_2025_path = os.path.join(with_chn_path, "census2025_household_chn.csv")

# -----------------------------
# Sanity checks
# -----------------------------
print("DATA_PATH:", DATA_PATH)
print("Movers path:", movers_path, "exists?", os.path.exists(movers_path))
print("Census 2024 path:", census_2024_path, "exists?", os.path.exists(census_2024_path))
print("Census 2025 path:", census_2025_path, "exists?", os.path.exists(census_2025_path))

if not os.path.exists(movers_path):
    raise FileNotFoundError(f"Movers file not found: {movers_path}")

# -----------------------------
# Load movers IDs (as ints)
# -----------------------------
movers_df = pd.read_csv(movers_path)

if "HH_ID" not in movers_df.columns:
    raise ValueError("moversd.csv is missing required column: HH_ID")

movers_ids = (
    pd.to_numeric(movers_df["HH_ID"], errors="coerce")
      .dropna()
      .astype(int)
)

# Optional: de-duplicate movers list
movers_ids = movers_ids.drop_duplicates()

print("Movers HH_ID count:", len(movers_ids))

# -----------------------------
# Helper: subset + save
# -----------------------------
def subset_and_save(year: int, census_path: str, movers_ids: pd.Series, out_folder: str) -> pd.DataFrame:
    if not os.path.exists(census_path):
        raise FileNotFoundError(f"[{year}] Census file not found: {census_path}")

    df = pd.read_csv(census_path)

    if "HH_ID" not in df.columns:
        raise ValueError(f"[{year}] Census file missing required column: HH_ID")

    df["HH_ID"] = pd.to_numeric(df["HH_ID"], errors="coerce")

    subset = df[df["HH_ID"].isin(movers_ids)].copy()

    out_path = os.path.join(out_folder, f"{year}subsetd.csv")
    subset.to_csv(out_path, index=False)

    print(f"[{year}] Total rows: {len(df)} | Subset rows: {len(subset)} | Unique HH_ID: {subset['HH_ID'].nunique()}")
    print(f"[{year}] Saved subset to: {out_path}")

    return subset

# -----------------------------
# Run for 2024 + 2025
# -----------------------------
subset_2024_movers = subset_and_save(2024, census_2024_path, movers_ids, DATA_PATH)
subset_2025_movers = subset_and_save(2025, census_2025_path, movers_ids, DATA_PATH)


DATA_PATH: C:/Users/mgordon/OneDrive - Financial Accountability Office of Ontario/FA2404 Housing and Homelessness Update/Data
Movers path: C:/Users/mgordon/OneDrive - Financial Accountability Office of Ontario/FA2404 Housing and Homelessness Update/Data\moversd.csv exists? True
Census 2024 path: C:/Users/mgordon/OneDrive - Financial Accountability Office of Ontario/FA2404 Housing and Homelessness Update/Data\Microsimulations\with_chn\census2024_household_chn.csv exists? True
Census 2025 path: C:/Users/mgordon/OneDrive - Financial Accountability Office of Ontario/FA2404 Housing and Homelessness Update/Data\Microsimulations\with_chn\census2025_household_chn.csv exists? True
Movers HH_ID count: 176
[2024] Total rows: 32972 | Subset rows: 176 | Unique HH_ID: 176
[2024] Saved subset to: C:/Users/mgordon/OneDrive - Financial Accountability Office of Ontario/FA2404 Housing and Homelessness Update/Data\2024subsetd.csv
[2025] Total rows: 32972 | Subset rows: 176 | Unique HH_ID: 176
[2025] Saved

In [4]:
subset_out_path = os.path.join(DATA_PATH, "2024subset.csv")

subset_2024_movers.to_csv(subset_out_path, index=False)

print("Saved subset to:", subset_out_path)


Saved subset to: C:/Users/mgordon/OneDrive - Financial Accountability Office of Ontario/FA2404 Housing and Homelessness Update/Data\2024subset.csv


In [5]:
import os
import pandas as pd
from dotenv import load_dotenv

# -----------------------------
# Load env + paths
# -----------------------------
load_dotenv()

DATA_PATH = os.getenv("DATA_PATH")
if not DATA_PATH:
    raise ValueError("DATA_PATH not found. Check your .env file and that load_dotenv() is running.")

movers_path = os.path.join(DATA_PATH, "moversd.csv")

with_chn_path = os.path.join(DATA_PATH, "Microsimulations", "with_chn")
census_2025_path = os.path.join(with_chn_path, "census2025_household_chn.csv")

# -----------------------------
# Sanity checks
# -----------------------------
print("DATA_PATH:", DATA_PATH)
print("Movers path:", movers_path, "exists?", os.path.exists(movers_path))
print("Census 2025 path:", census_2025_path, "exists?", os.path.exists(census_2025_path))

if not os.path.exists(movers_path):
    raise FileNotFoundError(f"Movers file not found: {movers_path}")
if not os.path.exists(census_2025_path):
    raise FileNotFoundError(f"Census 2025 file not found: {census_2025_path}")

# -----------------------------
# Load movers IDs (as ints)
# -----------------------------
movers_df = pd.read_csv(movers_path)

if "HH_ID" not in movers_df.columns:
    raise ValueError("moversd.csv is missing required column: HH_ID")

movers_ids = (
    pd.to_numeric(movers_df["HH_ID"], errors="coerce")
      .dropna()
      .astype(int)
      .drop_duplicates()
)

movers_set = set(movers_ids.tolist())
print("Movers HH_ID count:", len(movers_set))

# -----------------------------
# Load 2025 census + add mover flag
# -----------------------------
census_2025 = pd.read_csv(census_2025_path)

if "HH_ID" not in census_2025.columns:
    raise ValueError("[2025] Census file missing required column: HH_ID")

# Keep a numeric HH_ID for matching, but don't break your file if HH_ID is stored as text
hh_id_num = pd.to_numeric(census_2025["HH_ID"], errors="coerce").astype("Int64")

census_2025["mover"] = hh_id_num.isin(movers_set).astype(int)

# -----------------------------
# Save a full copy with mover flag
# -----------------------------
out_path = os.path.join(with_chn_path, "census2025_household_chn_with_mover.csv")
census_2025.to_csv(out_path, index=False)

print("Saved:", out_path)
print("Mover counts:")
print(census_2025["mover"].value_counts(dropna=False))


DATA_PATH: C:/Users/mgordon/OneDrive - Financial Accountability Office of Ontario/FA2404 Housing and Homelessness Update/Data
Movers path: C:/Users/mgordon/OneDrive - Financial Accountability Office of Ontario/FA2404 Housing and Homelessness Update/Data\moversd.csv exists? True
Census 2025 path: C:/Users/mgordon/OneDrive - Financial Accountability Office of Ontario/FA2404 Housing and Homelessness Update/Data\Microsimulations\with_chn\census2025_household_chn.csv exists? True
Movers HH_ID count: 176
Saved: C:/Users/mgordon/OneDrive - Financial Accountability Office of Ontario/FA2404 Housing and Homelessness Update/Data\Microsimulations\with_chn\census2025_household_chn_with_mover.csv
Mover counts:
mover
0    32796
1      176
Name: count, dtype: int64


In [6]:
# -----------------------------
# Paths
# -----------------------------
nonmovers_path = os.path.join(DATA_PATH, "nonmoversd.csv")
census_2024_path = os.path.join(with_chn_path, "census2024_household_chn.csv")

if not os.path.exists(nonmovers_path):
    raise FileNotFoundError(f"Non-movers file not found: {nonmovers_path}")
if not os.path.exists(census_2024_path):
    raise FileNotFoundError(f"Census 2024 file not found: {census_2024_path}")

# -----------------------------
# Load non-mover HH_IDs
# -----------------------------
nonmovers_df = pd.read_csv(nonmovers_path)

if "HH_ID" not in nonmovers_df.columns:
    raise ValueError("nonmoversd.csv is missing required column: HH_ID")

nonmover_ids = (
    pd.to_numeric(nonmovers_df["HH_ID"], errors="coerce")
      .dropna()
      .astype(int)
      .drop_duplicates()
)

nonmover_set = set(nonmover_ids.tolist())

print("Non-mover HH_ID count:", len(nonmover_set))

# -----------------------------
# Load 2024 census + subset
# -----------------------------
census_2024 = pd.read_csv(census_2024_path)

if "HH_ID" not in census_2024.columns:
    raise ValueError("[2024] Census file missing required column: HH_ID")

hh_id_num = pd.to_numeric(census_2024["HH_ID"], errors="coerce").astype("Int64")

subset_2024_nonmovers = census_2024[hh_id_num.isin(nonmover_set)].copy()

# -----------------------------
# Save subset
# -----------------------------
out_path = os.path.join(
    with_chn_path, "census2024_household_chn_nonmovers.csv"
)
subset_2024_nonmovers.to_csv(out_path, index=False)

print("Saved:", out_path)
print(
    "[2024] Total rows:", len(census_2024),
    "| Subset rows:", len(subset_2024_nonmovers),
    "| Unique HH_ID:", subset_2024_nonmovers["HH_ID"].nunique()
)


Non-mover HH_ID count: 2214
Saved: C:/Users/mgordon/OneDrive - Financial Accountability Office of Ontario/FA2404 Housing and Homelessness Update/Data\Microsimulations\with_chn\census2024_household_chn_nonmovers.csv
[2024] Total rows: 32972 | Subset rows: 2214 | Unique HH_ID: 2214


In [7]:
def add_mover_flag(
    census_path: str,
    movers_path: str,
    out_path: str,
    year: int
):
    if not os.path.exists(census_path):
        raise FileNotFoundError(f"[{year}] Census file not found: {census_path}")
    if not os.path.exists(movers_path):
        raise FileNotFoundError(f"Movers file not found: {movers_path}")

    # Load movers
    movers_df = pd.read_csv(movers_path)
    if "HH_ID" not in movers_df.columns:
        raise ValueError("moversd.csv missing HH_ID column")

    movers_set = set(
        pd.to_numeric(movers_df["HH_ID"], errors="coerce")
          .dropna()
          .astype(int)
    )

    # Load census
    df = pd.read_csv(census_path)
    if "HH_ID" not in df.columns:
        raise ValueError(f"[{year}] Census file missing HH_ID column")

    hh_id_num = pd.to_numeric(df["HH_ID"], errors="coerce").astype("Int64")

    # Assign mover flag
    df["mover"] = hh_id_num.isin(movers_set).astype(int)

    # Save
    df.to_csv(out_path, index=False)

    print(
        f"[{year}] Saved with mover flag:",
        out_path,
        "| movers flagged:", df["mover"].sum()
    )

    return df


In [8]:
# Paths
movers_path = os.path.join(DATA_PATH, "moversd.csv")

census_2024_path = os.path.join(with_chn_path, "census2024_household_chn.csv")
census_2025_path = os.path.join(with_chn_path, "census2025_household_chn.csv")

out_2024 = os.path.join(
    with_chn_path, "census2024_household_chn_with_mover.csv"
)
out_2025 = os.path.join(
    with_chn_path, "census2025_household_chn_with_mover.csv"
)

# Run
census_2024 = add_mover_flag(
    census_2024_path, movers_path, out_2024, year=2024
)

census_2025 = add_mover_flag(
    census_2025_path, movers_path, out_2025, year=2025
)


[2024] Saved with mover flag: C:/Users/mgordon/OneDrive - Financial Accountability Office of Ontario/FA2404 Housing and Homelessness Update/Data\Microsimulations\with_chn\census2024_household_chn_with_mover.csv | movers flagged: 176
[2025] Saved with mover flag: C:/Users/mgordon/OneDrive - Financial Accountability Office of Ontario/FA2404 Housing and Homelessness Update/Data\Microsimulations\with_chn\census2025_household_chn_with_mover.csv | movers flagged: 176


In [9]:
# -----------------------------
# Paths
# -----------------------------
nonmovers_path = os.path.join(DATA_PATH, "nonmoversd.csv")
census_2024_path = os.path.join(with_chn_path, "census2024_household_chn.csv")

if not os.path.exists(nonmovers_path):
    raise FileNotFoundError(f"Non-movers file not found: {nonmovers_path}")

if not os.path.exists(census_2024_path):
    raise FileNotFoundError(f"Census 2024 file not found: {census_2024_path}")

# -----------------------------
# Load non-mover HH_IDs
# -----------------------------
nonmovers_df = pd.read_csv(nonmovers_path)

if "HH_ID" not in nonmovers_df.columns:
    raise ValueError("nonmoversd.csv is missing required column: HH_ID")

nonmovers_ids = (
    pd.to_numeric(nonmovers_df["HH_ID"], errors="coerce")
      .dropna()
      .astype(int)
      .drop_duplicates()
)

nonmovers_set = set(nonmovers_ids.tolist())

print("Non-movers HH_ID count:", len(nonmovers_set))

# -----------------------------
# Load 2024 census + subset
# -----------------------------
census_2024 = pd.read_csv(census_2024_path)

if "HH_ID" not in census_2024.columns:
    raise ValueError("[2024] Census file missing required column: HH_ID")

hh_id_num = pd.to_numeric(census_2024["HH_ID"], errors="coerce").astype("Int64")

subset_2024_nonmovers = census_2024[
    hh_id_num.isin(nonmovers_set)
].copy()

# -----------------------------
# Save subset
# -----------------------------
out_path = os.path.join(
    with_chn_path, "census2024_household_chn_nonmovers.csv"
)

subset_2024_nonmovers.to_csv(out_path, index=False)

print(
    "[2024] Total rows:", len(census_2024),
    "| Subset rows:", len(subset_2024_nonmovers),
    "| Unique HH_ID:", subset_2024_nonmovers["HH_ID"].nunique()
)
print("Saved:", out_path)


Non-movers HH_ID count: 2214
[2024] Total rows: 32972 | Subset rows: 2214 | Unique HH_ID: 2214
Saved: C:/Users/mgordon/OneDrive - Financial Accountability Office of Ontario/FA2404 Housing and Homelessness Update/Data\Microsimulations\with_chn\census2024_household_chn_nonmovers.csv


In [10]:
# -----------------------------
# Paths
# -----------------------------
chn2021_path = os.path.join(DATA_PATH, "2021chnhouseholds.csv")
census_2022_path = os.path.join(with_chn_path, "census2022_household_chn.csv")

if not os.path.exists(chn2021_path):
    raise FileNotFoundError(f"2021 CHN file not found: {chn2021_path}")

if not os.path.exists(census_2022_path):
    raise FileNotFoundError(f"Census 2022 file not found: {census_2022_path}")

# -----------------------------
# Load 2021 CHN HH_IDs
# -----------------------------
chn2021_df = pd.read_csv(chn2021_path)

if "HH_ID" not in chn2021_df.columns:
    raise ValueError("2021chnhouseholds.csv is missing required column: HH_ID")

chn2021_ids = (
    pd.to_numeric(chn2021_df["HH_ID"], errors="coerce")
      .dropna()
      .astype(int)
      .drop_duplicates()
)

chn2021_set = set(chn2021_ids.tolist())

print("2021 CHN HH_ID count:", len(chn2021_set))

# -----------------------------
# Load 2022 census + subset
# -----------------------------
census_2022 = pd.read_csv(census_2022_path)

if "HH_ID" not in census_2022.columns:
    raise ValueError("[2022] Census file missing required column: HH_ID")

hh_id_num_2022 = pd.to_numeric(
    census_2022["HH_ID"], errors="coerce"
).astype("Int64")

subset_2022_from_2021chn = census_2022[
    hh_id_num_2022.isin(chn2021_set)
].copy()

# -----------------------------
# Save subset
# -----------------------------
out_path = os.path.join(
    with_chn_path, "census2022_household_chn_from_2021chn.csv"
)

subset_2022_from_2021chn.to_csv(out_path, index=False)

print(
    "[2022] Total rows:", len(census_2022),
    "| Subset rows:", len(subset_2022_from_2021chn),
    "| Unique HH_ID:", subset_2022_from_2021chn["HH_ID"].nunique()
)
print("Saved:", out_path)


2021 CHN HH_ID count: 6814
[2022] Total rows: 32972 | Subset rows: 6814 | Unique HH_ID: 6814
Saved: C:/Users/mgordon/OneDrive - Financial Accountability Office of Ontario/FA2404 Housing and Homelessness Update/Data\Microsimulations\with_chn\census2022_household_chn_from_2021chn.csv


In [11]:
censush_path = os.path.join(
    DATA_PATH,
    "Statistics Canada",
    "Census 2021",
    "Hierarchical",
    "censush.csv"
)

print("CensusH path:", censush_path, "exists?", os.path.exists(censush_path))

if not os.path.exists(censush_path):
    raise FileNotFoundError(f"censush.csv not found: {censush_path}")

censush = pd.read_csv(censush_path)


CensusH path: C:/Users/mgordon/OneDrive - Financial Accountability Office of Ontario/FA2404 Housing and Homelessness Update/Data\Statistics Canada\Census 2021\Hierarchical\censush.csv exists? True


In [14]:
# -----------------------------
# Basic checks
# -----------------------------
required = {"PP_ID", "AGEGRP", "WEIGHT", "PR"}
missing = required - set(censush.columns)
if missing:
    raise ValueError(f"censush missing columns: {missing}")

# -----------------------------
# Filter: Ontario only (PR == 35)
# -----------------------------
censush_on = censush[censush["PR"] == 35].copy()

print("Rows after PR==35 filter:", len(censush_on))

# -----------------------------
# Drop missing AGEGRP / WEIGHT
# -----------------------------
censush2 = censush_on.dropna(subset=["AGEGRP", "WEIGHT"]).copy()

# Ensure WEIGHT is numeric
censush2["WEIGHT"] = pd.to_numeric(censush2["WEIGHT"], errors="coerce")
censush2 = censush2.dropna(subset=["WEIGHT"])

# -----------------------------
# Weighted age distribution
# -----------------------------
age_dist = (
    censush2
    .groupby("AGEGRP", as_index=False)
    .agg(
        weighted_population=("WEIGHT", "sum"),
        unweighted_n=("PP_ID", "nunique")
    )
)

age_dist["share"] = (
    age_dist["weighted_population"] /
    age_dist["weighted_population"].sum()
)

# -----------------------------
# Sort if AGEGRP is numeric-like
# -----------------------------
age_dist["AGEGRP_sort"] = pd.to_numeric(age_dist["AGEGRP"], errors="coerce")
age_dist = age_dist.sort_values(
    ["AGEGRP_sort", "AGEGRP"]
).drop(columns=["AGEGRP_sort"])

age_dist


Rows after PR==35 filter: 139734


Unnamed: 0,AGEGRP,weighted_population,unweighted_n,share
0,1,1333345.0,13278,0.095023
1,2,707241.2,7043,0.050403
2,3,708747.5,7058,0.05051
3,4,797717.5,7944,0.056851
4,5,886486.6,8828,0.063177
5,6,933381.7,9295,0.066519
6,7,874938.6,8713,0.062354
7,8,824328.1,8209,0.058747
8,9,827742.3,8243,0.058991
9,10,880160.3,8765,0.062726


In [15]:
# Format for Excel copy-paste
excel_ready = age_dist.copy()

# Optional: percent as %
excel_ready["share_pct"] = (excel_ready["share"] * 100).round(2)
excel_ready = excel_ready.drop(columns=["share"])

# Print as tab-delimited text
print(excel_ready.to_csv(sep="\t", index=False))


AGEGRP	weighted_population	unweighted_n	share_pct
1	1333344.9955818378	13278	9.5
2	707241.2113181867	7043	5.04
3	708747.4754342982	7058	5.05
4	797717.4758926133	7944	5.69
5	886486.6411354468	8828	6.32
6	933381.6639503827	9295	6.65
7	874938.6162452593	8713	6.24
8	824328.1419439152	8209	5.87
9	827742.3406071011	8243	5.9
10	880160.3318477789	8765	6.27
11	1924704.2875671852	19167	13.72
12	1441796.0119418607	14358	10.28
13	975356.2239860212	9713	6.95
88	915808.5825957494	9120	6.53



In [16]:
total_weight_on = censush2["WEIGHT"].sum()

print("Total weighted population (PR = 35):", total_weight_on)


Total weighted population (PR = 35): 14031754.00004763
