In [28]:

import pandas as pd
import os
import statsmodels.api as sm
import numpy as np
from dotenv import load_dotenv

In [29]:
load_dotenv()
data_dir = os.getenv("DATA_PATH")
folder_path = os.path.join(data_dir, "Microsimulations")

In [30]:
# Define input and output paths
input_base_path = os.path.join(folder_path, "household")
output_base_path = os.path.join(folder_path, "with_chn")

# Ensure output directory exists
os.makedirs(output_base_path, exist_ok=True)

# Dictionary to store updated data
census_data = {}

# Loop through the years 2022 to 2030
for year in range(2022, 2031):
    input_file_path = os.path.join(input_base_path, f"census{year}_household.csv")
    output_file_path = os.path.join(output_base_path, f"census{year}_household_chn.csv")
    
    if os.path.exists(input_file_path):
        # Load the data
        census_df = pd.read_csv(input_file_path)
        
        # Initialize CHN column to 0
        census_df['chn'] = 0

        # Define housing issue conditions
        housing_issue = (
            (census_df['SHELCO'] * 12 / census_df['totalincome'] > 0.30) |  # Unaffordable
            (census_df['NOS'] == 0) |  # Unsuitable
            (census_df['REPAIR'] == 3)  # Inadequate
        )

        # Define market unaffordability condition
        market_unaffordable = (census_df['mmr']) * 12 > 0.30 * census_df['totalincome']

        # Update CHN variable
        census_df.loc[
            housing_issue & market_unaffordable &
            ~((census_df['student_household'] == 1) & (census_df['non_family_household'] == 1)),
            'chn'
        ] = 1

        # Now create stir and alt_stir after chn is assigned
       # census_df['stir'] = census_df['SHELCO'] * 12 / census_df['totalincome']
        #census_df['alt_stir'] = (census_df['mmr']) * 12 / census_df['totalincome']


        # Update CHN: Exclude individuals with STIR >= 1
        census_df.loc[census_df["stir"] >= 1, "chn"] = 0
        

        # Define deep core housing issue condition (using 50% income threshold)
        deep_housing_issue = (
            (census_df['SHELCO'] * 12 / census_df['totalincome'] > 0.50) |  # Deeply Unaffordable
            (census_df['NOS'] == 0) |  # Unsuitable
            (census_df['REPAIR'] == 3)  # Inadequate
        )

        # Define deep market unaffordability condition (50% threshold)
        deep_market_unaffordable = (census_df['mmr']) * 12 > 0.50 * census_df['totalincome']

        # Initialize dchn column to 0
        census_df['dchn'] = 0

        # Update dchn variable
        census_df.loc[
            deep_housing_issue & deep_market_unaffordable &
            ~((census_df['student_household'] == 1) & (census_df['non_family_household'] == 1)),
            'dchn'
        ] = 1

        # Update DCHN: Exclude individuals with STIR >= 1
        census_df.loc[census_df["stir"] >= 1, "dchn"] = 0

        # Export updated data
        census_df.to_csv(output_file_path, index=False)
        
        print(f"Updated CHN values and added stir/alt_stir for {year}")

    else:
        print(f"File not found: {input_file_path}")

Updated CHN values and added stir/alt_stir for 2022
Updated CHN values and added stir/alt_stir for 2023
Updated CHN values and added stir/alt_stir for 2024
Updated CHN values and added stir/alt_stir for 2025
Updated CHN values and added stir/alt_stir for 2026
Updated CHN values and added stir/alt_stir for 2027
Updated CHN values and added stir/alt_stir for 2028
Updated CHN values and added stir/alt_stir for 2029
Updated CHN values and added stir/alt_stir for 2030


In [31]:
#add 2021 census file to folder

# Define input and output paths
input_base_path = os.path.join(folder_path, "household")
output_base_path = os.path.join(folder_path, "with_chn")

# File name
input_file_name = "census2021_household.csv"
output_file_name = "census2021_household_chn.csv"

# Full paths
input_file_path = os.path.join(input_base_path, input_file_name)
output_file_path = os.path.join(output_base_path, output_file_name)

# Read the file
df = pd.read_csv(input_file_path)


  # Now create stir and alt_stir after chn is assigned
df['stir'] = df['SHELCO'] * 12 / df['totalincome']
df['alt_stir'] = (df['mmr']) * 12 / df['totalincome']

#net income share
df['netshare'] = (
    df['TOTINC_AT'] / df['totalincome']
).clip(upper=1.0)

#net income
df['netinc'] = df['totalincome'] * df['netshare']

# Save the modified dataframe
df.to_csv(output_file_path, index=False)

print(f"File saved to: {output_file_path}")

File saved to: C:/Users/mgordon/OneDrive - Financial Accountability Office of Ontario/FA2404 Housing and Homelessness Update/Data\Microsimulations\with_chn\census2021_household_chn.csv


In [32]:

# Define the path for processed files
output_base_path = os.path.join(folder_path, "with_chn")

# Dictionaries to store weighted household counts
chn_weighted_counts = {}
dchn_weighted_counts = {}

# Loop through years 2022 to 2030
for year in range(2021, 2031):
    file_path = os.path.join(output_base_path, f"census{year}_household_chn.csv")

    if os.path.exists(file_path):
        # Load the data
        census_df = pd.read_csv(file_path)

        # Check if required columns exist
        required_columns = {'chn', 'dchn', 'WEIGHT', 'HCORENEED_IND'}
        missing_columns = required_columns - set(census_df.columns)

        if missing_columns:
            print(f"Skipping {year} due to missing columns: {missing_columns}")
            continue  # Skip processing this file

        # Exclude households where HCORENEED_IND == 888
        filtered_df = census_df[census_df['HCORENEED_IND'] != 888]

        # Calculate weighted count of households where chn == 1
        chn_weight = filtered_df.loc[filtered_df['chn'] == 1, 'WEIGHT'].sum()
        chn_weighted_counts[year] = chn_weight

        # Calculate weighted count of households where dchn == 1
        dchn_weight = filtered_df.loc[filtered_df['dchn'] == 1, 'WEIGHT'].sum()
        dchn_weighted_counts[year] = dchn_weight

        print(f"[{year}] CHN weighted count: {chn_weight}, DCHN weighted count: {dchn_weight}")

    else:
        print(f"File not found: {file_path}")

print("\nTotal household weights per year:")

# Print the sum of WEIGHT for each year
for year in range(2021, 2031):
    file_path = os.path.join(output_base_path, f"census{year}_household_chn.csv")
    if os.path.exists(file_path):
        census_df = pd.read_csv(file_path)
        print(f"Sum of WEIGHT for {year}: {census_df['WEIGHT'].sum()}")


[2021] CHN weighted count: 684245.5791455524, DCHN weighted count: 154141.0278820697
[2022] CHN weighted count: 768459.7971429066, DCHN weighted count: 189153.10440888445
[2023] CHN weighted count: 878051.8715488891, DCHN weighted count: 235515.96500592493
[2024] CHN weighted count: 918123.9496870566, DCHN weighted count: 250291.51886131393
[2025] CHN weighted count: 942574.9216363034, DCHN weighted count: 260233.4142954576
[2026] CHN weighted count: 962076.6157002894, DCHN weighted count: 266639.7399537616
[2027] CHN weighted count: 958460.4392855032, DCHN weighted count: 266588.1096230006
[2028] CHN weighted count: 970296.3275081685, DCHN weighted count: 271314.65747217473
[2029] CHN weighted count: 977263.0225034186, DCHN weighted count: 272381.90853394964
[2030] CHN weighted count: 987783.0503600006, DCHN weighted count: 275454.0300173497

Total household weights per year:
Sum of WEIGHT for 2021: 3310969.362428404
Sum of WEIGHT for 2022: 3393369.992059277
Sum of WEIGHT for 2023: 35

In [33]:

# Define year to process
year = 2021
file_path = os.path.join(folder_path, f"census{year}.csv")

# Check if file exists before proceeding
if os.path.exists(file_path):
    df = pd.read_csv(file_path)
    
    # Create jobless variable
    df["jobless"] = df["LFACT"].between(3, 10).astype(int)
    
    # Calculate share of records with jobless == 1 for each AGEGRP and IMMSTAT
    summary = df.groupby(["AGEGRP", df["IMMSTAT"].apply(lambda x: "IMMSTAT_3" if x == 3 else "IMMSTAT_not_3")])["jobless"].mean().reset_index()
    
    # Save summary to a CSV file
    output_path = os.path.join(folder_path, f"census_share_{year}.csv")
    summary.to_csv(output_path, index=False)


In [34]:

# Load netshare from 2021
input_base_path = os.path.join(folder_path, "with_chn")
census2021_household = pd.read_csv(os.path.join(input_base_path, "census2021_household_chn.csv"))
netshare_2021 = (census2021_household['TOTINC_AT'] / census2021_household['totalincome']).clip(upper=1.0)

# Apply to years 2022–2030
input_base_path = os.path.join(folder_path, "with_chn")

for year in range(2022, 2031):
    file_path = os.path.join(input_base_path, f"census{year}_household_chn.csv")

    if os.path.exists(file_path):
        census_df = pd.read_csv(file_path)

        # Assign netshare from 2021 (assumes same order/row count)
        census_df['netshare'] = netshare_2021.values
        census_df['netinc'] = census_df['totalincome'] * census_df['netshare']

        # Save updated file
        census_df.to_csv(file_path, index=False)
        print(f"✅ Updated netshare and netinc for {year}")
    else:
        print(f"❌ File not found for {year}: {file_path}")

✅ Updated netshare and netinc for 2022
✅ Updated netshare and netinc for 2023
✅ Updated netshare and netinc for 2024
✅ Updated netshare and netinc for 2025
✅ Updated netshare and netinc for 2026
✅ Updated netshare and netinc for 2027
✅ Updated netshare and netinc for 2028
✅ Updated netshare and netinc for 2029
✅ Updated netshare and netinc for 2030


In [35]:

input_base_path = os.path.join(folder_path, "with_chn")

for year in range(2021, 2031):
    file_path = os.path.join(input_base_path, f"census{year}_household_chn.csv")

    if os.path.exists(file_path):
        census_df = pd.read_csv(file_path)

        # Default gap to 0.0
        census_df['gap'] = 0.0

        # Only calculate gap where CHN = 1
        chn_condition = census_df['chn'] == 1

        # Use SHELCO if NOS == 1, REPAIR != 3, and SHELCO < mmr 
        use_shelco = (
            chn_condition &
            (census_df['NOS'] == 1) &
            (census_df['REPAIR'] != 3) &
            (census_df['SHELCO'] < census_df['mmr'])
        )

        # Use AMR (mmr) otherwise
        use_amr = chn_condition & ~use_shelco  # CHN == 1 but doesn't meet SHELCO condition

        # Apply SHELCO-based gap
        census_df.loc[use_shelco, 'gap'] = (
            census_df.loc[use_shelco, 'SHELCO'] * 12 - 0.3 * census_df.loc[use_shelco, 'totalincome']
        )

        # Apply AMR-based gap
        census_df.loc[use_amr, 'gap'] = (
            (census_df.loc[use_amr, 'mmr']) * 12 - 0.3 * census_df.loc[use_amr, 'totalincome']
        )

        # Save updated file
        census_df.to_csv(file_path, index=False)
        print(f"✅ Calculated gap for {year} using updated CHN conditions")
    else:
        print(f"❌ File not found for {year}: {file_path}")

✅ Calculated gap for 2021 using updated CHN conditions
✅ Calculated gap for 2022 using updated CHN conditions
✅ Calculated gap for 2023 using updated CHN conditions
✅ Calculated gap for 2024 using updated CHN conditions
✅ Calculated gap for 2025 using updated CHN conditions
✅ Calculated gap for 2026 using updated CHN conditions
✅ Calculated gap for 2027 using updated CHN conditions
✅ Calculated gap for 2028 using updated CHN conditions
✅ Calculated gap for 2029 using updated CHN conditions
✅ Calculated gap for 2030 using updated CHN conditions


In [36]:
# After assigning gaps
print(f"{year}: CHN=1 count: {census_df['chn'].sum()}, GAP > 0 count: {(census_df['gap'] > 0).sum()}")


2030: CHN=1 count: 8505, GAP > 0 count: 8505


In [37]:
import os
import pandas as pd

# Load NPR household IDs from CSV
npr_hh_path = os.path.join(folder_path, "npr_household_ids.csv")
npr_hh_df = pd.read_csv(npr_hh_path)
npr_household_ids = npr_hh_df['HH_ID'].tolist()

input_base_path = os.path.join(folder_path, "with_chn")

for year in range(2021, 2031):  # 2021 to 2030 inclusive
    file_path = os.path.join(input_base_path, f"census{year}_household_chn.csv")

    if os.path.exists(file_path):
        census_df = pd.read_csv(file_path)

        # ✅ Add nprhh column based on imported NPR household IDs
        if 'HH_ID' in census_df.columns:
            census_df['nprhh'] = census_df['HH_ID'].isin(npr_household_ids).astype(int)
        else:
            print(f"⚠️ HH_ID column not found in {year} dataset.")
            census_df['nprhh'] = 0

        # Initialize COHB to 0.0
        census_df['cohb'] = 0.0

        # Condition: renter, in core housing need, stir > 0.3
        condition = (
            (census_df['TENUR'] == 2) &
            (census_df['chn'] == 1) &
            (census_df['stir'] > 0.3)
        )

        # Pre-calculate COHB components
        mmr_80 = 0.8 * 12 * census_df.loc[condition, 'mmr']
        shelco_100_capped = (12 * census_df.loc[condition, 'SHELCO']).clip(
            upper=(12 * census_df.loc[condition, 'mmr'])
        )
        eligible_cost = pd.concat([mmr_80, shelco_100_capped], axis=1).max(axis=1)

        netinc_30 = 0.3 * census_df.loc[condition, 'netinc']
        cohb_values = eligible_cost - netinc_30

        # Final COHB assignment with clipping
        census_df.loc[condition, 'cohb'] = cohb_values.clip(lower=0)

        # 🔢 Filter: nprhh == 1, chn == 1, and exclude HCORENEED_IND == 888
        filter_condition = (
            (census_df['nprhh'] == 1) &
            (census_df['chn'] == 1) &
            (census_df['HCORENEED_IND'] != 888)
        )

        if 'WEIGHT' in census_df.columns:
            weighted_count = census_df.loc[filter_condition, 'WEIGHT'].sum()
            print(f"📅 {year}: Weighted households with nprhh == 1, chn == 1, HCORENEED_IND != 888: {weighted_count:,.0f}")
        else:
            print(f"⚠️ WEIGHT column missing in {year} data.")

        # Save updated file
        census_df.to_csv(file_path, index=False)
        print(f"✅ Finished processing {year}")

    else:
        print(f"❌ File not found for {year}: {file_path}")


📅 2021: Weighted households with nprhh == 1, chn == 1, HCORENEED_IND != 888: 27,916
✅ Finished processing 2021
📅 2022: Weighted households with nprhh == 1, chn == 1, HCORENEED_IND != 888: 33,665
✅ Finished processing 2022
📅 2023: Weighted households with nprhh == 1, chn == 1, HCORENEED_IND != 888: 51,530
✅ Finished processing 2023
📅 2024: Weighted households with nprhh == 1, chn == 1, HCORENEED_IND != 888: 67,545
✅ Finished processing 2024
📅 2025: Weighted households with nprhh == 1, chn == 1, HCORENEED_IND != 888: 62,548
✅ Finished processing 2025
📅 2026: Weighted households with nprhh == 1, chn == 1, HCORENEED_IND != 888: 57,076
✅ Finished processing 2026
📅 2027: Weighted households with nprhh == 1, chn == 1, HCORENEED_IND != 888: 49,609
✅ Finished processing 2027
📅 2028: Weighted households with nprhh == 1, chn == 1, HCORENEED_IND != 888: 49,606
✅ Finished processing 2028
📅 2029: Weighted households with nprhh == 1, chn == 1, HCORENEED_IND != 888: 49,920
✅ Finished processing 2029
📅

In [38]:
import os
import pandas as pd

mean_cohb = {}
mean_gap = {}
mean_income = {}  # NEW dictionary to store weighted avg income

for year in range(2021, 2031):
    file_path = os.path.join(input_base_path, f"census{year}_household_chn.csv")

    if os.path.exists(file_path):
        census_df_full = pd.read_csv(file_path)  # full, unfiltered

        # Ensure the necessary columns exist
        if all(col in census_df_full.columns for col in ['cohb', 'gap', 'WEIGHT', 'HCORENEED_IND', 'totalincome']):
            
            # 1️⃣ For COHB & GAP → exclude HCORENEED_IND == 888
            census_df = census_df_full[census_df_full['HCORENEED_IND'] != 888].copy()

            # COHB > 0
            cohb_positive = census_df[census_df['cohb'] > 0]
            if not cohb_positive.empty:
                weighted_mean_cohb = (cohb_positive['cohb'] * cohb_positive['WEIGHT']).sum() / cohb_positive['WEIGHT'].sum()
                mean_cohb[year] = weighted_mean_cohb
            else:
                mean_cohb[year] = 0

            # GAP > 0
            gap_positive = census_df[census_df['gap'] > 0]
            if not gap_positive.empty:
                weighted_mean_gap = (gap_positive['gap'] * gap_positive['WEIGHT']).sum() / gap_positive['WEIGHT'].sum()
                mean_gap[year] = weighted_mean_gap
            else:
                mean_gap[year] = 0

            # 2️⃣ For Income → use *full*, unfiltered dataframe
            if not census_df_full.empty:
                weighted_mean_income = (census_df_full['totalincome'] * census_df_full['WEIGHT']).sum() / census_df_full['WEIGHT'].sum()
                mean_income[year] = weighted_mean_income
            else:
                mean_income[year] = 0
        else:
            print(f"❌ Missing columns in {year}, skipping.")
    else:
        print(f"❌ File not found for {year}")

# ✅ Print results
print("\n📊 Weighted Mean COHB (for values > 0, excluding HCORENEED_IND == 888):")
for year, val in mean_cohb.items():
    print(f"{year}: {val:.2f}")

print("\n📊 Weighted Mean GAP (for values > 0, excluding HCORENEED_IND == 888):")
for year, val in mean_gap.items():
    print(f"{year}: {val:.2f}")

print("\n📊 Weighted Mean Income (ALL households, INCLUDING HCORENEED_IND == 888):")
for year, val in mean_income.items():
    print(f"{year}: ${val:,.2f}")



📊 Weighted Mean COHB (for values > 0, excluding HCORENEED_IND == 888):
2021: 4401.67
2022: 4919.80
2023: 5625.75
2024: 6019.27
2025: 6287.90
2026: 6476.86
2027: 6539.40
2028: 6670.99
2029: 6723.64
2030: 6829.38

📊 Weighted Mean GAP (for values > 0, excluding HCORENEED_IND == 888):
2021: 3783.17
2022: 4105.45
2023: 4540.28
2024: 4840.74
2025: 5028.76
2026: 5155.58
2027: 5230.50
2028: 5342.70
2029: 5409.72
2030: 5513.75

📊 Weighted Mean Income (ALL households, INCLUDING HCORENEED_IND == 888):
2021: $59,184.27
2022: $60,507.28
2023: $61,719.46
2024: $64,771.23
2025: $66,716.81
2026: $68,422.17
2027: $70,485.50
2028: $72,256.10
2029: $74,023.55
2030: $75,798.59


In [39]:
#trace file


# Folder path
base_path = os.path.join(folder_path, "with_chn")

# Years to process
years = range(2021, 2031)

# Row number to extract (0-based index)
target_row = 51  # Change this to any row index you want

# List to store selected rows
selected_rows = []

for year in years:
    file_name = f"census{year}_household_chn.csv"
    file_path = os.path.join(base_path, file_name)
    
    if os.path.exists(file_path):
        df = pd.read_csv(file_path)
        if len(df) > target_row:
            selected_row = df.iloc[target_row]
            selected_rows.append(selected_row)
        else:
            print(f"File {file_name} has less than {target_row + 1} rows.")
    else:
        print(f"File not found: {file_path}")

# Combine and save to chn_trace.csv
if selected_rows:
    chn_trace_df = pd.DataFrame(selected_rows)
    output_path = os.path.join(base_path, f"chn_trace_row{target_row + 2}.csv")
    chn_trace_df.to_csv(output_path, index=False)
    print(f"chn_trace_row{target_row + 1}.csv created at {output_path}")
else:
    print("No data found to create trace file.")

chn_trace_row52.csv created at C:/Users/mgordon/OneDrive - Financial Accountability Office of Ontario/FA2404 Housing and Homelessness Update/Data\Microsimulations\with_chn\chn_trace_row53.csv


In [40]:


# Load the data
df = pd.read_csv(os.path.join(folder_path, "with_chn", "census2021_household_chn.csv"))

# Check required columns
required_cols = ['netinc', 'totalincome', 'WEIGHT', 'quintile']
if all(col in df.columns for col in required_cols):
    # Compute netshare safely
    df['netshare'] = df['netinc'] / df['totalincome']
    df = df.replace([float('inf'), -float('inf')], pd.NA).dropna(subset=['netshare'])

    # Group by quintile and calculate weighted average netshare
    summary = (
        df.groupby('quintile')
        .apply(lambda g: (g['netshare'] * g['WEIGHT']).sum() / g['WEIGHT'].sum())
        .reset_index(name='weighted_netshare')
    )

    # Format output
    summary['weighted_netshare'] = summary['weighted_netshare'].round(4)
    print("\n✅ Weighted Average Netshare by Quintile:\n")
    print(summary)
else:
    print("❌ Missing required columns: netinc, totalincome, WEIGHT, or quintile.")



✅ Weighted Average Netshare by Quintile:

   quintile  weighted_netshare
0         1             0.9442
1         2             0.9152
2         3             0.8822


  .apply(lambda g: (g['netshare'] * g['WEIGHT']).sum() / g['WEIGHT'].sum())


In [41]:
print(df[['totalincome', 'netinc', 'netshare']].sort_values(by='netshare').head(10))


       totalincome   netinc   netshare
5860           401  -7000.0 -17.456359
30378         1000  -6000.0  -6.000000
25865         4400 -23000.0  -5.227273
8073         11300 -50000.0  -4.424779
5262          1000  -4000.0  -4.000000
16587        13400 -53000.0  -3.955224
24238          801  -3000.0  -3.745318
9991          6800 -20000.0  -2.941176
7770         11900 -30000.0  -2.521008
4357          4500 -11000.0  -2.444444


Ignore for now: COHB/affordable housing program analysis

In [42]:

import pandas as pd
output_path = os.path.join(folder_path, "with_chn", "subset_2024.csv")
# Example: assuming df2024 is loaded
df = pd.read_csv(os.path.join(folder_path, "with_chn", "census2024_household_chn.csv"))

df = df[
    (df['TENUR'] == 2) &
    (df['chn'] == 1) &
    (df['SUBSIDY'] == 0)
].copy()




def select_households_below_weighted_avg(df, target_avg=20000):
    df_sorted = df.sort_values(by="totalincome").reset_index(drop=True)
    
    df_sorted['cum_weighted_income'] = (df_sorted['totalincome'] * df_sorted['WEIGHT']).cumsum()
    df_sorted['cum_weight'] = df_sorted['WEIGHT'].cumsum()
    df_sorted['cum_weighted_avg'] = df_sorted['cum_weighted_income'] / df_sorted['cum_weight']

    # Find the row where weighted average is closest to target_avg
    df_sorted['abs_diff'] = (df_sorted['cum_weighted_avg'] - target_avg).abs()
    best_idx = df_sorted['abs_diff'].idxmin()

    subset_df = df_sorted.loc[:best_idx].copy()
    subset_df.drop(columns=['cum_weighted_income', 'cum_weight', 'cum_weighted_avg', 'abs_diff'], inplace=True)

    return subset_df



subset_df = select_households_below_weighted_avg(df)

subset_df['estgap'] = 12 * subset_df['mmr'] - 0.3 * subset_df['totalincome']
subset_df['ntgap'] = 0.8 * 12 * subset_df['mmr'] - 0.3 * subset_df['totalincome']


weighted_avg = (subset_df['totalincome'] * subset_df['WEIGHT']).sum() / subset_df['WEIGHT'].sum()
print(f"Weighted average income: ${weighted_avg:,.2f}")
subset_df.to_csv(output_path, index=False)


Weighted average income: $19,998.39


In [43]:
import pandas as pd
output_path = os.path.join(folder_path, "with_chn", "final_subset_2024_greedy.csv")
# Load your subset
df = pd.read_csv(os.path.join(folder_path, "with_chn", "subset_2024.csv"))

target_weight = 22000
target_avg_cohb = 10600

# Create a new column: absolute difference from target
df['cohb_diff'] = (df['cohb'] - target_avg_cohb).abs()

# Sort by closest to target COHB first
df_sorted = df.sort_values(by='cohb_diff').reset_index(drop=True)

selected_rows = []
total_weight = 0
total_weighted_cohb = 0

for _, row in df_sorted.iterrows():
    weight = row['WEIGHT']
    cohb = row['cohb']

    if total_weight + weight > target_weight:
        remaining_weight = target_weight - total_weight
        total_weighted_cohb += cohb * remaining_weight
        row_copy = row.copy()
        row_copy['WEIGHT'] = remaining_weight
        selected_rows.append(row_copy)
        total_weight = target_weight
        break
    else:
        total_weight += weight
        total_weighted_cohb += cohb * weight
        selected_rows.append(row)

# Calculate final weighted average
weighted_avg_cohb = total_weighted_cohb / total_weight
print(f"✅ Final weighted avg COHB: ${weighted_avg_cohb:,.2f}")

# Save
final_df = pd.DataFrame(selected_rows)
final_df.to_csv(output_path, index=False)


✅ Final weighted avg COHB: $10,165.81


In [44]:
import pandas as pd

# Load your final selected households (from 2024)
final_subset = pd.read_csv(os.path.join(folder_path, "with_chn", "final_subset_2024_greedy.csv"))


# Get the list of selected HH_IDs
selected_hh_ids = final_subset['HH_ID'].unique()

# Range of years to check
years = range(2024, 2031)  # 2025 to 2030 inclusive

# Loop through each year
for year in years:
    # Load that year's census file
    df_year = pd.read_csv(os.path.join(folder_path, "with_chn", f"census{year}_household_chn.csv"))
    
    # Filter to only the selected HH_IDs
    df_matched = df_year[df_year['HH_ID'].isin(selected_hh_ids)].copy()

    if df_matched.empty:
        print(f"⚠️ No matching HH_IDs found in {year} data.")
        continue
    
    # Calculate weighted average COHB
    weighted_avg_cohb = (df_matched['cohb'] * df_matched['WEIGHT']).sum() / df_matched['WEIGHT'].sum()

    print(f"✅ {year}: Weighted avg COHB = ${weighted_avg_cohb:,.2f} over {df_matched['WEIGHT'].sum():,.0f} weighted households")


✅ 2024: Weighted avg COHB = $10,165.12 over 22,007 weighted households
✅ 2025: Weighted avg COHB = $10,587.99 over 21,979 weighted households
✅ 2026: Weighted avg COHB = $10,902.19 over 21,885 weighted households
✅ 2027: Weighted avg COHB = $11,070.07 over 21,788 weighted households
✅ 2028: Weighted avg COHB = $11,299.94 over 21,955 weighted households
✅ 2029: Weighted avg COHB = $11,449.78 over 22,134 weighted households
✅ 2030: Weighted avg COHB = $11,656.61 over 22,318 weighted households


In [45]:
import pandas as pd
import numpy as np

# Load the data
df = pd.read_csv(os.path.join(folder_path, "with_chn", "census2023_household_chn.csv"))


# Define conditions based on bedsuit values
conditions = [
    (df['bedsuit'] == 0) & (df['totalincome'] <= 40000),
    (df['bedsuit'] == 1) & (df['totalincome'] <= 49000),
    (df['bedsuit'] == 2) & (df['totalincome'] <= 56000),
    (df['bedsuit'] == 3) & (df['totalincome'] <= 62000),
    (df['bedsuit'] >= 4) & (df['totalincome'] <= 75000),
]

# Combine all conditions using logical OR
combined_condition = conditions[0]
for cond in conditions[1:]:
    combined_condition |= cond

# Create the filtered subset
subset_df = df[combined_condition].copy()


In [46]:
# Filter for households in core housing need and valid HCORENEED_IND
chn_df = subset_df[(subset_df['chn'] == 1) & (subset_df['HCORENEED_IND'] != 888)]

# Group by TENUR and sum the WEIGHT for each group
weighted_chn_by_tenur = chn_df.groupby('TENUR')['WEIGHT'].sum()

# Display the result
print(weighted_chn_by_tenur)


TENUR
1.0    241147.576884
2.0    445077.157150
Name: WEIGHT, dtype: float64


calculate CNIT to use for asset threshold shares

In [47]:
import pandas as pd

# Step 1: Load the data
df = pd.read_csv(os.path.join(folder_path, "with_chn", "census2023_household_chn.csv"))

# Step 2: Create a filtered copy for households in core housing need with valid HCORENEED_IND
df_copy = df[(df['chn'] == 1) & (df['HCORENEED_IND'] != 888)].copy()

# Step 3: Create the cnit variable
df_copy['cnit'] = (12 * df_copy['mmr']) / 0.3

# Step 4: Calculate weighted average CNIT by bedsuit
weighted_avg_cnit = (
    df_copy
    .groupby('bedsuit')
    .apply(lambda g: (g['cnit'] * g['WEIGHT']).sum() / g['WEIGHT'].sum())
)

# Display the result
print(weighted_avg_cnit)


# Filter households with bedsuit >= 4
df_4plus = df_copy[df_copy['bedsuit'] >= 4]

# Calculate weighted average CNIT
weighted_avg_cnit_4plus = (df_4plus['cnit'] * df_4plus['WEIGHT']).sum() / df_4plus['WEIGHT'].sum()

# Display the result
print(f"Weighted average CNIT for 4+ bedroom-suitable households: {weighted_avg_cnit_4plus:.2f}")



bedsuit
0.0     46739.260049
1.0     58171.510037
2.0     68383.754562
3.0     82392.623954
4.0    104664.859686
5.0    103760.971054
dtype: float64
Weighted average CNIT for 4+ bedroom-suitable households: 104547.97


  .apply(lambda g: (g['cnit'] * g['WEIGHT']).sum() / g['WEIGHT'].sum())


In [48]:
import pandas as pd
import os

# Path to your directory
base_path = os.path.join(folder_path, "with_chn")

# Loop through years 2023 to 2030
for year in range(2023, 2031):
    file_path = os.path.join(base_path, f"census{year}_household_chn.csv")
    
    # Load the data
    df = pd.read_csv(file_path)
    
    # Filter for households in core housing need with valid HCORENEED_IND
    df_copy = df[(df['chn'] == 1) & (df['HCORENEED_IND'] != 888)].copy()
    
    # Create the cnit variable
    df_copy['cnit'] = (12 * df_copy['mmr']) / 0.3
    
    # Calculate weighted average CNIT by bedsuit
    weighted_avg_cnit = (
        df_copy
        .groupby('bedsuit')
        .apply(lambda g: (g['cnit'] * g['WEIGHT']).sum() / g['WEIGHT'].sum())
    )
    
    # Display the result for this year
    print(f"\nYear {year} - Weighted average CNIT by bedsuit:")
    print(weighted_avg_cnit)
    
    # Filter households with bedsuit >= 4
    df_4plus = df_copy[df_copy['bedsuit'] >= 4]
    
    # Calculate weighted average CNIT for 4+ bedroom-suitable households
    if not df_4plus.empty:
        weighted_avg_cnit_4plus = (df_4plus['cnit'] * df_4plus['WEIGHT']).sum() / df_4plus['WEIGHT'].sum()
        print(f"Weighted average CNIT for 4+ bedroom-suitable households: {weighted_avg_cnit_4plus:.2f}")
    else:
        print("No 4+ bedroom-suitable households in this year.")


  .apply(lambda g: (g['cnit'] * g['WEIGHT']).sum() / g['WEIGHT'].sum())



Year 2023 - Weighted average CNIT by bedsuit:
bedsuit
0.0     46739.260049
1.0     58171.510037
2.0     68383.754562
3.0     82392.623954
4.0    104664.859686
5.0    103760.971054
dtype: float64
Weighted average CNIT for 4+ bedroom-suitable households: 104547.97


  .apply(lambda g: (g['cnit'] * g['WEIGHT']).sum() / g['WEIGHT'].sum())



Year 2024 - Weighted average CNIT by bedsuit:
bedsuit
0.0     49168.414346
1.0     61234.767167
2.0     72050.645307
3.0     86849.170762
4.0    110182.989784
5.0    108820.268116
dtype: float64
Weighted average CNIT for 4+ bedroom-suitable households: 109994.55


  .apply(lambda g: (g['cnit'] * g['WEIGHT']).sum() / g['WEIGHT'].sum())



Year 2025 - Weighted average CNIT by bedsuit:
bedsuit
0.0     50890.834160
1.0     63433.209644
2.0     74552.513586
3.0     89787.577795
4.0    113900.082036
5.0    113075.526211
dtype: float64
Weighted average CNIT for 4+ bedroom-suitable households: 113790.38


  .apply(lambda g: (g['cnit'] * g['WEIGHT']).sum() / g['WEIGHT'].sum())



Year 2026 - Weighted average CNIT by bedsuit:
bedsuit
0.0     52211.967423
1.0     64891.608915
2.0     76405.471141
3.0     92143.129955
4.0    116577.713349
5.0    116421.940234
dtype: float64
Weighted average CNIT for 4+ bedroom-suitable households: 116557.97


  .apply(lambda g: (g['cnit'] * g['WEIGHT']).sum() / g['WEIGHT'].sum())



Year 2027 - Weighted average CNIT by bedsuit:
bedsuit
0.0     53095.496088
1.0     65941.991508
2.0     77756.706121
3.0     93747.119127
4.0    118842.391422
5.0    118866.606682
dtype: float64
Weighted average CNIT for 4+ bedroom-suitable households: 118845.37


  .apply(lambda g: (g['cnit'] * g['WEIGHT']).sum() / g['WEIGHT'].sum())



Year 2028 - Weighted average CNIT by bedsuit:
bedsuit
0.0     54153.253545
1.0     67136.744979
2.0     79283.313113
3.0     95687.747382
4.0    121389.292514
5.0    121249.097609
dtype: float64
Weighted average CNIT for 4+ bedroom-suitable households: 121372.00


  .apply(lambda g: (g['cnit'] * g['WEIGHT']).sum() / g['WEIGHT'].sum())



Year 2029 - Weighted average CNIT by bedsuit:
bedsuit
0.0     54998.407829
1.0     68162.101955
2.0     80546.498405
3.0     97269.013974
4.0    123246.887682
5.0    123089.473839
dtype: float64
Weighted average CNIT for 4+ bedroom-suitable households: 123227.23

Year 2030 - Weighted average CNIT by bedsuit:
bedsuit
0.0     55983.419020
1.0     69327.349676
2.0     81964.661446
3.0     99078.868801
4.0    125429.553285
5.0    125352.727606
dtype: float64
Weighted average CNIT for 4+ bedroom-suitable households: 125419.91


  .apply(lambda g: (g['cnit'] * g['WEIGHT']).sum() / g['WEIGHT'].sum())


In [49]:
import pandas as pd

# Load the data
df = pd.read_csv(os.path.join(folder_path, "with_chn", "census2023_household_chn.csv"))

# Subset for CHN == 1, HCORENEED_IND != 888, and NPRHH == 0
filtered_df = df[(df['chn'] == 1) & (df['HCORENEED_IND'] != 888) ]
#& (df['nprhh'] == 0)
# Calculate weighted counts for each category
tenur_2 = filtered_df[filtered_df['TENUR'] == 2]['WEIGHT'].sum()
tenur_1_presmortg_0 = filtered_df[(filtered_df['TENUR'] == 1) & (filtered_df['PRESMORTG'] == 0)]['WEIGHT'].sum()
tenur_1_presmortg_1 = filtered_df[(filtered_df['TENUR'] == 1) & (filtered_df['PRESMORTG'] == 1)]['WEIGHT'].sum()


# Total weighted households (after filtering)
total_weighted = filtered_df['WEIGHT'].sum()

# Print the results
print(f"Weighted count (TENUR = 2): {tenur_2:,.0f}")
print(f"Weighted count (TENUR = 1 & PRESMORTG = 0): {tenur_1_presmortg_0:,.0f}")
print(f"Weighted count (TENUR = 1 & PRESMORTG = 1): {tenur_1_presmortg_1:,.0f}")
print(f"Total weighted households (filtered): {total_weighted:,.0f}")

Weighted count (TENUR = 2): 547,209
Weighted count (TENUR = 1 & PRESMORTG = 0): 122,220
Weighted count (TENUR = 1 & PRESMORTG = 1): 208,622
Total weighted households (filtered): 878,052


In [50]:
import os
import pandas as pd

# Path to your directory
base_path = os.path.join(folder_path, "with_chn")

# Loop through years 2023 to 2030
for year in range(2023, 2031):
    file_path = os.path.join(base_path, f"census{year}_household_chn.csv")

    if os.path.exists(file_path):
        # Load the data
        df = pd.read_csv(file_path)

        # Subset for CHN == 1, HCORENEED_IND != 888, and NPRHH == 0
        filtered_df = df[(df['chn'] == 1) & (df['HCORENEED_IND'] != 888)]

        # Calculate weighted counts
        tenur_2 = filtered_df[filtered_df['TENUR'] == 2]['WEIGHT'].sum()
        tenur_1_presmortg_0 = filtered_df[(filtered_df['TENUR'] == 1) & (filtered_df['PRESMORTG'] == 0)]['WEIGHT'].sum()
        tenur_1_presmortg_1 = filtered_df[(filtered_df['TENUR'] == 1) & (filtered_df['PRESMORTG'] == 1)]['WEIGHT'].sum()

        # Total weighted households (after filtering)
        total_weighted = filtered_df['WEIGHT'].sum()

        # Print the results for this year
        print(f"\n📅 Year {year}:")
        print(f"Weighted count (TENUR = 2): {tenur_2:,.0f}")
        print(f"Weighted count (TENUR = 1 & PRESMORTG = 0): {tenur_1_presmortg_0:,.0f}")
        print(f"Weighted count (TENUR = 1 & PRESMORTG = 1): {tenur_1_presmortg_1:,.0f}")
        print(f"Total weighted households (filtered): {total_weighted:,.0f}")

    else:
        print(f"❌ File not found for year {year}, skipping.")



📅 Year 2023:
Weighted count (TENUR = 2): 547,209
Weighted count (TENUR = 1 & PRESMORTG = 0): 122,220
Weighted count (TENUR = 1 & PRESMORTG = 1): 208,622
Total weighted households (filtered): 878,052

📅 Year 2024:
Weighted count (TENUR = 2): 576,120
Weighted count (TENUR = 1 & PRESMORTG = 0): 126,014
Weighted count (TENUR = 1 & PRESMORTG = 1): 215,990
Total weighted households (filtered): 918,124

📅 Year 2025:
Weighted count (TENUR = 2): 589,236
Weighted count (TENUR = 1 & PRESMORTG = 0): 131,226
Weighted count (TENUR = 1 & PRESMORTG = 1): 222,112
Total weighted households (filtered): 942,575

📅 Year 2026:
Weighted count (TENUR = 2): 597,275
Weighted count (TENUR = 1 & PRESMORTG = 0): 136,773
Weighted count (TENUR = 1 & PRESMORTG = 1): 228,028
Total weighted households (filtered): 962,077

📅 Year 2027:
Weighted count (TENUR = 2): 590,953
Weighted count (TENUR = 1 & PRESMORTG = 0): 141,208
Weighted count (TENUR = 1 & PRESMORTG = 1): 226,300
Total weighted households (filtered): 958,460


In [51]:
#calculate avg income growth

