In [44]:

import pandas as pd
import os
import statsmodels.api as sm
import numpy as np
from dotenv import load_dotenv

In [45]:
load_dotenv()
data_dir = os.getenv("DATA_PATH")
folder_path = os.path.join(data_dir, "Microsimulations")

In [46]:
# Define input and output paths
input_base_path = os.path.join(folder_path, "household")
output_base_path = os.path.join(folder_path, "with_chn")

# Ensure output directory exists
os.makedirs(output_base_path, exist_ok=True)

# Dictionary to store updated data
census_data = {}

# Loop through the years 2022 to 2030
for year in range(2022, 2031):
    input_file_path = os.path.join(input_base_path, f"census{year}_household.csv")
    output_file_path = os.path.join(output_base_path, f"census{year}_household_chn.csv")
    
    if os.path.exists(input_file_path):
        # Load the data
        census_df = pd.read_csv(input_file_path)
        
        # Initialize CHN column to 0
        census_df['chn'] = 0

        # Define housing issue conditions
        housing_issue = (
            (census_df['SHELCO'] * 12 / census_df['totalincome'] > 0.30) |  # Unaffordable
            (census_df['NOS'] == 0) |  # Unsuitable
            (census_df['REPAIR'] == 3)  # Inadequate
        )

        # Define market unaffordability condition
        market_unaffordable = (census_df['mmr']) * 12 > 0.30 * census_df['totalincome']

        # Update CHN variable
        census_df.loc[
            housing_issue & market_unaffordable &
            ~((census_df['student_household'] == 1) & (census_df['non_family_household'] == 1)),
            'chn'
        ] = 1

        # Now create stir and alt_stir after chn is assigned
       # census_df['stir'] = census_df['SHELCO'] * 12 / census_df['totalincome']
        #census_df['alt_stir'] = (census_df['mmr']) * 12 / census_df['totalincome']


        # Update CHN: Exclude individuals with STIR >= 1
        census_df.loc[census_df["stir"] >= 1, "chn"] = 0
        

        # Define deep core housing issue condition (using 50% income threshold)
        deep_housing_issue = (
            (census_df['SHELCO'] * 12 / census_df['totalincome'] > 0.50) |  # Deeply Unaffordable
            (census_df['NOS'] == 0) |  # Unsuitable
            (census_df['REPAIR'] == 3)  # Inadequate
        )

        # Define deep market unaffordability condition (50% threshold)
        deep_market_unaffordable = (census_df['mmr']) * 12 > 0.50 * census_df['totalincome']

        # Initialize dchn column to 0
        census_df['dchn'] = 0

        # Update dchn variable
        census_df.loc[
            deep_housing_issue & deep_market_unaffordable &
            ~((census_df['student_household'] == 1) & (census_df['non_family_household'] == 1)),
            'dchn'
        ] = 1

        # Update DCHN: Exclude individuals with STIR >= 1
        census_df.loc[census_df["stir"] >= 1, "dchn"] = 0

        # Export updated data
        census_df.to_csv(output_file_path, index=False)
        
        print(f"Updated CHN values and added stir/alt_stir for {year}")

    else:
        print(f"File not found: {input_file_path}")

Updated CHN values and added stir/alt_stir for 2022
Updated CHN values and added stir/alt_stir for 2023
Updated CHN values and added stir/alt_stir for 2024
Updated CHN values and added stir/alt_stir for 2025
Updated CHN values and added stir/alt_stir for 2026
Updated CHN values and added stir/alt_stir for 2027
Updated CHN values and added stir/alt_stir for 2028
Updated CHN values and added stir/alt_stir for 2029
Updated CHN values and added stir/alt_stir for 2030


In [47]:
#add 2021 census file to folder

# Define input and output paths
input_base_path = os.path.join(folder_path, "household")
output_base_path = os.path.join(folder_path, "with_chn")

# File name
input_file_name = "census2021_household.csv"
output_file_name = "census2021_household_chn.csv"

# Full paths
input_file_path = os.path.join(input_base_path, input_file_name)
output_file_path = os.path.join(output_base_path, output_file_name)

# Read the file
df = pd.read_csv(input_file_path)


  # Now create stir and alt_stir after chn is assigned
df['stir'] = df['SHELCO'] * 12 / df['totalincome']
df['alt_stir'] = (df['mmr']) * 12 / df['totalincome']

#net income share
df['netshare'] = (
    df['TOTINC_AT'] / df['totalincome']
).clip(upper=1.0)

#net income
df['netinc'] = df['totalincome'] * df['netshare']

# Save the modified dataframe
df.to_csv(output_file_path, index=False)

print(f"File saved to: {output_file_path}")

File saved to: C:/Users/mgordon/OneDrive - Financial Accountability Office of Ontario/FA2404 Housing and Homelessness Update/Data\Microsimulations\with_chn\census2021_household_chn.csv


In [48]:

# Define the path for processed files
output_base_path = os.path.join(folder_path, "with_chn")

# Dictionaries to store weighted household counts
chn_weighted_counts = {}
dchn_weighted_counts = {}

# Loop through years 2022 to 2030
for year in range(2021, 2031):
    file_path = os.path.join(output_base_path, f"census{year}_household_chn.csv")

    if os.path.exists(file_path):
        # Load the data
        census_df = pd.read_csv(file_path)

        # Check if required columns exist
        required_columns = {'chn', 'dchn', 'WEIGHT', 'HCORENEED_IND'}
        missing_columns = required_columns - set(census_df.columns)

        if missing_columns:
            print(f"Skipping {year} due to missing columns: {missing_columns}")
            continue  # Skip processing this file

        # Exclude households where HCORENEED_IND == 888
        filtered_df = census_df[census_df['HCORENEED_IND'] != 888]

        # Calculate weighted count of households where chn == 1
        chn_weight = filtered_df.loc[filtered_df['chn'] == 1, 'WEIGHT'].sum()
        chn_weighted_counts[year] = chn_weight

        # Calculate weighted count of households where dchn == 1
        dchn_weight = filtered_df.loc[filtered_df['dchn'] == 1, 'WEIGHT'].sum()
        dchn_weighted_counts[year] = dchn_weight

        print(f"[{year}] CHN weighted count: {chn_weight}, DCHN weighted count: {dchn_weight}")

    else:
        print(f"File not found: {file_path}")

print("\nTotal household weights per year:")

# Print the sum of WEIGHT for each year
for year in range(2021, 2031):
    file_path = os.path.join(output_base_path, f"census{year}_household_chn.csv")
    if os.path.exists(file_path):
        census_df = pd.read_csv(file_path)
        print(f"Sum of WEIGHT for {year}: {census_df['WEIGHT'].sum()}")


[2021] CHN weighted count: 684245.5791455524, DCHN weighted count: 154141.0278820697
[2022] CHN weighted count: 833459.1897082782, DCHN weighted count: 230163.96291553596
[2023] CHN weighted count: 832018.319150569, DCHN weighted count: 227946.11747487643
[2024] CHN weighted count: 872540.6530378839, DCHN weighted count: 244929.585172378
[2025] CHN weighted count: 937383.7454420876, DCHN weighted count: 267785.88209152524
[2026] CHN weighted count: 967357.5286619415, DCHN weighted count: 280889.29216062214
[2027] CHN weighted count: 978742.445329859, DCHN weighted count: 287704.8630338368
[2028] CHN weighted count: 988579.358870241, DCHN weighted count: 292470.2648668414
[2029] CHN weighted count: 1002778.3887848369, DCHN weighted count: 299798.03430102044
[2030] CHN weighted count: 1022751.9155978468, DCHN weighted count: 309470.2860218115

Total household weights per year:
Sum of WEIGHT for 2021: 3310969.362428404
Sum of WEIGHT for 2022: 3395297.595946454
Sum of WEIGHT for 2023: 3505

In [49]:

# Define year to process
year = 2021
file_path = os.path.join(folder_path, f"census{year}.csv")

# Check if file exists before proceeding
if os.path.exists(file_path):
    df = pd.read_csv(file_path)
    
    # Create jobless variable
    df["jobless"] = df["LFACT"].between(3, 10).astype(int)
    
    # Calculate share of records with jobless == 1 for each AGEGRP and IMMSTAT
    summary = df.groupby(["AGEGRP", df["IMMSTAT"].apply(lambda x: "IMMSTAT_3" if x == 3 else "IMMSTAT_not_3")])["jobless"].mean().reset_index()
    
    # Save summary to a CSV file
    output_path = os.path.join(folder_path, f"census_share_{year}.csv")
    summary.to_csv(output_path, index=False)


In [50]:

# Load netshare from 2021
input_base_path = os.path.join(folder_path, "with_chn")
census2021_household = pd.read_csv(os.path.join(input_base_path, "census2021_household_chn.csv"))
netshare_2021 = (census2021_household['TOTINC_AT'] / census2021_household['totalincome']).clip(upper=1.0)

# Apply to years 2022‚Äì2030
input_base_path = os.path.join(folder_path, "with_chn")

for year in range(2022, 2031):
    file_path = os.path.join(input_base_path, f"census{year}_household_chn.csv")

    if os.path.exists(file_path):
        census_df = pd.read_csv(file_path)

        # Assign netshare from 2021 (assumes same order/row count)
        census_df['netshare'] = netshare_2021.values
        census_df['netinc'] = census_df['totalincome'] * census_df['netshare']

        # Save updated file
        census_df.to_csv(file_path, index=False)
        print(f"‚úÖ Updated netshare and netinc for {year}")
    else:
        print(f"‚ùå File not found for {year}: {file_path}")

‚úÖ Updated netshare and netinc for 2022
‚úÖ Updated netshare and netinc for 2023
‚úÖ Updated netshare and netinc for 2024
‚úÖ Updated netshare and netinc for 2025
‚úÖ Updated netshare and netinc for 2026
‚úÖ Updated netshare and netinc for 2027
‚úÖ Updated netshare and netinc for 2028
‚úÖ Updated netshare and netinc for 2029
‚úÖ Updated netshare and netinc for 2030


In [51]:

input_base_path = os.path.join(folder_path, "with_chn")

for year in range(2021, 2031):
    file_path = os.path.join(input_base_path, f"census{year}_household_chn.csv")

    if os.path.exists(file_path):
        census_df = pd.read_csv(file_path)

        # Default gap to 0.0
        census_df['gap'] = 0.0

        # Only calculate gap where CHN = 1
        chn_condition = census_df['chn'] == 1

        # Use SHELCO if NOS == 1, REPAIR != 3, and SHELCO < mmr 
        use_shelco = (
            chn_condition &
            (census_df['NOS'] == 1) &
            (census_df['REPAIR'] != 3) &
            (census_df['SHELCO'] < census_df['mmr'])
        )

        # Use AMR (mmr) otherwise
        use_amr = chn_condition & ~use_shelco  # CHN == 1 but doesn't meet SHELCO condition

        # Apply SHELCO-based gap
        census_df.loc[use_shelco, 'gap'] = (
            census_df.loc[use_shelco, 'SHELCO'] * 12 - 0.3 * census_df.loc[use_shelco, 'totalincome']
        )

        # Apply AMR-based gap
        census_df.loc[use_amr, 'gap'] = (
            (census_df.loc[use_amr, 'mmr']) * 12 - 0.3 * census_df.loc[use_amr, 'totalincome']
        )

        # Save updated file
        census_df.to_csv(file_path, index=False)
        print(f"‚úÖ Calculated gap for {year} using updated CHN conditions")
    else:
        print(f"‚ùå File not found for {year}: {file_path}")

‚úÖ Calculated gap for 2021 using updated CHN conditions
‚úÖ Calculated gap for 2022 using updated CHN conditions
‚úÖ Calculated gap for 2023 using updated CHN conditions
‚úÖ Calculated gap for 2024 using updated CHN conditions
‚úÖ Calculated gap for 2025 using updated CHN conditions
‚úÖ Calculated gap for 2026 using updated CHN conditions
‚úÖ Calculated gap for 2027 using updated CHN conditions
‚úÖ Calculated gap for 2028 using updated CHN conditions
‚úÖ Calculated gap for 2029 using updated CHN conditions
‚úÖ Calculated gap for 2030 using updated CHN conditions


In [52]:
# After assigning gaps
print(f"{year}: CHN=1 count: {census_df['chn'].sum()}, GAP > 0 count: {(census_df['gap'] > 0).sum()}")


2030: CHN=1 count: 8742, GAP > 0 count: 8742


In [53]:
import os
import pandas as pd

# Load NPR household IDs from CSV
npr_hh_path = os.path.join(folder_path, "npr_household_ids.csv")
npr_hh_df = pd.read_csv(npr_hh_path)
npr_household_ids = npr_hh_df['HH_ID'].tolist()

input_base_path = os.path.join(folder_path, "with_chn")

for year in range(2021, 2031):  # 2021 to 2030 inclusive
    file_path = os.path.join(input_base_path, f"census{year}_household_chn.csv")

    if os.path.exists(file_path):
        census_df = pd.read_csv(file_path)

        # ‚úÖ Add nprhh column based on imported NPR household IDs
        if 'HH_ID' in census_df.columns:
            census_df['nprhh'] = census_df['HH_ID'].isin(npr_household_ids).astype(int)
        else:
            print(f"‚ö†Ô∏è HH_ID column not found in {year} dataset.")
            census_df['nprhh'] = 0

        # Initialize COHB to 0.0
        census_df['cohb'] = 0.0

        # Condition: renter, in core housing need, stir > 0.3
        condition = (
            (census_df['TENUR'] == 2) &
            (census_df['chn'] == 1) &
            (census_df['stir'] > 0.3)
        )

        # Pre-calculate COHB components
        mmr_80 = 0.8 * 12 * census_df.loc[condition, 'mmr']
        shelco_100_capped = (12 * census_df.loc[condition, 'SHELCO']).clip(
            upper=(12 * census_df.loc[condition, 'mmr'])
        )
        eligible_cost = pd.concat([mmr_80, shelco_100_capped], axis=1).max(axis=1)

        netinc_30 = 0.3 * census_df.loc[condition, 'netinc']
        cohb_values = eligible_cost - netinc_30

        # Final COHB assignment with clipping
        census_df.loc[condition, 'cohb'] = cohb_values.clip(lower=0)

        # üî¢ Filter: nprhh == 1, chn == 1, and exclude HCORENEED_IND == 888
        filter_condition = (
            (census_df['nprhh'] == 1) &
            (census_df['chn'] == 1) &
            (census_df['HCORENEED_IND'] != 888)
        )

        if 'WEIGHT' in census_df.columns:
            weighted_count = census_df.loc[filter_condition, 'WEIGHT'].sum()
            print(f"üìÖ {year}: Weighted households with nprhh == 1, chn == 1, HCORENEED_IND != 888: {weighted_count:,.0f}")
        else:
            print(f"‚ö†Ô∏è WEIGHT column missing in {year} data.")

        # Save updated file
        census_df.to_csv(file_path, index=False)
        print(f"‚úÖ Finished processing {year}")

    else:
        print(f"‚ùå File not found for {year}: {file_path}")


üìÖ 2021: Weighted households with nprhh == 1, chn == 1, HCORENEED_IND != 888: 27,916
‚úÖ Finished processing 2021
üìÖ 2022: Weighted households with nprhh == 1, chn == 1, HCORENEED_IND != 888: 34,932
‚úÖ Finished processing 2022
üìÖ 2023: Weighted households with nprhh == 1, chn == 1, HCORENEED_IND != 888: 48,281
‚úÖ Finished processing 2023
üìÖ 2024: Weighted households with nprhh == 1, chn == 1, HCORENEED_IND != 888: 63,397
‚úÖ Finished processing 2024
üìÖ 2025: Weighted households with nprhh == 1, chn == 1, HCORENEED_IND != 888: 66,205
‚úÖ Finished processing 2025
üìÖ 2026: Weighted households with nprhh == 1, chn == 1, HCORENEED_IND != 888: 63,775
‚úÖ Finished processing 2026
üìÖ 2027: Weighted households with nprhh == 1, chn == 1, HCORENEED_IND != 888: 59,948
‚úÖ Finished processing 2027
üìÖ 2028: Weighted households with nprhh == 1, chn == 1, HCORENEED_IND != 888: 55,839
‚úÖ Finished processing 2028
üìÖ 2029: Weighted households with nprhh == 1, chn == 1, HCORENEED_IND 

In [54]:
import os
import pandas as pd

mean_cohb = {}
mean_gap = {}
mean_income = {}  # NEW dictionary to store weighted avg income

for year in range(2021, 2031):
    file_path = os.path.join(input_base_path, f"census{year}_household_chn.csv")

    if os.path.exists(file_path):
        census_df_full = pd.read_csv(file_path)  # full, unfiltered

        # Ensure the necessary columns exist
        if all(col in census_df_full.columns for col in ['cohb', 'gap', 'WEIGHT', 'HCORENEED_IND', 'totalincome']):
            
            # 1Ô∏è‚É£ For COHB & GAP ‚Üí exclude HCORENEED_IND == 888
            census_df = census_df_full[census_df_full['HCORENEED_IND'] != 888].copy()

            # COHB > 0
            cohb_positive = census_df[census_df['cohb'] > 0]
            if not cohb_positive.empty:
                weighted_mean_cohb = (cohb_positive['cohb'] * cohb_positive['WEIGHT']).sum() / cohb_positive['WEIGHT'].sum()
                mean_cohb[year] = weighted_mean_cohb
            else:
                mean_cohb[year] = 0

            # GAP > 0
            gap_positive = census_df[census_df['gap'] > 0]
            if not gap_positive.empty:
                weighted_mean_gap = (gap_positive['gap'] * gap_positive['WEIGHT']).sum() / gap_positive['WEIGHT'].sum()
                mean_gap[year] = weighted_mean_gap
            else:
                mean_gap[year] = 0

            # 2Ô∏è‚É£ For Income ‚Üí use *full*, unfiltered dataframe
            if not census_df_full.empty:
                weighted_mean_income = (census_df_full['totalincome'] * census_df_full['WEIGHT']).sum() / census_df_full['WEIGHT'].sum()
                mean_income[year] = weighted_mean_income
            else:
                mean_income[year] = 0
        else:
            print(f"‚ùå Missing columns in {year}, skipping.")
    else:
        print(f"‚ùå File not found for {year}")

# ‚úÖ Print results
print("\nüìä Weighted Mean COHB (for values > 0, excluding HCORENEED_IND == 888):")
for year, val in mean_cohb.items():
    print(f"{year}: {val:.2f}")

print("\nüìä Weighted Mean GAP (for values > 0, excluding HCORENEED_IND == 888):")
for year, val in mean_gap.items():
    print(f"{year}: {val:.2f}")

print("\nüìä Weighted Mean Income (ALL households, INCLUDING HCORENEED_IND == 888):")
for year, val in mean_income.items():
    print(f"{year}: ${val:,.2f}")



üìä Weighted Mean COHB (for values > 0, excluding HCORENEED_IND == 888):
2021: 4401.67
2022: 5213.31
2023: 5527.46
2024: 5966.27
2025: 6515.30
2026: 6816.78
2027: 6998.51
2028: 7169.25
2029: 7367.51
2030: 7579.33

üìä Weighted Mean GAP (for values > 0, excluding HCORENEED_IND == 888):
2021: 3783.17
2022: 4232.18
2023: 4545.48
2024: 4874.24
2025: 5199.40
2026: 5406.06
2027: 5552.32
2028: 5694.64
2029: 5856.39
2030: 6024.88

üìä Weighted Mean Income (ALL households, INCLUDING HCORENEED_IND == 888):
2021: $59,184.27
2022: $59,726.20
2023: $63,792.93
2024: $66,903.60
2025: $68,837.10
2026: $70,701.35
2027: $72,841.41
2028: $75,057.74
2029: $77,016.16
2030: $78,855.19


In [55]:
#trace file


# Folder path
base_path = os.path.join(folder_path, "with_chn")

# Years to process
years = range(2021, 2031)

# Row number to extract (0-based index)
target_row = 51  # Change this to any row index you want

# List to store selected rows
selected_rows = []

for year in years:
    file_name = f"census{year}_household_chn.csv"
    file_path = os.path.join(base_path, file_name)
    
    if os.path.exists(file_path):
        df = pd.read_csv(file_path)
        if len(df) > target_row:
            selected_row = df.iloc[target_row]
            selected_rows.append(selected_row)
        else:
            print(f"File {file_name} has less than {target_row + 1} rows.")
    else:
        print(f"File not found: {file_path}")

# Combine and save to chn_trace.csv
if selected_rows:
    chn_trace_df = pd.DataFrame(selected_rows)
    output_path = os.path.join(base_path, f"chn_trace_row{target_row + 2}.csv")
    chn_trace_df.to_csv(output_path, index=False)
    print(f"chn_trace_row{target_row + 1}.csv created at {output_path}")
else:
    print("No data found to create trace file.")

chn_trace_row52.csv created at C:/Users/mgordon/OneDrive - Financial Accountability Office of Ontario/FA2404 Housing and Homelessness Update/Data\Microsimulations\with_chn\chn_trace_row53.csv


In [56]:


# Load the data
df = pd.read_csv(os.path.join(folder_path, "with_chn", "census2021_household_chn.csv"))

# Check required columns
required_cols = ['netinc', 'totalincome', 'WEIGHT', 'quintile']
if all(col in df.columns for col in required_cols):
    # Compute netshare safely
    df['netshare'] = df['netinc'] / df['totalincome']
    df = df.replace([float('inf'), -float('inf')], pd.NA).dropna(subset=['netshare'])

    # Group by quintile and calculate weighted average netshare
    summary = (
        df.groupby('quintile')
        .apply(lambda g: (g['netshare'] * g['WEIGHT']).sum() / g['WEIGHT'].sum())
        .reset_index(name='weighted_netshare')
    )

    # Format output
    summary['weighted_netshare'] = summary['weighted_netshare'].round(4)
    print("\n‚úÖ Weighted Average Netshare by Quintile:\n")
    print(summary)
else:
    print("‚ùå Missing required columns: netinc, totalincome, WEIGHT, or quintile.")



‚úÖ Weighted Average Netshare by Quintile:

   quintile  weighted_netshare
0         1             0.9442
1         2             0.9152
2         3             0.8822


  .apply(lambda g: (g['netshare'] * g['WEIGHT']).sum() / g['WEIGHT'].sum())


In [57]:
print(df[['totalincome', 'netinc', 'netshare']].sort_values(by='netshare').head(10))


       totalincome   netinc   netshare
5860           401  -7000.0 -17.456359
30378         1000  -6000.0  -6.000000
25865         4400 -23000.0  -5.227273
8073         11300 -50000.0  -4.424779
5262          1000  -4000.0  -4.000000
16587        13400 -53000.0  -3.955224
24238          801  -3000.0  -3.745318
9991          6800 -20000.0  -2.941176
7770         11900 -30000.0  -2.521008
4357          4500 -11000.0  -2.444444


Ignore for now: COHB/affordable housing program analysis

In [58]:

import pandas as pd
output_path = os.path.join(folder_path, "with_chn", "subset_2024.csv")
# Example: assuming df2024 is loaded
df = pd.read_csv(os.path.join(folder_path, "with_chn", "census2024_household_chn.csv"))

df = df[
    (df['TENUR'] == 2) &
    (df['chn'] == 1) &
    (df['SUBSIDY'] == 0)
].copy()




def select_households_below_weighted_avg(df, target_avg=20000):
    df_sorted = df.sort_values(by="totalincome").reset_index(drop=True)
    
    df_sorted['cum_weighted_income'] = (df_sorted['totalincome'] * df_sorted['WEIGHT']).cumsum()
    df_sorted['cum_weight'] = df_sorted['WEIGHT'].cumsum()
    df_sorted['cum_weighted_avg'] = df_sorted['cum_weighted_income'] / df_sorted['cum_weight']

    # Find the row where weighted average is closest to target_avg
    df_sorted['abs_diff'] = (df_sorted['cum_weighted_avg'] - target_avg).abs()
    best_idx = df_sorted['abs_diff'].idxmin()

    subset_df = df_sorted.loc[:best_idx].copy()
    subset_df.drop(columns=['cum_weighted_income', 'cum_weight', 'cum_weighted_avg', 'abs_diff'], inplace=True)

    return subset_df



subset_df = select_households_below_weighted_avg(df)

subset_df['estgap'] = 12 * subset_df['mmr'] - 0.3 * subset_df['totalincome']
subset_df['ntgap'] = 0.8 * 12 * subset_df['mmr'] - 0.3 * subset_df['totalincome']


weighted_avg = (subset_df['totalincome'] * subset_df['WEIGHT']).sum() / subset_df['WEIGHT'].sum()
print(f"Weighted average income: ${weighted_avg:,.2f}")
subset_df.to_csv(output_path, index=False)


Weighted average income: $19,997.94


In [59]:
import os
import pandas as pd
import numpy as np

df = pd.read_csv(os.path.join(folder_path, "with_chn", "subset_2024.csv"))

target_weight = 25600
target_avg = 10283

# Split below/above target
below = df[df["cohb"] <= target_avg].copy()
above = df[df["cohb"] >  target_avg].copy()

# Sort: below from highest up to target, above from lowest down to target
below = below.sort_values("cohb", ascending=False).reset_index(drop=True)
above = above.sort_values("cohb", ascending=True).reset_index(drop=True)

selected = []
W = 0.0
WC = 0.0

i = j = 0
while W < target_weight and (i < len(below) or j < len(above)):
    # current average if we add a candidate
    def score(row):
        w = row["WEIGHT"]
        c = row["cohb"]
        W_new = min(target_weight, W + w)
        w_use = W_new - W
        avg_new = (WC + c * w_use) / W_new
        return abs(avg_new - target_avg)

    candidates = []
    if i < len(below): candidates.append(("below", below.iloc[i]))
    if j < len(above): candidates.append(("above", above.iloc[j]))

    # choose the candidate that gets the running average closest to target
    side, row = min(candidates, key=lambda t: score(t[1]))

    w = row["WEIGHT"]
    c = row["cohb"]

    # take full weight or trim last
    take = min(w, target_weight - W)

    row_copy = row.copy()
    row_copy["WEIGHT"] = take
    selected.append(row_copy)

    W += take
    WC += c * take

    if side == "below": i += 1
    else: j += 1

final_avg = WC / W
print(f"Final weighted avg COHB: {final_avg:,.2f} (target {target_avg:,.2f}), total weight {W:,.0f}")

final_df = pd.DataFrame(selected)
final_df.to_csv(os.path.join(folder_path, "with_chn", "final_subset_2024_balanced.csv"), index=False)


Final weighted avg COHB: 10,300.57 (target 10,283.00), total weight 25,600


In [60]:
import os
import pandas as pd

# Load your balanced 2024 selection (this includes trimmed WEIGHT)
final_subset = pd.read_csv(os.path.join(folder_path, "with_chn", "final_subset_2024_balanced.csv"))

# Create a lookup: HH_ID -> selected_weight_2024
sel_w = final_subset.groupby("HH_ID", as_index=False)["WEIGHT"].sum()
sel_w = sel_w.rename(columns={"WEIGHT": "SEL_WEIGHT_2024"})

years = range(2024, 2031)

for year in years:
    df_year = pd.read_csv(os.path.join(folder_path, "with_chn", f"census{year}_household_chn.csv"))

    # Join to keep only selected HH_IDs and bring in the selected weights
    df_matched = df_year.merge(sel_w, on="HH_ID", how="inner")

    if df_matched.empty:
        print(f"‚ö†Ô∏è No matching HH_IDs found in {year} data.")
        continue

    weighted_avg_cohb = (df_matched["cohb"] * df_matched["SEL_WEIGHT_2024"]).sum() / df_matched["SEL_WEIGHT_2024"].sum()

    print(f"‚úÖ {year}: Weighted avg COHB = ${weighted_avg_cohb:,.2f} "
          f"over {df_matched['SEL_WEIGHT_2024'].sum():,.0f} fixed-weight households")


‚úÖ 2024: Weighted avg COHB = $10,300.57 over 25,600 fixed-weight households
‚úÖ 2025: Weighted avg COHB = $11,070.20 over 25,600 fixed-weight households
‚úÖ 2026: Weighted avg COHB = $11,525.30 over 25,600 fixed-weight households
‚úÖ 2027: Weighted avg COHB = $11,840.37 over 25,600 fixed-weight households
‚úÖ 2028: Weighted avg COHB = $12,142.45 over 25,600 fixed-weight households
‚úÖ 2029: Weighted avg COHB = $12,454.82 over 25,600 fixed-weight households
‚úÖ 2030: Weighted avg COHB = $12,787.49 over 25,600 fixed-weight households


In [61]:
####### COHB counterfactual ##############

In [62]:
final_subset = pd.read_csv(
    os.path.join(folder_path, "with_chn", "final_subset_2024_balanced.csv")
)

selected_hh_ids = final_subset["HH_ID"].unique()

print(f"Selected households: {len(selected_hh_ids):,}")


Selected households: 227


In [63]:
panel = []

for year in range(2024, 2031):
    df_year = pd.read_csv(
        os.path.join(folder_path, "with_chn", f"census{year}_household_chn.csv")
    )

    df_sel = df_year[df_year["HH_ID"].isin(selected_hh_ids)].copy()
    df_sel["year"] = year

    panel.append(df_sel)

panel_df = pd.concat(panel, ignore_index=True)

In [64]:
base_2024 = panel_df[panel_df["year"] == 2024][
    ["HH_ID", "mmr", "SHELCO"]
].rename(columns={
    "mmr": "mmr_2024",
    "SHELCO": "SHELCO_2024"
})

panel_df = panel_df.merge(base_2024, on="HH_ID", how="left")

panel_df["t"] = panel_df["year"] - 2024

In [65]:
SHELTER_GROWTH = 0.02

panel_df["mmr_cf"] = panel_df["mmr_2024"] * (1 + SHELTER_GROWTH) ** panel_df["t"]
panel_df["SHELCO_cf"] = panel_df["SHELCO_2024"] * (1 + SHELTER_GROWTH) ** panel_df["t"]

In [66]:
panel_df["mmr_orig"] = panel_df["mmr"]
panel_df["SHELCO_orig"] = panel_df["SHELCO"]

panel_df["mmr"] = panel_df["mmr_cf"]
panel_df["SHELCO"] = panel_df["SHELCO_cf"]

In [67]:
condition = (
    (panel_df["TENUR"] == 2) &
    (panel_df["chn"] == 1) &
    (panel_df["stir"] > 0.3)
)

mmr_80 = 0.8 * 12 * panel_df.loc[condition, "mmr"]

shelco_100 = (12 * panel_df.loc[condition, "SHELCO"]).clip(
    upper=(12 * panel_df.loc[condition, "mmr"])
)

eligible_cost = pd.concat(
    [mmr_80, shelco_100], axis=1
).max(axis=1)

netinc_30 = 0.3 * panel_df.loc[condition, "netinc"]

panel_df["cohb_cf"] = 0.0
panel_df.loc[condition, "cohb_cf"] = (
    eligible_cost - netinc_30
).clip(lower=0)


In [68]:
avg_by_year = (
    panel_df
    .groupby("year")
    .apply(lambda g: pd.Series({
        "avg_cohb_baseline": np.average(g["cohb"], weights=g["WEIGHT"]),
        "avg_cohb_cf":       np.average(g["cohb_cf"], weights=g["WEIGHT"]),
        "avg_diff":          np.average(g["cohb_cf"] - g["cohb"], weights=g["WEIGHT"]),
        "total_weight":      g["WEIGHT"].sum()
    }))
    .reset_index()
)

print(avg_by_year)

######enf of COHB counterfactual ##############

   year  avg_cohb_baseline   avg_cohb_cf      avg_diff  total_weight
0  2024       10304.424757  10304.424757  3.553642e-14  25619.623854
1  2025       11074.774893  10486.796790 -5.879781e+02  25895.741021
2  2026       11522.772854  10690.916772 -8.318561e+02  25906.801561
3  2027       11827.932123  10890.500870 -9.374313e+02  25861.820248
4  2028       12124.126560  11096.576050 -1.027551e+03  25841.334673
5  2029       12434.887910  11318.528128 -1.116360e+03  25951.051765
6  2030       12775.147795  11555.842887 -1.219305e+03  26192.297527


  .apply(lambda g: pd.Series({


In [69]:

# --- Preserve true baseline shelter if not already preserved ---
if "mmr_orig" not in panel_df.columns:
    panel_df["mmr_orig"] = panel_df["mmr"]
if "SHELCO_orig" not in panel_df.columns:
    panel_df["SHELCO_orig"] = panel_df["SHELCO"]

# --- Re-anchor 2024 shelter values ---
base_2024 = panel_df.loc[
    panel_df["year"] == 2024, ["HH_ID", "mmr_orig", "SHELCO_orig"]
].copy()

base_2024 = base_2024.rename(columns={
    "mmr_orig": "mmr_2024",
    "SHELCO_orig": "SHELCO_2024"
})

panel_df = panel_df.drop(columns=["mmr_2024", "SHELCO_2024"], errors="ignore")
panel_df = panel_df.merge(base_2024, on="HH_ID", how="left")

# --- Sort for forward construction ---
panel_df = panel_df.sort_values(["HH_ID", "year"])

# --- Your KNOWN baseline shelter growth path ---
baseline_growth = {
    2025: 1.057,
    2026: 1.034,
    2027: 1.025,
    2028: 1.024,
    2029: 1.024,
    2030: 1.024
}

BASELINE_WEIGHT = 0.5
CPI_GROWTH = 1.02

# --- Map baseline growth and blended growth factors ---
panel_df["baseline_growth_factor"] = panel_df["year"].map(baseline_growth)

panel_df["mmr_growth_blend"] = (
    BASELINE_WEIGHT * panel_df["baseline_growth_factor"] +
    (1 - BASELINE_WEIGHT) * CPI_GROWTH
)

panel_df["SHELCO_growth_blend"] = panel_df["mmr_growth_blend"]

# --- Initialize blended shelter paths ---
panel_df["mmr_blend"] = panel_df["mmr_orig"]
panel_df["SHELCO_blend"] = panel_df["SHELCO_orig"]

# --- Build blended paths forward from 2024 ---
for hh_id, g in panel_df.groupby("HH_ID"):
    idx = g.index

    mmr_vals = []
    shelco_vals = []

    for _, row in g.iterrows():
        if row["year"] == 2024:
            mmr_vals.append(row["mmr_2024"])
            shelco_vals.append(row["SHELCO_2024"])
        else:
            growth = row["mmr_growth_blend"]

            prev_mmr = mmr_vals[-1]
            prev_shelco = shelco_vals[-1]

            mmr_vals.append(prev_mmr * growth)
            shelco_vals.append(prev_shelco * growth)

    panel_df.loc[idx, "mmr_blend"] = mmr_vals
    panel_df.loc[idx, "SHELCO_blend"] = shelco_vals

# --- Overwrite shelter with blended path ---
panel_df["mmr"] = panel_df["mmr_blend"]
panel_df["SHELCO"] = panel_df["SHELCO_blend"]

# --- Recompute COHB using blended shelter ---
condition = (
    (panel_df["TENUR"] == 2) &
    (panel_df["chn"] == 1) &
    (panel_df["stir"] > 0.3)
)

mmr_80 = 0.8 * 12 * panel_df.loc[condition, "mmr"]

shelco_100 = (12 * panel_df.loc[condition, "SHELCO"]).clip(
    upper=(12 * panel_df.loc[condition, "mmr"])
)

eligible_cost = pd.concat(
    [mmr_80, shelco_100], axis=1
).max(axis=1)

netinc_30 = 0.3 * panel_df.loc[condition, "netinc"]

panel_df["cohb_blend"] = 0.0
panel_df.loc[condition, "cohb_blend"] = (
    eligible_cost - netinc_30
).clip(lower=0)

# --- Summarize blended vs baseline ---
avg_by_year = (
    panel_df
    .groupby("year")
    .apply(lambda g: pd.Series({
        "avg_cohb_baseline": np.average(g["cohb"], weights=g["WEIGHT"]),
        "avg_cohb_blend":    np.average(g["cohb_blend"], weights=g["WEIGHT"]),
        "avg_diff_blend":   np.average(g["cohb_blend"] - g["cohb"], weights=g["WEIGHT"]),
        "total_weight":     g["WEIGHT"].sum()
    }))
    .reset_index()
)

print(avg_by_year)

### END OF 2nd COUNTERFACTUAL #####

   year  avg_cohb_baseline  avg_cohb_blend  avg_diff_blend  total_weight
0  2024       10304.424757    10304.424757    3.553642e-14  25619.623854
1  2025       11074.774893    10781.087885   -2.936870e+02  25895.741021
2  2026       11522.772854    11106.673755   -4.160991e+02  25906.801561
3  2027       11827.932123    11356.858161   -4.710740e+02  25861.820248
4  2028       12124.126560    11606.895192   -5.172314e+02  25841.334673
5  2029       12434.887910    11874.594534   -5.602934e+02  25951.051765
6  2030       12775.147795    12159.620947   -6.155268e+02  26192.297527


  .apply(lambda g: pd.Series({


In [70]:
import pandas as pd
import numpy as np

# Load the data
df = pd.read_csv(os.path.join(folder_path, "with_chn", "census2023_household_chn.csv"))


# Define conditions based on bedsuit values
conditions = [
    (df['bedsuit'] == 0) & (df['totalincome'] <= 40000),
    (df['bedsuit'] == 1) & (df['totalincome'] <= 49000),
    (df['bedsuit'] == 2) & (df['totalincome'] <= 56000),
    (df['bedsuit'] == 3) & (df['totalincome'] <= 62000),
    (df['bedsuit'] >= 4) & (df['totalincome'] <= 75000),
]

# Combine all conditions using logical OR
combined_condition = conditions[0]
for cond in conditions[1:]:
    combined_condition |= cond

# Create the filtered subset
subset_df = df[combined_condition].copy()


In [71]:
# Filter for households in core housing need and valid HCORENEED_IND
chn_df = subset_df[(subset_df['chn'] == 1) & (subset_df['HCORENEED_IND'] != 888)]

# Group by TENUR and sum the WEIGHT for each group
weighted_chn_by_tenur = chn_df.groupby('TENUR')['WEIGHT'].sum()

# Display the result
print(weighted_chn_by_tenur)


TENUR
1.0    229651.528029
2.0    424459.059190
Name: WEIGHT, dtype: float64


calculate CNIT to use for asset threshold shares

In [72]:
import pandas as pd

# Step 1: Load the data
df = pd.read_csv(os.path.join(folder_path, "with_chn", "census2023_household_chn.csv"))

# Step 2: Create a filtered copy for households in core housing need with valid HCORENEED_IND
df_copy = df[(df['chn'] == 1) & (df['HCORENEED_IND'] != 888)].copy()

# Step 3: Create the cnit variable
df_copy['cnit'] = (12 * df_copy['mmr']) / 0.3

# Step 4: Calculate weighted average CNIT by bedsuit
weighted_avg_cnit = (
    df_copy
    .groupby('bedsuit')
    .apply(lambda g: (g['cnit'] * g['WEIGHT']).sum() / g['WEIGHT'].sum())
)

# Display the result
print(weighted_avg_cnit)


# Filter households with bedsuit >= 4
df_4plus = df_copy[df_copy['bedsuit'] >= 4]

# Calculate weighted average CNIT
weighted_avg_cnit_4plus = (df_4plus['cnit'] * df_4plus['WEIGHT']).sum() / df_4plus['WEIGHT'].sum()

# Display the result
print(f"Weighted average CNIT for 4+ bedroom-suitable households: {weighted_avg_cnit_4plus:.2f}")



  .apply(lambda g: (g['cnit'] * g['WEIGHT']).sum() / g['WEIGHT'].sum())


bedsuit
0.0     46658.136153
1.0     57890.077750
2.0     68351.707704
3.0     82646.721766
4.0    105104.427040
5.0    103770.710224
dtype: float64
Weighted average CNIT for 4+ bedroom-suitable households: 104928.78


In [73]:
import pandas as pd
import os

# Path to your directory
base_path = os.path.join(folder_path, "with_chn")

# Loop through years 2023 to 2030
for year in range(2023, 2031):
    file_path = os.path.join(base_path, f"census{year}_household_chn.csv")
    
    # Load the data
    df = pd.read_csv(file_path)
    
    # Filter for households in core housing need with valid HCORENEED_IND
    df_copy = df[(df['chn'] == 1) & (df['HCORENEED_IND'] != 888)].copy()
    
    # Create the cnit variable
    df_copy['cnit'] = (12 * df_copy['mmr']) / 0.3
    
    # Calculate weighted average CNIT by bedsuit
    weighted_avg_cnit = (
        df_copy
        .groupby('bedsuit')
        .apply(lambda g: (g['cnit'] * g['WEIGHT']).sum() / g['WEIGHT'].sum())
    )
    
    # Display the result for this year
    print(f"\nYear {year} - Weighted average CNIT by bedsuit:")
    print(weighted_avg_cnit)
    
    # Filter households with bedsuit >= 4
    df_4plus = df_copy[df_copy['bedsuit'] >= 4]
    
    # Calculate weighted average CNIT for 4+ bedroom-suitable households
    if not df_4plus.empty:
        weighted_avg_cnit_4plus = (df_4plus['cnit'] * df_4plus['WEIGHT']).sum() / df_4plus['WEIGHT'].sum()
        print(f"Weighted average CNIT for 4+ bedroom-suitable households: {weighted_avg_cnit_4plus:.2f}")
    else:
        print("No 4+ bedroom-suitable households in this year.")



Year 2023 - Weighted average CNIT by bedsuit:
bedsuit
0.0     46658.136153
1.0     57890.077750
2.0     68351.707704
3.0     82646.721766
4.0    105104.427040
5.0    103770.710224
dtype: float64
Weighted average CNIT for 4+ bedroom-suitable households: 104928.78


  .apply(lambda g: (g['cnit'] * g['WEIGHT']).sum() / g['WEIGHT'].sum())
  .apply(lambda g: (g['cnit'] * g['WEIGHT']).sum() / g['WEIGHT'].sum())



Year 2024 - Weighted average CNIT by bedsuit:
bedsuit
0.0     49083.426459
1.0     60942.024003
2.0     71965.839681
3.0     87240.025706
4.0    110645.412118
5.0    108870.932761
dtype: float64
Weighted average CNIT for 4+ bedroom-suitable households: 110394.67

Year 2025 - Weighted average CNIT by bedsuit:
bedsuit
0.0     51879.958786
1.0     64474.329173
2.0     76024.373526
3.0     91991.742676
4.0    116203.548621
5.0    115489.868349
dtype: float64
Weighted average CNIT for 4+ bedroom-suitable households: 116108.36

Year 2026 - Weighted average CNIT by bedsuit:
bedsuit
0.0     53594.082183
1.0     66540.377838
2.0     78495.039104
3.0     94976.493154
4.0    119881.657462
5.0    118606.759727
dtype: float64
Weighted average CNIT for 4+ bedroom-suitable households: 119714.06


  .apply(lambda g: (g['cnit'] * g['WEIGHT']).sum() / g['WEIGHT'].sum())
  .apply(lambda g: (g['cnit'] * g['WEIGHT']).sum() / g['WEIGHT'].sum())



Year 2027 - Weighted average CNIT by bedsuit:
bedsuit
0.0     54915.620180
1.0     68160.790655
2.0     80443.666993
3.0     97326.471340
4.0    122772.479408
5.0    121812.041323
dtype: float64
Weighted average CNIT for 4+ bedroom-suitable households: 122651.11


  .apply(lambda g: (g['cnit'] * g['WEIGHT']).sum() / g['WEIGHT'].sum())



Year 2028 - Weighted average CNIT by bedsuit:
bedsuit
0.0     56204.000958
1.0     69593.988980
2.0     82336.687917
3.0     99644.251242
4.0    125690.370610
5.0    124978.438805
dtype: float64
Weighted average CNIT for 4+ bedroom-suitable households: 125603.83

Year 2029 - Weighted average CNIT by bedsuit:
bedsuit
0.0     57523.063322
1.0     71216.500778
2.0     84275.547449
3.0    101901.835353
4.0    128165.721388
5.0    128006.092661
dtype: float64
Weighted average CNIT for 4+ bedroom-suitable households: 128146.76

Year 2030 - Weighted average CNIT by bedsuit:
bedsuit
0.0     58901.630488
1.0     72863.829094
2.0     86317.613745
3.0    104289.954933
4.0    131301.387322
5.0    131117.420423
dtype: float64
Weighted average CNIT for 4+ bedroom-suitable households: 131279.52


  .apply(lambda g: (g['cnit'] * g['WEIGHT']).sum() / g['WEIGHT'].sum())
  .apply(lambda g: (g['cnit'] * g['WEIGHT']).sum() / g['WEIGHT'].sum())
  .apply(lambda g: (g['cnit'] * g['WEIGHT']).sum() / g['WEIGHT'].sum())


In [74]:
import pandas as pd

# Load the data
df = pd.read_csv(os.path.join(folder_path, "with_chn", "census2023_household_chn.csv"))

# Subset for CHN == 1, HCORENEED_IND != 888, and NPRHH == 0
filtered_df = df[(df['chn'] == 1) & (df['HCORENEED_IND'] != 888) ]
#& (df['nprhh'] == 0)
# Calculate weighted counts for each category
tenur_2 = filtered_df[filtered_df['TENUR'] == 2]['WEIGHT'].sum()
tenur_1_presmortg_0 = filtered_df[(filtered_df['TENUR'] == 1) & (filtered_df['PRESMORTG'] == 0)]['WEIGHT'].sum()
tenur_1_presmortg_1 = filtered_df[(filtered_df['TENUR'] == 1) & (filtered_df['PRESMORTG'] == 1)]['WEIGHT'].sum()


# Total weighted households (after filtering)
total_weighted = filtered_df['WEIGHT'].sum()

# Print the results
print(f"Weighted count (TENUR = 2): {tenur_2:,.0f}")
print(f"Weighted count (TENUR = 1 & PRESMORTG = 0): {tenur_1_presmortg_0:,.0f}")
print(f"Weighted count (TENUR = 1 & PRESMORTG = 1): {tenur_1_presmortg_1:,.0f}")
print(f"Total weighted households (filtered): {total_weighted:,.0f}")

Weighted count (TENUR = 2): 518,952
Weighted count (TENUR = 1 & PRESMORTG = 0): 118,726
Weighted count (TENUR = 1 & PRESMORTG = 1): 194,340
Total weighted households (filtered): 832,018


In [75]:
import os
import pandas as pd

# Path to your directory
base_path = os.path.join(folder_path, "with_chn")

# Loop through years 2023 to 2030
for year in range(2023, 2031):
    file_path = os.path.join(base_path, f"census{year}_household_chn.csv")

    if os.path.exists(file_path):
        # Load the data
        df = pd.read_csv(file_path)

        # Subset for CHN == 1, HCORENEED_IND != 888, and nprhh == 0
        filtered_df = df[(df['chn'] == 1) & (df['HCORENEED_IND'] != 888) & (df['nprhh'] == 0)]

        # Calculate weighted counts
        tenur_2 = filtered_df[filtered_df['TENUR'] == 2]['WEIGHT'].sum()
        tenur_1_presmortg_0 = filtered_df[(filtered_df['TENUR'] == 1) & (filtered_df['PRESMORTG'] == 0)]['WEIGHT'].sum()
        tenur_1_presmortg_1 = filtered_df[(filtered_df['TENUR'] == 1) & (filtered_df['PRESMORTG'] == 1)]['WEIGHT'].sum()

        # Total weighted households (after filtering)
        total_weighted = filtered_df['WEIGHT'].sum()

        # Print the results for this year
        print(f"\nüìÖ Year {year}:")
        print(f"Weighted count (TENUR = 2): {tenur_2:,.0f}")
        print(f"Weighted count (TENUR = 1 & PRESMORTG = 0): {tenur_1_presmortg_0:,.0f}")
        print(f"Weighted count (TENUR = 1 & PRESMORTG = 1): {tenur_1_presmortg_1:,.0f}")
        print(f"Total weighted households (filtered): {total_weighted:,.0f}")

    else:
        print(f"‚ùå File not found for year {year}, skipping.")



üìÖ Year 2023:
Weighted count (TENUR = 2): 477,599
Weighted count (TENUR = 1 & PRESMORTG = 0): 117,129
Weighted count (TENUR = 1 & PRESMORTG = 1): 189,009
Total weighted households (filtered): 783,737

üìÖ Year 2024:
Weighted count (TENUR = 2): 494,896
Weighted count (TENUR = 1 & PRESMORTG = 0): 119,949
Weighted count (TENUR = 1 & PRESMORTG = 1): 194,299
Total weighted households (filtered): 809,143

üìÖ Year 2025:
Weighted count (TENUR = 2): 532,679
Weighted count (TENUR = 1 & PRESMORTG = 0): 126,765
Weighted count (TENUR = 1 & PRESMORTG = 1): 211,734
Total weighted households (filtered): 871,179

üìÖ Year 2026:
Weighted count (TENUR = 2): 551,731
Weighted count (TENUR = 1 & PRESMORTG = 0): 131,616
Weighted count (TENUR = 1 & PRESMORTG = 1): 220,235
Total weighted households (filtered): 903,583

üìÖ Year 2027:
Weighted count (TENUR = 2): 559,087
Weighted count (TENUR = 1 & PRESMORTG = 0): 136,040
Weighted count (TENUR = 1 & PRESMORTG = 1): 223,668
Total weighted households (filt

In [76]:
#calculate avg income growth

