In [33]:
import pandas as pd
import os
import statsmodels.api as sm
import numpy as np

In [34]:
# Define input and output paths
input_base_path = "../Microsimulations/household/"
output_base_path = "../Microsimulations/with_chn/"

# Ensure output directory exists
os.makedirs(output_base_path, exist_ok=True)

# Dictionary to store updated data
census_data = {}

# Loop through the years 2022 to 2030
for year in range(2022, 2031):
    input_file_path = os.path.join(input_base_path, f"census{year}_household.csv")
    output_file_path = os.path.join(output_base_path, f"census{year}_household_chn.csv")
    
    if os.path.exists(input_file_path):
        # Load the data
        census_df = pd.read_csv(input_file_path)
        
        # Initialize CHN column to 0
        census_df['chn'] = 0

        # Define housing issue conditions
        housing_issue = (
            (census_df['SHELCO'] * 12 / census_df['totalincome'] > 0.30) |  # Unaffordable
            (census_df['NOS'] == 0) |  # Unsuitable
            (census_df['REPAIR'] == 3)  # Inadequate
        )

        # Define market unaffordability condition
        market_unaffordable = (census_df['mmr']) * 12 > 0.30 * census_df['totalincome']

        # Update CHN variable
        census_df.loc[
            housing_issue & market_unaffordable &
            ~((census_df['student_household'] == 1) & (census_df['non_family_household'] == 1)),
            'chn'
        ] = 1

        # Now create stir and alt_stir after chn is assigned
       # census_df['stir'] = census_df['SHELCO'] * 12 / census_df['totalincome']
        #census_df['alt_stir'] = (census_df['mmr']) * 12 / census_df['totalincome']


        # Update CHN: Exclude individuals with STIR >= 1
        census_df.loc[census_df["stir"] >= 1, "chn"] = 0
        

        # Define deep core housing issue condition (using 50% income threshold)
        deep_housing_issue = (
            (census_df['SHELCO'] * 12 / census_df['totalincome'] > 0.50) |  # Deeply Unaffordable
            (census_df['NOS'] == 0) |  # Unsuitable
            (census_df['REPAIR'] == 3)  # Inadequate
        )

        # Define deep market unaffordability condition (50% threshold)
        deep_market_unaffordable = (census_df['mmr']) * 12 > 0.50 * census_df['totalincome']

        # Initialize dchn column to 0
        census_df['dchn'] = 0

        # Update dchn variable
        census_df.loc[
            deep_housing_issue & deep_market_unaffordable &
            ~((census_df['student_household'] == 1) & (census_df['non_family_household'] == 1)),
            'dchn'
        ] = 1

        # Update DCHN: Exclude individuals with STIR >= 1
        census_df.loc[census_df["stir"] >= 1, "dchn"] = 0

        # Export updated data
        census_df.to_csv(output_file_path, index=False)
        
        print(f"Updated CHN values and added stir/alt_stir for {year}")

    else:
        print(f"File not found: {input_file_path}")

Updated CHN values and added stir/alt_stir for 2022
Updated CHN values and added stir/alt_stir for 2023
Updated CHN values and added stir/alt_stir for 2024
Updated CHN values and added stir/alt_stir for 2025
Updated CHN values and added stir/alt_stir for 2026
Updated CHN values and added stir/alt_stir for 2027
Updated CHN values and added stir/alt_stir for 2028
Updated CHN values and added stir/alt_stir for 2029
Updated CHN values and added stir/alt_stir for 2030


In [35]:
#add 2021 census file to folder

# Define input and output paths
input_base_path = "../Microsimulations/household/"
output_base_path = "../Microsimulations/with_chn/"

# File name
input_file_name = "census2021_household.csv"
output_file_name = "census2021_household_chn.csv"

# Full paths
input_file_path = os.path.join(input_base_path, input_file_name)
output_file_path = os.path.join(output_base_path, output_file_name)

# Read the file
df = pd.read_csv(input_file_path)


  # Now create stir and alt_stir after chn is assigned
df['stir'] = df['SHELCO'] * 12 / df['totalincome']
df['alt_stir'] = (df['mmr']) * 12 / df['totalincome']

#net income share
df['netshare'] = (
    df['TOTINC_AT'] / df['totalincome']
).clip(upper=1.0)

#net income
df['netinc'] = df['totalincome'] * df['netshare']

# Save the modified dataframe
df.to_csv(output_file_path, index=False)

print(f"File saved to: {output_file_path}")

File saved to: ../Microsimulations/with_chn/census2021_household_chn.csv


In [36]:

# Define the path for processed files
output_base_path = "../Microsimulations/with_chn/"

# Dictionaries to store weighted household counts
chn_weighted_counts = {}
dchn_weighted_counts = {}

# Loop through years 2022 to 2030
for year in range(2021, 2031):
    file_path = os.path.join(output_base_path, f"census{year}_household_chn.csv")

    if os.path.exists(file_path):
        # Load the data
        census_df = pd.read_csv(file_path)

        # Check if required columns exist
        required_columns = {'chn', 'dchn', 'WEIGHT', 'HCORENEED_IND'}
        missing_columns = required_columns - set(census_df.columns)

        if missing_columns:
            print(f"Skipping {year} due to missing columns: {missing_columns}")
            continue  # Skip processing this file

        # Exclude households where HCORENEED_IND == 888
        filtered_df = census_df[census_df['HCORENEED_IND'] != 888]

        # Calculate weighted count of households where chn == 1
        chn_weight = filtered_df.loc[filtered_df['chn'] == 1, 'WEIGHT'].sum()
        chn_weighted_counts[year] = chn_weight

        # Calculate weighted count of households where dchn == 1
        dchn_weight = filtered_df.loc[filtered_df['dchn'] == 1, 'WEIGHT'].sum()
        dchn_weighted_counts[year] = dchn_weight

        print(f"[{year}] CHN weighted count: {chn_weight}, DCHN weighted count: {dchn_weight}")

    else:
        print(f"File not found: {file_path}")

print("\nTotal household weights per year:")

# Print the sum of WEIGHT for each year
for year in range(2021, 2031):
    file_path = os.path.join(output_base_path, f"census{year}_household_chn.csv")
    if os.path.exists(file_path):
        census_df = pd.read_csv(file_path)
        print(f"Sum of WEIGHT for {year}: {census_df['WEIGHT'].sum()}")


[2021] CHN weighted count: 684245.5791455524, DCHN weighted count: 154141.0278820697
[2022] CHN weighted count: 775663.1996845966, DCHN weighted count: 193171.441375075
[2023] CHN weighted count: 898977.6674971142, DCHN weighted count: 246696.35102513284
[2024] CHN weighted count: 961182.9994609419, DCHN weighted count: 267950.07722888654
[2025] CHN weighted count: 1002609.9839466489, DCHN weighted count: 286965.47288308386
[2026] CHN weighted count: 1041591.8021459938, DCHN weighted count: 308772.29141312605
[2027] CHN weighted count: 1081257.9059293726, DCHN weighted count: 330058.47826986696
[2028] CHN weighted count: 1139147.8167526051, DCHN weighted count: 356986.8514300904
[2029] CHN weighted count: 1201660.7058479595, DCHN weighted count: 388469.46203185746
[2030] CHN weighted count: 1268572.9000548227, DCHN weighted count: 425247.337325424

Total household weights per year:
Sum of WEIGHT for 2021: 3310969.362428404
Sum of WEIGHT for 2022: 3391447.579802723
Sum of WEIGHT for 202

In [37]:

# Define year to process
year = 2021
file_path = f"../Microsimulations/census{year}.csv"

# Check if file exists before proceeding
if os.path.exists(file_path):
    df = pd.read_csv(file_path)
    
    # Create jobless variable
    df["jobless"] = df["LFACT"].between(2, 10).astype(int)
    
    # Calculate share of records with jobless == 1 for each AGEGRP and IMMSTAT
    summary = df.groupby(["AGEGRP", df["IMMSTAT"].apply(lambda x: "IMMSTAT_3" if x == 3 else "IMMSTAT_not_3")])["jobless"].mean().reset_index()
    
    # Save summary to a CSV file
    output_path = f"../Microsimulations/census_share_{year}.csv"
    summary.to_csv(output_path, index=False)


In [38]:

# Load netshare from 2021
input_base_path = "../Microsimulations/with_chn/"
census2021_household = pd.read_csv(os.path.join(input_base_path, "census2021_household_chn.csv"))
netshare_2021 = (census2021_household['TOTINC_AT'] / census2021_household['totalincome']).clip(upper=1.0)

# Apply to years 2022–2030
input_base_path = "../Microsimulations/with_chn/"

for year in range(2022, 2031):
    file_path = os.path.join(input_base_path, f"census{year}_household_chn.csv")

    if os.path.exists(file_path):
        census_df = pd.read_csv(file_path)

        # Assign netshare from 2021 (assumes same order/row count)
        census_df['netshare'] = netshare_2021.values
        census_df['netinc'] = census_df['totalincome'] * census_df['netshare']

        # Save updated file
        census_df.to_csv(file_path, index=False)
        print(f"✅ Updated netshare and netinc for {year}")
    else:
        print(f"❌ File not found for {year}: {file_path}")

✅ Updated netshare and netinc for 2022
✅ Updated netshare and netinc for 2023
✅ Updated netshare and netinc for 2024
✅ Updated netshare and netinc for 2025
✅ Updated netshare and netinc for 2026
✅ Updated netshare and netinc for 2027
✅ Updated netshare and netinc for 2028
✅ Updated netshare and netinc for 2029
✅ Updated netshare and netinc for 2030


In [39]:

input_base_path = "../Microsimulations/with_chn/"

for year in range(2021, 2031):
    file_path = os.path.join(input_base_path, f"census{year}_household_chn.csv")

    if os.path.exists(file_path):
        census_df = pd.read_csv(file_path)

        # Default gap to 0.0
        census_df['gap'] = 0.0

        # Only calculate gap where CHN = 1
        chn_condition = census_df['chn'] == 1

        # Use SHELCO if NOS == 1, REPAIR != 3, and SHELCO < mmr 
        use_shelco = (
            chn_condition &
            (census_df['NOS'] == 1) &
            (census_df['REPAIR'] != 3) &
            (census_df['SHELCO'] < census_df['mmr'])
        )

        # Use AMR (mmr) otherwise
        use_amr = chn_condition & ~use_shelco  # CHN == 1 but doesn't meet SHELCO condition

        # Apply SHELCO-based gap
        census_df.loc[use_shelco, 'gap'] = (
            census_df.loc[use_shelco, 'SHELCO'] * 12 - 0.3 * census_df.loc[use_shelco, 'totalincome']
        )

        # Apply AMR-based gap
        census_df.loc[use_amr, 'gap'] = (
            (census_df.loc[use_amr, 'mmr']) * 12 - 0.3 * census_df.loc[use_amr, 'totalincome']
        )

        # Save updated file
        census_df.to_csv(file_path, index=False)
        print(f"✅ Calculated gap for {year} using updated CHN conditions")
    else:
        print(f"❌ File not found for {year}: {file_path}")

✅ Calculated gap for 2021 using updated CHN conditions
✅ Calculated gap for 2022 using updated CHN conditions
✅ Calculated gap for 2023 using updated CHN conditions
✅ Calculated gap for 2024 using updated CHN conditions
✅ Calculated gap for 2025 using updated CHN conditions
✅ Calculated gap for 2026 using updated CHN conditions
✅ Calculated gap for 2027 using updated CHN conditions
✅ Calculated gap for 2028 using updated CHN conditions
✅ Calculated gap for 2029 using updated CHN conditions
✅ Calculated gap for 2030 using updated CHN conditions


In [40]:
# After assigning gaps
print(f"{year}: CHN=1 count: {census_df['chn'].sum()}, GAP > 0 count: {(census_df['gap'] > 0).sum()}")


2030: CHN=1 count: 10960, GAP > 0 count: 10960


In [41]:
import os
import pandas as pd

input_base_path = "../Microsimulations/with_chn/"

for year in range(2021, 2031):  # 2021 to 2030 inclusive
    file_path = os.path.join(input_base_path, f"census{year}_household_chn.csv")

    if os.path.exists(file_path):
        census_df = pd.read_csv(file_path)

        # Initialize COHB to 0.0
        census_df['cohb'] = 0.0

        # Condition: renter, in core housing need, stir > 0.3
        condition = (
            (census_df['TENUR'] == 2) &
            (census_df['chn'] == 1) &
            (census_df['stir'] > 0.3)
        )

        # Pre-calculate parts
        mmr_80 = 0.8 * 12 * census_df.loc[condition, 'mmr']
        shelco_100_capped = (12 * census_df.loc[condition, 'SHELCO']).clip(upper=(12 * census_df.loc[condition, 'mmr']))
        eligible_cost = pd.concat([mmr_80, shelco_100_capped], axis=1).max(axis=1)

        netinc_30 = 0.3 * census_df.loc[condition, 'netinc']
        cohb_values = eligible_cost - netinc_30

        # Clip to ensure no negative COHB
        census_df.loc[condition, 'cohb'] = cohb_values.clip(lower=0)

        # Save back to file
        census_df.to_csv(file_path, index=False)
        print(f"✅ Added COHB values for renters in CHN in {year}")
    else:
        print(f"❌ File not found for {year}: {file_path}")


✅ Added COHB values for renters in CHN in 2021
✅ Added COHB values for renters in CHN in 2022
✅ Added COHB values for renters in CHN in 2023
✅ Added COHB values for renters in CHN in 2024
✅ Added COHB values for renters in CHN in 2025
✅ Added COHB values for renters in CHN in 2026
✅ Added COHB values for renters in CHN in 2027
✅ Added COHB values for renters in CHN in 2028
✅ Added COHB values for renters in CHN in 2029
✅ Added COHB values for renters in CHN in 2030


In [42]:
mean_cohb = {}
mean_gap = {}

for year in range(2021, 2031):
    file_path = os.path.join(input_base_path, f"census{year}_household_chn.csv")

    if os.path.exists(file_path):
        census_df = pd.read_csv(file_path)

        # Ensure the necessary columns exist
        if all(col in census_df.columns for col in ['cohb', 'gap', 'WEIGHT', 'HCORENEED_IND']):
            # Exclude records where HCORENEED_IND == 888
            census_df = census_df[census_df['HCORENEED_IND'] != 888]

            # Filter rows where cohb > 0
            cohb_positive = census_df[census_df['cohb'] > 0]
            if not cohb_positive.empty:
                weighted_mean_cohb = (cohb_positive['cohb'] * cohb_positive['WEIGHT']).sum() / cohb_positive['WEIGHT'].sum()
                mean_cohb[year] = weighted_mean_cohb
            else:
                mean_cohb[year] = 0

            # Filter rows where gap > 0
            gap_positive = census_df[census_df['gap'] > 0]
            if not gap_positive.empty:
                weighted_mean_gap = (gap_positive['gap'] * gap_positive['WEIGHT']).sum() / gap_positive['WEIGHT'].sum()
                mean_gap[year] = weighted_mean_gap
            else:
                mean_gap[year] = 0
        else:
            print(f"❌ Missing columns in {year}, skipping.")
    else:
        print(f"❌ File not found for {year}")

# ✅ Print results
print("\n📊 Weighted Mean COHB (for values > 0, excluding HCORENEED_IND == 888):")
for year, val in mean_cohb.items():
    print(f"{year}: {val:.2f}")

print("\n📊 Weighted Mean GAP (for values > 0, excluding HCORENEED_IND == 888):")
for year, val in mean_gap.items():
    print(f"{year}: {val:.2f}")


📊 Weighted Mean COHB (for values > 0, excluding HCORENEED_IND == 888):
2021: 4401.67
2022: 4946.46
2023: 5712.38
2024: 6168.52
2025: 6626.07
2026: 7156.80
2027: 7646.38
2028: 8236.55
2029: 8893.56
2030: 9675.36

📊 Weighted Mean GAP (for values > 0, excluding HCORENEED_IND == 888):
2021: 3783.17
2022: 4123.14
2023: 4580.42
2024: 4900.40
2025: 5191.18
2026: 5561.62
2027: 5898.50
2028: 6299.93
2029: 6745.96
2030: 7260.19


In [43]:
#trace file


# Folder path
base_path = "../Microsimulations/with_chn/"

# Years to process
years = range(2021, 2031)

# Row number to extract (0-based index)
target_row = 51  # Change this to any row index you want

# List to store selected rows
selected_rows = []

for year in years:
    file_name = f"census{year}_household_chn.csv"
    file_path = os.path.join(base_path, file_name)
    
    if os.path.exists(file_path):
        df = pd.read_csv(file_path)
        if len(df) > target_row:
            selected_row = df.iloc[target_row]
            selected_rows.append(selected_row)
        else:
            print(f"File {file_name} has less than {target_row + 1} rows.")
    else:
        print(f"File not found: {file_path}")

# Combine and save to chn_trace.csv
if selected_rows:
    chn_trace_df = pd.DataFrame(selected_rows)
    output_path = os.path.join(base_path, f"chn_trace_row{target_row + 2}.csv")
    chn_trace_df.to_csv(output_path, index=False)
    print(f"chn_trace_row{target_row + 1}.csv created at {output_path}")
else:
    print("No data found to create trace file.")

chn_trace_row52.csv created at ../Microsimulations/with_chn/chn_trace_row53.csv


In [44]:


# Load the data
df = pd.read_csv("../Microsimulations/with_chn/census2021_household_chn.csv")

# Check required columns
required_cols = ['netinc', 'totalincome', 'WEIGHT', 'quintile']
if all(col in df.columns for col in required_cols):
    # Compute netshare safely
    df['netshare'] = df['netinc'] / df['totalincome']
    df = df.replace([float('inf'), -float('inf')], pd.NA).dropna(subset=['netshare'])

    # Group by quintile and calculate weighted average netshare
    summary = (
        df.groupby('quintile')
        .apply(lambda g: (g['netshare'] * g['WEIGHT']).sum() / g['WEIGHT'].sum())
        .reset_index(name='weighted_netshare')
    )

    # Format output
    summary['weighted_netshare'] = summary['weighted_netshare'].round(4)
    print("\n✅ Weighted Average Netshare by Quintile:\n")
    print(summary)
else:
    print("❌ Missing required columns: netinc, totalincome, WEIGHT, or quintile.")



✅ Weighted Average Netshare by Quintile:

   quintile  weighted_netshare
0         1             0.9442
1         2             0.9152
2         3             0.8822


  .apply(lambda g: (g['netshare'] * g['WEIGHT']).sum() / g['WEIGHT'].sum())


In [45]:
print(df[['totalincome', 'netinc', 'netshare']].sort_values(by='netshare').head(10))


       totalincome   netinc   netshare
5860           401  -7000.0 -17.456359
30378         1000  -6000.0  -6.000000
25865         4400 -23000.0  -5.227273
8073         11300 -50000.0  -4.424779
5262          1000  -4000.0  -4.000000
16587        13400 -53000.0  -3.955224
24238          801  -3000.0  -3.745318
9991          6800 -20000.0  -2.941176
7770         11900 -30000.0  -2.521008
4357          4500 -11000.0  -2.444444


Ignore for now: COHB/affordable housing program analysis

In [None]:

import pandas as pd

# Example: assuming df2024 is loaded
df = pd.read_csv("../Microsimulations/with_chn/census2024_household_chn.csv")
df = df[
    (df['TENUR'] == 2) &
    (df['HCORENEED_IND'] != 888) &
    (df['SUBSIDY'] == 1)
].copy()




def select_households_below_weighted_avg(df, target_avg=20000):
    df_sorted = df.sort_values(by="totalincome")
    
    subset_rows = []
    total_weighted_income = 0
    total_weight = 0

    for _, row in df_sorted.iterrows():
        income = row['totalincome']
        weight = row['WEIGHT']
        
        # Predict the new weighted average if we add this household
        new_total_weighted_income = total_weighted_income + income * weight
        new_total_weight = total_weight + weight
        new_weighted_avg = new_total_weighted_income / new_total_weight
        
        if new_weighted_avg <= target_avg:
            subset_rows.append(row)
            total_weighted_income = new_total_weighted_income
            total_weight = new_total_weight
        else:
            break

    subset_df = pd.DataFrame(subset_rows)
    return subset_df


subset_df = select_households_below_weighted_avg(df)

subset_df['estgap'] = 12 * subset_df['mmr'] - 0.3 * subset_df['totalincome']
subset_df['ntgap'] = 0.8 * 12 * subset_df['mmr'] - 0.3 * subset_df['totalincome']


weighted_avg = (subset_df['totalincome'] * subset_df['WEIGHT']).sum() / subset_df['WEIGHT'].sum()
print(f"Weighted average income: ${weighted_avg:,.2f}")
subset_df.to_csv("../Microsimulations/with_chn/subset_2024.csv", index=False)


Weighted average income: $37,548.84


In [None]:
import pandas as pd
import numpy as np

# Load your subset
subset_df = pd.read_csv("../Microsimulations/with_chn/subset_2022.csv")

# Shuffle the DataFrame
shuffled_df = subset_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Select rows until cumulative WEIGHT is close to 555606 ± 500
target_weight = 55606
tolerance = 500

selected_rows = []
total_weight = 0

for _, row in shuffled_df.iterrows():
    row_weight = row['WEIGHT']
    if total_weight + row_weight > target_weight + tolerance:
        continue
    selected_rows.append(row)
    total_weight += row_weight
    if total_weight >= target_weight - tolerance:
        break

# Convert to DataFrame
sample_df = pd.DataFrame(selected_rows)

# Calculate weighted count where estgap > 0
estgap_positive_weighted = sample_df.loc[sample_df['estgap'] > 0, 'WEIGHT'].sum()

# Calculate weighted count where ntgap < 0
ntgap_negative_weighted = sample_df.loc[sample_df['ntgap'] < 0, 'WEIGHT'].sum()

print(f"✅ Total weighted sample: {total_weight:,.0f}")
print(f"🏠 Weighted count (estgap > 0): {estgap_positive_weighted:,.0f}")
print(f"🏠 Weighted count (ntgap < 0): {ntgap_negative_weighted:,.0f}")


# Save to CSV
sample_df.to_csv("../Microsimulations/with_chn/sample_subset_2022.csv", index=False)

✅ Total weighted sample: 55,181
🏠 Weighted count (estgap > 0): 55,181
🏠 Weighted count (ntgap < 0): 0


In [None]:
import pandas as pd
import numpy as np

# Load your subset
df = pd.read_csv("../Microsimulations/with_chn/subset_2024.csv")

# Target constraints
target_weight = 514
target_avg_cohb = 6154
tolerance = 50  # allowable ± range for avg COHB

best_match = None
closest_diff = float('inf')

for i in range(10000):  # Max tries
    # Random sample
    sample = df.sample(frac=1, replace=False).copy()
    selected_rows = []
    total_weight = 0
    total_weighted_cohb = 0

    for _, row in sample.iterrows():
        weight = row['WEIGHT']
        cohb = row['cohb']

        if total_weight + weight > target_weight:
            remaining_weight = target_weight - total_weight
            total_weighted_cohb += cohb * remaining_weight
            row_copy = row.copy()
            row_copy['WEIGHT'] = remaining_weight
            selected_rows.append(row_copy)
            total_weight = target_weight
            break
        else:
            total_weight += weight
            total_weighted_cohb += cohb * weight
            selected_rows.append(row)

    weighted_avg_cohb = total_weighted_cohb / total_weight
    diff = abs(weighted_avg_cohb - target_avg_cohb)

    if diff < closest_diff:
        closest_diff = diff
        best_match = pd.DataFrame(selected_rows)

    if diff <= tolerance:
        print(f"✅ Match found on attempt {i+1}: Weighted avg COHB = ${weighted_avg_cohb:,.2f}")
        break

else:
    print(f"⚠️ Closest match after 10,000 attempts: Weighted avg COHB = ${weighted_avg_cohb:,.2f}")

# Save the best match
best_match.to_csv("../Microsimulations/with_chn/final_subset_2022_random_targeted.csv", index=False)


✅ Match found on attempt 54: Weighted avg COHB = $6,119.38
