In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [5]:
housing = pd.read_csv("datasets/socal_housing.csv")
population = pd.read_csv("datasets/socal_population.csv")
empl_wages = pd.read_csv("datasets/socal_empl_wages.csv")
commuting_flows = pd.read_csv("datasets/socal_commuting_flows.csv")

In [9]:
# ------------------------------------------------------------
# constructing the commuting probability matrix λ_ni
# ------------------------------------------------------------

#rename columns for clarity
flows = commuting_flows.rename(columns={
    "Workers in Commuting Flow": "flow",
    "Residence_County": "origin",
    "Workplace_County": "dest",
}).copy()

#converting flow column to numeric values 
flows["flow"] = (flows["flow"]
                 .astype(str)
                 .str.replace(",", "", regex=False)
                 .str.strip())

#filling any non-numeric values with 0 
flows["flow"] = pd.to_numeric(flows["flow"], errors="coerce").fillna(0)

# ------------------------------------------------------------
# building the origin-destination (OD) commuter count matrix F_ni
# ------------------------------------------------------------
# rows   = residence counties (n)
# columns = workplace counties (i)
# values = number of commuters living in n and working in i

F = flows.pivot_table(index="origin", columns="dest", values="flow",
                      aggfunc="sum", fill_value=0)

# ------------------------------------------------------------
# convert these commuter counts into commuting probabilities λ_ni
# ------------------------------------------------------------
# for each resident county n, divide flows to each workplace i, by the total number of commuters originating in n
row_sums = F.sum(axis=1)
lambda_mat = F.div(row_sums.replace(0, np.nan), axis=0).fillna(0)

lambda_mat

dest,Los Angeles,Orange,Riverside,San Bernardino,San Diego,Ventura
origin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Los Angeles,0.934049,0.041366,0.00354,0.012816,0.000996,0.007232
Orange,0.116475,0.856489,0.010473,0.008358,0.007815,0.000391
Riverside,0.052326,0.074656,0.713976,0.111907,0.046324,0.000811
San Bernardino,0.146244,0.041046,0.086331,0.722741,0.003075,0.000563
San Diego,0.003761,0.007503,0.004326,0.000748,0.983479,0.000183
Ventura,0.169889,0.001479,0.000454,0.001504,0.000855,0.825819


In [10]:
# ------------------------------------------------------------
# Construct workplace employment (L_i) and wages (w_i)
# ------------------------------------------------------------
L = empl_wages.set_index("County")["Employment"]
w = empl_wages.set_index("County")["Weekly_Wage"]

# ------------------------------------------------------------
# Compute average resident income by county (v̄_n) where v̄_n = Σ_i λ_ni * w_i
# ------------------------------------------------------------

# v_bar is the average labor income accessible to residents of each county, accounting for commuting patterns.
v_bar = lambda_mat @ w

In [12]:
# ------------------------------------------------------------
# Construct resident population vector R_n
# ------------------------------------------------------------

R = population.set_index("County")["Total Population 24"]

# ------------------------------------------------------------
# Predicted workplace employment from residents and commuting
# ------------------------------------------------------------
L_pred = lambda_mat.T @ R
L_pred.name = "L_pred"

# ------------------------------------------------------------
# Rescale resident population to match total observed employment
# ------------------------------------------------------------

# We correct for inconsistency between datasets with a scalar so that:
#   Σ_n R_use_n = Σ_i L_i

s = L.sum() / L_pred.sum()
R_use = s * R
R_use.name = "R_use"

# ------------------------------------------------------------
# Model inversion to obtain baseline residential amenities B_n
# ------------------------------------------------------------

# ε: migration dispersion parameter (sensitivity to utility)
#calibrating this to 5, TODO: estimate from data or try a list of diff epsilon values 
eps = 5

# Invert the residence-choice condition:
#   log B_n = ε * log R_n - log v̄_n + constant


# compute the unnormalized log-amenity index
B_log_raw = eps * np.log(R_use) - np.log(v_bar)

# normalize amenities so that mean log(B_n) = 0
B_log = B_log_raw - B_log_raw.mean()

# convert log amenities back to levels
B = np.exp(B_log)
B.name = "B"

B.sort_values(ascending=False)


County
Los Angeles       493.684282
San Diego           2.142186
Orange              1.742816
Riverside           0.663178
San Bernardino      0.345290
Ventura             0.002369
Name: B, dtype: float64

In [14]:
# ------------------------------------------------------------
# Let's now apply an amenity shock to Los Angeles and compute equilibrium
# population under the counterfactual
# ------------------------------------------------------------

la = "Los Angeles"

eps = 5

# amenity shock multipliers applied to LA only (wildfire), (0.95 corresponds to a 5% decline in amenities)
shock_grid = [0.99, 0.95, 0.90]

# total population fixed across counterfactuals 
# This ensures we are reallocating people across counties, but not changing the total population size
Rtot = R_use.sum()

# ------------------------------------------------------------
# Function to compute equilibrium resident population
# ------------------------------------------------------------
def implied_residents(B_series, vbar_series, eps, Rtot):
    """
    Given amenities B_n and average resident income v̄_n,
    compute the implied equilibrium resident population R_n.

    Parameters:
    - B_series    : amenities by county (indexed by county)
    - vbar_series : average resident income by county
    - eps         : migration dispersion parameter
    - Rtot        : total population (fixed)

    Returns:
    - R_cf        : counterfactual resident population by county
    """

    # utility index for residence choice  
    # u_n = log(B_n) + log(v̄_n)
    u = np.log(B_series) + np.log(vbar_series)

    # converting utilities into location choice weights
    s = np.exp(u / eps)

    # normalize to obtain population shares
    shares = s / s.sum()

    # scaling shares by total population to get levels
    return shares * Rtot


In [16]:
# ------------------------------------------------------------
# Compute baseline and counterfactual population outcomes under different Los Angeles amenity shocks
# ------------------------------------------------------------
results = []

# baseline equilibrium population  
R_base_hat = implied_residents(B, v_bar, eps, Rtot)

for bhat in shock_grid:

    B_cf = B.copy()

    # applying an amenity shock to Los Angeles only
    B_cf.loc[la] = bhat * B_cf.loc[la]

    # compute counterfactual equilibrium population
    R_cf = implied_residents(B_cf, v_bar, eps, Rtot)

    results.append({
        "Amenity_Shock_LA": bhat,

        "LA_Pop_Baseline": R_base_hat.loc[la],

        "LA_Pop_Counterfactual": R_cf.loc[la],

        "Pct_Change_LA_Pop": (R_cf.loc[la] / R_base_hat.loc[la] - 1) * 100,

        "Change_LA_Pop": R_cf.loc[la] - R_base_hat.loc[la],
    })

out = pd.DataFrame(results)

# formatting output table
out["LA_Pop_Baseline"] = out["LA_Pop_Baseline"].round(0).astype(int)
out["LA_Pop_Counterfactual"] = out["LA_Pop_Counterfactual"].round(0).astype(int)
out["Pct_Change_LA_Pop"] = out["Pct_Change_LA_Pop"].round(2)
out["Change_LA_Pop"] = out["Change_LA_Pop"].round(0).astype(int)


In [17]:
out

Unnamed: 0,Amenity_Shock_LA,LA_Pop_Baseline,LA_Pop_Counterfactual,Pct_Change_LA_Pop,Change_LA_Pop
0,0.99,4392231,4387382,-0.11,-4849
1,0.95,4392231,4367492,-0.56,-24739
2,0.9,4392231,4341443,-1.16,-50788
