In [1]:
import pandas as pd
import numpy as np
import altair as alt
import os
# import chromatose as ct

import geopandas as gpd
from geopy.distance import geodesic
from itertools import product
import json

from IPython.display import SVG
import scipy.special

  from pandas.core import (


### Renaming original columns

In [19]:
filepath = 'datasets'
# df_commuting = pd.read_csv(os.path.join(filepath, 'Commuting_Flows.csv'))

df_housing = pd.read_csv(os.path.join(filepath, "socal_housing.csv"))
df_pop = pd.read_csv(os.path.join(filepath, "socal_population.csv"))
df_wages = pd.read_csv(os.path.join(filepath, "socal_empl_wages.csv"))
df_commuting = pd.read_csv(os.path.join(filepath, 'socal_commuting_flows.csv'))

In [20]:
df_pop = df_pop.rename(columns={
    'County':'county',
    'Total Population 24':'total_population_2024', 
    'Total Population 25':'total_population_2025',
    'Percent Change':'percent_change'
})

df_housing = df_housing.rename(columns={
    'County':'county',
    'Total Housing 24':'total_housing_2024', 
    'Total Housing 25':'total_housing_2025',
    'Percent Change':'percent_change'
})

df_wages = df_wages.rename(columns={
    'County':'county',
    'Num_Establishments':'n_establishments',
    'Employment':'employment',
    'Annual_Wages':'wages_annual',
    'Weekly_Wage':'wages_weekly', 
    'Wage_per_Employee':'wage_per_employee'
})

df_commuting = df_commuting.rename(columns={
    'State Name':'state',
    'Residence_County':'county_residence',
    'Workplace_County':'county_workplace',
    'Workers in Commuting Flow':'n_workers',
    'Margin of Error': 'err_workers'
})

df_commuting['n_workers'] = df_commuting['n_workers'].str.replace(',','').astype(int)

### Construct modeling dataframes $d_{ij}$, $d_{i}$, $d_{j}$

In [21]:
df_pop['county'].values # copied and pasted into analysis (keep kernels separate)

array(['Los Angeles', 'Orange', 'Riverside', 'San Bernardino',
       'San Diego', 'Ventura'], dtype=object)

In [22]:
counties = df_pop['county'].values
d_index_county = {i: county for i, county in enumerate(counties)}
d_county_index = {v:k for k, v in d_index_county.items()}

df_model = pd.DataFrame({
    'county_i_name':counties, 
    'county_j_name':counties
})

df_model['county_i'] = df_model['county_i_name'].map(d_county_index)
df_model['county_j'] = df_model['county_j_name'].map(d_county_index)

In [24]:
# ----------------- ij ----------------------
df_model_ij = df_commuting.copy()
df_model_ij['county_i_name'] = df_model_ij['county_residence']
df_model_ij['county_j_name'] = df_model_ij['county_workplace']
df_model_ij['L_ij_data'] = df_model_ij['n_workers']

df_model_ij['county_i'] = df_model_ij['county_i_name'].map(d_county_index)
df_model_ij['county_j'] = df_model_ij['county_j_name'].map(d_county_index)

df_model_ij = df_model_ij[[
    'county_i', 'county_j','county_i_name', 'county_j_name', 'L_ij_data']]

# ----------------- j ----------------------
df_model_j = df_wages.copy()
df_model_j['county_j_name'] = df_model_j['county']
df_model_j['county_j'] = df_model_j['county_j_name'].map(d_county_index)
df_model_j['w_j_emp_data'] = df_model_j['wage_per_employee']
df_model_j['w_j_county_data'] = df_model_j['wages_annual']
df_model_j['L_j_data'] = df_model_j['employment']
df_model_j['_n_firms_j_data'] = df_model_j['n_establishments']

df_model_j = df_model_j[[
    'county_j_name', 'county_j', 
    'w_j_emp_data', 'w_j_county_data', 
    'L_j_data', '_n_firms_j_data']]

# ----------------- i ----------------------
df_model_i = df_housing.merge(df_pop, on='county')
df_model_i['county_i_name'] = df_model_i['county']
df_model_i['county_i'] = df_model_i['county_i_name'].map(d_county_index)
df_model_i['H_i_2024_data'] = df_model_i['total_housing_2024']
df_model_i['H_i_2025_data'] = df_model_i['total_housing_2025']
df_model_i['pop_i_2024_data'] = df_model_i['total_population_2024']
df_model_i['pop_i_2025_data'] = df_model_i['total_population_2025']

# summing Li for sanity check later - workers who live in county i
df_model_i = df_model_i.merge(
    df_model_ij.groupby('county_i')['L_ij_data'].sum(
    ).reset_index(name='L_i_data').sort_values(by='L_i_data'),
    how='left', on='county_i'
)

# Adjust population and households 
# https://census-charts.com/HF/California.html
d_county_household={
    'Los Angeles': 2.98, 
    'Orange': 3.00, 
    'Riverside': 2.98, 
    'San Bernardino': 3.15,
    'San Diego': 2.73, 
    'Ventura': 3.04
}
df_model_i['_avg_per_household'] = df_model_i['county_i_name'].map(d_county_household)
df_model_i['N_i_2024_data'] = df_model_i['pop_i_2024_data'] / df_model_i['_avg_per_household']
df_model_i['N_i_2025_data'] = df_model_i['pop_i_2025_data'] / df_model_i['_avg_per_household']

df_model_i = df_model_i[[
    'county_i_name', 'county_i', 
    'H_i_2024_data', 'H_i_2025_data', 
    'N_i_2024_data', 'N_i_2025_data', 
    'L_i_data'
]]

In [25]:
df_model_ij.to_csv("processed_data/DF_MODEL_IJ.csv", index=False)
df_model_i.to_csv("processed_data/DF_MODEL_I.csv", index=False)
df_model_j.to_csv("processed_data/DF_MODEL_J.csv", index=False)

In [26]:
# "Annual\nEstablishments": "Num_Establishments", 
# "Annual\nAverage\nEmployment": "Employment",
# "Total\nAnnual\nWages": "Annual_Wages",
# "Annual\nAverage\nWeekly Wage": "Weekly_Wage",
# "Annual\nWages per\nEmployee": "Wage_per_Employee",

# wages per employee * employment = wages_annual
# w_j_per_employee * L_j = w_j_total_county

### Process shapefiles into geojsons

In [7]:
# KEEP THIS CELL - It's used for conversion (perhaps move to data cleaning)
# --------- RETRIEVING .GEOJSON FROM .SHP -------
shapefiles = [
    'raw_data/tl_2025_06_tract/tl_2025_06_tract.shp',
    "raw_data/tl_2025_us_county/tl_2025_us_county.shp",
    "raw_data/tl_2025_06_tract_simplify/tl_2025_06_tract.shp",
    'raw_data/Palisades_Perimeter_20250121_simplify/Palisades_Perimeter_20250121.shp',
    'raw_data/Eaton_Perimeter_20250121_simplify/Eaton_Perimeter_20250121.shp'
]
name_shapefiles = [
    "california_tracts_2020.geojson",
    "california_counties_2020.geojson",
    "california_tracts_2020_simplify.geojson",
    'palisades_fire_perimeter.geojson',
    'eaton_fire_perimeter.geojson',
]
for shapefile_path, name_shapefile in zip(shapefiles, name_shapefiles):
    gdf = gpd.read_file(shapefile_path)
    print(gdf.crs)
    
    # Export to GeoJSON
    geojson_path = os.path.join(f'processed_data/{name_shapefile}')
    gdf.to_file(geojson_path, driver="GeoJSON")
    
    print("Saved GeoJSON:", geojson_path)

EPSG:4269
Saved GeoJSON: processed_data/california_tracts_2020.geojson
EPSG:4326
Saved GeoJSON: processed_data/california_counties_2020.geojson
EPSG:4326
Saved GeoJSON: processed_data/california_tracts_2020_simplify.geojson
None
Saved GeoJSON: processed_data/palisades_fire_perimeter.geojson
None
Saved GeoJSON: processed_data/eaton_fire_perimeter.geojson


### Computing county centroids from census tract centroid weights

In [8]:
d_county_fips = {
    'Los Angeles':'037',
    'Orange':'059',
    'Riverside':'065',
    'San Bernardino':'071',
    'San Diego':'073',
    'Ventura':'111',
}
d_fips_county = {v: k for k, v in d_county_fips.items()}

In [9]:
state_tracts = gpd.read_file('processed_data/california_tracts_2020.geojson')

# ---------------------
# Compute centroids from state_tracts
rows = []

projected_crs = "EPSG:3310"
state_tracts = state_tracts.to_crs(projected_crs)

for k, v in d_county_fips.items():
    county_tracts = state_tracts[state_tracts['COUNTYFP'] == v].copy()

    # FIX: use county_tracts, not tracts
    centroids = county_tracts.geometry.centroid

    # robust mean centroid
    mean_centroid = centroids.unary_union.centroid

    rows.append({
        "county_name": k,
        "fips": v,
        "centroid_x": mean_centroid.x,
        "centroid_y": mean_centroid.y
    })

df_centroid = pd.DataFrame(rows)
df_centroid.to_csv('processed_data/DF_CENTROID.csv', index=False)

In [None]:
gdf_centroid = gpd.GeoDataFrame(
    df_centroid,
    geometry=gpd.points_from_xy(df_centroid.centroid_x, df_centroid.centroid_y),
    crs="EPSG:3310"
)

gdf_centroid = gdf_centroid.to_crs(epsg=4326)

gdf_centroid["lon"] = gdf_centroid.geometry.x
gdf_centroid["lat"] = gdf_centroid.geometry.y

# Save gdf_centroid
gdf_centroid.to_file(
    "processed_data/GDF_CENTROID.geojson",
    driver="GeoJSON"
)