In [190]:
#!pip install pandas
#!pip install numpy
#!pip install rapidfuzz

In [191]:
import pandas as pd
import numpy as np

pd.set_option('display.float_format', '{:.2f}'.format)

Census data sourced from this site
https://www.census.gov/data/tables/time-series/demo/popest/2020s-metro-and-micro-statistical-areas-detail.html
MSA data dictionary available here
https://www2.census.gov/programs-surveys/popest/technical-documentation/file-layouts/2020-2024/CBSA-EST2024-ALLDATA-CHAR.pdf

In [192]:
df_sub = pd.read_csv('data/sub-est2024.csv') # for cities proper

In [193]:
df_msa = pd.read_csv('data/cbsa-est2024-alldata-char.csv') # for msas

df_msa = df_msa[df_msa['SUMLEV'] != 314]

df_sf = df_msa[(df_msa['NAME'].str.contains('San Francisco')) & (df_msa['AGEGRP'] == 0) & (df_msa['YEAR'] == 6)].copy()
df_over5 = df_msa[(df_msa['TOT_POP'] > 5000000) & (df_msa['AGEGRP'] == 0) & (df_msa['YEAR'] == 6)].copy()

df_sf_5yr = df_msa[(df_msa['NAME'].str.contains('San Francisco')) & (df_msa['AGEGRP'] == 0) & (df_msa['YEAR'] == 1)]
df_over5_5yr = df_msa[(df_msa['AGEGRP'] == 0) & (df_msa['YEAR'] == 1)] # No population requirement here because some cities grew bigger

df_sf['4 year population change'] = df_sf['TOT_POP'].values- df_sf_5yr['TOT_POP'].values
df_sf['4 year population change pct'] = (df_sf['TOT_POP'].values/df_sf_5yr['TOT_POP'].values - 1)


df_over5 = df_over5.merge(
    df_over5_5yr[['CBSA', 'TOT_POP']],
    on='CBSA',
    suffixes=('', '_2020')
)

# Calculate changes
df_over5['4 year population change']      = df_over5['TOT_POP'] - df_over5['TOT_POP_2020']
df_over5['4 year population change pct']  = (df_over5['TOT_POP'] / df_over5['TOT_POP_2020'] - 1)

# Clean up
df_over5 = df_over5.drop(columns=['TOT_POP_2020'])

In [194]:
msa_sample = pd.concat([df_sf, df_over5])
msa_sample

Unnamed: 0,SUMLEV,CBSA,MDIV,NAME,LSAD,YEAR,AGEGRP,TOT_POP,TOT_MALE,TOT_FEMALE,...,HBAC_MALE,HBAC_FEMALE,HIAC_MALE,HIAC_FEMALE,HAAC_MALE,HAAC_FEMALE,HNAC_MALE,HNAC_FEMALE,4 year population change,4 year population change pct
101,310,41860,,"San Francisco-Oakland-Fremont, CA",Metropolitan Statistical Area,6,0,4648486,2315027,2333459,...,27269,26652,38063,34766,31522,30685,7108,6636,-105169,-0.02
0,310,35620,,"New York-Newark-Jersey City, NY-NJ",Metropolitan Statistical Area,6,0,19940274,9722142,10218132,...,459029,496725,138250,135405,43772,46034,17057,17528,-143138,-0.01
1,310,31080,,"Los Angeles-Long Beach-Anaheim, CA",Metropolitan Statistical Area,6,0,12927614,6394633,6532981,...,79776,81628,117903,110602,71071,71676,15922,15764,-277043,-0.02
2,310,16980,,"Chicago-Naperville-Elgin, IL-IN",Metropolitan Statistical Area,6,0,9408576,4631974,4776602,...,50060,51917,43131,40243,15126,14989,3370,3346,-45787,-0.0
3,310,19100,,"Dallas-Fort Worth-Arlington, TX",Metropolitan Statistical Area,6,0,8344032,4134816,4209216,...,45334,47329,39539,36865,14937,14999,3797,3749,705774,0.09
4,310,26420,,"Houston-Pasadena-The Woodlands, TX",Metropolitan Statistical Area,6,0,7796182,3867729,3928453,...,58662,60572,49896,45614,16992,16823,4639,4231,645949,0.09
5,310,33100,,"Miami-Fort Lauderdale-West Palm Beach, FL",Metropolitan Statistical Area,6,0,6457988,3170380,3287608,...,81963,82236,17853,15997,10341,10733,3120,3183,319631,0.05
6,310,47900,,"Washington-Arlington-Alexandria, DC-VA-MD-WV",Metropolitan Statistical Area,6,0,6436489,3163503,3272986,...,62665,63118,36167,33115,15028,15387,4546,4556,157894,0.03
7,310,12060,,"Atlanta-Sandy Springs-Roswell, GA",Metropolitan Statistical Area,6,0,6411149,3109052,3302097,...,49592,53941,22126,19041,6630,7052,2241,2352,304344,0.05
8,310,37980,,"Philadelphia-Camden-Wilmington, PA-NJ-DE-MD",Metropolitan Statistical Area,6,0,6330422,3074073,3256349,...,74130,78597,20733,20849,7704,7618,3520,3847,85405,0.01


In [195]:
msa_sample = msa_sample.drop(['SUMLEV', 'CBSA', 'MDIV','YEAR', 'AGEGRP','WA_MALE', 'WA_FEMALE', 'BA_MALE',
       'BA_FEMALE', 'IA_MALE', 'IA_FEMALE', 'AA_MALE', 'AA_FEMALE', 'NA_MALE',
       'NA_FEMALE', 'TOM_MALE', 'TOM_FEMALE', 'WAC_MALE', 'WAC_FEMALE',
       'BAC_MALE', 'BAC_FEMALE', 'IAC_MALE', 'IAC_FEMALE', 'AAC_MALE',
       'AAC_FEMALE', 'NAC_MALE', 'NAC_FEMALE', 'NH_MALE', 'NH_FEMALE',
       'NHWA_MALE', 'NHWA_FEMALE', 'NHBA_MALE', 'NHBA_FEMALE', 'NHIA_MALE',
       'NHIA_FEMALE', 'NHAA_MALE', 'NHAA_FEMALE', 'NHNA_MALE', 'NHNA_FEMALE',
       'NHTOM_MALE', 'NHTOM_FEMALE', 'NHWAC_MALE', 'NHWAC_FEMALE',
       'NHBAC_MALE', 'NHBAC_FEMALE', 'NHIAC_MALE', 'NHIAC_FEMALE',
       'NHAAC_MALE', 'NHAAC_FEMALE', 'NHNAC_MALE', 'NHNAC_FEMALE', 'H_MALE',
       'H_FEMALE', 'HWA_MALE', 'HWA_FEMALE', 'HBA_MALE', 'HBA_FEMALE',
       'HIA_MALE', 'HIA_FEMALE', 'HAA_MALE', 'HAA_FEMALE', 'HNA_MALE',
       'HNA_FEMALE', 'HTOM_MALE', 'HTOM_FEMALE', 'HWAC_MALE', 'HWAC_FEMALE',
       'HBAC_MALE', 'HBAC_FEMALE', 'HIAC_MALE', 'HIAC_FEMALE', 'HAAC_MALE',
       'HAAC_FEMALE', 'HNAC_MALE', 'HNAC_FEMALE'], axis = 1)
msa_sample = msa_sample.reset_index(drop=True)

In [196]:
msa_first_city_state = pd.Series([
    'San Francisco, CA',
 'New York, NY',
 'Los Angeles, CA',
 'Chicago, IL',
 'Dallas, TX',
 'Houston, TX',
 'Miami, FL',
 'Washington, DC',
 'Atlanta, GA',
 'Philadelphia, PA',
 'Phoenix, AZ',
 'Boston, MA'
])
msa_sample['common_name'] = msa_first_city_state

In [197]:
msa_sample.head(24)

Unnamed: 0,NAME,LSAD,TOT_POP,TOT_MALE,TOT_FEMALE,4 year population change,4 year population change pct,common_name
0,"San Francisco-Oakland-Fremont, CA",Metropolitan Statistical Area,4648486,2315027,2333459,-105169,-0.02,"San Francisco, CA"
1,"New York-Newark-Jersey City, NY-NJ",Metropolitan Statistical Area,19940274,9722142,10218132,-143138,-0.01,"New York, NY"
2,"Los Angeles-Long Beach-Anaheim, CA",Metropolitan Statistical Area,12927614,6394633,6532981,-277043,-0.02,"Los Angeles, CA"
3,"Chicago-Naperville-Elgin, IL-IN",Metropolitan Statistical Area,9408576,4631974,4776602,-45787,-0.0,"Chicago, IL"
4,"Dallas-Fort Worth-Arlington, TX",Metropolitan Statistical Area,8344032,4134816,4209216,705774,0.09,"Dallas, TX"
5,"Houston-Pasadena-The Woodlands, TX",Metropolitan Statistical Area,7796182,3867729,3928453,645949,0.09,"Houston, TX"
6,"Miami-Fort Lauderdale-West Palm Beach, FL",Metropolitan Statistical Area,6457988,3170380,3287608,319631,0.05,"Miami, FL"
7,"Washington-Arlington-Alexandria, DC-VA-MD-WV",Metropolitan Statistical Area,6436489,3163503,3272986,157894,0.03,"Washington, DC"
8,"Atlanta-Sandy Springs-Roswell, GA",Metropolitan Statistical Area,6411149,3109052,3302097,304344,0.05,"Atlanta, GA"
9,"Philadelphia-Camden-Wilmington, PA-NJ-DE-MD",Metropolitan Statistical Area,6330422,3074073,3256349,85405,0.01,"Philadelphia, PA"


Zillow housing data sourced from https://www.zillow.com/research/data/

In [198]:
# Mean and Median sale prices
df_mean_sale = pd.read_csv('data/Metro_mean_sale_price_now_uc_sfrcondo_month.csv')
df_mean_sale = df_mean_sale[df_mean_sale['RegionType'] == 'msa']
df_mean_sale['5yr_change_pct'] = df_mean_sale['2025-10-31'].div(df_mean_sale['2020-10-31']).sub(1)
df_mean_sale = df_mean_sale[['RegionName', '2025-10-31', '5yr_change_pct']]
df_mean_sale.columns = ['MSA', 'Mean Sale Price', 'Mean Sale Price 5 yr Percent Change']

df_median_sale = pd.read_csv('data/Metro_median_sale_price_now_uc_sfrcondo_month.csv')
df_median_sale = df_median_sale[df_median_sale['RegionType'] == 'msa']
df_median_sale['5yr_change_pct'] = df_median_sale['2025-10-31'].div(df_median_sale['2020-10-31']).sub(1)
df_median_sale = df_median_sale[['RegionName', '2025-10-31', '5yr_change_pct']]
df_median_sale.columns = ['MSA', 'Median Sale Price', 'Median Sale Price 5 yr Percent Change']

# Tier-specific ZHVI data (top, mid, low)
df_zhvi_top = pd.read_csv('data/Metro_zhvi_uc_sfrcondo_tier_0.67_1.0_sm_sa_month.csv')
df_zhvi_top = df_zhvi_top[df_zhvi_top['RegionType'] == 'msa']
df_zhvi_top = df_zhvi_top[['RegionName', '2025-10-31', '2020-10-31']]
df_zhvi_top['5yr_change_pct'] = df_zhvi_top['2025-10-31'].div(df_zhvi_top['2020-10-31']).sub(1)
df_zhvi_top = df_zhvi_top.drop(['2025-10-31'], axis=1)
df_zhvi_top.columns = ['MSA', 'Top Tier ZHVI', 'Top Tier ZHVI 5 yr Percent Change']

df_zhvi_mid = pd.read_csv('data/Metro_zhvi_uc_sfrcondo_tier_0.33_0.67_month.csv')
df_zhvi_mid = df_zhvi_mid[df_zhvi_mid['RegionType'] == 'msa']
df_zhvi_mid = df_zhvi_mid[['RegionName', '2025-10-31', '2020-10-31']]
df_zhvi_mid['5yr_change_pct'] = df_zhvi_mid['2025-10-31'].div(df_zhvi_mid['2020-10-31']).sub(1)
df_zhvi_mid = df_zhvi_mid.drop(['2025-10-31'], axis=1)
df_zhvi_mid.columns = ['MSA', 'Mid Tier ZHVI', 'Mid Tier ZHVI 5 yr Percent Change']

df_zhvi_low = pd.read_csv('data/Metro_zhvi_uc_sfrcondo_tier_0.0_0.33_sm_sa_month.csv')
df_zhvi_low = df_zhvi_low[df_zhvi_low['RegionType'] == 'msa']
df_zhvi_low = df_zhvi_low[['RegionName', '2025-10-31', '2020-10-31']]
df_zhvi_low['5yr_change_pct'] = df_zhvi_low['2025-10-31'].div(df_zhvi_low['2020-10-31']).sub(1)
df_zhvi_low = df_zhvi_low.drop(['2025-10-31'], axis=1)
df_zhvi_low.columns = ['MSA', 'Bottom Tier ZHVI', 'Bottom Tier ZHVI 5 yr Percent Change']

df_new_homeowner_income_needed = pd.read_csv('data/Metro_new_homeowner_income_needed_downpayment_0.20_uc_sfrcondo_tier_0.33_0.67_sm_sa_month.csv')
df_new_homeowner_income_needed = df_new_homeowner_income_needed[df_new_homeowner_income_needed['RegionType'] == 'msa']
df_new_homeowner_income_needed = df_new_homeowner_income_needed[['RegionName', '2025-10-31', '2020-10-31']]
df_new_homeowner_income_needed['5yr_change_pct'] = df_new_homeowner_income_needed['2025-10-31'].div(df_new_homeowner_income_needed['2020-10-31']).sub(1)
df_new_homeowner_income_needed = df_new_homeowner_income_needed.drop(['2025-10-31'], axis=1)
df_new_homeowner_income_needed.columns = ['MSA', 'Income Needed for Monthly Home Payment', 'Income Needed for Monthly Home Payment 5 yr Percent Change']

df_new_renter_income_needed = pd.read_csv('data/Metro_new_renter_income_needed_uc_sfrcondomfr_sm_sa_month.csv')
df_new_renter_income_needed = df_new_renter_income_needed[df_new_renter_income_needed['RegionType'] == 'msa']
df_new_renter_income_needed = df_new_renter_income_needed[['RegionName', '2025-10-31', '2020-10-31']]
df_new_renter_income_needed['5yr_change_pct'] = df_new_renter_income_needed['2025-10-31'].div(df_new_renter_income_needed['2020-10-31']).sub(1)
df_new_renter_income_needed = df_new_renter_income_needed.drop(['2025-10-31'], axis=1)
df_new_renter_income_needed.columns = ['MSA', 'Income Needed for Monthly Rent Payment', 'Income Needed for Monthly Rent Payment 5 yr Percent Change']

df_new_construction = pd.read_csv('data/Metro_new_con_sales_count_raw_uc_sfrcondo_month.csv')
df_new_construction = df_new_construction[df_new_construction['RegionType'] == 'msa']
df_new_construction = df_new_construction[['RegionName', '2025-09-30', '2020-09-30']]
df_new_construction['5yr_change_pct'] = df_new_construction['2025-09-30'].div(df_new_construction['2020-09-30']).sub(1)
df_new_construction = df_new_construction.drop(['2025-09-30'], axis=1)
df_new_construction.columns = ['MSA', 'New Construction Sale Counts (Monthly)', 'New Construction Sale Counts (Monthly) 5 yr Percent Change']

df_days_pending = pd.read_csv('data/Metro_mean_doz_pending_uc_sfrcondo_sm_month.csv')
df_days_pending = df_days_pending[df_days_pending['RegionType'] == 'msa']
df_days_pending = df_days_pending[['RegionName', '2025-10-31', '2020-10-31']]
df_days_pending['5yr_change_pct'] = df_days_pending['2025-10-31'].div(df_days_pending['2020-10-31']).sub(1)
df_days_pending = df_days_pending.drop(['2025-10-31'], axis=1)
df_days_pending.columns = ['MSA', 'Mean Days to Pending', 'Mean Days to Pending 5 yr Percent Change']

df_price_cut = pd.read_csv('data/Metro_perc_listings_price_cut_uc_sfrcondo_sm_month.csv')
df_price_cut = df_price_cut[df_price_cut['RegionType'] == 'msa']
df_price_cut = df_price_cut[['RegionName', '2025-10-31', '2020-10-31']]
df_price_cut['5yr_change_pct'] = df_price_cut['2025-10-31'].div(df_price_cut['2020-10-31']).sub(1)
df_price_cut = df_price_cut.drop(['2025-10-31'], axis=1)
df_price_cut.columns = ['MSA', 'Share of Listings with Price Cut', 'Share of Listings with Price Cut 5 yr Percent Change']

df_market_heat = pd.read_csv('data/Metro_market_temp_index_uc_sfrcondo_month.csv')
df_market_heat = df_market_heat[df_market_heat['RegionType'] == 'msa']
df_market_heat = df_market_heat[['RegionName', '2025-10-31', '2020-10-31']]
df_market_heat['5yr_change_pct'] = df_market_heat['2025-10-31'].div(df_market_heat['2020-10-31']).sub(1)
df_market_heat = df_market_heat.drop(['2025-10-31'], axis=1)
df_market_heat.columns = ['MSA', 'Zillow Market Heat Index', 'Zillow Market Heat Index 5 yr Percent Change']

In [199]:
df_new_construction

Unnamed: 0,MSA,New Construction Sale Counts (Monthly),New Construction Sale Counts (Monthly) 5 yr Percent Change
1,"New York, NY",700.00,-0.46
2,"Los Angeles, CA",569.00,-0.58
3,"Chicago, IL",614.00,-0.10
4,"Dallas, TX",2168.00,-0.02
5,"Houston, TX",2435.00,-0.04
...,...,...,...
391,"Pella, IA",,
392,"Breckenridge, CO",,
393,"Sheridan, WY",,
394,"Mountain Home, ID",,


In [200]:
msa_sample = msa_sample.merge(df_mean_sale, how='left', left_on='common_name', right_on='MSA').drop(columns=['MSA'])
msa_sample = msa_sample.merge(df_median_sale, how='left', left_on='common_name', right_on='MSA').drop(columns=['MSA'])
msa_sample = msa_sample.merge(df_zhvi_top, how='left', left_on='common_name', right_on='MSA').drop(columns=['MSA'])
msa_sample = msa_sample.merge(df_zhvi_mid, how='left', left_on='common_name', right_on='MSA').drop(columns=['MSA'])
msa_sample = msa_sample.merge(df_zhvi_low, how='left', left_on='common_name', right_on='MSA').drop(columns=['MSA'])
msa_sample = msa_sample.merge(df_new_homeowner_income_needed, how='left', left_on='common_name', right_on='MSA').drop(columns=['MSA'])
msa_sample = msa_sample.merge(df_new_renter_income_needed, how='left', left_on='common_name', right_on='MSA').drop(columns=['MSA'])
msa_sample = msa_sample.merge(df_new_construction, how='left', left_on='common_name', right_on='MSA').drop(columns=['MSA'])
msa_sample = msa_sample.merge(df_days_pending, how='left', left_on='common_name', right_on='MSA').drop(columns=['MSA'])
msa_sample = msa_sample.merge(df_price_cut, how='left', left_on='common_name', right_on='MSA').drop(columns=['MSA'])
msa_sample = msa_sample.merge(df_market_heat, how='left', left_on='common_name', right_on='MSA').drop(columns=['MSA'])

In [201]:
df_zori = pd.read_csv('data/Metro_zori_uc_sfrcondomfr_sm_month.csv')
df_zori = df_zori[df_zori['RegionType'] == 'msa']
df_zori = df_zori[['RegionName', '2025-10-31', '2020-10-31']]
df_zori['5yr_change_pct'] = df_zori['2025-10-31'].div(df_zori['2020-10-31']).sub(1)
df_zori = df_zori.drop(['2025-10-31'], axis=1)
df_zori.columns = ['MSA', 'Zillow Observed Rent Index (ZORI)', 'ZORI 5 yr Percent Change']
df_zori

Unnamed: 0,MSA,Zillow Observed Rent Index (ZORI),ZORI 5 yr Percent Change
1,"New York, NY",2356.14,0.44
2,"Los Angeles, CA",2254.08,0.30
3,"Chicago, IL",1511.54,0.37
4,"Dallas, TX",1336.73,0.26
5,"Houston, TX",1347.95,0.22
...,...,...,...
702,"Sterling, CO",,
703,"Jamestown, ND",,
704,"Portales, NM",,
705,"Los Alamos, NM",,


In [202]:
msa_sample = msa_sample.merge(df_zori, how='left', left_on='common_name', right_on = 'MSA')

In [203]:
# Clean up duplicate columns from all merges
msa_sample = msa_sample.drop(['LSAD', 'common_name', 'MSA_x', 'MSA_y', 'MSA_x', 'MSA_y', 'MSA'], axis=1, errors='ignore')
msa_sample = msa_sample.drop([col for col in msa_sample.columns if col.endswith('_x') or col.endswith('_y')], axis=1, errors='ignore')

In [204]:
msa_sample

Unnamed: 0,NAME,TOT_POP,TOT_MALE,TOT_FEMALE,4 year population change,4 year population change pct,Mean Sale Price,Mean Sale Price 5 yr Percent Change,Median Sale Price,Median Sale Price 5 yr Percent Change,...,New Construction Sale Counts (Monthly),New Construction Sale Counts (Monthly) 5 yr Percent Change,Mean Days to Pending,Mean Days to Pending 5 yr Percent Change,Share of Listings with Price Cut,Share of Listings with Price Cut 5 yr Percent Change,Zillow Market Heat Index,Zillow Market Heat Index 5 yr Percent Change,Zillow Observed Rent Index (ZORI),ZORI 5 yr Percent Change
0,"San Francisco-Oakland-Fremont, CA",4648486,2315027,2333459,-105169,-0.02,1437249.0,0.21,1161375.0,0.22,...,303.0,-0.68,26.0,0.69,0.16,0.38,111.0,-0.41,2584.07,0.21
1,"New York-Newark-Jersey City, NY-NJ",19940274,9722142,10218132,-143138,-0.01,847341.0,0.44,656688.0,0.43,...,700.0,-0.46,66.0,-0.09,0.15,0.04,77.0,-0.16,2356.14,0.44
2,"Los Angeles-Long Beach-Anaheim, CA",12927614,6394633,6532981,-277043,-0.02,1250508.0,0.31,947186.0,0.3,...,569.0,-0.58,32.0,0.66,0.16,0.5,83.0,-0.31,2254.08,0.3
3,"Chicago-Naperville-Elgin, IL-IN",9408576,4631974,4776602,-45787,-0.0,411258.0,0.28,327317.0,0.28,...,614.0,-0.1,43.0,-0.23,0.26,0.13,58.0,-0.05,1511.54,0.37
4,"Dallas-Fort Worth-Arlington, TX",8344032,4134816,4209216,705774,0.09,491722.0,0.35,379187.0,0.29,...,2168.0,-0.02,44.0,0.55,0.18,0.85,70.0,-0.34,1336.73,0.26
5,"Houston-Pasadena-The Woodlands, TX",7796182,3867729,3928453,645949,0.09,432112.0,0.33,324778.0,0.27,...,2435.0,-0.04,50.0,0.54,0.22,0.32,59.0,-0.27,1347.95,0.22
6,"Miami-Fort Lauderdale-West Palm Beach, FL",6457988,3170380,3287608,319631,0.05,670086.0,0.54,467250.0,0.49,...,700.0,-0.67,79.0,0.32,0.13,0.54,47.0,-0.26,1752.49,0.52
7,"Washington-Arlington-Alexandria, DC-VA-MD-WV",6436489,3163503,3272986,157894,0.03,683727.0,0.3,556368.0,0.28,...,1265.0,-0.57,22.0,0.68,0.17,0.68,74.0,-0.23,1885.58,0.26
8,"Atlanta-Sandy Springs-Roswell, GA",6411149,3109052,3302097,304344,0.05,454408.0,0.42,355903.0,0.36,...,2888.0,-0.65,40.0,0.82,0.18,0.72,67.0,-0.37,1410.32,0.32
9,"Philadelphia-Camden-Wilmington, PA-NJ-DE-MD",6330422,3074073,3256349,85405,0.01,423605.0,0.37,351701.0,0.35,...,677.0,-0.68,33.0,0.03,0.18,0.4,71.0,-0.27,1441.08,0.3


In [205]:
msa_sample.to_csv('data/us_city_affordability.csv',index=False)
msa_sample

Unnamed: 0,NAME,TOT_POP,TOT_MALE,TOT_FEMALE,4 year population change,4 year population change pct,Mean Sale Price,Mean Sale Price 5 yr Percent Change,Median Sale Price,Median Sale Price 5 yr Percent Change,...,New Construction Sale Counts (Monthly),New Construction Sale Counts (Monthly) 5 yr Percent Change,Mean Days to Pending,Mean Days to Pending 5 yr Percent Change,Share of Listings with Price Cut,Share of Listings with Price Cut 5 yr Percent Change,Zillow Market Heat Index,Zillow Market Heat Index 5 yr Percent Change,Zillow Observed Rent Index (ZORI),ZORI 5 yr Percent Change
0,"San Francisco-Oakland-Fremont, CA",4648486,2315027,2333459,-105169,-0.02,1437249.0,0.21,1161375.0,0.22,...,303.0,-0.68,26.0,0.69,0.16,0.38,111.0,-0.41,2584.07,0.21
1,"New York-Newark-Jersey City, NY-NJ",19940274,9722142,10218132,-143138,-0.01,847341.0,0.44,656688.0,0.43,...,700.0,-0.46,66.0,-0.09,0.15,0.04,77.0,-0.16,2356.14,0.44
2,"Los Angeles-Long Beach-Anaheim, CA",12927614,6394633,6532981,-277043,-0.02,1250508.0,0.31,947186.0,0.3,...,569.0,-0.58,32.0,0.66,0.16,0.5,83.0,-0.31,2254.08,0.3
3,"Chicago-Naperville-Elgin, IL-IN",9408576,4631974,4776602,-45787,-0.0,411258.0,0.28,327317.0,0.28,...,614.0,-0.1,43.0,-0.23,0.26,0.13,58.0,-0.05,1511.54,0.37
4,"Dallas-Fort Worth-Arlington, TX",8344032,4134816,4209216,705774,0.09,491722.0,0.35,379187.0,0.29,...,2168.0,-0.02,44.0,0.55,0.18,0.85,70.0,-0.34,1336.73,0.26
5,"Houston-Pasadena-The Woodlands, TX",7796182,3867729,3928453,645949,0.09,432112.0,0.33,324778.0,0.27,...,2435.0,-0.04,50.0,0.54,0.22,0.32,59.0,-0.27,1347.95,0.22
6,"Miami-Fort Lauderdale-West Palm Beach, FL",6457988,3170380,3287608,319631,0.05,670086.0,0.54,467250.0,0.49,...,700.0,-0.67,79.0,0.32,0.13,0.54,47.0,-0.26,1752.49,0.52
7,"Washington-Arlington-Alexandria, DC-VA-MD-WV",6436489,3163503,3272986,157894,0.03,683727.0,0.3,556368.0,0.28,...,1265.0,-0.57,22.0,0.68,0.17,0.68,74.0,-0.23,1885.58,0.26
8,"Atlanta-Sandy Springs-Roswell, GA",6411149,3109052,3302097,304344,0.05,454408.0,0.42,355903.0,0.36,...,2888.0,-0.65,40.0,0.82,0.18,0.72,67.0,-0.37,1410.32,0.32
9,"Philadelphia-Camden-Wilmington, PA-NJ-DE-MD",6330422,3074073,3256349,85405,0.01,423605.0,0.37,351701.0,0.35,...,677.0,-0.68,33.0,0.03,0.18,0.4,71.0,-0.27,1441.08,0.3


# City-Level Analysis
Processing city-level census data with city-level Zillow housing data

Note that we use census sumlev = 162 for city proper.

Per https://www2.census.gov/programs-surveys/popest/technical-documentation/file-layouts/2020-2024/SUB-EST2024.pdf SUMLEV 162 corresponds to an incorporated place and most of the entities we are interested in appear to hold this designation.

In [206]:
# Filter df_sub for cities proper (SUMLEV = 162 for cities)
df_cities = df_sub[df_sub['SUMLEV'] == 162].copy()

# Define the same subset of cities as used in MSA analysis
target_cities = [
    'San Francisco',
    'New York',
    'Los Angeles',
    'Chicago',
    'Dallas',
    'Houston',
    'Miami',
    'Washington',
    'Atlanta',
    'Philadelphia',
    'Phoenix',
    'Boston'
]

# Filter for target cities (remove 'city' suffix and clean names)
df_cities['CITY_NAME'] = df_cities['NAME'].str.replace(' city', '').str.replace(' (pt.)', '')
city_sample = df_cities[df_cities['CITY_NAME'].isin(target_cities)].copy()

# Calculate population changes
city_sample['4 year population change'] = city_sample['POPESTIMATE2024'] - city_sample['POPESTIMATE2020']
city_sample['4 year population change pct'] = (city_sample['POPESTIMATE2024'] - city_sample['POPESTIMATE2020']) / city_sample['POPESTIMATE2020']

# Create common names for merging with Zillow data
city_common_names = {
    'San Francisco': 'San Francisco, CA',
    'New York': 'New York, NY',
    'Los Angeles': 'Los Angeles, CA',
    'Chicago': 'Chicago, IL',
    'Dallas': 'Dallas, TX',
    'Houston': 'Houston, TX',
    'Miami': 'Miami, FL',
    'Washington': 'Washington, DC',
    'Atlanta': 'Atlanta, GA',
    'Philadelphia': 'Philadelphia, PA',
    'Phoenix': 'Phoenix, AZ',
    'Boston': 'Boston, MA'
}

city_sample['common_name'] = city_sample['CITY_NAME'].map(city_common_names)
city_sample = city_sample[city_sample['common_name'].notna()]
city_sample['TOT_POP'] = city_sample['POPESTIMATE2024']
# Add city and state columns for proper joining
city_sample['city'] = city_sample['CITY_NAME']
# Convert numeric state codes to state abbreviations to match Zillow data
state_abbr = {
    1: 'AL', 2: 'AK', 4: 'AZ', 5: 'AR', 6: 'CA', 8: 'CO', 9: 'CT', 10: 'DE', 11: 'DC', 12: 'FL',
    13: 'GA', 15: 'HI', 16: 'ID', 17: 'IL', 18: 'IN', 19: 'IA', 20: 'KS', 21: 'KY', 22: 'LA', 23: 'ME',
    24: 'MD', 25: 'MA', 26: 'MI', 27: 'MN', 28: 'MS', 29: 'MO', 30: 'MT', 31: 'NE', 32: 'NV', 33: 'NH',
    34: 'NJ', 35: 'NM', 36: 'NY', 37: 'NC', 38: 'ND', 39: 'OH', 40: 'OK', 41: 'OR', 42: 'PA', 44: 'RI',
    45: 'SC', 46: 'SD', 47: 'TN', 48: 'TX', 49: 'UT', 50: 'VT', 51: 'VA', 53: 'WA', 54: 'WV', 55: 'WI', 56: 'WY'
}
city_sample['state'] = city_sample['STATE'].map(state_abbr)
city_sample['state'] = city_sample['state'].str.upper()

In [207]:
df_cities

Unnamed: 0,SUMLEV,STATE,COUNTY,PLACE,COUSUB,CONCIT,PRIMGEO_FLAG,FUNCSTAT,NAME,STNAME,ESTIMATESBASE2020,POPESTIMATE2020,POPESTIMATE2021,POPESTIMATE2022,POPESTIMATE2023,POPESTIMATE2024,CITY_NAME
13,162,36,0,51000,0,0,0,A,New York city,New York,8805594,8740306,8453772,8356179,8390888,8478072,New York
33,162,6,0,44000,0,0,0,A,Los Angeles city,California,3899449,3896329,3830987,3832998,3847428,3878704,Los Angeles
45,162,17,0,14000,0,0,0,A,Chicago city,Illinois,2748331,2745196,2706324,2684076,2699144,2721308,Chicago
54,162,48,0,35000,0,0,0,A,Houston city,Texas,2300351,2298945,2291070,2314258,2346908,2390125,Houston
70,162,4,0,55000,0,0,0,A,Phoenix city,Arizona,1608415,1612593,1625132,1644798,1656231,1673164,Phoenix
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
80584,162,16,0,85420,0,0,0,A,Warm River city,Idaho,1,1,1,1,1,1,Warm River
80662,162,8,0,12030,0,0,0,A,Carbonate town,Colorado,0,0,0,0,0,0,Carbonate town
80961,162,21,0,72138,0,0,0,A,South Park View city,Kentucky,0,0,0,0,0,0,South Park View
81037,162,29,0,16462,0,0,0,A,Corning town,Missouri,0,0,0,0,0,0,Corning town


In [208]:
city_sample = city_sample[city_sample['TOT_POP'] > 50000] # Get rid of Miami, UT and the like
city_sample

Unnamed: 0,SUMLEV,STATE,COUNTY,PLACE,COUSUB,CONCIT,PRIMGEO_FLAG,FUNCSTAT,NAME,STNAME,...,POPESTIMATE2022,POPESTIMATE2023,POPESTIMATE2024,CITY_NAME,4 year population change,4 year population change pct,common_name,TOT_POP,city,state
13,162,36,0,51000,0,0,0,A,New York city,New York,...,8356179,8390888,8478072,New York,-262234,-0.03,"New York, NY",8478072,New York,NY
33,162,6,0,44000,0,0,0,A,Los Angeles city,California,...,3832998,3847428,3878704,Los Angeles,-17625,-0.0,"Los Angeles, CA",3878704,Los Angeles,CA
45,162,17,0,14000,0,0,0,A,Chicago city,Illinois,...,2684076,2699144,2721308,Chicago,-23888,-0.01,"Chicago, IL",2721308,Chicago,IL
54,162,48,0,35000,0,0,0,A,Houston city,Texas,...,2314258,2346908,2390125,Houston,91180,0.04,"Houston, TX",2390125,Houston,TX
70,162,4,0,55000,0,0,0,A,Phoenix city,Arizona,...,1644798,1656231,1673164,Phoenix,60571,0.04,"Phoenix, AZ",1673164,Phoenix,AZ
79,162,42,0,60000,0,0,0,A,Philadelphia city,Pennsylvania,...,1570554,1563349,1573916,Philadelphia,-26872,-0.02,"Philadelphia, PA",1573916,Philadelphia,PA
97,162,48,0,19000,0,0,0,A,Dallas city,Texas,...,1303012,1317163,1326087,Dallas,22661,0.02,"Dallas, TX",1326087,Dallas,TX
174,162,6,0,67000,0,0,0,A,San Francisco city,California,...,814176,819151,827526,San Francisco,-47300,-0.05,"San Francisco, CA",827526,San Francisco,CA
217,162,11,0,50000,0,0,0,N,Washington city,District of Columbia,...,676725,687324,702250,Washington,31333,0.05,"Washington, DC",702250,Washington,DC
227,162,25,0,7000,0,0,0,A,Boston city,Massachusetts,...,660080,664603,673458,Boston,-2064,-0.0,"Boston, MA",673458,Boston,MA


In [210]:
df_zhvi_city = pd.read_csv('data/City_zhvi_uc_sfrcondo_tier_0.33_0.67_sm_sa_month.csv')
df_zhvi_city = df_zhvi_city[df_zhvi_city['RegionType'] == 'city']
df_zhvi_city = df_zhvi_city[['RegionName', 'State', '2025-10-31', '2020-10-31']]
df_zhvi_city['5yr_change_pct'] = df_zhvi_city['2025-10-31'].div(df_zhvi_city['2020-10-31']).sub(1)
df_zhvi_city = df_zhvi_city.drop(['2025-10-31'], axis=1)
df_zhvi_city.columns = ['city', 'state', 'Mid Tier ZHVI', 'Mid Tier ZHVI 5 yr Percent Change']

In [211]:
df_zori_city = pd.read_csv('data/City_zori_uc_sfrcondomfr_sm_month.csv')
df_zori_city = df_zori_city[df_zori_city['RegionType'] == 'city']
df_zori_city = df_zori_city[['RegionName', 'State', '2025-10-31', '2020-10-31']]
df_zori_city['5yr_change_pct'] = df_zori_city['2025-10-31'].div(df_zori_city['2020-10-31']).sub(1)
df_zori_city = df_zori_city.drop(['2025-10-31'], axis=1)
df_zori_city.columns = ['city', 'state', 'ZORI', 'ZORI 5 yr Percent Change']

Rent values aren't available for DC and Boston for some reason

In [212]:
df_zori_city[df_zori_city['city'] == 'Washington']

Unnamed: 0,city,state,ZORI,ZORI 5 yr Percent Change
27,Washington,DC,,
1231,Washington,PA,,
2155,Washington,UT,,
2238,Washington,MI,,
2530,Washington,MO,,
3160,Washington,NJ,,


In [213]:
df_zori_city[df_zori_city['city'] == 'Boston']

Unnamed: 0,city,state,ZORI,ZORI 5 yr Percent Change
29,Boston,MA,,


In [214]:
# Merge city sample with housing data using both city and state to avoid duplications
city_sample = city_sample.merge(df_zhvi_city, how='left', left_on=['city', 'state'], right_on=['city', 'state'])
city_sample = city_sample.merge(df_zori_city, how='left', left_on=['city', 'state'], right_on=['city', 'state'])

# Clean up duplicate columns
city_sample = city_sample.drop(['city_x', 'state_x', 'city_y', 'state_y'], axis=1, errors='ignore')
city_sample = city_sample.drop_duplicates(subset=['CITY_NAME'], keep='first')

In [215]:
city_sample['NAME'] = city_sample['NAME'].str.replace(' city', '', regex=True)

In [216]:
city_sample.columns

Index(['SUMLEV', 'STATE', 'COUNTY', 'PLACE', 'COUSUB', 'CONCIT',
       'PRIMGEO_FLAG', 'FUNCSTAT', 'NAME', 'STNAME', 'ESTIMATESBASE2020',
       'POPESTIMATE2020', 'POPESTIMATE2021', 'POPESTIMATE2022',
       'POPESTIMATE2023', 'POPESTIMATE2024', 'CITY_NAME',
       '4 year population change', '4 year population change pct',
       'common_name', 'TOT_POP', 'city', 'state', 'Mid Tier ZHVI',
       'Mid Tier ZHVI 5 yr Percent Change', 'ZORI',
       'ZORI 5 yr Percent Change'],
      dtype='object')

In [217]:
city_sample = city_sample[['NAME', 'TOT_POP', '4 year population change', '4 year population change pct', 
           'Mid Tier ZHVI',
       'Mid Tier ZHVI 5 yr Percent Change', 'ZORI',
       'ZORI 5 yr Percent Change']]
city_sample.to_csv('data/us_city_affordability_city_level.csv', index=False)


In [218]:
city_sample

Unnamed: 0,NAME,TOT_POP,4 year population change,4 year population change pct,Mid Tier ZHVI,Mid Tier ZHVI 5 yr Percent Change,ZORI,ZORI 5 yr Percent Change
0,New York,8478072,-262234,-0.03,727764.49,0.1,2604.13,0.47
1,Los Angeles,3878704,-17625,-0.0,745184.75,0.25,2269.16,0.24
2,Chicago,2721308,-23888,-0.01,263204.8,0.17,1612.48,0.36
3,Houston,2390125,91180,0.04,202767.93,0.29,1298.91,0.2
4,Phoenix,1673164,60571,0.04,295909.28,0.37,1240.82,0.28
5,Philadelphia,1573916,-26872,-0.02,192195.1,0.19,1383.74,0.24
6,Dallas,1326087,22661,0.02,234467.46,0.29,1296.69,0.23
7,San Francisco,827526,-47300,-0.05,1329621.15,-0.06,2897.42,0.27
8,Washington,702250,31333,0.05,605089.93,-0.04,,
9,Boston,673458,-2064,-0.0,676159.49,0.13,,
