# Compare simulated population to actual

See how well our population synthesis reproduces the actual population in the base/fitted case.

In [None]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatch
import cenpy
import re

In [None]:
plt.style.use('asu-light')

In [None]:
spop = pd.read_csv('../model_inputs/base/persons.csv')
shh = pd.read_csv('../model_inputs/base/households.csv')

In [None]:
# give the TAZs back their tract IDs
tract_ids = pd.read_parquet('../la_abm/data/skim_tracts.parquet')
shh = shh.merge(tract_ids, left_on='TAZ', right_on='idx', how='left')
assert not shh.geoid.isnull().any()
# and county IDs
shh['county'] = shh.geoid.str.slice(0, 5)

In [None]:
spop = spop.merge(shh, on="household_id", how="left", validate="m:1")

## First, distributions across the region

In [None]:
acs = cenpy.products.ACS(year=2017)

In [None]:
pd.api.types.is_numeric_dtype

In [None]:
# regional age distributions
counties = ['Los Angeles, CA', 'Orange, CA', 'Riverside, CA', 'San Bernardino, CA', 'Ventura, CA', 'Imperial, CA']

acs_age = acs.from_state('CA', '^B01001_', level='county', return_geometry=False).set_index('GEOID')


In [None]:
acs_age = acs_age.apply(lambda c: c / acs_age.B01001_001E if pd.api.types.is_numeric_dtype(c) else c)
acs_age.columns = [acs.variables.loc[c, 'label'] if c in acs.variables.index else c for c in acs_age.columns]

In [None]:
# fold categories down
acs_age['male_under_18'] = acs_age[[
    'Estimate!!Total!!Male!!Under 5 years',
       'Estimate!!Total!!Male!!5 to 9 years',
       'Estimate!!Total!!Male!!10 to 14 years',
       'Estimate!!Total!!Male!!15 to 17 years']].sum(axis=1)
acs_age['male_18_34'] = acs_age[[
     'Estimate!!Total!!Male!!18 and 19 years',
       'Estimate!!Total!!Male!!20 years',
       'Estimate!!Total!!Male!!21 years',
       'Estimate!!Total!!Male!!22 to 24 years',
       'Estimate!!Total!!Male!!25 to 29 years',
       'Estimate!!Total!!Male!!30 to 34 years',
]].sum(axis=1)
acs_age['male_35_49'] = acs_age[[
     'Estimate!!Total!!Male!!35 to 39 years',
       'Estimate!!Total!!Male!!40 to 44 years',
       'Estimate!!Total!!Male!!45 to 49 years',
]].sum(axis=1)

acs_age['male_50_64'] = acs_age[[
    'Estimate!!Total!!Male!!50 to 54 years',
       'Estimate!!Total!!Male!!55 to 59 years',
       'Estimate!!Total!!Male!!60 and 61 years',
       'Estimate!!Total!!Male!!62 to 64 years'
]].sum(axis=1)

acs_age['male_65_plus'] = acs_age[[
    'Estimate!!Total!!Male!!65 and 66 years',
       'Estimate!!Total!!Male!!67 to 69 years',
       'Estimate!!Total!!Male!!70 to 74 years',
       'Estimate!!Total!!Male!!75 to 79 years',
       'Estimate!!Total!!Male!!80 to 84 years',
       'Estimate!!Total!!Male!!85 years and over',
]].sum(axis=1)

# fold categories down
acs_age['female_under_18'] = acs_age[[
    'Estimate!!Total!!Female!!Under 5 years',
       'Estimate!!Total!!Female!!5 to 9 years',
       'Estimate!!Total!!Female!!10 to 14 years',
       'Estimate!!Total!!Female!!15 to 17 years']].sum(axis=1)
acs_age['female_18_34'] = acs_age[[
     'Estimate!!Total!!Female!!18 and 19 years',
       'Estimate!!Total!!Female!!20 years',
       'Estimate!!Total!!Female!!21 years',
       'Estimate!!Total!!Female!!22 to 24 years',
       'Estimate!!Total!!Female!!25 to 29 years',
       'Estimate!!Total!!Female!!30 to 34 years',
]].sum(axis=1)
acs_age['female_35_49'] = acs_age[[
     'Estimate!!Total!!Female!!35 to 39 years',
       'Estimate!!Total!!Female!!40 to 44 years',
       'Estimate!!Total!!Female!!45 to 49 years',
]].sum(axis=1)

acs_age['female_50_64'] = acs_age[[
    'Estimate!!Total!!Female!!50 to 54 years',
       'Estimate!!Total!!Female!!55 to 59 years',
       'Estimate!!Total!!Female!!60 and 61 years',
       'Estimate!!Total!!Female!!62 to 64 years'
]].sum(axis=1)

acs_age['female_65_plus'] = acs_age[[
    'Estimate!!Total!!Female!!65 and 66 years',
       'Estimate!!Total!!Female!!67 to 69 years',
       'Estimate!!Total!!Female!!70 to 74 years',
       'Estimate!!Total!!Female!!75 to 79 years',
       'Estimate!!Total!!Female!!80 to 84 years',
       'Estimate!!Total!!Female!!85 years and over',
]].sum(axis=1)



In [None]:
# derive an age distribution from the population
spop['census_age_cat'] = pd.cut(spop.age, [0, 17, 34, 49, 64, 122], right=True, include_lowest=True, labels=['under_18', '18_34', '35_49', '50_64', '65_plus'])

In [None]:
# sanity check
spop.groupby('census_age_cat').age.agg([min, max])

In [None]:
spop['census_sex'] = spop.sex.map({1: 'male', 2: 'female'})

In [None]:
# make the table like the census one
spop_age = spop.groupby(['county', 'census_age_cat', 'census_sex']).size()
spop_age /= spop.groupby('county').size().reindex(spop_age.index, level='county')
spop_age = spop_age.unstack().unstack()
spop_age.columns = [f'{sex}_{age}' for sex, age in spop_age.columns]
spop_age *= 100

In [None]:
acs_age *= 100

In [None]:
male_cols = [c for c in spop_age.columns if c.startswith('male_')]
female_cols = [c for c in spop_age.columns if c.startswith('female_')]

In [None]:
county_names = {
    '06025': 'Imperial County',
    '06037': 'Los Angeles County',
    '06059': 'Orange County',
    '06065': 'Riverside County',
    '06071': 'San Bernardino County',
    '06111': 'Ventura County'
}

In [None]:
spop_age

In [None]:
assert np.allclose(acs_age[[*male_cols, *female_cols]].sum(axis=1), 100)

acs_age

In [None]:
f, axs = plt.subplots(6, 3, figsize=(8.5, 9), gridspec_kw={"width_ratios": [2, 2, 1]})

for ctyidx, county in enumerate(spop_age.index):
    county_name = county_names[county]
    for lbl, cols in [('Male', male_cols), ('Female', female_cols)]:
        ax = axs[ctyidx, 1 if lbl == 'Female' else 0]
        #ax.set_axis_off()
        
        if ctyidx == 0:
            ax.set_title(lbl)
            
        if lbl == 'Male':
            ax.set_ylabel(county_name)
        
        ax.bar(np.arange(len(cols)) + 0.2, spop_age.loc[county, cols], width=0.4, color='C1', label='Synthesized')
        ax.bar(np.arange(len(cols)) - 0.2, acs_age.loc[county, cols], width=0.4, color='C0', label='ACS')
        
        ax.set_xticks(np.arange(5))
        ax.set_xticklabels(['<18', '18–34', '35–49', '50–64', '≥65'])
        
        ax.set_ylim(0, 17)
        ax.set_yticks([0, 5, 10, 15])
        ax.set_yticklabels(['0%', '5%', '10%', '15%'])
        
# make legend
# https://matplotlib.org/stable/gallery/subplots_axes_and_figures/gridspec_and_subplots.html
gs = axs[0, 2].get_gridspec()

for ax in axs[:,2]:
    ax.remove()

legax = f.add_subplot(gs[:, 2])
legax.set_axis_off()
legax.legend(handles=[
    mpatch.Patch(color='C0', label='ACS'),
    mpatch.Patch(color='C1', label='Synthesized'),
], loc='upper center')


plt.tight_layout()
plt.savefig('../../dissertation/fig/popsyn/cmp_age_sex.pdf', bbox_inches='tight')

## Income

In [None]:
acs_inc = acs.from_state('CA', '^B19001_', level='county', return_geometry=False).set_index('GEOID')
acs_inc = acs_inc.apply(lambda c: c / acs_inc.B19001_001E * 100 if pd.api.types.is_numeric_dtype(c) else c)
acs_inc.columns = [acs.variables.loc[c, 'label'] if c in acs.variables.index else c for c in acs_inc.columns]

In [None]:
acs_inc['under_15k'] = acs_inc[['Estimate!!Total!!Less than $10,000',
       'Estimate!!Total!!$10,000 to $14,999']].sum(axis=1)
acs_inc['15_25k'] = acs_inc[['Estimate!!Total!!$15,000 to $19,999',
       'Estimate!!Total!!$20,000 to $24,999',]].sum(axis=1)
acs_inc['25_50k'] = acs_inc[[
    'Estimate!!Total!!$25,000 to $29,999',
       'Estimate!!Total!!$30,000 to $34,999',
       'Estimate!!Total!!$35,000 to $39,999',
       'Estimate!!Total!!$40,000 to $44,999',
       'Estimate!!Total!!$45,000 to $49,999',
]].sum(axis=1)
acs_inc[['50_100k']] = acs_inc[[
    'Estimate!!Total!!$50,000 to $59,999',
       'Estimate!!Total!!$60,000 to $74,999',
       'Estimate!!Total!!$75,000 to $99,999',
]].sum(axis=1)
acs_inc['over100k'] = acs_inc[[
    'Estimate!!Total!!$100,000 to $124,999',
       'Estimate!!Total!!$125,000 to $149,999',
       'Estimate!!Total!!$150,000 to $199,999',
       'Estimate!!Total!!$200,000 or more'
]].sum(axis=1)

In [None]:
# income is in 2000 dollars for ASIM, rescale to 2017
shh['inc17'] = shh.income * 1.44

In [None]:
shh['inccat'] = pd.cut(shh.inc17, [-np.inf, 14999, 24999, 49999, 99999, np.inf], labels=['under_15k', '15_25k', '25_50k', '50_100k', 'over100k'], right=True)
shh.groupby('inccat').inc17.agg([min, max]).round()

In [None]:
shh_inc = shh.groupby(['county', 'inccat']).size().fillna(0)
shh_inc /= shh.groupby('county').size().reindex(shh_inc.index, level='county')
shh_inc *= 100
shh_inc = shh_inc.unstack()
shh_inc

In [None]:
cols = ['under_15k', '15_25k', '25_50k', '50_100k', 'over100k']

In [None]:
assert np.allclose(acs_inc[cols].sum(axis=1), 100)
assert np.allclose(shh_inc[cols].sum(axis=1), 100)

In [None]:
f, allaxs = plt.subplots(3, 3, figsize=(8.5, 9), gridspec_kw={"width_ratios": [2, 2, 1]})

axs = allaxs[:,:2].reshape(-1)


for ax, cnty in zip(axs, shh_inc.index):
    ax.set_title(county_names[cnty])
    ax.bar(np.arange(5) - 0.2, acs_inc.loc[cnty, cols], color='C0', width=0.4)
    ax.bar(np.arange(5) + 0.2, shh_inc.loc[cnty, cols], color='C1', width=0.4)

    ax.set_ylim(0, 46)
    ax.set_yticks([0, 10, 20, 30, 40])
    ax.set_yticklabels(['0%', '10%', '20%', '30%', '40%'])
    
    ax.set_xticks(np.arange(5))
    ax.set_xticklabels(['<\\$15k', '15–25k', '25–50k', '50–100k', '≥100k'])

gs = allaxs[0, 2].get_gridspec()

for ax in allaxs[:,2]:
    ax.remove()

legax = f.add_subplot(gs[:, 2])
legax.set_axis_off()
legax.legend(handles=[
    mpatch.Patch(color='C0', label='ACS'),
    mpatch.Patch(color='C1', label='Synthesized'),
], loc='upper center')
    
plt.tight_layout()

plt.savefig('../../dissertation/fig/popsyn/cmp_inc.pdf')

## hh size

In [None]:
acs_hhs = acs.from_state('CA', '^B25009_', level='county', return_geometry=False).set_index('GEOID')
acs_hhs = acs_hhs.apply(lambda c: c / acs_hhs.B25009_001E * 100 if pd.api.types.is_numeric_dtype(c) else c)
acs_hhs.columns = [acs.variables.loc[c, 'label'] if c in acs.variables.index else c for c in acs_hhs.columns]

In [None]:
acs_hhs.columns

In [None]:
acs_hhs['ownocc_1pers'] = acs_hhs['Estimate!!Total!!Owner occupied!!1-person household']
acs_hhs['ownocc_2pers'] = acs_hhs['Estimate!!Total!!Owner occupied!!2-person household']
acs_hhs['ownocc_3pers'] = acs_hhs['Estimate!!Total!!Owner occupied!!3-person household']
acs_hhs['ownocc_4pers'] = acs_hhs['Estimate!!Total!!Owner occupied!!4-person household']
acs_hhs['ownocc_5pluspers'] = acs_hhs[[
     'Estimate!!Total!!Owner occupied!!5-person household',
       'Estimate!!Total!!Owner occupied!!6-person household',
       'Estimate!!Total!!Owner occupied!!7-or-more person household',
]].sum(axis=1)

acs_hhs['rent_1pers'] = acs_hhs['Estimate!!Total!!Renter occupied!!1-person household']
acs_hhs['rent_2pers'] = acs_hhs['Estimate!!Total!!Renter occupied!!2-person household']
acs_hhs['rent_3pers'] = acs_hhs['Estimate!!Total!!Renter occupied!!3-person household']
acs_hhs['rent_4pers'] = acs_hhs['Estimate!!Total!!Renter occupied!!4-person household']
acs_hhs['rent_5pluspers'] = acs_hhs[[
     'Estimate!!Total!!Renter occupied!!5-person household',
       'Estimate!!Total!!Renter occupied!!6-person household',
       'Estimate!!Total!!Renter occupied!!7-or-more person household',
]].sum(axis=1)

In [None]:
own_cols = ['ownocc_1pers', 'ownocc_2pers', 'ownocc_3pers', 'ownocc_4pers', 'ownocc_5pluspers']
rent_cols = ['rent_1pers', 'rent_2pers', 'rent_3pers', 'rent_4pers', 'rent_5pluspers']

In [None]:
shh['hhsize_cat'] = np.minimum(shh.hhsize, 5)
shh['tencat'] = shh.rent.map({
    False: 'ownocc',
    True: 'rent'
})

In [None]:
shh_hhs = shh.groupby(['county', 'tencat', 'hhsize_cat']).size()
shh_hhs = shh_hhs / shh.groupby('county').size().reindex(shh_hhs.index, level='county')
shh_hhs = shh_hhs.unstack().unstack()
shh_hhs.columns = [f'{ten}_{hhs}pers' if hhs < 5 else f'{ten}_5pluspers' for hhs, ten in shh_hhs.columns]
shh_hhs *= 100

In [None]:
assert np.allclose(acs_hhs[[*own_cols, *rent_cols]].sum(axis=1), 100)
assert np.allclose(shh_hhs[[*own_cols, *rent_cols]].sum(axis=1), 100)

In [None]:
f, axs = plt.subplots(6, 3, figsize=(8.5, 9), gridspec_kw={"width_ratios": [2, 2, 1]})

for ctyidx, county in enumerate(spop_age.index):
    county_name = county_names[county]
    for lbl, cols in [('Rent', rent_cols), ('Own', own_cols)]:
        ax = axs[ctyidx, 1 if lbl == 'Own' else 0]
        #ax.set_axis_off()
        
        if ctyidx == 0:
            ax.set_title(lbl)
            
        if lbl == 'Rent':
            ax.set_ylabel(county_name)
        
        ax.bar(np.arange(len(cols)) + 0.2, shh_hhs.loc[county, cols], width=0.4, color='C1', label='Synthesized')
        ax.bar(np.arange(len(cols)) - 0.2, acs_hhs.loc[county, cols], width=0.4, color='C0', label='ACS')
        
        ax.set_xticks(np.arange(5))
        ax.set_xticklabels(['1', '2', '3', '4', '≥5'])
        
        ax.set_ylim(0, 22)
        ax.set_yticks([0, 10, 20])
        ax.set_yticklabels(['0%', '10%', '20%'])
        
# make legend
# https://matplotlib.org/stable/gallery/subplots_axes_and_figures/gridspec_and_subplots.html
gs = axs[0, 2].get_gridspec()

for ax in axs[:,2]:
    ax.remove()

legax = f.add_subplot(gs[:, 2])
legax.set_axis_off()
legax.legend(handles=[
    mpatch.Patch(color='C0', label='ACS'),
    mpatch.Patch(color='C1', label='Synthesized'),
], loc='upper center')


plt.tight_layout()
plt.savefig('../../dissertation/fig/popsyn/cmp_tenure_hhsize.pdf', bbox_inches='tight')

## Vehicle ownership

In [None]:
acs_veh = acs.from_state('CA', '^B08201_', level='county', return_geometry=False).set_index('GEOID')
acs_veh = acs_veh.apply(lambda c: c / acs_veh.B08201_001E * 100 if pd.api.types.is_numeric_dtype(c) else c)
acs_veh.columns = [acs.variables.loc[c, 'label'] if c in acs.variables.index else c for c in acs_veh.columns]

In [None]:
acs_veh.columns

In [None]:
acs_veh['0veh'] = acs_veh['Estimate!!Total!!No vehicle available']
acs_veh['1veh'] = acs_veh['Estimate!!Total!!1 vehicle available']
acs_veh['2veh'] = acs_veh['Estimate!!Total!!2 vehicles available']
acs_veh['3veh'] = acs_veh['Estimate!!Total!!3 vehicles available']
acs_veh['4veh'] = acs_veh['Estimate!!Total!!4 or more vehicles available']

In [None]:
shh['acsveh'] = np.minimum(shh.auto_ownership, 4)
shh['acsveh'] = shh.acsveh.astype('str') + 'veh'

In [None]:
shh_veh = shh.groupby(['county', 'acsveh']).size()
shh_veh = shh_veh / shh.groupby('county').size().reindex(shh_veh.index, level='county')
shh_veh *= 100
shh_veh = shh_veh.unstack()
shh_veh

In [None]:
veh_cols = ['0veh', '1veh', '2veh', '3veh', '4veh']
assert np.allclose(acs_veh[veh_cols].sum(axis=1), 100)
assert np.allclose(shh_veh[veh_cols].sum(axis=1), 100)

In [None]:
f, allaxs = plt.subplots(3, 3, figsize=(8.5, 9), gridspec_kw={"width_ratios": [2, 2, 1]})

axs = allaxs[:,:2].reshape(-1)


for ax, cnty in zip(axs, shh_inc.index):
    ax.set_title(county_names[cnty])
    ax.bar(np.arange(5) - 0.2, acs_veh.loc[cnty, veh_cols], color='C0', width=0.4)
    ax.bar(np.arange(5) + 0.2, shh_veh.loc[cnty, veh_cols], color='C1', width=0.4)

    ax.set_ylim(0, 43)
    ax.set_yticks([0, 10, 20, 30, 40])
    ax.set_yticklabels(['0%', '10%', '20%', '30%', '40%'])
    
    ax.set_xticks(np.arange(5))
    ax.set_xticklabels(['0', '1', '2', '3', '≥4'])

gs = allaxs[0, 2].get_gridspec()

for ax in allaxs[:,2]:
    ax.remove()

legax = f.add_subplot(gs[:, 2])
legax.set_axis_off()
legax.legend(handles=[
    mpatch.Patch(color='C0', label='ACS'),
    mpatch.Patch(color='C1', label='Synthesized'),
], loc='upper center')
    
plt.tight_layout()

plt.savefig('../../dissertation/fig/popsyn/cmp_veh.pdf')

## Tract level analysis

Just going to look at income for brevity.

In [None]:
# table S1902, not available from CenPy
acs_meaninc = pd.read_csv('../data/acs_mean_income.csv')
acs_meaninc['geoid'] = acs_meaninc.id.str.slice(-11)
acs_meaninc = acs_meaninc.set_index('geoid')
acs_meaninc.head()

In [None]:
acs_meaninc['meaninc'] = acs_meaninc['Estimate!!Mean income (dollars)!!HOUSEHOLD INCOME!!All households'].replace({'-': np.nan, 'N': np.nan}).astype('float64')

In [None]:
shh_mean_inc = shh.groupby('geoid').inc17.mean()

In [None]:
shh_mean_inc

In [None]:
mean_inc = pd.DataFrame({'acs': acs_meaninc.meaninc.reindex(shh_mean_inc.index), 'synthesized': shh_mean_inc})

In [None]:
tss = np.sum((mean_inc.acs - mean_inc.acs.mean()) ** 2)
rss = np.sum((mean_inc.acs - mean_inc.synthesized) ** 2)
r2 = 1 - rss / tss

f, ax = plt.subplots()

plt.scatter(mean_inc.acs, mean_inc.synthesized, s=0.5, alpha=0.25, color='black')
plt.plot([0, 1000000], [0, 1000000])
plt.xlim(0, 200000)
plt.ylim(0, 200000)
plt.xlabel('Mean income (ACS, 2017 dollars)')
plt.ylabel('Mean income (synthesized, 2017 dollars)')

plt.text(1000, 190000, f'$R^2$: {r2:.2f}')
ax.xaxis.set_major_formatter('{x:,.0f}')
ax.yaxis.set_major_formatter('{x:,.0f}')
plt.savefig('../../dissertation/fig/popsyn/cmp_inc_tract.pdf', bbox_inches='tight')