# Population scenarios

This creates the households and persons files used as input to ActivitySim.

In [None]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

SCENARIO = 'npv_low_opcost'

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import cenpy
import geopandas as gp
import io
import sqlalchemy as sq
import pyarrow.parquet as pq
from tqdm import tqdm, trange

## Get proportion of each property type in each PUMA that are in a particular tract

In [None]:
acs17 = cenpy.products.ACS(2017)

In [None]:
assert acs17.tables.loc['B25127'].description == 'TENURE BY YEAR STRUCTURE BUILT BY UNITS IN STRUCTURE'

In [None]:
counties = ['Los Angeles, CA', 'Orange, CA', 'Ventura, CA', 'Imperial, CA', 'Riverside, CA', 'San Bernardino, CA']

tract_property_counts = pd.concat([acs17.from_county(county, variables='^B25127', level='tract') for county in counties])

### Summarize to PUMA level



In [None]:
tract_to_puma = pd.read_csv('../data/2010_Census_Tract_to_2010_PUMA.csv', dtype='str')

In [None]:
tract_to_puma['tract_geoid'] = tract_to_puma.STATEFP.str.cat(tract_to_puma.COUNTYFP).str.cat(tract_to_puma.TRACTCE)
tract_to_puma['puma_geoid'] = tract_to_puma.STATEFP.str.cat(tract_to_puma.PUMA5CE)

In [None]:
tract_property_counts = tract_property_counts.merge(
    tract_to_puma[['tract_geoid', 'puma_geoid']],
    left_on='GEOID',
    right_on='tract_geoid',
    how='left',
    validate='1:1'
)

# one census tract, 06037137000, in Woodland Hills does not match, because it was accidentally
# deleted in preparing geographies for the 2010 Census: https://www2.census.gov/geo/pdfs/reference/Geography_Notes.pdf
# It's back in the latest ACS but not in the PUMA relationship file, and is split across two PUMAs (see below)
assert not tract_property_counts[tract_property_counts.GEOID != '06037137000'].puma_geoid.isnull().any()

In [None]:
pumas = gp.read_file('/Volumes/Pheasant Ridge/IPUMS/pumas/socal_pumas_projected.shp')

In [None]:
f, ax = plt.subplots(figsize=(12, 8))
pumas[
    pumas.to_crs(epsg=4326).geometry.overlaps(tract_property_counts.loc[tract_property_counts.GEOID == '06037137000'].to_crs(epsg=4326).geometry.iloc[0])
].to_crs(epsg=26943).plot(ax=ax, color=['pink', 'red'])
tract_property_counts.loc[tract_property_counts.GEOID == '06037137000'].to_crs(epsg=26943).plot(ax=ax, color='blue', alpha=0.5)

In [None]:
# dump for mapping in geojson.io
overlap_pumas = pumas[
        pumas.to_crs(epsg=4326).geometry.overlaps(tract_property_counts.loc[tract_property_counts.GEOID == '06037137000'].to_crs(epsg=4326).geometry.iloc[0])
    ].to_crs(epsg=4326)[['geometry', 'PUMA']].reset_index()
features = pd.concat([
    overlap_pumas, 
    tract_property_counts.loc[tract_property_counts.GEOID == '06037137000', ['geometry', 'puma_geoid', 'tract_geoid']].to_crs(epsg=4326).reset_index()
], ignore_index=True)

features.to_file('broken_tract.geojson', driver='GeoJSON')

In [None]:
# looks like the tract in questions is split roughly evenly between the two PUMAs. Slice in half and put it in both
split_tract = tract_property_counts[tract_property_counts.GEOID == '06037137000'].copy()
split_tract[[i for i in split_tract.columns if i.startswith('B25127')]] /= 2
split_tract = pd.concat([split_tract, split_tract], ignore_index=True)
split_tract['puma_geoid'] = '06' + overlap_pumas.PUMA.to_numpy()
assert not split_tract.puma_geoid.isnull().any()
assert split_tract.puma_geoid.iloc[0] != split_tract.puma_geoid.iloc[1]
tract_property_counts = pd.concat([
    tract_property_counts[tract_property_counts.GEOID != '06037137000'],
    split_tract
], ignore_index=True)
assert not tract_property_counts.puma_geoid.isnull().any()

In [None]:
overlap_pumas

### Summarize into the property type categories used in the modeling

In [None]:
collabels = acs17.variables.loc[[i for i in split_tract.columns if i.startswith('B25127')]].label.str.split('!!', expand=True)[[2, 3, 4]].fillna('')
with pd.option_context('display.max_colwidth', 200, 'display.max_rows', 500):
    display(collabels)

In [None]:
sfh_new_cols = [
    'B25127_004E',  # owner occupied, 2010 or later
    'B25127_011E',  # OO, 2000-2009
    'B25127_047E',  # rented, 2010 or later
    'B25127_054E'   # rented, 2000-2009
]

sfh_old_cols = [
    'B25127_018E',  # owner occupied, 1980-1999
    'B25127_025E',  # owner occupied, 1960-1979
    'B25127_032E',  # owner occupied, 1940-1959
    'B25127_039E',  # owner occupied, pre-1939
    'B25127_061E',  # rented, 1980-1999
    'B25127_068E',  # rented, 1960-1979
    'B25127_075E',  # rented, 1940-1959
    'B25127_082E'   # rented, 1939 or earlier
]

mfh_new_cols = [
    'B25127_005E', # owner occ, 2010 or later, varying unit counts
    'B25127_006E',
    'B25127_007E',
    'B25127_008E',
    'B25127_012E', # owner occ, 2000-2009, varying unit counts
    'B25127_013E',
    'B25127_014E',
    'B25127_015E',
    'B25127_048E', # renter occ, 2010 or later, varying unit counts
    'B25127_049E',
    'B25127_050E',
    'B25127_051E',
    'B25127_055E', # renter occ, 2000-2009, varying unit counts
    'B25127_056E',
    'B25127_057E',
    'B25127_058E'
]

mfh_old_cols = [
    'B25127_019E', # owner occ, 1980-1999, varying unit counts
    'B25127_020E',
    'B25127_021E',
    'B25127_022E',
    'B25127_026E', # owner occ, 1960-1979, varying unit counts
    'B25127_027E',
    'B25127_028E',
    'B25127_029E',
    'B25127_033E', # owner occ, 1940-1959, varying unit counts
    'B25127_034E',
    'B25127_035E',
    'B25127_036E',
    'B25127_040E', # owner occ, 1939 or earlier, varying unit counts
    'B25127_041E',
    'B25127_042E',
    'B25127_043E',
    'B25127_062E', # renter occ, 1980-1999, varying unit counts
    'B25127_063E',
    'B25127_064E',
    'B25127_065E',
    'B25127_069E', # renter occ, 1960-1979, varying unit counts
    'B25127_070E',
    'B25127_071E',
    'B25127_072E',
    'B25127_076E', # renter occ, 1940-1959, varying unit counts
    'B25127_077E',
    'B25127_078E',
    'B25127_079E',
    'B25127_083E', # renter occ, 1939 or earlier, varying unit counts
    'B25127_084E',
    'B25127_085E',
    'B25127_086E'
]

#### Sanity checks

In [None]:
collabels.loc[sfh_new_cols]

In [None]:
collabels.loc[sfh_old_cols]

In [None]:
collabels.loc[mfh_new_cols]

In [None]:
collabels.loc[mfh_old_cols]

In [None]:
tract_property_counts['sfh_new'] = tract_property_counts[sfh_new_cols].sum(axis=1)
tract_property_counts['sfh_old'] = tract_property_counts[sfh_old_cols].sum(axis=1)
tract_property_counts['mfh_new'] = tract_property_counts[mfh_new_cols].sum(axis=1)
tract_property_counts['mfh_old'] = tract_property_counts[mfh_old_cols].sum(axis=1)

In [None]:
# This should be close to but not exactly 1, and never more than 1, because we've excluded mobile homes, boats, rvs etc
(
    (tract_property_counts.sfh_new + tract_property_counts.sfh_old + tract_property_counts.mfh_new + tract_property_counts.mfh_old)
    / tract_property_counts.B25127_001E
).describe()

## Update with scenario

This is where I will apply the tract level scenario.

In [None]:
# 100% sample of ACS
phys_housing_totals = ((
    tract_property_counts.groupby(['puma_geoid', 'GEOID'])[['sfh_new', 'sfh_old', 'mfh_new', 'mfh_old']]
        .sum()
        .stack()).rename('n_hhs')
    .round().astype('int').reset_index().rename(columns={'GEOID': 'tract_geoid'}))
phys_housing_totals['puma'] = phys_housing_totals.puma_geoid.str.slice(2)
phys_housing_totals['phys_housing'] = (phys_housing_totals.puma + '_').str.cat(phys_housing_totals.level_2.str.replace('sfh', 'SF').str.replace('mfh', 'MF'))
phys_housing_totals['phys_housing_nopuma'] = phys_housing_totals.level_2.str.replace('sfh', 'SF').str.replace('mfh', 'MF')
phys_housing_totals = phys_housing_totals.set_index('phys_housing')
phys_housing_totals = phys_housing_totals.reset_index().drop(columns=['level_2', 'puma_geoid'])


In [None]:
phys_housing_totals

In [None]:
scenarios = pd.read_parquet('../../construction/data/npv_tract_scenarios.parquet')

In [None]:
scenario_chgs = scenarios[SCENARIO].copy().reset_index()
scenario_chgs[['tract_geoid', 'phys_housing_nopuma']] = scenario_chgs['index'].str.slice(0, 18).str.split('_', 1, expand=True)
scenario_chgs

In [None]:
scenario_chgs.phys_housing_nopuma.value_counts()

In [None]:
scenario_chgs = scenario_chgs.groupby(['tract_geoid', 'phys_housing_nopuma'])[SCENARIO].sum()
scenario_chgs.head(25)

In [None]:
len(scenario_chgs)

In [None]:
12903 / 4

In [None]:
# this very special tract matches to *two* pumas... break housing change across pumas
scenario_chgs.loc['06037137000'] /= 2

In [None]:
phys_housing_totals = phys_housing_totals.merge(pd.DataFrame(scenario_chgs.rename('scenario')), left_on=['tract_geoid', 'phys_housing_nopuma'], right_index=True, how='left', validate='m:m')

In [None]:
# okay for some to be null, no SF zoning in those locations perhaps (TODO figure this out)
phys_housing_totals.scenario.isnull().mean()

In [None]:
phys_housing_totals['scenario'] = phys_housing_totals.scenario.fillna(0)

In [None]:
orig_n_hhs = phys_housing_totals.n_hhs.sum()

In [None]:
phys_housing_totals['n_hhs'] += phys_housing_totals.scenario

In [None]:
# a handful of tracts have no ACS homes but do have homes in gp16 - patch up
phys_housing_totals['n_hhs'] = np.maximum(phys_housing_totals.n_hhs, 0)
# TODO scaling back not truly necessary here, but doing to match sorting model - should I?
phys_housing_totals['n_hhs'] *= orig_n_hhs / phys_housing_totals.n_hhs.sum()
phys_housing_totals['n_hhs'] = phys_housing_totals.n_hhs.round().astype('int64')


In [None]:
phys_housing_totals.n_hhs.sum() - orig_n_hhs

In [None]:
phys_housing_totals.to_parquet(f'../data/{SCENARIO}_proportion_of_puma.parquet')

Now, move over to Julia to run the core of the algorithm!