# Extract data for full Los Angeles simulation

In [None]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
%load_ext autoreload
%autoreload 1

import pandas as pd
import numpy as np
import sqlalchemy as sq
import sys
import matplotlib.pyplot as plt
import geopandas as gp
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from factor_analyzer import FactorAnalyzer
import factor_analyzer

CAP_RATE = 0.04424375

In [None]:
plt.style.use('asu-light')

In [None]:
# connect to database
conn = sq.create_engine('sqlite:////Volumes/Pheasant Ridge/IPUMS/scag_sorting_5yr_abm/scag_sorting_5yr_abm.db')

In [None]:
ipums = pd.read_sql('''SELECT
serial,
year,
numprec,
hhwt,
puma,
ownershp,
rentgrs,
hhincome,
builtyr2,
unitsstr,
rooms,
bedrooms,
vehicles,
multgen,
educ,
empstat,
indnaics,
bpl,
valueh,
race,
hispan,
age,
pwpuma00 AS workpuma,
pwstate2 AS workstate,
pwcounty AS workcounty,
tranwork
FROM ipums
''', conn)

In [None]:
ipums.ownershp.value_counts() # what does N/A mean?

In [None]:
ipums = ipums[ipums.ownershp != 'N/A'].copy()

## Person-level variables

Summarize person-level variables to household level.

In [None]:
ipums['age'] = ipums.age.replace({'Less than 1 year old': 0, '90 (90+ in 1980 and 1990)': 90}).astype('float64')

In [None]:
ipums['university'] = ipums.educ.isin(['4 years of college', '5+ years of college'])
ipums['worker'] = ipums.empstat == 'Employed'

In [None]:
ipums['immigrant'] = ~ipums.bpl.isin(['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut',
                                           'Delaware', 'District of Columbia', 'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana',
                                           'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan',
                                           'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey',
                                           'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania',
                                           'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia',
                                           'Washington', 'West Virginia', 'Wisconsin', 'Wyoming', 'U.S. Virgin Islands', 'Atlantic Islands', 'Guam',
                                          'Puerto Rico', 'Pacific Islands', 'American Samoa', 'Other US Possessions'])

In [None]:
ipums.bpl[ipums.immigrant].unique()

In [None]:
ipums.race.unique()

In [None]:
# avoid creating multiindex below when summing and doing an all by just creating two columns
ipums['nonhispwhite_count'] = ipums['nonhispwhite_all'] = (ipums.hispan == 'Not Hispanic') & (ipums.race == 'White')

In [None]:
ipums['child'] = ipums.age < 18
ipums['senior'] = ipums.age >= 55

In [None]:
grpd = ipums.groupby('serial')
hhs = pd.DataFrame({
    'university': grpd.university.any(),
    'worker': grpd.worker.sum(),
    'immigrant': grpd.immigrant.any(),
    'child': grpd.child.any(),
    'senior': grpd.senior.any(),
    'nonhispwhite_count': grpd.nonhispwhite_count.sum(),
    'nonhispwhite_all': grpd.nonhispwhite_all.all(),
    'year': grpd.year.first(),
    'numprec': grpd.numprec.first(),
    'hhwt': grpd.hhwt.first(),
    'puma': grpd.puma.first(),
    'ownershp': grpd.ownershp.first(),
    'rentgrs': grpd.rentgrs.first(),
    'valueh': grpd.valueh.first(),
    'hhincome': grpd.hhincome.first(),
    'builtyr2': grpd.builtyr2.first(),
    'unitsstr': grpd.unitsstr.first(),
    'rooms': grpd.rooms.first(),
    'bedrooms': grpd.bedrooms.first(),
    'vehicles': grpd.vehicles.first(),
    'multgen': grpd.multgen.first()
})

In [None]:
# remove mobile homes, other
hhs = hhs[~hhs.unitsstr.isin(['Mobile home or trailer',
       'Boat, tent, van, other'])].copy()

In [None]:
hhs['sfmf'] = hhs.unitsstr.map({
    '1-family house, attached': 'SF',
     '1-family house, detached': 'SF',
     '10-19 family building': 'MF',
     '2-family building': 'MF',
     '20-49 family building': 'MF',
     '3-4 family building': 'MF',
     '5-9 family building': 'MF',
     '50+ family building': 'MF'
}).astype('category')
assert not hhs.sfmf.isnull().any()


In [None]:
hhs.unitsstr.unique()

In [None]:
len(hhs)

# Make rent and value comparable

Eventually these will be estimated separately, but for now just put them together using the cap rate previously estimated.

In [None]:
hhs['annvalue'] = np.where(hhs.ownershp == 'Rented', hhs.rentgrs * 12, hhs.valueh * CAP_RATE)

In [None]:
hhs['annvalue'].value_counts()

## Breakpoints for new/med/old

When does a property lose its 'new' value? Estimate a simple hedonic to find out.

In [None]:
hhs['builtyr2'] = hhs.builtyr2.astype('category')
hhs.builtyr2.cat.categories

In [None]:
hhs['builtyrmid'] = hhs.builtyr2.map({
    '1939 or earlier': 1939,
    '1940-1949': 1945, 
    '1950-1959': 1955,
    '1960-1969': 1965,
    '1970-1979': 1975,
    '1980-1989': 1985,
    '1990-1994 (1990-1999 in the 2005-onward ACS and the PRCS)': 1995,
    '2000-2004 (1999-2002 in the 2000-2002 ACS)': 2002,
    '2005 (2005 or later in datasets containing 2005, 2006, or 2007 ACS/PRCS data)': 2005,
    '2006': 2006, '2007': 2007, '2008': 2008, '2009': 2009, '2010': 2010, '2011': 2011, '2012': 2012, '2013': 2013, '2014': 2014,
       '2015': 2015, '2016': 2016, '2017': 2017}).astype('int64')
hhs['appxage'] = pd.cut(hhs.year.astype('int64') - hhs.builtyrmid, [-1, 5, 10, 15, 30, 40, 50, 60, 70, 100_000])
assert not hhs.appxage.isnull().any()

In [None]:
plt.hist(hhs[hhs.ownershp == 'Rented'].annvalue, alpha=0.15, color='red', bins=20)
plt.hist(hhs[hhs.ownershp != 'Rented'].annvalue, alpha=0.15, color='blue', bins=20)

In [None]:
hhs[hhs.ownershp == 'Rented'].annvalue.mean()

In [None]:
hhs[hhs.ownershp != 'Rented'].annvalue.mean()

In [None]:
hhs.appxage.unique()

In [None]:
exog = pd.get_dummies(hhs[[
    'appxage',
    'bedrooms',
    'ownershp',
    'sfmf',
    'puma'
]]).drop(columns=['appxage_(70, 100000]', 'bedrooms_No bedrooms', 'sfmf_MF', 'ownershp_Rented', 'puma_03701'])
hedonic = sm.OLS(hhs.annvalue, sm.add_constant(exog))
hedonic_fit = hedonic.fit()


In [None]:
hhs.puma.unique()

In [None]:
# check vifs
#vifs = pd.Series([variance_inflation_factor(hedonic.exog, i) for i in range(len(hedonic.exog_names))], index=hedonic.exog_names)
#vifs

In [None]:
# with pd.option_context('display.max_rows', None):
#     display(vifs)

In [None]:
hedonic_fit.summary()

In [None]:
params = [f'appxage_{i}' for i in hhs.appxage.cat.categories][:-1]
plt.figure(figsize=(10, 6))
plt.errorbar(np.arange(len(params) + 1), [*hedonic_fit.params[params], 0], yerr=[*(hedonic_fit.bse[params] * 1.96), 0])
plt.xticks(np.arange(len(params) + 1), [f'{i.left + 1}–{i.right}' if i.right < 10000 else '71 or more' for i in hhs.appxage.cat.categories], rotation='vertical')
plt.xlabel('Age of structure (approximate)')
plt.ylabel('Cost premium (dollars, relative to more than 70 years)')
plt.savefig('../../dissertation/fig/sorting/value_age.pdf', bbox_inches='tight')

In [None]:
hhs.appxage.value_counts()

In [None]:
hhs['agecat'] = pd.cut(hhs.year.astype('int64') - hhs.builtyrmid, [-1, 15, np.inf], labels=['new', 'old'])

In [None]:
pd.crosstab(hhs.builtyr2, hhs.agecat)

In [None]:
hhs['ownrent'] = hhs.ownershp.map({
    'Owned or being bought (loan)': 'own',
    'Rented': 'rent'
})
assert not hhs.ownrent.isnull().any()

In [None]:
hhs['choice'] = hhs.puma.str.cat(hhs.sfmf, sep='_').str.cat(hhs.agecat, sep='_').str.cat(hhs.ownrent, sep='_')
len(hhs.choice.unique())

In [None]:
hhs.choice.value_counts().describe()

In [None]:
len(hhs)

## Neighborhood attributes

Add some neighborhood attributes.

### School quality

2012 STAR tests (...ah that brings back memories): https://star.cde.ca.gov/star2012/ResearchFileList.aspx?rf=True&ps=True

Spatial data: http://www.californiaschoolcampusdatabase.org/#download

In [None]:
testResults = pd.read_csv('../data/schoolquality/ca2012_1_csv_v3.txt')
entities = pd.read_csv('../data/schoolquality/ca2012entities_csv.txt')
entities = entities[entities['Type Id'] == 7].copy() # only public, not charter

In [None]:
schools = gp.read_file('../data/schoolquality/CSCD_school_centroids.shp')

In [None]:
entities['CDSCode'] = entities['County Code'].astype('str').str.zfill(2) +\
    entities['District Code'].astype('str').str.zfill(5) +\
    entities['School Code'].astype('str').str.zfill(7)

In [None]:
# merge with spatial data
# Note that some schools have more than one physical location, so are represented more than once in the output
schools = schools.merge(entities, on='CDSCode', how='inner', validate='m:1')

In [None]:
testResults['CDSCode'] = testResults['County Code'].astype('str').str.zfill(2) +\
    testResults['District Code'].astype('str').str.zfill(5) +\
    testResults['School Code'].astype('str').str.zfill(7)

In [None]:
# Use fifth grade math because it's always done in elementary school (the most local school type)
# and everyone takes it pretty much
fifthGradeMath = testResults.loc[(testResults['Test Id'] == 8) & (testResults.Grade == 5)].copy()

In [None]:
schools = schools.merge(fifthGradeMath, on='CDSCode', how='inner', validate='m:1')
schools = schools[schools['Percentage At Or Above Proficient'] != '*'].copy() # drop schools with missing data (usually less than ten students took test)

In [None]:
schools['Percentage At Or Above Proficient'] = schools['Percentage At Or Above Proficient'].astype('float64')

In [None]:
plt.hist(schools['Percentage At Or Above Proficient'], bins=30)

In [None]:
# aggregate to PUMAs
pumas = gp.read_file('/Volumes/Pheasant Ridge/IPUMS/pumas/ipums_puma_2010.shp')
pumas = pumas[pumas.STATEFIP == '06'].to_crs(epsg=4326).copy()

In [None]:
schools = schools.to_crs(epsg=4326)

In [None]:
schools = gp.sjoin(schools, pumas, op='within')

In [None]:
pumaScores = schools.groupby('PUMA')['Percentage At Or Above Proficient'].median()

In [None]:
pumas['score'] = pumaScores.loc[pumas.PUMA].values

In [None]:
pumas.plot(column='score', legend=True)
plt.xlim(-119, -117)
plt.ylim(33, 35)

In [None]:
hhs['fifthGradeMathMedianProficient'] = pumaScores.loc[hhs.puma].values

In [None]:
hhs['bedrooms'] = hhs.bedrooms.map({
    '3': 3, '2': 2, '1': 1, 'No bedrooms': 0, '4 (1970-2000, 2000-2007 ACS/PRCS)': 4,
       '5+ (1970-2000, 2000-2007 ACS/PRCS)': 5, '10': 10, '7': 7, '9': 9
}).astype('int64')

## SLD data

In [None]:
# from my new residents transit project
pumaSld = pd.read_csv('../../../new_residents_transit/data/sld_2010.csv', dtype={'stpuma': 'str'})

In [None]:
pumaSld['state'] = pumaSld.stpuma.str.slice(0, 2)
pumaSld['puma'] = pumaSld.stpuma.str.slice(2)
pumaSld = pumaSld[pumaSld.state == '06'].copy()

In [None]:
hhs = hhs.reset_index().merge(pumaSld[['jobAccessAuto', 'intersectionDens', 'puma']], on='puma', how='left', validate='m:1')\
    .set_index('serial')

In [None]:
assert not hhs.choice.isnull().any()

In [None]:
hhs['numprec'] = hhs.numprec.replace({
    '1 person record': 1
}).astype('int64')

## Transit access

From University of Minnesota Accessibility Observatory.

In [None]:
tracc = pd.read_csv('../data/ao/transitaccess.csv', dtype={'geoid': 'str'})

In [None]:
tracc = tracc.dropna(subset=['threshold'])

In [None]:
tracc.head()

In [None]:
# MABLE block to puma mapping, weighted by housing units
blockpuma = pd.read_csv('../data/mable_puma_block_hu.csv', skiprows=[1], dtype={'state': 'str', 'county': 'str', 'puma12': 'str', 'tract': 'str', 'block': 'str'})
blockpuma['blockgeoid'] = blockpuma.county.str.cat(blockpuma.tract.str.replace('.', '')).str.cat(blockpuma.block)
blockpuma.head()

In [None]:
tracc = tracc.merge(blockpuma, left_on='geoid', right_on='blockgeoid', how='inner', validate='1:1')

In [None]:
# summarize to PUMAs weighting by housing units in block in 2010
pumaTracc = pd.DataFrame(tracc.groupby('puma12').apply(lambda df: np.average(df.jobs, weights=df.hus10)).rename('jobacc30'))

In [None]:
pumaTracc.head()

In [None]:
hhs = hhs.merge(pumaTracc, left_on='puma', right_index=True, how='left', validate='m:1')

In [None]:
assert not hhs.jobacc30.isnull().any()

## Retail density

For every tract, sum all jobs within the tract or within tracts within 1500 meters, and divide by the area of the buffer as calculated by land area only.

In [None]:
lodes = pd.read_csv('../data/lodes/ca_wac_S000_JT00_2017.csv', dtype={'w_geocode': 'str'})
lodes['tract'] = lodes.w_geocode.str.slice(0, 11)
lodes.head()

In [None]:
tractlodes = pd.DataFrame(lodes.groupby('tract').CNS07.sum().rename('retailjobs')) # retail trade
tractlodes.head()

In [None]:
# find all retail within 1500m of tract boundaries
tracts = gp.read_file('../data/tracts/tl_2019_06_tract.shp')
tracts = tracts[tracts.COUNTYFP.isin(['025', '037', '059', '065', '071', '111'])].to_crs(epsg=26943)

In [None]:
buffered = tracts.copy()
buffered['geometry'] = buffered.buffer(1500)

In [None]:
assert not buffered.GEOID.duplicated().any()

In [None]:
neighbors = gp.sjoin(buffered, tracts, op='intersects')

In [None]:
neighbors = neighbors.merge(tractlodes, left_on='GEOID_right', right_index=True, how='left', validate='m:1')

In [None]:
tractlocalretail = neighbors[['GEOID_left', 'retailjobs', 'ALAND_right']].groupby('GEOID_left').agg(np.sum)
tractlocalretail['retailDensJobsSqKm'] = tractlocalretail.retailjobs / tractlocalretail.ALAND_right * 1e6
tractlocalretail.head()

In [None]:
tracthsg = pd.read_csv('../data/tract_housing/ACS_17_5YR_DP04_with_ann.csv', skiprows=[1], dtype={'GEO.id2': 'str'})
tracthsgheader = pd.read_csv('../data/tract_housing/ACS_17_5YR_DP04_with_ann.csv', nrows=2, header=None).transpose()
tracthsgheader.head()

In [None]:
tractlocalretail = tractlocalretail.merge(tracthsg[['GEO.id2', 'HC01_VC03']].rename(columns={'HC01_VC03': 'totalHousingUnits'}),
                                         left_index=True, right_on='GEO.id2', how='left', validate='1:1')

In [None]:
# read the MABLE geographic crosswalk
tractPumaXwalk = pd.read_csv('../data/mable_tracts_pumas.csv', skiprows=[1], dtype=str)

In [None]:
tractPumaXwalk['tract_geoid'] = tractPumaXwalk.county.str.cat(tractPumaXwalk.tract.str.replace('.', ''))
tractPumaXwalk['puma_geoid'] = tractPumaXwalk.state.str.cat(tractPumaXwalk.puma12)
assert np.all(tractPumaXwalk.afact == '1 ') # all tracts should be completely in one puma
tractPumaXwalk.head()

In [None]:
tractlocalretail = tractlocalretail.merge(tractPumaXwalk[['tract_geoid', 'puma_geoid']], left_on='GEO.id2', right_on='tract_geoid',
                                          how='left', validate='1:1')

In [None]:
tractlocalretail.head()

In [None]:
pumalocalretail = pd.DataFrame(tractlocalretail.groupby('puma_geoid')\
                               .apply(lambda df: np.average(df.retailDensJobsSqKm, weights=df.totalHousingUnits))\
                               .rename('retailJobDensSqKm')).reset_index()
pumalocalretail['puma'] = pumalocalretail.puma_geoid.str.slice(2)
pumalocalretail.head()

In [None]:
hhs = hhs.reset_index().merge(pumalocalretail[['retailJobDensSqKm', 'puma']], on='puma', how='left', validate='m:1').set_index('serial')

In [None]:
alts = hhs.groupby('choice', as_index=False).agg({
    'fifthGradeMathMedianProficient': lambda x: x.iloc[0],
    'bedrooms': np.mean,
    'jobAccessAuto': lambda x: x.iloc[0],
    'intersectionDens': lambda x: x.iloc[0],
    'annvalue': np.median,
    'sfmf': lambda x: x.iloc[0],
    'retailJobDensSqKm': lambda x: x.iloc[0],
    'jobacc30': lambda x: x.iloc[0]
})

## Factor-analyze density variables

To reduce collinearity.

In [None]:
alts['puma'] = alts.choice.str.slice(0, 5)

In [None]:
puma_dens = alts.groupby('puma')[['retailJobDensSqKm', 'jobacc30', 'jobAccessAuto', 'intersectionDens']].first()
puma_dens.head()

Alternately, could use a PCA to extract a single factor as this may be grasping at straws trying to differentiate these things. I also tried oblimin but correlation between the two factors was >.9. A third varimax factor is basically 0.

In [None]:
faz = FactorAnalyzer(rotation='varimax', n_factors=2)
faz.fit(puma_dens)
loadings = pd.DataFrame(faz.loadings_, index=puma_dens.columns, columns=['regacc', 'locacc'])
# make sure they came out in the expected order and with expected sign
assert loadings.loc['jobAccessAuto', 'regacc'] > 0
assert loadings.loc['retailJobDensSqKm', 'locacc'] > 0
assert loadings.loc['jobAccessAuto', 'regacc'] > loadings.loc['jobAccessAuto', 'locacc']
loadings

In [None]:
pretty_loadings = loadings.where(loadings > 0.3).round(2).fillna('').rename(
    index={
        'retailJobDensSqKm': 'Retail job density near PUMA',
        'jobacc30': 'Access to jobs via transit within 30 minutes',
        'jobAccessAuto': 'Access to jobs via auto within 45 minutes', # TODO confirm time
        'intersectionDens': 'Intersection density'
    },
    columns={
        'regacc': 'Regional access',
        'locacc': 'Local access'
    }
)
pretty_loadings

In [None]:
print(pretty_loadings.to_latex())

In [None]:
puma_dens.corr()

In [None]:
scores = pd.DataFrame(faz.transform(puma_dens), index=puma_dens.index, columns=['regacc', 'locacc'])

In [None]:
np.corrcoef(scores.regacc, scores.locacc)

In [None]:
alts = alts.merge(scores, left_on='puma', right_index=True, how='left', validate='m:1')

## Commute to work

TODO exclude out-of-state workers (data errors?) and telecommuters.

After Tra (2007), just check if the PUMA overlaps the work PUMA - computing distances is too noisy.

In [None]:
# every place of work puma is a collocation of regular PUMAs, and this file from https://usa.ipums.org/usa/volii/10pwpuma.shtml
# maps them to each other
powPumaComposition = pd.read_excel('../data/puma_migpuma1_pwpuma00.xls', skiprows=2).rename(columns={
    'State of Residence (ST)': 'state', 'PUMA': 'puma',
       'Place of Work State (PWSTATE2) or Migration State (MIGPLAC1)': 'powState',
       'PWPUMA00 or MIGPUMA1': 'powPuma'
})

In [None]:
caPowPumaComposition = powPumaComposition.loc[powPumaComposition.state == '06', ['puma', 'powPuma']].copy().astype('int64')
caPowPumaComposition['puma'] = caPowPumaComposition.puma.astype('str').str.zfill(5)
caPowPumaComposition['powPuma'] = caPowPumaComposition.powPuma.astype('str').str.zfill(5)
caPowPumaComposition.head()

In [None]:
ipums['workpuma'] = ipums.workpuma.astype('str').str.zfill(5)

In [None]:
alts = alts.merge(caPowPumaComposition, on='puma', how='left', validate='m:1')

In [None]:
# Compute relevant variables for all individuals
# only count people who commute in person to workplaces in LA County, Orange County
# Originally I was going to include surrounding counties too, but they would have the same effect on all alternatives so would drop out.
# This is much too slow for the full sample. Compute on the fly in eqsormo (later).
# commuters = ipums[ipums.worker & (ipums.tranwork != 'Worked at home') & (ipums.workstate == 'California') &\
#                     ipums.workcounty.isin([25, 37, 59, 65, 71, 111])].copy() # 37: LA county, 59: Orange County
# commuters['forty2'] = 42
# alts['forty2'] = 42
# # cross join
# jtw = commuters[['serial', 'workpuma', 'forty2']].merge(alts.reset_index(), on='forty2', how='left', validate='m:m')

In [None]:
#jtw['liveworksame'] = (jtw.workpuma == jtw.powPuma).astype('float64')

In [None]:
# hhalts = jtw.groupby(['serial', 'choice']).agg({
#     'liveworksame': np.mean
# })

In [None]:
# hhalts.liveworksame.describe()

In [None]:
# alts.set_index('choice', inplace=True)

In [None]:
# reindex to match for non-worker households
# hhalts = hhalts.reindex(pd.MultiIndex.from_product([hhs.index.values, alts.index.values]), fill_value=0)

In [None]:
# hhalts.liveworksame.describe()

## Split alternatives by owner and renter

Pre-interact all terms so we can put them into the model without allowing additional levels of interaction in eqsormo.

In [None]:
hhs['renter'] = hhs.ownershp == 'Renter'

In [None]:
alts['intersectionDens'] /= 100
alts['retailJobDensSqKm'] /= 100
alts['jobAccessAuto'] /= 1e5
alts['fifthGradeMathMedianProficient'] /= 100
alts['jobacc30'] /= 1e5

alts['sfh'] = alts.sfmf == 'SF'

alts['hiregacc'] = alts.regacc >= np.percentile(scores.regacc, 90)
alts['hilocacc'] = alts.locacc >= np.percentile(scores.locacc, 90)

In [None]:
alts['annvalue'] /= 1000

In [None]:
hhs['hhincome'] /= 1000

In [None]:
alts = alts.set_index('choice')

In [None]:
# eliminate households
hhs['annvalue'] = alts.annvalue.loc[hhs.choice].to_numpy()
to_drop = (hhs.hhincome <= hhs.annvalue) | (hhs.hhincome < 15)
print(to_drop.sum())
hhs = hhs[~to_drop].copy()

## Categorical income

In [None]:
hhs.hhincome

In [None]:
hhs['inc_under_50k'] = (hhs.hhincome <  50)
hhs['inc_50_100k'] = ((hhs.hhincome >= 50) & (hhs.hhincome < 100))
hhs['inc_100k_plus'] = (hhs.hhincome >= 100)
assert (hhs[['inc_under_50k', 'inc_50_100k', 'inc_100k_plus']].sum(axis=1) == 1).all()

In [None]:
hhs['inc_under_50k:immigrant'] = hhs.inc_under_50k & hhs.immigrant
hhs['inc_50_100k:immigrant'] = hhs.inc_50_100k & hhs.immigrant
hhs['inc_100k_plus:immigrant'] = hhs.inc_100k_plus & hhs.immigrant

In [None]:
hhs['inc_under_50k:child:university'] = hhs.inc_under_50k & hhs.child & hhs.university
hhs['inc_50_100k:child:university'] = hhs.inc_50_100k & hhs.child & hhs.university
hhs['inc_100k_plus:child:university'] = hhs.inc_100k_plus & hhs.child & hhs.university

hhs['inc_under_50k:child:not_university'] = hhs.inc_under_50k & hhs.child & ~hhs.university
hhs['inc_50_100k:child:not_university'] = hhs.inc_50_100k & hhs.child & ~hhs.university
hhs['inc_100k_plus:child:not_university'] = hhs.inc_100k_plus & hhs.child & ~hhs.university

assert (
    hhs[['inc_under_50k:child:university', 'inc_50_100k:child:university', 'inc_100k_plus:child:university',
        'inc_under_50k:child:not_university', 'inc_50_100k:child:not_university', 'inc_100k_plus:child:not_university']]
    .sum(axis=1) == hhs.child
).all()

In [None]:
hhs['hhwt'] /= 100

In [None]:
hhs['vehchoice'] = hhs.vehicles.replace({
    '1 available': '1',
    '4': '3+', 
    '3': '3+',
    'No vehicles available': '0',
    '5': '3+',
    '6 (6+, 2000, ACS and PRCS)': '3+'
})

# put zero-vehicle households first, so that that is the reference category
hhs = hhs.sort_values('vehchoice')

In [None]:
alts['rent'] = pd.Series(alts.index, index=alts.index).str.endswith('rent')

In [None]:
rent_own_cols = ['fifthGradeMathMedianProficient', 'bedrooms', 'jobAccessAuto',
       'intersectionDens', 'retailJobDensSqKm', 'jobacc30',
        'sfh']

In [None]:
alts[rent_own_cols] = alts[rent_own_cols].astype('float64')
alts['hilocacc'] = alts.hilocacc.astype('float64')
alts['hiregacc'] = alts.hiregacc.astype('float64')

In [None]:
rentals = alts[alts.rent].rename(columns={c: f'{c}:rent' for c in rent_own_cols}).copy()
# we want to preserve the non-interacted versions for the vehicle choice model
rentals['locacc:rent'] = rentals.locacc
rentals['hilocacc:rent'] = rentals.hilocacc
rentals['regacc:rent'] = rentals.regacc
rentals['hiregacc:rent'] = rentals.hiregacc
ownoccs = alts[~alts.rent].rename(columns={c: f'{c}:own' for c in rent_own_cols}).copy()
ownoccs['locacc:own'] = ownoccs.locacc
ownoccs['hilocacc:own'] = ownoccs.hilocacc
ownoccs['regacc:own'] = ownoccs.regacc
ownoccs['hiregacc:own'] = ownoccs.hiregacc

In [None]:
# make sure all nulls are from concat
assert not alts.isnull().any().any()
full_alts = pd.concat([rentals, ownoccs]).fillna(0)

In [None]:
full_alts['hilocacc:rent']

In [None]:
full_alts.drop(columns=['sfmf']).to_parquet('../data/full_alts_fixed_acc.parquet')
hhs.drop(columns=['appxage', 'sfmf']).to_parquet('../data/full_hh_fixed_acc.parquet')

In [None]:
full_alts['sfh:rent']

In [None]:
full_alts.locacc

In [None]:
len(hhs)

In [None]:
hhs.numprec.min()

In [None]:
hhs.hhwt

In [None]:
hhs[['inc_under_50k', 'inc_50_100k', 'inc_100k_plus']].sum()