In [1]:
import os
import pandas as pd
from choicemodels import MultinomialLogit, MultinomialLogitResults
from choicemodels.tools import MergedChoiceTable
import pandana as pdna

In [2]:
import numpy as np

In [3]:
sfbay_counties = [1, 13, 41, 55, 75, 81, 85, 95, 97]

# Load CHTS Data

In [4]:
data_dir = '/home/mgardner/chts-2012/data'
chts_persons = pd.read_csv(os.path.join(data_dir, 'Deliv_PER.csv'), low_memory=False)
chts_persons_lookup = pd.read_csv(os.path.join(data_dir, 'LookUp_PER.csv'))
chts_households = pd.read_csv(os.path.join(data_dir, 'Deliv_HH.csv'), low_memory=False)
chts_households_lookup = pd.read_csv(os.path.join(data_dir, 'LookUp_Home.csv'))
acts = pd.read_csv(os.path.join(data_dir, 'Deliv_ACTIVITY.csv'))
places = pd.read_csv(os.path.join(data_dir, 'Deliv_PLACE.csv'), low_memory=False)
places_lookup = pd.read_csv(os.path.join(data_dir, 'LookUp_PLACE.csv'), low_memory=False)

#### Population Data

In [5]:
chts_persons = pd.merge(
    chts_persons.set_index(['SAMPN','PERNO']),
    chts_persons_lookup.set_index(['SAMPN','PERNO']),
    left_index=True, right_index=True,
    suffixes=('_persons', '_lookup')).reset_index()

chts_persons = pd.merge(
    chts_persons.set_index(['SAMPN']),
    chts_households_lookup.set_index(['SAMPN']),
    left_index=True, right_index=True).reset_index()

#### Restrict to Bay Area Residents

In [6]:
chts_persons = chts_persons[chts_persons['HCTFIP'].isin(sfbay_counties)].reset_index()

In [7]:
chts_persons.shape

(24030, 188)

#### Discretionary Activities

- 1: PERSONAL ACTIVITIES (SLEEPING, PERSONAL CARE, LEISURE, CHORES)
- 4: EXERCISE (WITH OR WITHOUT EQUIPMENT)/PLAYING SPORTS
- 7: USING COMPUTER/TELEPHONE/CELL OR SMART PHONE OR OTHER COMMUNICATIONS DEVICE FOR PERSONAL ACTIVITIES
- 13: NON-WORK RELATED ACTIVITIES (SOCIAL CLUBS, ETC)
- 14: EXERCISE/SPORTS
- 15: VOLUNTEER WORK/ACTIVITIES
- 23: DRIVE THROUGH MEALS (SNACKS, COFFEE, ETC.)
- 24: DRIVE THROUGH OTHER (ATM, BANK)
- 26: SERVICE PRIVATE VEHICLE (GAS, OIL, LUBE, REPAIRS)
- 27: ROUTINE SHOPPING (GROCERIES, CLOTHING, CONVENIENCE STORE, HH MAINTENANCE)
- 28: SHOPPING FOR MAJOR PURCHASES OR SPECIALTY ITEMS (APPLIANCE, ELECTRONICS, NEW VEHICLE, MAJORHH REPAIRS)
- 29: HOUSEHOLD ERRANDS (BANK, DRY CLEANING, ETC.)
- 30: PERSONAL BUSINESS (VISIT GOVERNMENT OFFICE, ATTORNEY, ACCOUNTANT)
- 31: EAT MEAL AT RESTAURANT/DINER
- 32: HEALTH CARE (DOCTOR, DENTIST, EYE CARE, CHIROPRACTOR, VETERINARIAN)
- 33: CIVIC/RELIGIOUS ACTIVITIES
- 34: OUTDOOR EXERCISE (PLAYING SPORTS/JOGGING, BICYCLING, WALKING, WALKING THE DOG, ETC.)
- 35: INDOOR EXERCISE (GYM, YOGA, ETC.)
- 36: ENTERTAINMENT (MOVIES, WATCH SPORTS, ETC)
- 37: SOCIAL/VISIT FRIENDS/RELATIVES

In [8]:
acts = pd.merge(chts_persons, acts, on=['SAMPN','PERNO'], suffixes=('_persons', '_acts'))
acts_disc = acts[acts['APURP'].isin([1, 4, 7, 13, 14, 15, 23, 24, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37])]

#### Restrict to In-Region Activities

In [9]:
places = places[places['STATE'] == 'CA']
places = places[places.CTFIP.isin(sfbay_counties)]

In [10]:
acts_places = pd.merge(
    acts_disc, places[[col for col in places if col not in [
        'PNAME',
        'CITY',
        'ZIP',
        'XCORD',
        'YCORD',
        'CTFIP',
        'TRACT',
        'BLOCK',
        'PPrimaryCity']]], on=['SAMPN', 'PERNO', 'PLANO'], suffixes=('_acts_disc', '_places'))

In [11]:
acts_places = pd.merge(acts_places, places_lookup, on=['SAMPN','PERNO', 'PLANO'], suffixes=('_acts_disc_places', '_lookup'))

In [12]:
acts_places.shape

(86607, 258)

#### Restrict to Out-of-Home Activities

In [13]:
acts_places = acts_places[acts_places['PNAME'] != 'HOME']
acts_places.shape

(32810, 258)

# Load UrbanSim Block-Level Land Use Data

In [14]:
store = pd.HDFStore('/home/mgardner/PILATES/pilates/urbansim/data/custom_mpo_06197001_model_data.h5')
blocks = store['blocks']
nodes = store['nodes']
edges = store['edges']
units = store['residential_units']
hh = store['households']
jobs = store['jobs']
p = store['persons']
store.close()

# Create UrbanSim-based Pandana Network 

In [15]:
nodes.set_index('id', inplace=True)
net=pdna.Network(nodes["x"], nodes["y"], edges["from"], edges["to"],
                 edges[["weight"]])
net.precompute(3000)

Generating contraction hierarchies with 36 threads.
Setting CH node vector of size 92395
Setting CH edge vector of size 232266
Range graph removed 233224 edges of 464532
. 10% . 20% . 30% . 40% . 50% . 60% . 70% . 80% . 90% . 100%


In [16]:
blocks['node_id'] = net.get_node_ids(blocks.x, blocks.y)

# Get Block-Group Skims from BEAM

In [None]:
# all_files = glob.glob("/home/mgardner/PILATES/pilates/beam/beam_output/result*")
# chts_skims = pd.DataFrame()
# for file in tqdm(all_files):
#     skims = pd.read_csv(
#         file, usecols=['timePeriod', 'pathType', 'origin', 'destination','DIST_meters'], low_memory=False)
#     skims = skims[(skims['timePeriod'] == 'AM') & (skims['pathType'] == 'SOV')]
#     chts_skims = pd.concat((chts_skims, skims), ignore_index=True)
# chts_skims.to_csv('../data/chts_skims.csv', index=False)

In [17]:
chts_skims = pd.read_csv('../data/chts_skims.csv')

In [18]:
chts_skims.rename(columns={'origin': 'block_grp', 'destination':'dest_block_grp'}, inplace=True)
chts_skims['block_grp'] = chts_skims['block_grp'].astype(str).str.zfill(12)
chts_skims['dest_block_grp'] = chts_skims['dest_block_grp'].astype(str).str.zfill(12)

# Create Estimation Data

#### Alternatives

In [19]:
units_per_block = units.reset_index().groupby('block_id')[['unit_id']].count()
units_per_block.rename(columns={'unit_id': 'res_units'}, inplace=True)

In [20]:
pp_block = hh.groupby('block_id')['persons'].sum()

In [21]:
retail_jobs = jobs[jobs['sector_id'] == '44-45'].groupby('block_id').count()
total_jobs = jobs.groupby('block_id').count()
total_jobs.rename(columns={'sector_id': 'total_jobs'}, inplace=True)
retail_jobs.rename(columns={'sector_id': 'retail_jobs'}, inplace=True)
jobs = pd.merge(total_jobs, retail_jobs, left_index=True, right_index=True, how='outer').fillna(0)
jobs['pct_retail'] = jobs['retail_jobs'] / jobs['total_jobs']

In [22]:
blocks = pd.merge(blocks, jobs[['pct_retail','total_jobs']], left_index=True, right_index=True, how='left').fillna(0)
blocks = pd.merge(blocks, pp_block, left_index=True, right_index=True, how='left').fillna(0)
blocks = pd.merge(blocks, units_per_block, left_index=True, right_index=True, how='left').fillna(0)
blocks['pop_density'] = blocks['persons'] / blocks['square_meters_land']
blocks['res_units'] = blocks['res_units'].replace(0, 1)
blocks['jobs_per_res_unit'] = blocks['total_jobs'] / blocks['res_units']
blocks = blocks[blocks['square_meters_land'] > 0]

In [23]:
alts_df = blocks[[
    'total_jobs', 'pct_retail', 'persons', 'pop_density', 'res_units', 'jobs_per_res_unit', 'node_id']].copy()

In [24]:
embc_bg_id = '060750105002'
sj_bg_id = '060133551141'
embc_node_id = blocks.loc['060750105002001', 'node_id']
sj_node_id = blocks.loc['060133551141003', 'node_id']
alts_df['dest_block_grp'] = alts_df.index.str[0:12].values
alts_df['dtsj_bg_id'] = sj_bg_id
alts_df['dtsf_bg_id'] = embc_bg_id
alts_df['dtsj_node_id'] = sj_node_id
alts_df['dtsf_node_id'] = embc_node_id

In [25]:
alts_df = alts_df[alts_df['dest_block_grp'].isin(chts_skims['dest_block_grp'])]

In [26]:
alts_df.shape

(106910, 12)

#### Observations

In [27]:
obs_df = pd.merge(
    acts_places[['SAMPN', 'AGE', 'EMPLY', 'HBLOCK', 'BLOCK']],
    chts_households[['SAMPN', 'INCOM', 'HHVEH', 'HHSIZ', 'HHLIC']], on='SAMPN')
obs_df['HBLOCK'] = obs_df['HBLOCK'].astype(str).str.zfill(15)
obs_df['BLOCK'] = obs_df['BLOCK'].astype(str).str.zfill(15)
obs_df.shape

(32810, 9)

In [29]:
obs_df = obs_df[obs_df['BLOCK'].isin(blocks.index)]

In [30]:
obs_df.shape

(32722, 9)

In [31]:
obs_df = obs_df[obs_df['HBLOCK'].isin(blocks.index)]

In [32]:
obs_df.shape

(32690, 9)

In [33]:
obs_df['node_id_home'] = blocks['node_id'].reindex(obs_df['HBLOCK']).astype(int).values

In [34]:
obs_df['inc_lt_75k'] = obs_df['INCOM'] < 6
obs_df['inc_gt_150k'] = obs_df['INCOM'] > 7
obs_df['inc_gt_200k'] = obs_df['INCOM'] > 8
obs_df['age_lt_18'] = obs_df['AGE'] < 18
obs_df['age_gt_65'] = obs_df['AGE'] > 65
obs_df['hhveh_gt_1'] = obs_df['HHVEH'] > 1
obs_df['hhveh_is_1'] = obs_df['HHVEH'] == 1
obs_df['hhveh_is_0'] = obs_df['HHVEH'] == 0
obs_df['hhveh_gt_0'] = obs_df['HHVEH'] > 0
obs_df['veh_pp'] = obs_df['HHVEH'] / obs_df['HHSIZ']
obs_df['veh_pp_gt_1'] = obs_df['veh_pp'] > 1
obs_df['veh_per_driver'] = obs_df['HHVEH'] / obs_df['HHLIC']
obs_df['veh_per_driver_gt_1'] = obs_df['veh_per_driver'] > 1

In [35]:
obs_df['block_grp'] = obs_df['HBLOCK'].str[0:12]

In [36]:
obs_df = obs_df[obs_df['block_grp'].isin(chts_skims['block_grp'])]

In [37]:
obs_df.shape

(32690, 24)

In [38]:
obs_df = obs_df[obs_df['BLOCK'].isin(alts_df.index)]

In [39]:
obs_df.shape

(32690, 24)

In [40]:
obs_df = obs_df[(obs_df['AGE'] < 98) & (obs_df['INCOM'] < 98) & (obs_df['HHSIZ'] < 98)]

In [41]:
obs_df.shape

(28800, 24)

#### Dists to CBD for Alts

In [42]:
chts_skims.set_index(['block_grp', 'dest_block_grp'], inplace=True)

In [43]:
alts_df = pd.merge(alts_df, chts_skims[['DIST_meters']], left_on=['dest_block_grp', 'dtsj_bg_id'], right_index=True, how='left')
alts_df.rename(columns={'DIST_meters': 'DIST_DTSJ'}, inplace=True)
alts_df = pd.merge(alts_df, chts_skims[['DIST_meters']], left_on=['dest_block_grp', 'dtsf_bg_id'], right_index=True, how='left')
alts_df.rename(columns={'DIST_meters': 'DIST_DTSF'}, inplace=True)
alts_df['dist_to_nearest_cbd'] = alts_df[['DIST_DTSF', 'DIST_DTSJ']].min(axis=1)

In [44]:
alts_df['pdna_dtsj_dist'] = net.shortest_path_lengths(alts_df['node_id'], alts_df['dtsj_node_id'])
alts_df['pdna_dtsf_dist'] = net.shortest_path_lengths(alts_df['node_id'], alts_df['dtsf_node_id'])

In [45]:
alts_df['pdna_dist_to_nearest_cbd'] = alts_df[['pdna_dtsj_dist', 'pdna_dtsf_dist']].min(axis=1)
alts_df['dist_to_nearest_cbd'] = alts_df['dist_to_nearest_cbd'].replace(np.inf, np.nan)
alts_df['pdna_dist_to_nearest_cbd'] = alts_df['pdna_dist_to_nearest_cbd'].replace(np.inf, np.nan)
alts_df.loc[pd.isnull(alts_df['dist_to_nearest_cbd']), 'dist_to_nearest_cbd'] = alts_df.loc[pd.isnull(alts_df['dist_to_nearest_cbd']), 'pdna_dist_to_nearest_cbd']

# Create Merged Choice Table 

In [46]:
sample_size = 1000

In [47]:
%%time 
mct = MergedChoiceTable(
    obs_df, alts_df, chosen_alternatives='BLOCK', sample_size=sample_size, interaction_terms=chts_skims[['DIST_meters']])

CPU times: user 30.5 s, sys: 7.93 s, total: 38.5 s
Wall time: 38.4 s


In [48]:
mct_df = mct.to_frame().reset_index()

#### Interaction Terms (Chooser-Alt Distances)

In [49]:
%%time
dists = net.shortest_path_lengths(mct_df['node_id_home'], mct_df['node_id'])

CPU times: user 22min 31s, sys: 1.29 s, total: 22min 33s
Wall time: 46.6 s


In [50]:
mct_df['pdna_dists'] = dists

In [51]:
mct_df.loc[mct_df['pdna_dists'] > 1e6, 'pdna_dists'] = np.nan

In [52]:
# replace NaN BEAM skims with Pandana distances
mct_df.loc[pd.isnull(mct_df['DIST_meters']), 'DIST_meters'] = mct_df.loc[pd.isnull(mct_df['DIST_meters']), 'pdna_dists']
mct_df.loc[pd.isnull(mct_df['pdna_dists']), 'pdna_dists'] = mct_df.loc[pd.isnull(mct_df['pdna_dists']), 'DIST_meters']
mct_df.loc[mct_df['pdna_dists'] == 0, 'pdna_dists'] = mct_df.loc[mct_df['pdna_dists'] == 0, 'DIST_meters']

In [53]:
# impute missing distances
counts = mct_df.groupby('obs_id')[['DIST_meters']].count()
mean_dists = mct_df.groupby('obs_id')[['DIST_meters']].mean()
mean_dists.rename(columns={'DIST_meters': 'mean_DIST_meters'}, inplace=True)
print("Imputing {0} values at most for each chooser".format(sample_size - counts.min().values[0]))

Imputing 4 values at most for each chooser


In [54]:
(sample_size - counts['DIST_meters']).value_counts()

0    26334
1     2200
2      246
3       19
4        1
Name: DIST_meters, dtype: int64

In [55]:
mct_df = pd.merge(mct_df, mean_dists, left_on='obs_id', right_index=True)
mct_df.loc[pd.isnull(mct_df['DIST_meters']), 'DIST_meters'] = mct_df.loc[pd.isnull(mct_df['DIST_meters']), 'mean_DIST_meters']

#### Additional Post-processing

In [56]:
mct_df = mct_df.sort_values(['obs_id', 'chosen'], ascending=False)
mct_df = mct_df[[
    'obs_id', 'block_id', 'age_lt_18', 'age_gt_65',
    'inc_lt_75k', 'inc_gt_150k', 'veh_pp','veh_per_driver', 'veh_pp_gt_1', 'veh_per_driver_gt_1',
    'hhveh_gt_1', 'hhveh_gt_0', 'hhveh_is_0', 'hhveh_is_1', 'pct_retail', 'total_jobs',
    'jobs_per_res_unit', 'persons', 'pop_density','DIST_meters', 'dist_to_nearest_cbd',
    'chosen']].set_index(['obs_id', 'block_id'])

In [57]:
mct = MergedChoiceTable.from_df(mct_df)

# Simple model

In [59]:
%%time
expression = 'np.log1p(pct_retail) + np.log1p(total_jobs) + ' + \
    'np.log1p(persons) + jobs_per_res_unit + np.log1p(dist_to_nearest_cbd):veh_pp - 1'
results = MultinomialLogit(
    mct,
    model_expression=expression)
fitted = results.fit()
print(fitted.report_fit())

                  CHOICEMODELS ESTIMATION RESULTS                   
Dep. Var.:                chosen   No. Observations:          28,800
Model:         Multinomial Logit   Df Residuals:              28,795
Method:       Maximum Likelihood   Df Model:                       5
Date:                 2022-01-05   Pseudo R-squ.:              0.160
Time:                      20:25   Pseudo R-bar-squ.:          0.160
AIC:                 334,411.155   Log-Likelihood:      -167,200.577
BIC:                 334,452.495   LL-Null:             -198,943.352
                                          coef   std err         z     P>|z|   Conf. Int.
-----------------------------------------------------------------------------------------
np.log1p(pct_retail)                    2.2708     0.032    71.363     0.000             
np.log1p(total_jobs)                    0.6120     0.003   233.446     0.000             
np.log1p(persons)                       0.0377     0.002    15.918     0.000            

In [60]:
fitted.fitted_parameters

[2.2708005690814117,
 0.6119614583221185,
 0.037655816769314676,
 -9.098902472454982e-05,
 0.1164899750612353]

# Store data to disk

In [62]:
alts_df.to_csv('../data/chts_alts_df.csv')