# Congestion model

This implements the congestion model.

In [None]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn.ensemble
import geopandas as gp
from joblib import dump, load
import openmatrix as omx
import sklearn.inspection

ESTIMATE = True  # When set to True, will estimate the random forest 

In [None]:
uber = pd.read_parquet('../data/uber_with_tracts.parquet')

In [None]:
uber = uber[~uber.congested_tt_ratio.isnull()].copy()  # some pairs had no overnight records

In [None]:
uber.index = uber.index.rename(['from_geoid', 'to_geoid', 'hour'])

## Load skims

In [None]:
skims = omx.open_file('../la_abm/data/skims.omx')

In [None]:
skim_idx = pd.read_parquet('../la_abm/data/skim_tracts.parquet')

In [None]:
np.array([[1, 2], [3, 4]])

In [None]:
car_tt = pd.DataFrame(np.array(skims['car_freeflow']), columns=skim_idx.geoid, index=skim_idx.geoid).stack().rename('car_freeflow_tt')
car_tt.index = car_tt.index.rename(['from_geoid', 'to_geoid'])

netdist = pd.DataFrame(np.array(skims['car_distance_km']), columns=skim_idx.geoid, index=skim_idx.geoid).stack().rename('car_net_dist')
netdist.index = netdist.index.rename(['from_geoid', 'to_geoid'])

In [None]:
uber = uber.join(car_tt, on=['from_geoid', 'to_geoid'], how='left')
uber = uber.join(netdist, on=['from_geoid', 'to_geoid'], how='left')
uber.head()

In [None]:
tracts = gp.read_file('tract_centroids_density.json')

In [None]:
# now see how well this works, if at all - might be too big 
along_route = pd.read_csv('../data/along_route.csv', dtype={'from_geoid': str, 'to_geoid': str})

In [None]:
# along_route is only computed for one direction - do it for the other direction
along_route_back = along_route.rename(columns={'from_geoid': 'to_geoid', 'to_geoid': 'from_geoid'})
along_route = pd.concat([along_route, along_route_back], ignore_index=True)
del along_route_back

In [None]:
len(along_route)

In [None]:
along_route = along_route.set_index(['from_geoid', 'to_geoid', 'band']).unstack().fillna(-1)

In [None]:
along_route.columns = [f'{col}_{band[1]}_{band[4]}' for col, band in along_route.columns]

In [None]:
along_route = along_route.drop(columns=[i for i in along_route.columns if 'fromidx' in i or 'toidx' in i])

In [None]:
data = uber.join(along_route.rename(columns='along_route_{}'.format), on=['from_geoid', 'to_geoid'], how='left')

In [None]:
# TODO is this a reasonable amount of missing data?
data.along_route_pop_dens_sqkm_25_0_2.isnull().mean()

In [None]:
data = data.fillna(-1)

In [None]:
del along_route  # save memory, we'll need it for the RF

In [None]:
# merge in origin and destination characteristics
data = data.reset_index()

data = data.merge(
    tracts.drop(columns=['aland', 'total_pop', 'NAME', 'state', 'county', 'tract', 'tract_geoid', 'total_jobs', 'geometry'])\
        .rename(columns='from_{}'.format),
    left_on='from_geoid',
    right_on='from_GEOID',
    how='left',
    validate='m:1'
)

data = data.merge(
    tracts.drop(columns=['aland', 'total_pop', 'NAME', 'state', 'county', 'tract', 'tract_geoid', 'total_jobs', 'geometry'])\
        .rename(columns='to_{}'.format),
    left_on='to_geoid',
    right_on='to_GEOID',
    how='left',
    validate='m:1'
)

In [None]:
data.head()

In [None]:
# This one geoid is missing data in tracts, just remove from estimation sample
data = data[(data.to_geoid != '06037930401') & (data.from_geoid != '06037930401')].copy()

In [None]:
assert not data.to_job_dens_sqkm.isnull().any()
assert not data.from_job_dens_sqkm.isnull().any()
# okay for some bands to be null, no tracts in band

In [None]:
data = data.fillna(-1)

## Fit random forest

Split into test and training data, and fit the random forest. The training dataset is only 100,000 observations for tractability.

In [None]:
rng = np.random.RandomState(seed=48923)
train_test = np.full(len(data), False, dtype='bool')
train_test[:100_000] = True
rng.shuffle(train_test)
data['train'] = train_test

# it's possible that we could still overfit the model even looking at test set performance since congestion
# is likely to be correlated across tracts
excluded_tracts = set(tracts.sample(200, random_state=rng).GEOID)
data.loc[data.from_geoid.isin(excluded_tracts) | data.to_geoid.isin(excluded_tracts), 'train'] = False

np.argwhere(train_test)[:100]

In [None]:
est_x = data.loc[data.train, ['hour', 'car_net_dist', 'car_freeflow_tt', *[i for i in data.columns if 'dens' in i]]].copy()

In [None]:
if ESTIMATE:
    rf = sklearn.ensemble.RandomForestRegressor(n_estimators=100, min_samples_split=100, random_state=32013)
    rf.fit(est_x, data[data.train].congested_tt_ratio)
    dump((rf, est_x.columns.values), '../data/skim_rf.joblib')
else:
    rf, feature_names = load('../data/skim_rf.joblib')

## Model fit and stats

Out-of-bag prediction score, and test set.

In [None]:
data['predicted'] = rf.predict(data[est_x.columns])

In [None]:
test = data.loc[~data.train]
testr2 = 1 - ((test.congested_tt_ratio - test.predicted) ** 2).sum() / ((test.congested_tt_ratio - test.congested_tt_ratio.mean()) ** 2).sum()


### Test $R^2$ on only tracts that were entirely excluded from model fitting

In [None]:
test_tracts = data.loc[data.from_geoid.isin(excluded_tracts) | data.to_geoid.isin(excluded_tracts)]
test_tract_r2 = 1 - ((test_tracts.congested_tt_ratio - test_tracts.predicted) ** 2).sum() / ((test_tracts.congested_tt_ratio - test_tracts.congested_tt_ratio.mean()) ** 2).sum()
n_test_tracts = len(test_tracts.from_geoid.unique())
test_tract_ss = len(test_tracts)

In [None]:
# not sure the adj r2 makes sense here since it's a test score not a training score - no need to penalize for params
print(f'''
Test R^2: {testr2:.3f}
Test R^2 on excluded tracts: {test_tract_r2:.3f}
''')

In [None]:
# permutation based feature importance, since "Impurity-based feature importances can be misleading for high cardinality features (many unique values)"
# https://scikit-learn.org/stable/auto_examples/ensemble/plot_forest_importances.html
feature_importance = sklearn.inspection.permutation_importance(rf, est_x, data[data.train].congested_tt_ratio)

In [None]:
with pd.option_context('display.max_rows', 300):
    display(pd.Series(feature_importance['importances_mean'], index=est_x.columns).sort_values(ascending=False))