# Congestion model

Uber Movement provides travel time data for Los Angeles, but not for the entire Southern California region. Use a random forest to extrapolate.

In [None]:
%reload_ext ipy_blink1

In [None]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import geopandas as gp
import cenpy
import rtree
import tqdm
import shapely
import threading
import queue
import multiprocessing

DB_URI = 'postgresql://matthewc@localhost/matthewc'

## Load Uber Movement data

In [None]:
uber = pd.read_csv('../data/los_angeles-censustracts-2019-4-OnlyWeekdays-HourlyAggregate.csv', dtype={'sourceid': 'str', 'dstid': 'str'})

In [None]:
uber_geo = gp.read_file('../data/uber_tracts_la.json').to_crs(epsg=26911)

In [None]:
# Figure out which county each Uber tract is in, because that data is missing from the provided file
uber_geo['geometry'] = uber_geo.centroid

In [None]:
counties = gp.read_file('../../sorting/data/counties/tl_2019_us_county.shp').to_crs(epsg=26911)

In [None]:
counties = counties[(counties.STATEFP == '06') & counties.COUNTYFP.isin(('025', '037', '059', '065', '071', '111'))].copy()

In [None]:
uber_geo = gp.sjoin(uber_geo, counties, op='within', how='left')

In [None]:
assert not uber_geo.COUNTYFP.isnull().any()

In [None]:
uber_geo['geoid'] = uber_geo.STATEFP.str.cat(uber_geo.COUNTYFP).str.cat(uber_geo.TRACT)

In [None]:
uber_geo = uber_geo.set_index('MOVEMENT_ID')

In [None]:
uber['source_geoid'] = uber_geo.loc[uber.sourceid, 'geoid'].to_numpy()
uber['target_geoid'] = uber_geo.loc[uber.dstid, 'geoid'].to_numpy()

In [None]:
uber

In [None]:
uber = uber.set_index(['source_geoid', 'target_geoid', 'hod'])

In [None]:
# some tracts are in multiple pieces. average together
uber = uber.drop(columns=['sourceid', 'dstid']).groupby(level=[0, 1, 2]).mean()

In [None]:
assert not uber.index.duplicated().any()

## Create the dependent variable

The dependent variable is the ratio of congested travel time at a particular hour to free-flow travel time. I assume average travel times 10 PM-5 AM is free-flow travel time.

In [None]:
np.r_[0:5,22:24]

In [None]:
fftt = uber.loc[pd.IndexSlice[:,:,np.r_[0:5,22:24]]].groupby(level=[0, 1]).mean_travel_time.min()

In [None]:
uber['congested_tt_ratio'] = uber.mean_travel_time / fftt.reindex(uber.index)

In [None]:
uber.congested_tt_ratio.isnull().mean()

In [None]:
# save for other analysis
uber.to_parquet('../data/uber_with_tracts.parquet')

In [None]:
# useful and I want to put in my diss, but takes forever
# uber.groupby(level=2).boxplot(column='congested_tt_ratio', subplots=False)
# plt.ylim(0, 4)
# plt.xticks(np.arange(24), np.arange(24))

## Prepare the covariates for the random forest model

In [None]:
# ORDER BY to force stable order 
tract_centroids = gp.read_postgis('''
    SELECT geoid, aland, ST_Centroid(geog)::geometry AS geom
    FROM diss.ca_tracts
    WHERE countyfp in ('025', '037', '059', '065', '071', '111')
    ORDER BY geoid''', DB_URI).set_index('geoid')

In [None]:
acs5 = cenpy.products.ACS(2017)

In [None]:
tract_pop = pd.concat([
    acs5.from_county(county, level='tract', variables=['^B01003'], return_geometry=False)
    for county in ['Los Angeles, CA', 'Riverside, CA', 'San Bernardino, CA', 'Ventura, CA', 'Imperial, CA', 'Orange, CA']
], ignore_index=True)

In [None]:
tract_centroids = tract_centroids.merge(tract_pop.rename(columns={'B01003_001E': 'total_pop'}), left_index=True, right_on='GEOID', how='left', validate='1:1')
assert not tract_centroids.total_pop.isnull().any()

In [None]:
# load up LODES
lodes = pd.read_csv('../data/ca_wac_S000_JT00_2017.csv', dtype={'w_geocode': 'str'})

In [None]:
lodes['tract_geoid'] = lodes.w_geocode.str.slice(0, 11)
total_jobs = lodes.groupby('tract_geoid', as_index=False).C000.sum()

In [None]:
tract_centroids = tract_centroids.merge(total_jobs.rename(columns={'C000': 'total_jobs'}), left_on='GEOID', right_on='tract_geoid', how='left', validate='1:1')
tract_centroids['total_jobs'] = tract_centroids.total_jobs.fillna(0)

In [None]:
tract_centroids = tract_centroids.to_crs(epsg=26911)

In [None]:
tract_centroids['pop_dens_sqkm'] = tract_centroids.total_pop / (tract_centroids.aland / 1000**2)
tract_centroids['job_dens_sqkm'] = tract_centroids.total_jobs / (tract_centroids.aland / 1000**2)

In [None]:
tract_centroids

## Create bands around each tract

In [None]:
tract_idx = rtree.index.Index()
for idx, x, y in zip(tract_centroids.index, tract_centroids.geom.x, tract_centroids.geom.y):
    tract_idx.insert(idx, (x, y, x, y))

In [None]:
%%time
densbands = {}

# create bands
for i, idx, geom, x, y in zip(range(len(tract_centroids)), tract_centroids.index, tract_centroids.geom, tract_centroids.geom.x, tract_centroids.geom.y):
    if i % 500 == 0:
        print(f'{i}..', end='')
        
    # find candidate tracts within 8 km
    candidate_tracts = tract_idx.intersection((x - 8000, y - 8000, x + 8000, y + 8000))
    distances = tract_centroids.loc[candidate_tracts].distance(geom)
    # 1e-5 so self is not included
    for low, high in [(1e-5, 2), (2, 4), (4, 6), (6, 8)]:
        tracts_in_dist = tract_centroids.loc[distances[(distances > (low * 1000)) & (distances <= (high * 1000))].index]
        if len(tracts_in_dist) == 0:
            continue
            
        densbands[idx, f'({low}, {high}]'] = pd.Series(
            np.concatenate([
                np.percentile(tracts_in_dist.pop_dens_sqkm, [25, 50, 75, 95]),
                np.percentile(tracts_in_dist.job_dens_sqkm, [25, 50, 75, 95]),
            ]),
            index=[
                'pop_dens_sqkm_25',
                'pop_dens_sqkm_50',
                'pop_dens_sqkm_75',
                'pop_dens_sqkm_95',
                'job_dens_sqkm_25',
                'job_dens_sqkm_50',
                'job_dens_sqkm_75',
                'job_dens_sqkm_95'
            ]
        )        

densbands = pd.DataFrame(densbands).transpose().unstack()
densbands.columns = list(map('_'.join, densbands.columns))
densbands = densbands.fillna(-1)

In [None]:
tract_centroids = tract_centroids.join(densbands.reindex(tract_centroids.index, fill_value=-1))

In [None]:
# save out to be processed into along-route bands
tract_centroids.to_file('../data/tract_centroids_density.json', driver='GeoJSON')