# Assemble skims

Put together all the skims from different places.

In [None]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import openmatrix as omx

## Read original skims

This already has walking and driving free-flow skims in it.

In [None]:
orig_skims = omx.open_file('../data/skim_inputs/skims_wrong_names.omx', 'r')
skims = omx.open_file('../model_inputs/skims.omx', 'w')
skim_tract_mapping = pd.read_parquet('../la_abm/data/skim_tracts.parquet')

In [None]:
geoid_order = skim_tract_mapping.geoid.to_numpy()

## Walking skims

Just copy these over.

In [None]:
# miles: https://github.com/BayAreaMetro/travel-model-one/blob/master/model-files/scripts/skims/HwySkims.job
skims['DISTWALK'] = np.nan_to_num(np.array(orig_skims['walk_dist_km']) / 1.609, nan=-999)

In [None]:
# these do seem to be network distances in the MTC data, not symmetrical
skims['DIST'] = np.nan_to_num(np.array(orig_skims['car_distance_km']) / 1.609, nan=-999)

In [None]:
# workaround for https://github.com/ActivitySim/activitysim/issues/390
skims.create_mapping('taz', np.arange(len(skim_tract_mapping)))

## Driving skims

A random forest model was used to create predictions for the ratio of congested travel time to uncongested travel time at each hour of the day.

In [None]:
rf_res = pd.read_parquet('predicted_congestion_ratio.parquet')

In [None]:
len(rf_res)

In [None]:
rf_res.head()

In [None]:
time_periods = {
    'EA': (3, 5),
    'AM': (5, 9),
    'MD': (9, 14),
    'PM': (14, 18),
    'EV': (18, 24)
}

In [None]:
free_flow_drive_time = pd.DataFrame(np.array(orig_skims['car_freeflow']), index=skim_tract_mapping.geoid.to_numpy(), columns=skim_tract_mapping.geoid.to_numpy()).stack()

In [None]:
for name, duration in time_periods.items():
    start, end = duration
    
    print(name)
    
    # intentionally not including end, as it represents the hour after the time window is over
    scale_factors = rf_res.loc[pd.IndexSlice[:,:,list(range(start, end))]].groupby(level=[0, 1]).congested_tt_ratio.mean()

    congested_travel_time = free_flow_drive_time * scale_factors.reindex(free_flow_drive_time.index, fill_value=np.nan)
    assert not congested_travel_time.isnull().any()
    assert (congested_travel_time >= free_flow_drive_time).all()

    # series back into matrix
    congested_travel_time = congested_travel_time.unstack()
    mtx = congested_travel_time.loc[skim_tract_mapping.geoid.to_numpy(), skim_tract_mapping.geoid.to_numpy()].to_numpy()
    
    # free flow drive times already in minutes
    skims[f'SOV_TIME__{name}'] = mtx
    skims[f'SOV_DIST__{name}'] = np.array(orig_skims['car_distance_km']) / 1.609  # miles

In [None]:
# free memory
del free_flow_drive_time, scale_factors, congested_travel_time, rf_res

### Non-SOV skims

These are just computed as some fraction of the SOV trips - HOV2 and HOV3, and all toll trips, in the AM and PM periods over 5 miles are assumed to take 10% less time than SOV trips, or the free flow time, whichever is faster.

In [None]:
sov_dist = np.array(skims[f'SOV_DIST__AM']) # time invariant
ff_time = np.array(orig_skims['car_freeflow'])
for name in time_periods.keys():
    sov_time = np.array(skims[f'SOV_TIME__{name}'])
    if name in ('AM', 'PM'):
        non_sov_time = np.copy(sov_time)
        non_sov_time[sov_dist >= 5] *= 0.9
        non_sov_time = np.maximum(non_sov_time, ff_time)
    else:
        non_sov_time = sov_time
        
    for non_sov in ('HOV2', 'HOV3', 'SOVTOLL', 'HOV2TOLL', 'HOV3TOLL'):
        skims[f'{non_sov}_TIME__{name}'] = non_sov_time
        skims[f'{non_sov}_DIST__{name}'] = sov_dist

### Tolls

Assume bridge tolls are 0, and value tolls are 1.57/mile peak and 1.48 offpeak in 2021 dollars.

CA 73 is [tolled for 14.6 miles](https://www.google.com/maps/dir/33.6547791,-117.8651856/33.5390436,-117.6747031/@33.6067673,-117.8551035,12.75z/data=!4m2!4m1!3e0?hl=en) and costs $1.57/mi peak and $1.48/mi offpeak. https://thetollroads.com/sites/default/files/FY21_RateCard.pdf

In [None]:
cpi_2000_2021 = 1.55
peak_toll = 157 / cpi_2000_2021
offpeak_toll = 148 / cpi_2000_2021

for name in time_periods.keys():
    if name in ('AM', 'PM'):
        toll_rate = peak_toll
    else:
        toll_rate = offpeak_toll
        
    for toll_mode in ('SOVTOLL', 'HOV2TOLL', 'HOV3TOLL'):
        # toll roads are rare in SoCal, assume 1/4 of trip is on tolled facility
        skims[f'{toll_mode}_VTOLL__{name}'] = (sov_dist / 4) * toll_rate
        # no toll bridges in SoCal, https://en.wikipedia.org/wiki/Vincent_Thomas_Bridge was the only one
        skims[f'{toll_mode}_BTOLL__{name}'] = np.zeros_like(sov_dist)
    
    for mode in ('SOV', 'HOV2', 'HOV3'):
        skims[f'{mode}_BTOLL__{name}'] = np.zeros_like(sov_dist)


## Transit skims

Reformat these from the Julia output.

In [None]:
seconds_to_hundredths_of_minutes = 100 / 60
meters_to_hundredths_of_miles = 100 / 1609


def generate_skim (df, col, idxs, missings, idx_shape):
    out = df[col].iloc[idxs].to_numpy()
    out[missings] = np.nan
    return out.reshape(idx_shape)    
    
fares = {
    'LOC': 175 / cpi_2000_2021,
    'LRF': 175 / cpi_2000_2021,
    'HVY': 175 / cpi_2000_2021,
    'COM': 500 / cpi_2000_2021,
    'EXP': 250 / cpi_2000_2021,
    'TRN': 200 / cpi_2000_2021
}
    

#for mode in ['LOC', 'LRF', 'HVY', 'COM', 'EXP', 'TRN']:
for mode in ['LOC']:
    for acc, egr in [
        #('WLK', 'WLK'),
        #('WLK', 'DRV'),
        ('DRV', 'WLK')
    ]:
        print(f'{acc}_{mode}_{egr}')
        tr_data = pd.read_parquet(f'../data/skim_inputs/transit_skims_{acc}_{mode}_{egr}.parquet')
        tr_data = tr_data.set_index(['from_geoid', 'to_geoid'])
        tr_data['time_period'] = tr_data.time_period.astype('category')
        
        for name in time_periods.keys():
            tp_data = tr_data[tr_data.time_period == name].copy()
            tp_data['intloc'] = np.arange(len(tp_data))
            # unstack once and get a list of indices to avoid having to unstack once per column
            idxs = tp_data.intloc.unstack(fill_value=-1).reindex(index=geoid_order, columns=geoid_order, fill_value=-1).to_numpy()
            idx_shape = idxs.shape
            idxs = idxs.reshape(-1)
            missings = idxs == -1
            # can't have nans in indexer array
            idxs[missings] = 0
            
            skims[f'{acc}_{mode}_{egr}_WAIT__{name}'] = np.nan_to_num(generate_skim(tp_data, 'wait', idxs, missings, idx_shape) * seconds_to_hundredths_of_minutes, nan=-999)
            totivt = np.nan_to_num(generate_skim(tp_data, 'total_ivt', idxs, missings, idx_shape) * seconds_to_hundredths_of_minutes, nan=-999)
            keyivt = np.nan_to_num(generate_skim(tp_data, 'key_ivt', idxs, missings, idx_shape) * seconds_to_hundredths_of_minutes, nan=-999)
            if mode != 'TRN':
                skims[f'{acc}_{mode}_{egr}_KEYIVT__{name}'] = keyivt
                skims[f'{acc}_{mode}_{egr}_TOTIVT__{name}'] = totivt
            else:
                skims[f'{acc}_{mode}_{egr}_IVT__{name}'] = totivt
            skims[f'{acc}_{mode}_{egr}_WAUX__{name}'] = np.nan_to_num(generate_skim(tp_data, 'walk_time_xfers', idxs, missings, idx_shape) * seconds_to_hundredths_of_minutes, nan=-999)
            skims[f'{acc}_{mode}_{egr}_IWAIT__{name}'] = np.nan_to_num(generate_skim(tp_data, 'initial_wait', idxs, missings, idx_shape) * seconds_to_hundredths_of_minutes, nan=-999)
            skims[f'{acc}_{mode}_{egr}_XWAIT__{name}'] = np.nan_to_num(generate_skim(tp_data, 'wait_time_xfers', idxs, missings, idx_shape) * seconds_to_hundredths_of_minutes, nan=-999)
            skims[f'{acc}_{mode}_{egr}_BOARDS__{name}'] = np.nan_to_num(generate_skim(tp_data, 'n_boardings', idxs, missings, idx_shape) * seconds_to_hundredths_of_minutes, nan=-999)
            skims[f'{acc}_{mode}_{egr}_WACC__{name}'] = np.nan_to_num(generate_skim(tp_data, 'walk_access_time', idxs, missings, idx_shape) * seconds_to_hundredths_of_minutes, nan=-999)
            skims[f'{acc}_{mode}_{egr}_WEGR__{name}'] = np.nan_to_num(generate_skim(tp_data, 'walk_egress_time', idxs, missings, idx_shape) * seconds_to_hundredths_of_minutes, nan=-999)
            skims[f'{acc}_{mode}_{egr}_FAR__{name}'] = np.full_like(sov_time, fares[mode])

            
            if acc == 'DRV' or egr == 'DRV':
                if 'drive_time' in tp_data.columns:
                    skims[f'{acc}_{mode}_{egr}_DTIM__{name}'] = np.nan_to_num(generate_skim(tp_data, 'drive_time', idxs, missings, idx_shape) * seconds_to_hundredths_of_minutes, nan=-999)
                    skims[f'{acc}_{mode}_{egr}_DDIST__{name}'] = np.nan_to_num(generate_skim(tp_data, 'drive_dist', idxs, missings, idx_shape) * meters_to_hundredths_of_miles, nan=-999)
                else:
                    print('WARN: USING ARBITRARY DRIVE TIMES AND DISTANCES!!!!!!!!!!!!!!!!!!!!!!!!')
                    skims[f'{acc}_{mode}_{egr}_DTIM__{name}'] = np.full_like(sov_time, 1500)
                    skims[f'{acc}_{mode}_{egr}_DDIST__{name}'] = np.full_like(sov_time, 500)
            
            if mode == 'LRF':
                # no ferries in this model
                skims[f'{acc}_{mode}_{egr}_FERRYIVT__{name}'] = np.full_like(sov_time, 0)

In [None]:
# for skim in list(skims.list_matrices()):
#     if 'DRV_LOC_WLK' in skim:
#         skims.remove_node(f'/data/{skim}')

## Bike distances

Assuming same as walk distance for now.

In [None]:
skims['DISTBIKE'] = np.array(skims['DISTWALK'])

## Prune skims to only what is needed

ActivitySim loads all skims into memory, so remove any skims that aren't in the SF model input data.

In [None]:
sf_skims = omx.open_file('../example_mtc_full/example_mtc_full/data/skims.omx')
sf_matrices = set(sf_skims.list_matrices())
sf_skims.close()

In [None]:
for skim in list(skims.list_matrices()):
    if not skim in sf_matrices:
        skims.remove_node(f'/data/{skim}')

## Intrazonal skims

Just assume 0.5 * min(dist_to_other_zones) as was done in https://github.com/BayAreaMetro/travel-model-one/blob/master/model-files/scripts/skims/HwySkims.job

This is also inordinately slow (1+ hr) for reasons I do not understand. This should take only a few minutes.

In [None]:
for mtxname in skims.list_matrices():
    #if '_FAR__' not in mtxname:  # don't scale back fares
    if 'DRV_LOC_WLK_' in mtxname and '_FAR__' not in mtxname:
        mtx = np.array(skims[mtxname])
        np.fill_diagonal(mtx, 1_000_000_000)  # make sure the diagonal is not the min
        # nanmin should only return nan if all are nan
        new_diag = np.nan_to_num(np.nanmin(np.where(mtx == -999, np.nan, mtx), axis=1) / 2, nan=-999)
        np.fill_diagonal(mtx, new_diag)
        skims.remove_node(f'/data/{mtxname}')  # no remove_matrix call?
        skims[mtxname] = np.nan_to_num(mtx, -999)  # just for good measure, make sure all nans are banished

In [None]:
assert len(skims.list_matrices()) == len(sf_matrices), 'not all SF matrices present in LA data'

In [None]:
la_matrices = skims.list_matrices()
[i for i in sf_matrices if i not in la_matrices]

In [None]:
skims.close()  # phew, all done!