## Setup

In [1]:
import sys
import os
from pathlib import Path
import matplotlib.pyplot as plt
from matplotlib.ticker import FormatStrFormatter
import matplotlib.colors as colors
from datetime import datetime
import ast

import numpy as np
import pandas as pd
import geopandas as gpd
import contextily as ctx
import random
import isuelogit as isl
import glob
import time

from sklearn import preprocessing
from sklearn.impute import SimpleImputer

In [2]:
import tensorflow as tf
tf.config.set_visible_devices([], 'GPU')

In [3]:
# Path management
main_dir = str(Path(os.path.abspath("")).parents[1])
os.chdir(main_dir)
print('main dir:', main_dir)

sys.path.append(os.path.join(main_dir, 'src'))

isl.config.dirs['read_network_data'] = "input/network-data/fresno/"

main dir: /Users/pablo/github/nesuelogit


In [4]:
%load_ext autoreload
%autoreload 2

from pesuelogit.networks import read_OD, load_k_shortest_paths
from pesuelogit.etl import data_curation, add_period_id

# Functions from internal modules
from nesuelogit.models import compute_generated_trips, compute_generation_factors, \
    regularization_kfold, create_tvgodlulpe_model_fresno
from nesuelogit.etl import build_network, get_tensors_by_year
from nesuelogit.visualizations import plot_flow_vs_traveltime, plot_congestion_maps
from nesuelogit.metrics import mse, mape, r2_score,  z2score, mdape
from nesuelogit.utils import read_paths

In [5]:
# Seed for reproducibility
_SEED = 2023
np.random.seed(_SEED)
random.seed(_SEED)
tf.random.set_seed(_SEED)

In [6]:
# To report global runtime
t0_global = time.time()

In [7]:
# Set timestamp to add in the filenames that are written in disk
ts = datetime.now().strftime('%y%m%d%H%M%S')
print('Timestamp:',ts)

Timestamp: 231219120556


## Configuration

In [8]:
# Critical hyperparameters
_EPOCHS = {'learning':10}
_LR = {'learning': 1e-1, 'generation':10}
_BATCH_SIZE = 1

# Number of splits for k-fold method
_N_SPLITS_HP = 5
_GRID_EQUILIBRIUM_HP = [0, 1e-3, 1e-2, 1e-1, 1e0, 1e1]
# _GRID_EQUILIBRIUM_HP = [0, 5e-1, 1, 2]
# _GRID_EQUILIBRIUM_HP = [0, 5e-1]

# These hyperparameters can be left in their current values
_LOSS_WEIGHTS ={'od': 0, 'traveltime': 1, 'flow': 1, 'equilibrium': 1}
_EQUILIBRIUM_STAGE = False
_RELATIVE_GAP = float('inf')
_LOSS_METRIC  = z2score
_EVALUATION_METRIC = mdape
_DTYPE = tf.float32
_OPTIMIZERS = {'learning': tf.keras.optimizers.legacy.Adam(learning_rate=_LR['learning'])}

_DAYSOFWEEK = [1,2,3] # Monday:0, Sunday:6
_HOURS = np.arange(6,21)
#_HOURS = [6,7,8, 15,16,17]

# Exogenous attributes in utility function
_FEATURES_Z = ['tt_sd', 'median_inc', 'incidents', 'bus_stops', 'intersections']

## Read nodes and link-specific data

In [9]:
nodes_df = pd.read_csv('./input/network-data/fresno/nodes/fresno-nodes-gis-data.csv')

links_df = pd.read_csv('./input/network-data/fresno/links/fresno-link-specific-data.csv',
                       converters={"link_key": ast.literal_eval, "pems_id": ast.literal_eval})

## Build Fresno network

In [11]:
network = build_network(links_df=links_df, nodes_df=nodes_df, crs='epsg:4326', key= 'fresno')

In [None]:
## Display network
links_gdf = gpd.read_file('./input/network-data/fresno/gis/links/fresno-links-gis.shp').set_crs(
        'EPSG:2228')
ax = links_gdf.to_crs(epsg=3857).plot(figsize=(10, 10), alpha=0.5)
ctx.add_basemap(ax, source=ctx.providers.OpenStreetMap.Mapnik)
plt.show()

## Read and load OD matrix

In [12]:
read_OD(network=network, sparse=True)

Matrix Q (1789, 1789) read in 0.0[s] with sparse format
66266.3 trips were loaded among 6970 o-d pairs



## Read and load paths

In [13]:
# read_paths(network=network, update_incidence_matrices=True, filename = 'paths-fresno-k3.csv')
read_paths(network=network, update_incidence_matrices=True, filename = 'paths-full-model-fresno.csv')

18289 paths were read and incidence matrices were built


## Read and process spatio-temporal link-level data

In [26]:
folderpath = './input/network-data/fresno/links/spatiotemporal-data/'
df = pd.concat([pd.read_csv(file) for file in glob.glob(folderpath + "*link-data*")], axis=0)

df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')
df['year'] = df.date.dt.year

df['link_key'] = pd.Categorical(df['link_key'].apply(ast.literal_eval), list(network.links_dict.keys()))
df['period'] = pd.to_datetime(df['period'], format = '%Y-%m-%d-%H').dt.strftime('%Y-%m-%d-%H')

# Select data from Tuesdays to Thursdays
df = df[df['date'].dt.dayofweek.isin(_DAYSOFWEEK)]

# Select data from first Tuesdays of 2019 and 2020
# df = df[df['date'].isin(["2019-10-01", "2020-10-06"])]

In [27]:
# Add period id for timevarying estimation
period_feature = 'hour'

df = add_period_id(df, period_feature='hour')

period_keys = df[[period_feature,'period_id']].drop_duplicates().reset_index().drop('index',axis =1).sort_values('hour')
print(period_keys)

    hour  period_id
11     6          0
12     7          1
13     8          2
14     9          3
0     10          4
1     11          5
2     12          6
3     13          7
4     14          8
5     15          9
6     16         10
7     17         11
8     18         12
9     19         13
10    20         14


In [28]:
# Data curation
df['tt_ff'] = np.where(df['link_type'] != 'LWRLK', 0,df['length']/df['speed_ref_avg'])
df.loc[(df.link_type == "LWRLK") & (df.speed_ref_avg == 0),'tt_ff'] = float('nan')

df['tt_avg'] = np.where(df['link_type'] != 'LWRLK', 0,df['length']/df['speed_hist_avg'])
df.loc[(df.link_type == "LWRLK") & (df.speed_hist_avg == 0),'tt_avg'] = float('nan')

tt_sd_adj = df.groupby(['period_id','link_key'])[['tt_avg']].std().reset_index().rename(columns = {'tt_avg': 'tt_sd_adj'})

df = df.merge(tt_sd_adj, on = ['period_id','link_key'])

df = data_curation(df)

In [29]:
# Units of travel time features are converted from hours to minutes
df['tt_sd'] = df['tt_sd_adj']
df['tt_sd'] = df['tt_sd']*60
df['tt_avg'] = df['tt_avg']*60
df['tt_ff'] = df['tt_ff']*60

# Set free flow travel times
tt_ff_links = df.groupby('link_key')['tt_ff'].min()
for link in network.links:
    network.links_dict[link.key].performance_function.tf = float(tt_ff_links[tt_ff_links.index==link.key].iloc[0])

## Process node-level data

In [30]:
nodes_df = nodes_df.rename(columns ={'pop_tract':'population','stops_tract': 'bus_stops','median_inc':'income'})

features_generation = ['population','income', 'bus_stops']

nodes_df = nodes_df[['key','type'] + features_generation]

imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
imp_mean.fit(nodes_df[features_generation])
nodes_df[features_generation] = imp_mean.transform(nodes_df[features_generation])

scaler = preprocessing.StandardScaler().fit(nodes_df[features_generation].values)
nodes_df[features_generation] = scaler.transform(nodes_df[features_generation].values)

## Exploratory Data Analysis

In [31]:
# To check that there is a balanced amount of observations per date
obs_date = df.groupby('date')['hour'].count()

In [32]:
# Stats by date
df.groupby('date')[['speed_sd','speed_avg', 'counts']].mean().assign(total_obs = obs_date)

Unnamed: 0_level_0,speed_sd,speed_avg,counts,total_obs
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-10-01,1.731787,17.175187,1770.335035,36195
2019-10-02,1.760109,17.169768,1746.651824,36195
2019-10-03,1.754288,17.092304,1785.115209,36195
2019-10-08,1.84706,18.165569,1747.732955,36195
2019-10-09,1.917923,18.137042,1756.834846,36195
2019-10-10,1.830232,18.107925,1793.51234,36195
2019-10-15,1.831527,18.114384,1750.339155,36195
2019-10-16,1.82368,18.162625,1760.170975,36195
2019-10-17,1.832219,18.08086,1775.411385,36195
2019-10-22,1.837839,18.175561,1738.314834,36195


In [33]:
## Link-level attributes in utility function
df[_FEATURES_Z].describe()

Unnamed: 0,tt_sd,median_inc,incidents,bus_stops,intersections
count,1013460.0,1013460.0,1013460.0,1013460.0,1013460.0
mean,0.0181368,26.21913,0.7441093,0.1500207,0.8765023
std,0.02783112,21.35738,3.193143,0.4411927,1.319496
min,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0
50%,0.01113761,24.821,0.0,0.0,0.0
75%,0.0254887,41.681,0.0,0.0,1.0
max,0.7459602,115.893,40.0,4.0,9.0


## Training and validation sets

In [35]:
XT, YT = get_tensors_by_year(df[df.hour.isin(_HOURS)], features_Z = _FEATURES_Z, links_keys=list(network.links_dict.keys()))

# Split in training and test sets
XT_train, XT_val, YT_train, YT_val = map(lambda x: tf.cast(x, dtype = _DTYPE), [XT[2019], XT[2020], YT[2019], YT[2020]])

n_periods = len(np.unique(XT_train[:, :, -1].numpy().flatten()))

### Reference O-D matrix and trip generation vectors

In [36]:
q_historic = np.repeat(network.q.flatten()[np.newaxis, :], len(_HOURS), axis=0)

#Adjust historic O-D and historic trip generation
generation_factors = compute_generation_factors(period_column=XT_train[:, :, -1, None].numpy(),
                                                              flow_column=YT_train[:,:,1, None].numpy(), reference_period=10)

reference_q = q_historic*np.tile(generation_factors.values,(q_historic.shape[1],1)).T

reference_g = compute_generated_trips(q = reference_q, ods= network.ods, n_nodes = len(network.nodes))

# Total trips tvodlulpe pesuelogit:
# Epoch 0: 6.6e+04 6.6e+04 6.6e+04 6.6e+04 6.6e+04 6.6e+04
# Final epoch: 6.4e+04 6.6e+04 6.3e+04 7.8e+04 7.9e+04 7.9e+04
# Growth factor captures the difference between the reference OD at epoch 0 and the estimated OD in Guarda et al., (2024), Transportation Research Part C
# growth_factor = 7.9/6.6
# reference_g = growth_factor*reference_g
# reference_q = growth_factor*reference_q

## Models

In [37]:
train_results_dfs = {}
val_results_dfs = {}
models = {}

## Search of optimal hyperparameter weighting the equilibrium component

In [38]:
# Parameters
target_metric = 'mse'
target_component = 'flow'

loss_weights = []

if isinstance(_GRID_EQUILIBRIUM_HP, (int, float)):
    _GRID_EQUILIBRIUM_HP = [_GRID_EQUILIBRIUM_HP]

for i in _GRID_EQUILIBRIUM_HP:
    loss_weights.append(_LOSS_WEIGHTS.copy())
    loss_weights[-1]['equilibrium'] = i

In [39]:
model = create_tvgodlulpe_model_fresno(network = network, n_periods = n_periods, features_Z = _FEATURES_Z,
                                       historic_g = reference_g, historic_q = reference_q)

hp_metrics_df, optimal_weights, optimal_metrics_kfold_df, optimal_parameters_kfold_df \
    = regularization_kfold(
    loss_weights=loss_weights,
    target_metric = 'mse',
    target_component = 'flow',
    n_splits=_N_SPLITS_HP,
    random_state=_SEED,
    model=model,
    X=XT_train, Y=YT_train,
    optimizers=_OPTIMIZERS,
    node_data=nodes_df,
    loss_metric=_LOSS_METRIC,
    evaluation_metric=_EVALUATION_METRIC,
    epochs_print_interval = _EPOCHS,
    threshold_relative_gap=_RELATIVE_GAP,
    batch_size=_BATCH_SIZE,
    epochs=_EPOCHS,
)


Replicate: 1/6

weights:  {'od': 0, 'traveltime': 1, 'flow': 1, 'equilibrium': 0}


2023-12-19 12:16:11.641500: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz



Fold 1/5

Model training

Pretraining generation weights

period 0 -> p-values kappa: {'population': 0.0017, 'income': 0.5592, 'bus_stops': 0.341}
period 1 -> p-values kappa: {'population': 0.0017, 'income': 0.5592, 'bus_stops': 0.341}
period 2 -> p-values kappa: {'population': 0.0017, 'income': 0.5592, 'bus_stops': 0.341}
period 3 -> p-values kappa: {'population': 0.0017, 'income': 0.5592, 'bus_stops': 0.341}
period 4 -> p-values kappa: {'population': 0.0017, 'income': 0.5592, 'bus_stops': 0.341}
period 5 -> p-values kappa: {'population': 0.0017, 'income': 0.5592, 'bus_stops': 0.341}
period 6 -> p-values kappa: {'population': 0.0017, 'income': 0.5592, 'bus_stops': 0.341}
period 7 -> p-values kappa: {'population': 0.0017, 'income': 0.5592, 'bus_stops': 0.341}
period 8 -> p-values kappa: {'population': 0.0017, 'income': 0.5592, 'bus_stops': 0.341}
period 9 -> p-values kappa: {'population': 0.0017, 'income': 0.5592, 'bus_stops': 0.341}
period 10 -> p-values kappa: {'population': 0.0017,

KeyboardInterrupt: 

In [None]:
filepath = f"output/tables/{ts}_hyperparameter_tuning_{'fresno'}.csv"
hp_metrics_df.to_csv(filepath, index=False)

In [None]:
hp_plot_df = pd.read_csv(filepath)
hp_plot_df = hp_plot_df.sort_values(by = ['component', 'lambda_equilibrium', 'dataset'])
hp_plot_df

In [None]:
# Losses in validation set

fig, ax = plt.subplots(subplot_kw={"projection": "3d"}, figsize = (12,6))

x = np.log10(hp_plot_df[(hp_plot_df.dataset == 'validation') & (hp_plot_df.component == 'traveltime') ]['value'])
y = np.log10(hp_plot_df[(hp_plot_df.dataset == 'validation') & (hp_plot_df.component == 'flow') ]['value'])
z = hp_plot_df['lambda_equilibrium'].sort_values().unique()

c = hp_plot_df[['lambda_equilibrium', 'relative_gap']].sort_values(['lambda_equilibrium'])['relative_gap'].drop_duplicates().values

p = ax.scatter(x,y,z,
               c =c,
               # c =np.log10(hyperparameter_search_eq['loss_eq']),
               norm=colors.LogNorm(vmin=1e-2, vmax=6e-2),
               s=40, cmap='Blues_r')

cbar = plt.colorbar(p,
                    #ticks=[1e-3,1e-4,1e-5,1e-6,1e-7],
                    #ticks=np.linspace(start = 1e-6, stop = 1e-7,num = 5),
                    cax = fig.add_axes([0.78, 0.28, 0.03, 0.38]))

ax.set_xlabel(r'$\log(\ell_t)$')
ax.set_ylabel(r'$\log(\ell_x)$')
ax.set_zlabel(r'$\lambda_{e}$')

ax.xaxis.set_major_formatter(FormatStrFormatter('%.2f'))
ax.yaxis.set_major_formatter(FormatStrFormatter('%.2f'))

ax.view_init(elev=10., azim=-20, roll=0)

plt.tight_layout()

plt.show()

In [None]:
# Losses in training set

fig, ax = plt.subplots(subplot_kw={"projection": "3d"}, figsize = (12,6))

x = np.log10(hp_plot_df[(hp_plot_df.dataset == 'training') & (hp_plot_df.component == 'traveltime') ]['value'])
y = np.log10(hp_plot_df[(hp_plot_df.dataset == 'training') & (hp_plot_df.component == 'flow') ]['value'])
z = hp_plot_df['lambda_equilibrium'].sort_values().unique()

c = hp_plot_df[['lambda_equilibrium', 'relative_gap']].sort_values(['lambda_equilibrium'])['relative_gap'].drop_duplicates().values

p = ax.scatter(x,y,z,
               c =c,
               # c =np.log10(hyperparameter_search_eq['loss_eq']),
               norm=colors.LogNorm(vmin=1e-2, vmax=6e-2),
               s=40, cmap='Blues_r')

cbar = plt.colorbar(p,
                    #ticks=[1e-3,1e-4,1e-5,1e-6,1e-7],
                    #ticks=np.linspace(start = 1e-6, stop = 1e-7,num = 5),
                    cax = fig.add_axes([0.78, 0.28, 0.03, 0.38]))

ax.set_xlabel(r'$\log(\ell_t)$')
ax.set_ylabel(r'$\log(\ell_x)$')
ax.set_zlabel(r'$\lambda_{e}$')

ax.xaxis.set_major_formatter(FormatStrFormatter('%.2f'))
ax.yaxis.set_major_formatter(FormatStrFormatter('%.2f'))
ax.zaxis.set_major_formatter(FormatStrFormatter('%.2f'))

ax.view_init(elev=10., azim=-25, roll=0)

plt.tight_layout()

plt.show()

### Estimation of TVGODLULPE with optimal hyperparameters

In [40]:
print('\ntvgodlulpe: Time specific utility and generation, and link specific parameters for performance functions')

# To report runtime
t0 = time.time()

models['tvgodlulpe'] = create_tvgodlulpe_model_fresno(network = network, n_periods = n_periods, features_Z = _FEATURES_Z,
                                                      historic_g = reference_g, historic_q = reference_q)


tvgodlulpe: Time specific utility and generation, and link specific parameters for performance functions


In [41]:
# Use optimal hyperparameter and do not run equilibrium stage
_LOSS_WEIGHTS = optimal_weights.copy()
optimal_weights

NameError: name 'optimal_weights' is not defined

In [43]:
train_results_dfs['tvgodlulpe'], val_results_dfs['tvgodlulpe'] = models['tvgodlulpe'].fit(
    XT_train, YT_train, XT_val, YT_val,
    node_data=nodes_df,
    optimizers= _OPTIMIZERS,
    batch_size=_BATCH_SIZE,
    loss_weights= _LOSS_WEIGHTS,
    loss_metric=_LOSS_METRIC,
    evaluation_metric=_EVALUATION_METRIC,
    equilibrium_stage=_EQUILIBRIUM_STAGE,
    threshold_relative_gap=_RELATIVE_GAP,
    epochs=_EPOCHS)

print(f'runtime: {time.time()-t0:0.1f} [s]')

# Save model weights for prediction analyses
models['tvgodlulpe'].save_weights(models['tvgodlulpe']._filepath_weights)
print(f"\nModel weights were saved at '{models['tvgodlulpe']._filepath_weights}'")


Model training

Pretraining generation weights

period 0 -> p-values kappa: {'population': 0.0017, 'income': 0.5592, 'bus_stops': 0.341}
period 1 -> p-values kappa: {'population': 0.0017, 'income': 0.5592, 'bus_stops': 0.341}
period 2 -> p-values kappa: {'population': 0.0017, 'income': 0.5592, 'bus_stops': 0.341}
period 3 -> p-values kappa: {'population': 0.0017, 'income': 0.5592, 'bus_stops': 0.341}
period 4 -> p-values kappa: {'population': 0.0017, 'income': 0.5592, 'bus_stops': 0.341}
period 5 -> p-values kappa: {'population': 0.0017, 'income': 0.5592, 'bus_stops': 0.341}
period 6 -> p-values kappa: {'population': 0.0017, 'income': 0.5592, 'bus_stops': 0.341}
period 7 -> p-values kappa: {'population': 0.0017, 'income': 0.5592, 'bus_stops': 0.341}
period 8 -> p-values kappa: {'population': 0.0017, 'income': 0.5592, 'bus_stops': 0.341}
period 9 -> p-values kappa: {'population': 0.0017, 'income': 0.5592, 'bus_stops': 0.341}
period 10 -> p-values kappa: {'population': 0.0017, 'income':

KeyboardInterrupt: 

# Forecasting

In [None]:
generation_factors = compute_generation_factors(period_column=XT_train[:, :, -1, None].numpy(),
                                                flow_column=YT_train[:,:,1, None].numpy(), reference_period=10)

print(generation_factors)

n_periods = len(np.unique(XT_train[:, :, -1].numpy().flatten()))

growth_factor = 7.9/6.6

generated_trips = growth_factor*generation_factors.values[:,np.newaxis]*compute_generated_trips(
    q = network.q.flatten()[np.newaxis,:], ods= network.ods, n_nodes = len(network.nodes))

In [None]:
# Create model for inference
inference_model = create_tvgodlulpe_model_fresno(network = network, n_periods = n_periods, features_Z = _FEATURES_Z,
                                                 historic_g = generated_trips, historic_q = reference_q)
inference_model.build()
inference_model.load_weights(models['tvgodlulpe']._filepath_weights)

In [None]:
# Make prediction on 2020, the validation set, without computing equilibrium
_ = inference_model.predict(XT_val,
                            node_data=nodes_df,
                            loss_metric=_LOSS_METRIC,
                            evaluation_metric=_EVALUATION_METRIC,
                            batch_size= _BATCH_SIZE,
                            optimizer= _OPTIMIZERS['learning'],
                            pretrain_link_flows = False,
                            loss_weights= optimal_weights,
                            threshold_relative_gap=_RELATIVE_GAP,
                            epochs=100)

In [None]:
with pd.option_context('display.float_format', '{:0.3g}'.format):
    print('\n')
    validation_metrics = inference_model.compute_loss_metrics(metrics = {_EVALUATION_METRIC.__name__: _EVALUATION_METRIC,
                                                                     'mse': mse, 'r2': r2_score}, X = XT_val, Y = YT_val)
    print(validation_metrics)

In [None]:
fig, axs = plot_flow_vs_traveltime(model = inference_model,
                        observed_traveltime=inference_model.mask_observed_traveltime(YT_val[:, :, 0]),
                        observed_flow= inference_model.mask_observed_flow(YT_val[:,:,1]),
                        # scatter_kws={"color": sns.color_palette("deep")[0], 's':4, 'alpha': 1}, line_kws={"color": "black"},
                        period_col = pd.DataFrame({'period': list(XT_val[:, :, -1].numpy().astype(int).flatten())})['period'].map(dict(zip(period_keys.period_id, period_keys.hour))).values.flatten(),
                        hour_label=True,
                        all_metrics = False
                        )

plt.savefig('output/figures/results/fresno-scatter-flow-traveltime-outofsample-tvgodlulpe-without-equilibrium.png')

plt.show()

## Comparison against data-driven top performing data-driven benchmark

In [None]:
# Link-level spatial information
links_gdf['link_key'] = pd.Categorical(links_gdf['key'].apply(ast.literal_eval), list(network.links_dict.keys()))

# Create dataframe with data collected in 2020 during peak hours only
model_df = df[(df.hour.isin(_HOURS)) & (df['year']==2020)].sort_values(['period','link_key'])
# links_gdf = links_gdf.sort_values(['link_key'])

# Build dataset witg data collected between 4-5pm in the first Tuesdays of Oct 2019 and 2020
benchmark_df = df[(df.hour == 16) & df['date'].isin(['2019-10-01', '2020-10-06'])].sort_values(['period','link_key'])

fig_speed, fig_flow = plot_congestion_maps(model=inference_model, model_df=model_df, benchmark_df = benchmark_df,
                     gdf=links_gdf.sort_values(['link_key']), features=_FEATURES_Z, cmap = 'viridis')

In [None]:
fig_flow

In [None]:
fig_speed



## Global runtime

In [None]:
print(f'runtime: {time.time()-t0_global:0.1f} [s]')