In [1]:
import pandas as pd
from itertools import product
from tqdm.notebook import tqdm

import sys, os

sys.path.append(os.path.abspath('..'))
%load_ext autoreload
%autoreload 2
from modules.storage import (
    get_demand_model_data,
    get_demand_orig_dest_model_data,
)
from modules.config import *
from modules.svm import *

# Support Vector Machine (SVM) Training
In this notebook we train SVM to predict demand. This process is split into two stages. Firstly, we determine the best hyperparameters for a fixed H3 and time resolution. Then, we use these hyperparameters to train models for all resolutions.

In the first stage, we perform three separate grid searches for 3 kernels - linear, rbf, polynomial. In order to speed up the training we use a grid search with successive halving, which first evaluates all candidates (combinations of hyperparameters) with a small fraction of the training set and then trains the best third of all candidates on a fraction of the dataset that is three times as large as in the previous stage. This process repeats until the best hyperparameters are found and evaluated on the whole dataset. By using halving grid search we hope to speed up the first stage.

As it states in scikit-learn documentation:

"The implementation is based on libsvm. The fit time complexity is more than quadratic with the number of samples which makes it hard to scale to datasets with more than a couple of 10000 samples."

We have attempted model fitting on training sets of different size and found 30000 to be the optimal to achieve comparable results to other models. The execution time already lasted between 6 and 10 hours depending on current RAM usage, which is why we do not advise to run this notebook.


For all three kernels we use various values on a log scale for the regularization parameter C, which tries to minimize the coefficients of the model. For the RBF kernel we also use different values on a log scale for the bandwidth γ, which determines how smooth the feature functions are. Lastly, we vary
the degree of the polynomial d for the polynomial kernel.

In [2]:
all_possible_metas = [
    {'kernel': ['linear'], 'C': [0.1, 1, 10, 100, 1000],  'gamma': [-1],                          'degree': [-1],         'max_iter': [1000000]},
    {'kernel': ['rbf'],    'C': [0.1, 1, 10, 100, 1000],  'gamma': [0.1, 0.01, 0.001, 0.0001],    'degree': [-1],         'max_iter': [1000000]},
    {'kernel': ['poly'],   'C': [1, 10, 100, 1000, 10000], 'gamma': [-1],                   'degree': [2, 3, 4, 5, 6], 'max_iter': [1000000]}
]

In [3]:
execute_stage(
    'first_stage',
    SVM_FIRST_STAGE_DEMAND_RESULTS_PATH,
    SVM_SECOND_STAGE_DEMAND_RESULTS_PATH,
    TUNE_H3_RESOLUTION,
    TUNE_TIME_INTERVAL_LENGTH,
    all_possible_metas,
    get_demand_model_data,
)

h3_res: 8 - time_interval_length: 6 - param_kernel: linear - param_C: 0.1 - param_gamma: -1 - param_degree: -1 # already trained
h3_res: 8 - time_interval_length: 6 - param_kernel: linear - param_C: 1 - param_gamma: -1 - param_degree: -1 # already trained
h3_res: 8 - time_interval_length: 6 - param_kernel: linear - param_C: 10 - param_gamma: -1 - param_degree: -1 # already trained
h3_res: 8 - time_interval_length: 6 - param_kernel: linear - param_C: 100 - param_gamma: -1 - param_degree: -1 # already trained
h3_res: 8 - time_interval_length: 6 - param_kernel: linear - param_C: 1000 - param_gamma: -1 - param_degree: -1 # already trained
h3_res: 8 - time_interval_length: 6 - param_kernel: rbf - param_C: 0.1 - param_gamma: 0.1 - param_degree: -1 # already trained
h3_res: 8 - time_interval_length: 6 - param_kernel: rbf - param_C: 0.1 - param_gamma: 0.01 - param_degree: -1 # already trained
h3_res: 8 - time_interval_length: 6 - param_kernel: rbf - param_C: 0.1 - param_gamma: 0.001 - param_de

0it [00:00, ?it/s]

In [4]:
results = pd.read_parquet(SVM_FIRST_STAGE_DEMAND_RESULTS_PATH)
results.sort_values(by=['mean_test_score'], ascending=False).head(2)

Unnamed: 0,iter,n_resources,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,param_max_iter,params,...,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score,n_iter,h3_res,time_interval_length,param_gamma,param_degree
52,2,29997,610.163423,4.774594,52.755195,4.02238,100.0,poly,1000000,"{'C': 100.0, 'degree': 5.0, 'gamma': None, 'ke...",...,-5.297303,-5.170202,-5.103442,-5.184127,0.064518,0,8,6,,5.0
53,2,29997,726.315229,11.302701,93.564156,4.522072,100.0,poly,1000000,"{'C': 100.0, 'degree': 3.0, 'gamma': None, 'ke...",...,-9.430366,-9.380489,-9.514133,-9.401522,0.077954,0,8,6,,3.0


In [5]:
results.sort_values(by='mean_test_score', ascending=False).groupby('param_kernel')[[
    'mean_test_score',
    'param_C',
    'param_degree',
    'param_gamma',
]].first()

Unnamed: 0_level_0,mean_test_score,param_C,param_degree,param_gamma
param_kernel,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
linear,-53.220806,100.0,,
poly,-17.281016,100.0,5.0,
rbf,-18.259941,100.0,,0.01


As we can see the best performing model is with polynomial kernel, degree 5 and regularization parameter C equal 100. It is closely followed by the model with rbf kernel.

In [6]:
results.groupby('param_kernel').mean_fit_time.sum() / 60

param_kernel
linear    48.731966
poly      51.119908
rbf       64.210966
Name: mean_fit_time, dtype: float64

Next, we train models for all resolutions with found hyperparameters.

In [7]:
resolutions = list(product(PREDICTIVE_H3_RESOLUTIONS, CALC_TIME_INTERVAL_LENGTHS))
resolutions = resolutions + (ADDITIONAL_PREDICTIVE_RESOLUTIONS)
resolutions

[(7, 1), (7, 2), (7, 6), (7, 24), (8, 1), (8, 2), (8, 6), (8, 24), (9, 24)]

Next, we train models for all resolutions with found hyperparameters.

In [8]:
for h3_res, time_interval_length in tqdm(resolutions):
    execute_stage(
        'second_stage',
        SVM_FIRST_STAGE_DEMAND_RESULTS_PATH,
        SVM_SECOND_STAGE_DEMAND_RESULTS_PATH,
        h3_res,
        time_interval_length,
        all_possible_metas,
        get_demand_model_data,
    )

  0%|          | 0/9 [00:00<?, ?it/s]

h3_res: 7 - time_interval_length: 1 - param_kernel: poly - param_C: 100.0 - param_gamma: nan - param_degree: 5.0 # already trained
h3_res: 7 - time_interval_length: 2 - param_kernel: poly - param_C: 100.0 - param_gamma: nan - param_degree: 5.0 # already trained
h3_res: 7 - time_interval_length: 6 - param_kernel: poly - param_C: 100.0 - param_gamma: nan - param_degree: 5.0 # already trained
h3_res: 7 - time_interval_length: 24 - param_kernel: poly - param_C: 100.0 - param_gamma: nan - param_degree: 5.0 # already trained
h3_res: 8 - time_interval_length: 1 - param_kernel: poly - param_C: 100.0 - param_gamma: nan - param_degree: 5.0 # already trained
h3_res: 8 - time_interval_length: 2 - param_kernel: poly - param_C: 100.0 - param_gamma: nan - param_degree: 5.0 # already trained
h3_res: 8 - time_interval_length: 6 - param_kernel: poly - param_C: 100.0 - param_gamma: nan - param_degree: 5.0 # already trained
h3_res: 8 - time_interval_length: 24 - param_kernel: poly - param_C: 100.0 - param

As an addition we will also train a model that predicts demand for origin-destination pairs instead of just origin. As the dimensionality of the data increases drastically when using origin-destination pairs, we will only use a low h3 resolution (7) and a large time interval (24h).

In [9]:
execute_stage(
    'second_stage',
    SVM_FIRST_STAGE_DEMAND_RESULTS_PATH,
    SVM_SECOND_STAGE_DEMAND_RESULTS_PATH,
    int(f"{ORIGIN_DESTINATION_H3_RESOLUTION}{ORIGIN_DESTINATION_H3_RESOLUTION}"),
    ORIGIN_DESTINATION_TIME_INTERVAL_LENGTH,
    all_possible_metas,
    lambda _, __, : get_demand_orig_dest_model_data(),
)

h3_res: 77 - time_interval_length: 24 - param_kernel: poly - param_C: 100.0 - param_gamma: nan - param_degree: 5.0 # already trained


In [10]:
results = pd.read_parquet(SVM_SECOND_STAGE_DEMAND_RESULTS_PATH)
results.sort_values(by=['mean_test_score'], ascending=False)

Unnamed: 0,iter,n_resources,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_degree,param_kernel,param_max_iter,...,std_train_score,n_iter,h3_res,time_interval_length,test_mse,test_rmse,test_mae,test_non_zero_mape,test_zero_accuracy,param_gamma
4,0,30000,137.719914,2.752086,30.158763,1.748708,100.0,5.0,poly,1000000,...,0.001647,61079,8,1,0.512375,0.715804,0.275154,0.622341,0.915995,
0,0,30000,59.233981,0.939136,10.778968,0.38459,100.0,5.0,poly,1000000,...,0.013615,81625,7,1,2.665077,1.632506,0.511289,0.448406,0.898903,
5,0,30000,183.379655,4.889796,35.336638,5.014597,100.0,5.0,poly,1000000,...,0.032401,168426,8,2,3.374197,1.836899,0.646611,0.556452,0.859193,
9,0,30000,66.164266,1.448703,13.217245,1.073573,100.0,5.0,poly,1000000,...,0.048807,37817,77,24,4.138852,2.034417,0.380661,0.353108,0.925897,
8,0,30000,562.608911,15.436321,109.632418,16.429736,100.0,5.0,poly,1000000,...,0.114783,354683,9,24,10.447953,3.232329,1.427058,0.448487,0.846259,
6,0,30000,216.618685,4.189967,36.682684,4.344279,100.0,5.0,poly,1000000,...,0.318483,301713,8,6,16.535265,4.066358,1.400172,0.40086,0.832144,
1,0,30000,106.339026,3.105151,12.387918,2.700414,100.0,5.0,poly,1000000,...,0.220275,214645,7,2,19.301339,4.393329,1.431508,0.366964,0.811201,
7,0,30000,244.099163,3.017364,39.720194,3.371004,100.0,5.0,poly,1000000,...,0.45891,344639,8,24,40.259285,6.34502,2.317048,0.250466,0.857496,
2,0,30000,174.270175,6.620389,13.206788,2.18766,100.0,5.0,poly,1000000,...,0.920956,509383,7,6,90.02311,9.488051,3.000653,0.220547,0.783393,
3,0,8797,11.47264,0.427771,1.319576,0.303373,100.0,5.0,poly,1000000,...,9.973864,92597,7,24,384.364754,19.605223,6.656288,0.167891,0.790748,
