In [1]:
import pandas as pd
from tqdm.notebook import tqdm
from itertools import product

from sklearn.svm import SVR
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import (
    train_test_split,
    HalvingGridSearchCV,
)
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    mean_squared_error,
    mean_absolute_error,
)

import sys, os
sys.path.append(os.path.abspath(os.path.join("..")))

In [2]:
import sys, os

sys.path.append(os.path.abspath('..'))
%load_ext autoreload
%autoreload 2
from modules.config import *

In [3]:
def get_svm_results_columns():
    return [
        'h3_res',
        'time_interval_length',
        'mean_fit_time',
        'std_fit_time',
        'mean_score_time',
        'std_score_time',
        'param_C',
        'param_kernel',
        'param_gamma',
        'param_degree',
        'params',
        'split0_test_score',
        'split1_test_score',
        'split2_test_score',
        'split3_test_score',
        'split4_test_score',
        'mean_test_score',
        'std_test_score',
        'rank_test_score'
    ]


def get_results_df(path):
    if os.path.isfile(path):
        return pd.read_parquet(path)

    results = pd.DataFrame(columns=get_svm_results_columns())
    results.to_parquet(path)
    return results


def store_results(new_results, path):
    results = pd.read_parquet(path)
    results = pd.concat([results, new_results], ignore_index=True)
    results.to_parquet(path)

In [4]:
# this model will filter out correct h3 resolution and time interval
# it will also one hot encode start and end hexagons and merge them to original dataframe

# data = pd.read_feather(MODEL_DATA_PATH)
# def get_model_data(h3_res, time_interval_length):
#     model_data = data[(data['h3_res'] == h3_res) & (data['time_interval_length'] == time_interval_length)]
#     start_hex_dummies = pd.get_dummies(model_data.start_hex_id, prefix="start_")
#     end_hex_dummies = pd.get_dummies(model_data.end_hex_id, prefix="end_")
#     model_data = pd.concat([model_data, start_hex_dummies, end_hex_dummies], axis=1)
#     model_data = model_data.drop(columns=['start_hex_id', 'end_hex_id'])
#     return model_data

In [5]:
# this method will get model data for a specific h3 resolution and time interval length
def get_model_data(h3_res, time_interval_length):
    model_data = pd.read_feather(os.path.join(MODEL_DATA_DIR_PATH, f"{h3_res}_{time_interval_length}.feather"))
    return model_data

In [6]:
def split_and_scale_data(model_data):
    y = model_data["demand"]
    X = model_data.drop(columns=["demand"])

    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    return X_train, X_test, y_train, y_test

In [7]:
def train_model(param_grid, X_train, y_train):
    svr = SVR()
    models = HalvingGridSearchCV(svr, param_grid, n_jobs=-1, scoring="neg_mean_squared_error", random_state=42)
    models.fit(X_train, y_train)
    return models

In [8]:
def mean_average_percentage_error(y_true, y_pred):
    return mean_absolute_error(y_true, y_pred) / y_true.mean()


def root_mean_squared_error(y_true, y_pred):
    return mean_squared_error(y_true, y_pred) ** 0.5


def evaluate_model(models, X_test, y_test):
    results = pd.DataFrame(models.cv_results_)
    results.sort_values(by="mean_test_score", ascending=False)

    y_pred = models.best_estimator_.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    mape = mean_average_percentage_error(y_test, y_pred)
    rmse = root_mean_squared_error(y_test, y_pred)

    results['mse'] = 0
    results['mae'] = 0
    results['mape'] = 0
    results['rmse'] = 0

    results['n_iter'] = 0

    results.loc[0, 'mse'] = mse
    results.loc[0, 'mae'] = mae
    results.loc[0, 'mape'] = mape
    results.loc[0, 'rmse'] = rmse

    results.loc[0, 'n_iter'] = models.best_estimator_.n_iter_

    return results

In [9]:
def get_svm_metas():
    return [
        {'kernel': ['linear'], 'C': [1, 10, 100], 'gamma': [-1],            'degree': [-1],         'max_iter': [100000]},
        {'kernel': ['rbf'],    'C': [1, 10, 100], 'gamma': [0.001, 0.0001], 'degree': [-1],         'max_iter': [100000]},
        {'kernel': ['poly'],   'C': [1, 10, 100], 'gamma': [-1],            'degree': [2, 3, 4, 5], 'max_iter': [100000]}
    ]


def check_if_model_result_empty(meta, results):
    return results[
        (results['h3_res'] == meta[0]) &
        (results['time_interval_length'] == meta[1]) &
        (results['param_kernel'] == meta[2]) &
        (results['param_C'] == meta[3]) &
        ((results['param_gamma'] == meta[4]) | (pd.isnull(results['param_gamma']))) &
        ((results['param_degree'] == meta[5]) | (pd.isnull(results['param_degree']))) 
    ]['mean_test_score'].empty


def get_param_grid(model_meta):
    param_grid = {
        'kernel': [model_meta[2]],
        'C': [model_meta[3]],
        'max_iter': [model_meta[6]]
    }
    if model_meta[4] > 0:
        param_grid = {**param_grid, 'gamma': [model_meta[4]]}
    if model_meta[5] > 0:
        param_grid = {**param_grid, 'degree': [model_meta[5]]}
    
    return param_grid


def get_availabe_models_metas(path):
    results = get_results_df(path)
    all_metas = get_svm_metas()

    # the following code will create all possible combinations of parameters for all models
    metas = [list(product(*meta.values())) for meta in all_metas]
    metas = [item for sublist in metas for item in sublist]
    metas = list(product(PREDICTIVE_H3_RESOLUTIONS, CALC_TIME_INTERVAL_LENGTHS, metas)) 
    metas = [[item[0], item[1], *item[2]] for item in metas]

    available_metas = metas
    if not results.empty:
        available_metas = [meta for meta in metas if check_if_model_result_empty(meta, results)]

    # group_by h3 and time, put other params in param grid
    metas_grouped = []
    for h3_res in PREDICTIVE_H3_RESOLUTIONS:
        for time_interval_length in CALC_TIME_INTERVAL_LENGTHS:
            for kernel in ['linear', 'rbf', 'poly']:
                param_grid = [get_param_grid(meta) for meta in available_metas if (meta[0] == h3_res and meta[1] == time_interval_length and meta[2] == kernel)]
                if len(param_grid) == 0:
                    continue
                metas_grouped.append({
                    'h3_res': h3_res,
                    'time_interval_length': time_interval_length,
                    'param_grid': param_grid
                })

    return metas_grouped

In [10]:
metas = get_availabe_models_metas(SVM_RESULTS_PATH)
for meta in tqdm(metas):
    h3_res = meta['h3_res']
    time_interval_length = meta['time_interval_length']
    param_grid = meta['param_grid']

    feedback = f"h3: {h3_res} | t:{time_interval_length} | - " + param_grid[0]["kernel"][0]
    tqdm.write(feedback, end="\r")
    
    model_data = get_model_data(h3_res, time_interval_length)
    model_data = model_data.iloc[:100000]

    X_train, X_test, y_train, y_test = split_and_scale_data(model_data)
    models = train_model(param_grid, X_train, y_train)

    results = evaluate_model(models, X_test, y_test)
    results['h3_res'] = h3_res
    results['time_interval_length'] = time_interval_length
    store_results(results, SVM_RESULTS_PATH)  
       
    tqdm.write(feedback + " ✓")

  0%|          | 0/24 [00:00<?, ?it/s]

h3: 7 | t:1 | - linear

In [None]:
# results = pd.read_parquet(SVM_RESULTS_PATH)
# results[results['mape'] != 0].sort_values(by='mape', ascending=True)

Unnamed: 0,h3_res,time_interval_length,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,param_gamma,param_degree,...,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score,mse,mae,mape,rmse,n_iter
72,7,6,0.209522,0.05099,0.031091,0.006581,1,poly,,2.0,...,-19.565808,-19.79093,-18.724065,-19.790411,0.657035,0.009769,0.098289,0.009879,0.098836,2850.0
102,7,24,0.274267,0.071746,0.041648,0.024144,1,poly,,2.0,...,-907.401871,-952.31614,-972.196186,-968.508369,37.210181,30.812127,1.599946,0.026143,5.550867,6932.0
42,7,2,0.247914,0.039923,0.02861,0.013269,1,poly,,2.0,...,-2.953061,-2.857672,-2.362335,-2.952981,0.381423,0.009738,0.098256,0.030753,0.098681,1730.0
12,7,1,0.172843,0.031846,0.019002,0.008299,1,poly,,2.0,...,-0.953403,-0.925942,-0.780402,-0.872231,0.06186,0.009627,0.097337,0.044849,0.098117,1787.0
222,8,24,0.996214,0.082982,0.211409,0.01267,1,poly,,2.0,...,-0.507881,-0.519747,-0.622409,-0.545693,0.054091,0.009314,0.094925,0.060436,0.096511,4186.0
192,8,6,0.899835,0.025538,0.213601,0.012843,1,poly,,2.0,...,-0.163172,-0.152372,-0.113371,-0.171679,0.040652,0.008672,0.090276,0.074732,0.093122,2293.0
132,8,1,0.603008,0.078774,0.132185,0.010098,1,poly,,2.0,...,-0.04184,-0.035887,-0.031921,-0.040382,0.005665,0.008216,0.087088,0.077966,0.090644,1272.0
162,8,2,0.626836,0.060522,0.119809,0.007023,1,poly,,2.0,...,-0.070038,-0.070348,-0.068376,-0.077144,0.014435,0.008509,0.089282,0.079198,0.092242,1493.0
124,8,1,3.736032,0.455232,0.720866,0.16215,1,rbf,0.001,,...,-0.130583,-0.140416,-0.132555,-0.142583,0.011088,0.070152,0.11191,0.100188,0.264862,79773.0
154,8,2,3.34177,0.167798,0.61585,0.029495,1,rbf,0.001,,...,-0.125097,-0.136185,-0.151924,-0.138655,0.010777,0.078246,0.142829,0.126696,0.279724,100000.0
