In [1]:
from time import time
import pandas as pd
from tqdm.notebook import tqdm

# for model training
from sklearn.svm import SVR

# for evaluation & preprocessing
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import (
    train_test_split,
    GridSearchCV,
    HalvingGridSearchCV,
)
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    mean_squared_error,
    mean_absolute_error,
)
import sys, os
sys.path.append(os.path.abspath(os.path.join("..")))

# for displaying results & feedback
# from tabulate import tabulate
import matplotlib.pyplot as plt

In [2]:
import sys, os

sys.path.append(os.path.abspath('..'))
%load_ext autoreload
%autoreload 2
from modules.config import *

In [3]:
data = pd.read_feather(MODEL_DATA_PATH)

In [4]:
# this model will filter out correct h3 resolution and time interval
# it will also one hot encode start and end hexagons and merge them to original dataframe
def get_model_data(h3_res, time_interval_length):
    model_data = data[(data['h3_res'] == h3_res) & (data['time_interval_length'] == time_interval_length)]
    start_hex_dummies = pd.get_dummies(model_data.start_hex_id, prefix="start_")
    end_hex_dummies = pd.get_dummies(model_data.end_hex_id, prefix="end_")
    model_data = pd.concat([model_data, start_hex_dummies, end_hex_dummies], axis=1)
    model_data = model_data.drop(columns=['start_hex_id', 'end_hex_id'])
    return model_data

In [5]:
# this model will filter out correct h3 resolution and time interval
# it will also one hot encode start and end hexagons and merge them to original dataframe
# def get_model_data(h3_res, time_interval_length):
#     model_data = pd.read_feather(os.path.join(MODEL_DATA_DIR_PATH, f"{h3_res}_{time_interval_length}.feather"))
#     return model_data

In [6]:
def split_and_scale_data(model_data):
    y = model_data["demand"]
    X = model_data.drop(columns=["demand"])

    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    return X_train, X_test, y_train, y_test

In [7]:
def train_model(param_grid, X_train, y_train):
    svr = SVR()
    clf = HalvingGridSearchCV(svr, param_grid, n_jobs=-1, scoring="neg_mean_squared_error", random_state=42)
    clf.fit(X_train, y_train)
    return clf

In [8]:
def mean_average_percentage_error(y_true, y_pred):
    return mean_absolute_error(y_true, y_pred) / y_true.mean()


def root_mean_squared_error(y_true, y_pred):
    return mean_squared_error(y_true, y_pred) ** 0.5

In [9]:
def evaluate_model(clf, X_test, y_test):
    results = pd.DataFrame(clf.cv_results_)
    results.sort_values(by="mean_test_score", ascending=False)

    y_pred = clf.best_estimator_.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    mape = mean_average_percentage_error(y_test, y_pred)
    rmse = root_mean_squared_error(y_test, y_pred)

    results['mse'] = 0
    results['mae'] = 0
    results['mape'] = 0
    results['rmse'] = 0

    results.loc[0, 'mse'] = mse
    results.loc[0, 'mae'] = mae
    results.loc[0, 'mape'] = mape
    results.loc[0, 'rmse'] = rmse

    # print(f"MSE: {mse}")
    # print(f"MAE: {mae}")
    # print(f"MAPE: {mape}")
    # print(f"RMSE: {rmse}")

    return results

In [10]:
import pandas as pd
import numpy as np
from itertools import product

In [11]:
def check_file_exists(path):
    return os.path.isfile(path)


def get_svm_results_columns():
    return [
        'h3_res',
        'time_interval_length',
        'mean_fit_time',
        'std_fit_time',
        'mean_score_time',
        'std_score_time',
        'param_C',
        'param_kernel',
        'param_gamma',
        'param_degree',
        'params',
        'split0_test_score',
        'split1_test_score',
        'split2_test_score',
        'split3_test_score',
        'split4_test_score',
        'mean_test_score',
        'std_test_score',
        'rank_test_score'
    ]


def init_results_df(path):
    results = pd.DataFrame(columns=get_svm_results_columns())
    results.to_parquet(path)
    return results


def get_results_df(path):
    if check_file_exists(path):
        return pd.read_parquet(path)
    return init_results_df(path)


def get_svm_metas():
    return [
        {'kernel': ['linear'], 'C': [1, 10, 100], 'gamma': [-1], 'degree': [-1]},
        {'kernel': ['rbf'], 'C': [1, 10, 100], 'gamma': [0.001, 0.0001], 'degree': [-1]},
        {'kernel': ['poly'], 'C': [1, 10, 100], 'gamma': [-1], 'degree': [2, 3, 4, 5]}
    ]


def check_if_model_result_empty(meta, results):
    return results[
        (results['h3_res'] == meta[0]) &
        (results['time_interval_length'] == meta[1]) &
        (results['param_kernel'] == meta[2]) &
        (results['param_C'] == meta[3]) &
        ((results['param_gamma'] == meta[4]) | (pd.isnull(results['param_gamma']))) &
        ((results['param_degree'] == meta[5]) | (pd.isnull(results['param_degree']))) 
    ]['mean_test_score'].empty


def get_param_grid(model_meta):
    param_grid = {
        'kernel': [model_meta[2]],
        'C': [model_meta[3]],
    }
    if model_meta[4] > 0:
        param_grid = {**param_grid, 'gamma': [model_meta[4]]}
    if model_meta[5] > 0:
        param_grid = {**param_grid, 'degree': [model_meta[5]]}
    
    return param_grid


def get_availabe_models_metas(path):
    results = get_results_df(path)
    all_metas = get_svm_metas()

    # the following code will create all possible combinations of parameters for all models
    metas = [list(product(*meta.values())) for meta in all_metas]
    metas = [item for sublist in metas for item in sublist]
    metas = list(product(PREDICTIVE_H3_RESOLUTIONS, CALC_TIME_INTERVAL_LENGTHS, metas)) 
    metas = [[item[0], item[1], *item[2]] for item in metas]

    available_metas = metas
    if not results.empty:
        available_metas = [meta for meta in metas if check_if_model_result_empty(meta, results)]

    # group_by h3 and time, put other params in param grid
    metas_grouped = []
    for h3_res in PREDICTIVE_H3_RESOLUTIONS:
        for time_interval_length in CALC_TIME_INTERVAL_LENGTHS:
            for kernel in ['linear', 'rbf', 'poly']:
                param_grid = [get_param_grid(meta) for meta in available_metas if (meta[0] == h3_res and meta[1] == time_interval_length and meta[2] == kernel)]
                if len(param_grid) == 0:
                    continue
                metas_grouped.append({
                    'h3_res': h3_res,
                    'time_interval_length': time_interval_length,
                    'param_grid': param_grid
                })

    return metas_grouped

def store_results(new_results, path):
    results = pd.read_parquet(path)
    results = pd.concat([results, new_results], ignore_index=True)
    results.to_parquet(path)

In [12]:
from time import time

In [13]:
metas = get_availabe_models_metas(SVM_RESULTS_PATH)
disk_read_time = 0
for meta in tqdm(metas):
    h3_res = meta['h3_res']
    time_interval_length = meta['time_interval_length']
    param_grid = meta['param_grid']
    feedback = f"h3: {h3_res} | t:{time_interval_length} | - " + param_grid[0]["kernel"][0]
    tqdm.write( feedback, end="\r")
    start = time()
    model_data = get_model_data(h3_res, time_interval_length)
    disk_read_time += (time() - start)

    # print(f"Total data size: {len(model_data.index)}")
    model_data = model_data.iloc[:1000]

    X_train, X_test, y_train, y_test = split_and_scale_data(model_data)

    # print(h3_res, time_interval_length)
    # print(param_grid)
    clf = train_model(param_grid, X_train, y_train)

    results = evaluate_model(clf, X_test, y_test)
    results['h3_res'] = h3_res
    results['time_interval_length'] = time_interval_length
    store_results(results, SVM_RESULTS_PATH)     
    tqdm.write(feedback + " ✓")

  0%|          | 0/24 [00:00<?, ?it/s]

h3: 7 | t:1 | - linear ✓
h3: 7 | t:1 | - rbf ✓
h3: 7 | t:1 | - poly ✓
h3: 7 | t:2 | - linear ✓
h3: 7 | t:2 | - rbf ✓
h3: 7 | t:2 | - poly ✓
h3: 7 | t:6 | - linear ✓
h3: 7 | t:6 | - rbf ✓
h3: 7 | t:6 | - poly ✓
h3: 7 | t:24 | - linear ✓
h3: 7 | t:24 | - rbf ✓
h3: 7 | t:24 | - poly ✓
h3: 8 | t:1 | - linear ✓
h3: 8 | t:1 | - rbf ✓
h3: 8 | t:1 | - poly ✓
h3: 8 | t:2 | - linear ✓
h3: 8 | t:2 | - rbf ✓
h3: 8 | t:2 | - poly ✓
h3: 8 | t:6 | - linear ✓
h3: 8 | t:6 | - rbf ✓
h3: 8 | t:6 | - poly ✓
h3: 8 | t:24 | - linear ✓
h3: 8 | t:24 | - rbf ✓
h3: 8 | t:24 | - poly ✓


In [14]:

print(disk_read_time)

120.74320149421692


In [15]:
# results = pd.read_parquet(SVM_RESULTS_PATH)
# results.param_kernel.unique()