In [1]:
from time import time
import pandas as pd

# for model training
from sklearn.svm import SVR

# for evaluation & preprocessing
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import (
    train_test_split,
    GridSearchCV,
    HalvingGridSearchCV,
)
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    mean_squared_error,
    mean_absolute_error,
)
import sys, os
sys.path.append(os.path.abspath(os.path.join("..")))

# for displaying results & feedback
# from tabulate import tabulate
import matplotlib.pyplot as plt

In [2]:
import sys, os

sys.path.append(os.path.abspath('..'))
%load_ext autoreload
%autoreload 2
from modules.config import *

In [3]:
data = pd.read_parquet(MODEL_DATA_PATH)

In [4]:
# this model will filter out correct h3 resolution and time interval
# it will also one hot encode start and end hexagons and merge them to original dataframe
def get_model_data(h3_res, time_interval_length):
    model_data = data[(data['h3_res'] == h3_res) & (data['time_interval_length'] == time_interval_length)]
    start_hex_dummies = pd.get_dummies(model_data.start_hex_id, prefix="start_")
    end_hex_dummies = pd.get_dummies(model_data.end_hex_id, prefix="end_")
    model_data = pd.concat([model_data, start_hex_dummies, end_hex_dummies], axis=1)
    model_data = model_data.drop(columns=['start_hex_id', 'end_hex_id'])
    return model_data

In [5]:
def split_and_scale_data(model_data):
    y = model_data["demand"]
    X = model_data.drop(columns=["demand"])

    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    return X_train, X_test, y_train, y_test

In [6]:
def train_model(param_grid, X_train, y_train):
    svr = SVR()
    clf = HalvingGridSearchCV(svr, param_grid, n_jobs=-1, scoring="neg_mean_squared_error", random_state=42)
    clf.fit(X_train, y_train)
    return clf

In [7]:
def mean_average_percentage_error(y_true, y_pred):
    return mean_absolute_error(y_true, y_pred) / y_true.mean()


def root_mean_squared_error(y_true, y_pred):
    return mean_squared_error(y_true, y_pred) ** 0.5

In [8]:
def evaluate_model(clf, X_test, y_test):
    results = pd.DataFrame(clf.cv_results_)
    results.sort_values(by="mean_test_score", ascending=False)

    y_pred = clf.best_estimator_.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    mape = mean_average_percentage_error(y_test, y_pred)
    rmse = root_mean_squared_error(y_test, y_pred)

    print(f"MSE: {mse}")
    print(f"MAE: {mae}")
    print(f"MAPE: {mape}")
    print(f"RMSE: {rmse}")

    # todo merge into results

In [9]:
def store_results(new_results, path):
    results = pd.read_parquet(path)
    results = results.append(new_results)
    results.store_parquet(path)

In [10]:
import pandas as pd
import numpy as np
from itertools import product

In [11]:
def check_file_exists(path):
    return os.path.isfile(path)


def get_svm_results_columns():
    return [
        'h3_res',
        'time_interval_length',
        'mean_fit_time',
        'std_fit_time',
        'mean_score_time',
        'std_score_time',
        'param_C',
        'param_kernel',
        'param_gamma',
        'param_degree',
        'params',
        'split0_test_score',
        'split1_test_score',
        'split2_test_score',
        'split3_test_score',
        'split4_test_score',
        'mean_test_score',
        'std_test_score',
        'rank_test_score'
    ]


def init_results_df(path):
    results = pd.DataFrame(columns=get_svm_results_columns())
    results.to_parquet(path)


def get_results_df(path):
    if check_file_exists(path):
        return pd.read_parquet(path)
    return init_results_df(path)


def get_svm_metas():
    return [
        {'kernel': ['linear'], 'C': [1, 10, 100], 'gamma': [-1], 'degree': [0]},
        {'kernel': ['rbf'], 'C': [1, 10, 100], 'gamma': [0.001, 0.0001], 'degree': [0]},
        {'kernel': ['poly'], 'C': [1, 10, 100], 'gamma': [-1], 'degree': [2, 3, 4, 5]}
    ]


def check_if_model_result_empty(meta, results):
    return results[
        (results['h3_res'] == meta[0]) &
        (results['time_interval_length'] == meta[1]) &
        (results['param_kernel'] == meta[2]) &
        (results['param_C'] == meta[3]) &
        (results['param_gamma'] == meta[4]) &
        (results['param_degree'] == meta[5]) 
    ]['mean_test_score'].empty


def get_param_grid(model_meta):
    param_grid = {
        'kernel': [model_meta[2]],
        'C': [model_meta[3]],
        'degree': [model_meta[5]]
    }
    if model_meta[4] == -1:
        return param_grid

    return {**param_grid, 'degree': model_meta[4]}


def get_availabe_models_metas(path):
    results = get_results_df(path)
    all_metas = get_svm_metas()

    # the following code will create all possible combinations of parameters for all models
    metas = [list(product(*meta.values())) for meta in all_metas]
    metas = [item for sublist in metas for item in sublist]
    metas = list(product(PREDICTIVE_H3_RESOLUTIONS, CALC_TIME_INTERVAL_LENGTHS, metas)) 
    metas = [[item[0], item[1], *item[2]] for item in metas]

    available_metas = [meta for meta in metas if check_if_model_result_empty(meta, results)]

    # group_by h3 and time, put other params in param grid
    metas_grouped = []
    for h3_res in PREDICTIVE_H3_RESOLUTIONS:
        for time_interval_length in CALC_TIME_INTERVAL_LENGTHS:
            for kernel in ['linear', 'rbf', 'poly']:
                param_grid = [get_param_grid(meta) for meta in available_metas if (meta[0] == h3_res and meta[1] == time_interval_length and meta[2] == kernel)]
                metas_grouped.append({
                    'h3_res': h3_res,
                    'time_interval_length': time_interval_length,
                    'param_grid': param_grid
                })

    return metas_grouped

In [12]:
metas = get_availabe_models_metas(SVM_RESULTS_PATH)
for meta in metas:
    h3_res = meta['h3_res']
    time_interval_length = meta['time_interval_length']

    model_data = get_model_data(h3_res, time_interval_length)
    print(f"Total data size: {len(model_data.index)}")
    model_data = model_data.iloc[:10000]

    X_train, X_test, y_train, y_test = split_and_scale_data(model_data)
    print(f"Size of the train dataset is: {X_train.shape[0]}")
    print(f"Size of the test dataset is: {X_test.shape[0]}")

    param_grid = meta['param_grid']
    print(param_grid)
    clf = train_model(param_grid, X_train, y_train)

    results = evaluate_model(clf, X_test, y_test)
    results['h3_res'] = h3_res
    results['time_interval_length'] = time_interval_length
    store_results(results)     

NameError: name 'meta' is not defined

In [None]:
# def check_if_model_result_empty(meta, results):
#   return results[
#           (results['h3_res'] == meta['h3_res']) &
#           (results['time_interval_length'] == meta['time_interval_length']) &
#           (results['param_kernel'] == meta['param_kernel']) &
#           (results['param_C'] == meta['param_C']) &
#           (results['param_gamma'] == meta['param_gamma']) &
#           (results['param_degree'] == meta['param_degree']) 
#       ]['mean_test_score'].empty


# def get_availabe_models_metas(path):
#     results = get_results_df(path)
#     all_metas = get_svm_metas()

#     # the following code will create all possible combinations of parameters for all models
#     metas = [list(product(PREDICTIVE_H3_RESOLUTIONS, CALC_TIME_INTERVAL_LENGTHS, *meta.values())) for meta in all_metas]
#     metas = [item for sublist in metas for item in sublist]

#     params = [] 
#     for meta in metas:
#       kernel = meta[2]
#       comb = {
#         'h3_res': meta[0],
#         'time_interval_length': meta[1],
#         'param_kernel': kernel,
#         'param_C': meta[3],
#       }

#       if kernel == 'rbf':
#         params.append({**comb, 'param_gamma': meta[4]})
#       elif kernel == 'poly':
#         params.append({**comb, 'param_degree': meta[4]})
#       else:
#         params.append({**comb})   

#     available_metas = [param_group for param_group in params if check_if_model_result_empty(param_group, results)]

#     # group_by h3 and time, put other params in param grid
#     return available_metas