In [1]:
from time import time
import pandas as pd

# for model training
from sklearn.svm import SVR

# for evaluation & preprocessing
from sklearn.model_selection import (
    train_test_split,
    ParameterGrid,
)
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    mean_squared_error,
    mean_absolute_error,
)
import sys, os
sys.path.append(os.path.abspath(os.path.join("..")))

# for displaying results & feedback
# from tabulate import tabulate
import matplotlib.pyplot as plt

In [2]:
import sys, os

sys.path.append(os.path.abspath('..'))
%load_ext autoreload
%autoreload 2
from modules.config import *

In [3]:
model_data = pd.read_pickle(MODEL_DATA_PATH)
model_data.head(2)

Unnamed: 0,demand,hour,day_of_week_0,day_of_week_1,day_of_week_2,day_of_week_3,day_of_week_4,day_of_week_5,day_of_week_6,month_1,...,end_881f1abb2dfffff,end_881f1abb31fffff,end_881f1abb35fffff,end_881f1abb39fffff,end_881f1abb61fffff,end_881f1abb63fffff,end_881f1abb65fffff,end_881f1abb67fffff,end_881f1abb69fffff,end_881f1abb6bfffff
0,3,0,0,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,1,12,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [4]:
model_data

Unnamed: 0,demand,hour,day_of_week_0,day_of_week_1,day_of_week_2,day_of_week_3,day_of_week_4,day_of_week_5,day_of_week_6,month_1,...,end_881f1abb2dfffff,end_881f1abb31fffff,end_881f1abb35fffff,end_881f1abb39fffff,end_881f1abb61fffff,end_881f1abb63fffff,end_881f1abb65fffff,end_881f1abb67fffff,end_881f1abb69fffff,end_881f1abb6bfffff
0,3,0,0,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,1,12,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,1,6,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,18,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,18,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
372203,1,6,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
372204,1,12,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
372205,2,6,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
372206,1,12,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
model_data = model_data.iloc[:10000]

In [6]:
y = model_data["demand"]
X = model_data.drop(columns=["demand"])

X_train, X_rest, y_train, y_rest = train_test_split(X, y, train_size=0.7, random_state=42)
X_valid, X_test, y_valid, y_test = train_test_split(X_rest, y_rest, test_size=0.5, random_state=42)

print(f"Size of the train dataset is: {X_train.shape[0]}")
print(f"Size of the validation dataset is: {X_valid.shape[0]}")
print(f"Size of the test dataset is: {X_test.shape[0]}")

Size of the train dataset is: 7000
Size of the validation dataset is: 1500
Size of the test dataset is: 1500


In [7]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_valid = scaler.transform(X_valid)
X_test = scaler.transform(X_test)

In [8]:
def mean_average_percentage_error(y_true, y_pred):
    return mean_absolute_error(y_true, y_pred) / y_true.mean()


def root_mean_squared_error(y_true, y_pred):
    return mean_squared_error(y_true, y_pred) ** 0.5

In [9]:
results = []
def get_results_array(y_pred_for_validation, y_pred_for_test, kernel, C, degree):
    return [
        kernel,
        C,
        degree,
        
        mean_squared_error(y_valid, y_pred_for_validation),
        mean_absolute_error(y_valid, y_pred_for_validation),
        mean_average_percentage_error(y_valid, y_pred_for_validation),
        root_mean_squared_error(y_valid, y_pred_for_validation),
        
        mean_squared_error(y_test, y_pred_for_test),
        mean_absolute_error(y_test, y_pred_for_test),
        mean_average_percentage_error(y_test, y_pred_for_test),
        root_mean_squared_error(y_test, y_pred_for_test),
    ]

In [10]:
def train_model_and_calculate_results(kernel, C, degree):
    model = SVR(kernel=kernel, degree=degree, C=C)
    model.fit(X_train, y_train)

    y_pred_for_validation = model.predict(X_valid)
    y_pred_for_test = model.predict(X_test)
    
    results_array = get_results_array(y_pred_for_validation, y_pred_for_test, kernel, C, degree)
    results.append(results_array)

In [11]:
train_model_and_calculate_results("poly", 1, 7)

In [12]:
# kernel_options = ["linear", "poly", "rbf"]
# degree = 0
# polynomial_degree_options = [2, 3, 4, 5, 6, 7]
# C_options = [1, 10, 100]

# for kernel in kernel_options:
#     for C in C_options:
#         if kernel != 'poly':
#             train_model_and_calculate_results(kernel, C, degree)
#             continue

#         for degree in polynomial_degree_options:
#             train_model_and_calculate_results(kernel, C, degree)

In [13]:
results = pd.DataFrame(results, columns=[
    "kernel", "C", "degree",
    "val_MSE", "val_MAE", "val_MAPE", "val_RMSE",
    "test_MSE", "test_MAE", "test_MAPE", "test_RMSE"
])
results.sort_values(by="val_MAPE")

Unnamed: 0,kernel,C,degree,val_MSE,val_MAE,val_MAPE,val_RMSE,test_MSE,test_MAE,test_MAPE,test_RMSE
0,poly,1,2,1.034219,0.549897,0.356459,1.016966,1.027988,0.531515,0.347244,1.013897
