In [52]:
from time import time
import pandas as pd

# for model training
from sklearn.svm import SVR

# for evaluation & preprocessing
from sklearn.model_selection import (
    train_test_split,
    GridSearchCV,
)
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    mean_squared_error,
    mean_absolute_error,
)
import sys, os
sys.path.append(os.path.abspath(os.path.join("..")))

# for displaying results & feedback
# from tabulate import tabulate
import matplotlib.pyplot as plt

In [53]:
import sys, os

sys.path.append(os.path.abspath('..'))
%load_ext autoreload
%autoreload 2
from modules.config import *

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [54]:
model_data = pd.read_parquet(MODEL_DATA_PATH)
model_data.head(2)

In [None]:
# encode start and end hexagons as one-hot vectors
model_data = model_data[(model_data['h3_res'] == 7) & (model_data['time_interval_length'] == 24)]
start_hex_dummies = pd.get_dummies(model_data.start_hex_id, prefix="start_")
end_hex_dummies = pd.get_dummies(model_data.end_hex_id, prefix="end_")
model_data = pd.concat([model_data, start_hex_dummies, end_hex_dummies], axis=1)
model_data = model_data.drop(columns=['start_hex_id', 'end_hex_id'])

In [7]:
model_data = model_data.iloc[:100000]

In [8]:
y = model_data["demand"]
X = model_data.drop(columns=["demand"])

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)
# X_train, X_rest, y_train, y_rest = train_test_split(X, y, train_size=0.7, random_state=42)
# X_valid, X_test, y_valid, y_test = train_test_split(X_rest, y_rest, test_size=0.5, random_state=42)

print(f"Size of the train dataset is: {X_train.shape[0]}")
# print(f"Size of the validation dataset is: {X_valid.shape[0]}")
print(f"Size of the test dataset is: {X_test.shape[0]}")

Size of the train dataset is: 7000
Size of the test dataset is: 3000


In [11]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
# X_valid = scaler.transform(X_valid)
X_test = scaler.transform(X_test)



In [12]:
def mean_average_percentage_error(y_true, y_pred):
    return mean_absolute_error(y_true, y_pred) / y_true.mean()


def root_mean_squared_error(y_true, y_pred):
    return mean_squared_error(y_true, y_pred) ** 0.5

In [13]:
results = []
def get_results_array(y_pred_for_validation, y_pred_for_test, kernel, C, degree):
    return [
        kernel,
        C,
        degree,
        
        mean_squared_error(y_valid, y_pred_for_validation),
        mean_absolute_error(y_valid, y_pred_for_validation),
        mean_average_percentage_error(y_valid, y_pred_for_validation),
        root_mean_squared_error(y_valid, y_pred_for_validation),
        
        mean_squared_error(y_test, y_pred_for_test),
        mean_absolute_error(y_test, y_pred_for_test),
        mean_average_percentage_error(y_test, y_pred_for_test),
        root_mean_squared_error(y_test, y_pred_for_test),
    ]

In [14]:
def train_model_and_calculate_results(kernel, C, degree):
    model = SVR(kernel=kernel, degree=degree, C=C)
    model.fit(X_train, y_train)

    y_pred_for_validation = model.predict(X_valid)
    y_pred_for_test = model.predict(X_test)
    
    results_array = get_results_array(y_pred_for_validation, y_pred_for_test, kernel, C, degree)
    results.append(results_array)

In [15]:
# kernel_options = ["linear", "poly", "rbf"]
# degree = 0
# polynomial_degree_options = [2, 3, 4, 5, 6, 7]
# C_options = [1, 10, 100]

# for kernel in kernel_options:
#     for C in C_options:
#         if kernel != 'poly':
#             train_model_and_calculate_results(kernel, C, degree)
#             continue

#         for degree in polynomial_degree_options:
#             train_model_and_calculate_results(kernel, C, degree)

In [16]:
# results = pd.DataFrame(results, columns=[
#     "kernel", "C", "degree",
#     "val_MSE", "val_MAE", "val_MAPE", "val_RMSE",
#     "test_MSE", "test_MAE", "test_MAPE", "test_RMSE"
# ])
# results.sort_values(by="val_MAPE")

In [41]:
parameters = {
    'kernel':("linear", "poly", "rbf"),
    'C':[1, 10],
    'degree':[2, 3, 4, 5, 6, 7]
}
svr = SVR()
clf = GridSearchCV(svr, parameters, n_jobs=-1, verbose=3, scoring="neg_mean_squared_error")
clf.fit(X_train, y_train)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


In [42]:
sorted(clf.cv_results_.keys())

['mean_fit_time',
 'mean_score_time',
 'mean_test_score',
 'param_C',
 'param_degree',
 'param_kernel',
 'params',
 'rank_test_score',
 'split0_test_score',
 'split1_test_score',
 'split2_test_score',
 'split3_test_score',
 'split4_test_score',
 'std_fit_time',
 'std_score_time',
 'std_test_score']

In [49]:
res = pd.DataFrame(clf.cv_results_)
res.sort_values(by="mean_test_score", ascending=False).head(2)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_degree,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
25,6.623288,1.002218,1.849753,0.853562,10,4,poly,"{'C': 10, 'degree': 4, 'kernel': 'poly'}",-100.917669,-114.316443,-132.611728,-107.313398,-116.914132,-114.414674,10.676956,1
22,7.186136,0.888642,2.226804,1.358889,10,3,poly,"{'C': 10, 'degree': 3, 'kernel': 'poly'}",-112.316617,-122.000309,-132.929557,-112.196657,-122.960364,-120.480701,7.72929,2


In [51]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

y_pred = clf.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print(f"MSE: {mse}")
print(f"MAE: {mae}")

MSE: 1181543985494955.0
MAE: 32848471.12409008


In [50]:
y_pred

array([  9.09993172,  55.10001362,  58.10009214, ..., 100.10005765,
        23.09988907,  71.90007431])

In [46]:
y_test

277077     48
275509    133
272556     34
275567    120
275346    107
         ... 
278839     48
271899     23
273888     95
277312     55
275530    131
Name: demand, Length: 3000, dtype: int64

In [None]:
retrain best model on all data