In [1]:
from time import time
import pandas as pd

# for model training
from sklearn.svm import SVR

# for evaluation & preprocessing
from sklearn.model_selection import (
    train_test_split,
    GridSearchCV,
)
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    mean_squared_error,
    mean_absolute_error,
)
import sys, os
sys.path.append(os.path.abspath(os.path.join("..")))

# for displaying results & feedback
# from tabulate import tabulate
import matplotlib.pyplot as plt

In [2]:
import sys, os

sys.path.append(os.path.abspath('..'))
%load_ext autoreload
%autoreload 2
from modules.config import *

In [3]:
model_data = pd.read_parquet(MODEL_DATA_PATH)
model_data.head(2)

Unnamed: 0,start_hex_id,end_hex_id,demand,time_interval_length,hour_0,hour_1,hour_2,hour_3,hour_4,hour_5,...,public_transport_poi_start,education_poi_start,arts_and_culture_poi_start,sports_poi_start,h3_res,sustenance_poi_end,public_transport_poi_end,education_poi_end,arts_and_culture_poi_end,sports_poi_end
0,871f1a164ffffff,871f1a164ffffff,1,1,1,0,0,0,0,0,...,503,101,40,45,7,862,503,101,40,45
1,871f1a164ffffff,871f1a164ffffff,1,1,1,0,0,0,0,0,...,503,101,40,45,7,862,503,101,40,45


In [4]:
# encode start and end hexagons as one-hot vectors
model_data = model_data[(model_data['h3_res'] == 7) & (model_data['time_interval_length'] == 24)]
start_hex_dummies = pd.get_dummies(model_data.start_hex_id, prefix="start_")
end_hex_dummies = pd.get_dummies(model_data.end_hex_id, prefix="end_")
model_data = pd.concat([model_data, start_hex_dummies, end_hex_dummies], axis=1)
model_data = model_data.drop(columns=['start_hex_id', 'end_hex_id'])

In [5]:
len(model_data.index)

921849

In [6]:
model_data = model_data.iloc[:50000]

In [7]:
y = model_data["demand"]
X = model_data.drop(columns=["demand"])

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)

print(f"Size of the train dataset is: {X_train.shape[0]}")
print(f"Size of the test dataset is: {X_test.shape[0]}")

Size of the train dataset is: 35000
Size of the test dataset is: 15000


In [8]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [9]:
def mean_average_percentage_error(y_true, y_pred):
    return mean_absolute_error(y_true, y_pred) / y_true.mean()


def root_mean_squared_error(y_true, y_pred):
    return mean_squared_error(y_true, y_pred) ** 0.5

In [10]:
param_grid = [
  {'kernel': ['linear'], 'C': [1, 10, 100]},
  {'kernel': ['rbf'], 'C': [1, 10, 100], 'gamma': [0.001, 0.0001]},
  {'kernel': ['poly'], 'C': [1, 10, 100], 'degree': [2, 3, 4, 5]}
 ]
svr = SVR()
clf = GridSearchCV(svr, param_grid, n_jobs=-1, verbose=4, scoring="neg_mean_squared_error")
clf.fit(X_train, y_train)

Fitting 5 folds for each of 21 candidates, totalling 105 fits


In [14]:
results = pd.DataFrame(clf.cv_results_)
results.sort_values(by="mean_test_score", ascending=False).head(5)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,param_gamma,param_degree,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
20,409.630961,49.14997,20.986492,6.767375,100,poly,,5,"{'C': 100, 'degree': 5, 'kernel': 'poly'}",-2.450408,-2.158209,-2.360602,-2.858119,-3.441552,-2.653778,0.455026,1
19,959.026009,79.463981,37.58558,8.075796,100,poly,,4,"{'C': 100, 'degree': 4, 'kernel': 'poly'}",-5.58419,-4.644717,-5.600465,-5.697597,-6.855771,-5.676548,0.702852,2
16,608.832966,108.53516,79.123269,5.808289,10,poly,,5,"{'C': 10, 'degree': 5, 'kernel': 'poly'}",-29.902758,-29.097552,-29.826218,-33.652816,-34.894592,-31.474787,2.335773,3
15,658.872283,83.125998,82.840855,11.312941,10,poly,,4,"{'C': 10, 'degree': 4, 'kernel': 'poly'}",-34.496676,-32.971192,-33.749712,-37.644783,-38.940257,-35.560524,2.318714,4
18,3914.482007,150.344142,16.5936,1.234687,100,poly,,3,"{'C': 100, 'degree': 3, 'kernel': 'poly'}",-43.544568,-44.397436,-43.382867,-45.67911,-46.251486,-44.651093,1.141509,5


In [15]:
y_pred = clf.best_estimator_.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mape = mean_average_percentage_error(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)

print(f"MSE: {mse}")
print(f"MAE: {mae}")
print(f"MAPE: {mape}")
print(f"RMSE: {rmse}")

MSE: 1.7919743465746079
MAE: 0.2095090314897931
MAPE: 0.005517780749168895
RMSE: 1.3386464606364923


In [16]:
pd.Series(y_pred).describe()

count    15000.000000
mean        37.945909
std         33.676102
min          0.899873
25%         12.899903
50%         28.100005
75%         51.099998
max        193.899954
dtype: float64