In [2]:
from time import time
import pandas as pd

# for model training
from sklearn.svm import SVR

# for evaluation & preprocessing
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import (
    train_test_split,
    GridSearchCV,
    HalvingGridSearchCV,
)
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    mean_squared_error,
    mean_absolute_error,
)
import sys, os
sys.path.append(os.path.abspath(os.path.join("..")))

# for displaying results & feedback
# from tabulate import tabulate
import matplotlib.pyplot as plt

In [3]:
import sys, os

sys.path.append(os.path.abspath('..'))
%load_ext autoreload
%autoreload 2
from modules.config import *

In [4]:
model_data = pd.read_parquet(MODEL_DATA_PATH)
model_data.head(2)

Unnamed: 0,start_hex_id,end_hex_id,demand,time_interval_length,hour_0,hour_1,hour_2,hour_3,hour_4,hour_5,...,public_transport_poi_start,education_poi_start,arts_and_culture_poi_start,sports_poi_start,h3_res,sustenance_poi_end,public_transport_poi_end,education_poi_end,arts_and_culture_poi_end,sports_poi_end
0,871f1a164ffffff,871f1a164ffffff,1,1,1,0,0,0,0,0,...,503,101,40,45,7,862,503,101,40,45
1,871f1a164ffffff,871f1a164ffffff,1,1,1,0,0,0,0,0,...,503,101,40,45,7,862,503,101,40,45


In [5]:
# encode start and end hexagons as one-hot vectors
model_data = model_data[(model_data['h3_res'] == 7) & (model_data['time_interval_length'] == 24)]
start_hex_dummies = pd.get_dummies(model_data.start_hex_id, prefix="start_")
end_hex_dummies = pd.get_dummies(model_data.end_hex_id, prefix="end_")
model_data = pd.concat([model_data, start_hex_dummies, end_hex_dummies], axis=1)
model_data = model_data.drop(columns=['start_hex_id', 'end_hex_id'])

In [6]:
len(model_data.index)

921849

In [7]:
model_data = model_data.iloc[:50000]

In [8]:
y = model_data["demand"]
X = model_data.drop(columns=["demand"])

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)

print(f"Size of the train dataset is: {X_train.shape[0]}")
print(f"Size of the test dataset is: {X_test.shape[0]}")

Size of the train dataset is: 35000
Size of the test dataset is: 15000


In [9]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [10]:
def mean_average_percentage_error(y_true, y_pred):
    return mean_absolute_error(y_true, y_pred) / y_true.mean()


def root_mean_squared_error(y_true, y_pred):
    return mean_squared_error(y_true, y_pred) ** 0.5

In [11]:
param_grid = [
  {'kernel': ['linear'], 'C': [1, 10, 100]},
  {'kernel': ['rbf'], 'C': [1, 10, 100], 'gamma': [0.001, 0.0001]},
  {'kernel': ['poly'], 'C': [1, 10, 100], 'degree': [2, 3, 4, 5]}
 ]
svr = SVR()
clf = HalvingGridSearchCV(svr, param_grid, n_jobs=-1, scoring="neg_mean_squared_error", random_state=42)
clf.fit(X_train, y_train)

In [12]:
results = pd.DataFrame(clf.cv_results_)
results.sort_values(by="mean_test_score", ascending=False).head(5)

Unnamed: 0,iter,n_resources,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,param_gamma,param_degree,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
30,2,34992,804.191074,84.796919,47.307169,2.693901,100,poly,,5,...,-2.654826,0.455459,1,-2.250397,-2.107887,-1.953211,-2.334988,-2.411238,-2.211544,0.163772
29,2,34992,1106.830871,60.876727,35.828149,3.590523,100,poly,,4,...,-5.678539,0.703346,2,-4.798248,-4.713985,-4.643493,-4.728365,-4.707741,-4.718366,0.049435
26,1,11664,32.063715,6.620148,4.276786,0.867433,100,poly,,5,...,-16.923292,2.328463,3,-9.87012,-9.524108,-8.759425,-9.15165,-8.341473,-9.129355,0.540863
27,1,11664,31.88968,1.653201,3.444474,1.182337,100,poly,,4,...,-24.259991,3.613449,4,-14.275176,-14.66483,-14.113496,-14.692955,-13.756395,-14.30057,0.351687
28,2,34992,3053.402636,183.363885,16.660356,1.789491,100,poly,,3,...,-44.632951,1.152993,5,-40.710521,-40.936003,-39.848274,-39.847225,-39.969499,-40.262304,0.465664


In [13]:
y_pred = clf.best_estimator_.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mape = mean_average_percentage_error(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)

print(f"MSE: {mse}")
print(f"MAE: {mae}")
print(f"MAPE: {mape}")
print(f"RMSE: {rmse}")

MSE: 1.7919743465746079
MAE: 0.2095090314897931
MAPE: 0.005517780749168895
RMSE: 1.3386464606364923


In [14]:
pd.Series(y_pred).describe()

count    15000.000000
mean        37.945909
std         33.676102
min          0.899873
25%         12.899903
50%         28.100005
75%         51.099998
max        193.899954
dtype: float64