In [1]:
import pandas as pd
import numpy as np

# Load dataset
df = pd.read_csv("./datasets/bike_data_normalized.csv", index_col=0)

In [2]:
# Possible removing of correlated one hot encoding variables
# df.drop("Seasons_Autumn",axis=1, inplace=True) # can be determined from other season columns
# df.drop("DWeek_Saturday",axis=1, inplace=True) # You can determine saturday from sunday and weekend
# df.drop("DWeek_Friday",axis=1, inplace=True) # Drop one weekday

In [3]:
# Copy this to other notebooks to get the train and test data
def load_train_test():
    train_df = pd.read_csv("./datasets/bike_train.csv", index_col=0)
    test_df = pd.read_csv("./datasets/bike_test.csv", index_col=0)
    X_train = train_df.drop("Count", axis=1).values
    y_train = train_df["Count"]
    X_test = test_df.drop("Count", axis=1).values
    y_test = test_df["Count"]
    return (X_train, X_test, y_train, y_test)

## Load Data

In [4]:
X_train, X_test, y_train, y_test = load_train_test()

## SVM

In [5]:
# With some parameters that worked well from the grid search
from sklearn.svm import SVR
regr = SVR(C=1000, gamma=10)
regr.fit(X_train, y_train)
regr.score(X_test, y_test)

0.8161119303300665

In [6]:
# Test r-squared value
regr.score(X_train, y_train)

0.9001386528682288

In [7]:
# Train on entire dataset and persist
X = df.drop("Count", axis=1).values
y = df["Count"]

svm = SVR(C=1000, gamma=10)
svm.fit(X,y)

SVR(C=1000, gamma=10)

In [8]:
svm.score(X,y)

0.9007983425367263

## Persist the model

In [9]:
from joblib import dump, load
dump(svm, './models/svm.joblib') 

['./models/svm.joblib']

In [None]:
# https://www.geeksforgeeks.org/svm-hyperparameter-tuning-using-gridsearchcv-ml/
# Reference on range to search: https://scikit-learn.org/stable/auto_examples/svm/plot_rbf_parameters.html
from sklearn.model_selection import GridSearchCV
param_grid = { 
            'kernel': ['rbf'], 
            'C': np.logspace(-2, 10, 3),
            'gamma': np.logspace(-1, 3, 3)
} 
grid = GridSearchCV(SVR(), param_grid,verbose = 3)
grid.fit(X_train, y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV] C=0.01, gamma=0.1, kernel=rbf ...................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ...... C=0.01, gamma=0.1, kernel=rbf, score=-0.102, total=   1.6s
[CV] C=0.01, gamma=0.1, kernel=rbf ...................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.6s remaining:    0.0s


[CV] ...... C=0.01, gamma=0.1, kernel=rbf, score=-0.140, total=   1.5s
[CV] C=0.01, gamma=0.1, kernel=rbf ...................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    3.1s remaining:    0.0s


[CV] ...... C=0.01, gamma=0.1, kernel=rbf, score=-0.092, total=   1.6s
[CV] C=0.01, gamma=0.1, kernel=rbf ...................................
[CV] ...... C=0.01, gamma=0.1, kernel=rbf, score=-0.066, total=   1.6s
[CV] C=0.01, gamma=0.1, kernel=rbf ...................................
[CV] ...... C=0.01, gamma=0.1, kernel=rbf, score=-0.072, total=   1.5s
[CV] C=0.01, gamma=10.0, kernel=rbf ..................................
[CV] ..... C=0.01, gamma=10.0, kernel=rbf, score=-0.104, total=   1.4s
[CV] C=0.01, gamma=10.0, kernel=rbf ..................................
[CV] ..... C=0.01, gamma=10.0, kernel=rbf, score=-0.141, total=   1.6s
[CV] C=0.01, gamma=10.0, kernel=rbf ..................................
[CV] ..... C=0.01, gamma=10.0, kernel=rbf, score=-0.094, total=   1.6s
[CV] C=0.01, gamma=10.0, kernel=rbf ..................................
[CV] ..... C=0.01, gamma=10.0, kernel=rbf, score=-0.067, total=   1.6s
[CV] C=0.01, gamma=10.0, kernel=rbf ..................................
[CV] .