In [1]:
%matplotlib inline
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib as mpl
import IPython.display as ipd
import sklearn as sk
import sklearn.metrics
import sklearn.model_selection
import sklearn.preprocessing
import sklearn.compose

In [2]:
data = pd.read_csv('datasets/bikesharing-day.csv', index_col=0)

data.dteday = pd.to_datetime(data.dteday, format='%Y-%m-%d').dt.normalize()
data.season = data.season.astype('category')
data.mnth = data.mnth.astype('category')
#data['day'] = data.dteday.dt.day.astype('category')
if ('hr' in data):
    data.hr = data.hr.astype('category')
#data.holiday = data.holiday.astype(bool)
data.weekday = data.weekday.astype('category')
#data.workingday = data.workingday.astype(bool)
data.weathersit = data.weathersit.astype('category')

In [3]:
columns = ['season','weekday','holiday','temp','hum','cnt']
projected_data = data.loc[:, columns]
train, test = sk.model_selection.train_test_split(projected_data)

ct = sk.compose.ColumnTransformer([
    ('col',sk.preprocessing.OneHotEncoder(categories='auto'),[
        'season', 'weekday'
    ])
], remainder='passthrough')

In [4]:
def regression_model(model, predict, param_grid, transform=True):
    variates = train.columns.drop(predict)
    
    trainX = train.loc[:, variates]
    trainy = train.loc[:,predict]
    
    # k-fold crossvalidation
    rgs = sk.model_selection.RandomizedSearchCV(model, param_distributions=param_grid, cv=5, n_iter=10, n_jobs=-1, verbose=4)
    rgs.fit(trainX, trainy)
    
    print(rgs)
    testX = test.loc[:, variates]
    testy = test.loc[:,predict]
    
    if transform:
        ct.fit(projected_data.loc[:,variates]);
        trainX = ct.transform(trainX)
        testX = ct.transform(testX)
#     rgs = model(**gs.best_params_)
#     rgs.fit(trainX,trainy)
    predictedy = rgs.predict(testX)
    return pd.Series({
        'RMSE': np.sqrt(sk.metrics.mean_squared_error(testy,predictedy)),
        'MAE': sk.metrics.mean_absolute_error(testy,predictedy),
        'model': model
    }, name=type(model).__name__)

In [5]:
param_dist_svr = {'C': np.logspace(-2, 10, 5),
                  'gamma': np.logspace(-9, 3, 5),
                  'kernel': ['rbf','linear']};

In [None]:
import sklearn.linear_model
import sklearn.svm
import sklearn.neural_network


# pd.DataFrame([regression_model(m, 'cnt', transform=False) for m in [
#     sk.linear_model.LinearRegression(),
#     sk.svm.SVR(gamma='scale'),
#     sk.neural_network.MLPRegressor(solver='lbfgs')
# ]]).drop(columns='model').sort_values('RMSE')


pd.DataFrame([regression_model(m, 'cnt', p, transform=False) for (m,p) in [
    (sk.svm.NuSVR(),param_dist_svr)
]]).drop(columns='model').sort_values('RMSE')
# {'C': [0.001, 0.01, 0.1, 1, 10],
#                                'gamma': [0.001, 0.01, 0.1, 1],
#                                'kernel': ['rbf']}

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed: 39.5min


In [7]:
pd.DataFrame([regression_model(m, 'cnt', param_dist_svc, transform=False) for m in [
    sk.svm.SVR()
]]).drop(columns='model').sort_values('RMSE')

{'C': array([1.e-02, 1.e+01, 1.e+04, 1.e+07, 1.e+10]),
 'gamma': array([1.e-09, 1.e-06, 1.e-03, 1.e+00, 1.e+03]),
 'kernel': ['rbf', 'linear']}