In [1]:
import pandas as pd
import os
import sys
import numpy as np
import matplotlib.pyplot as plt
import re
import datetime
import json
import pickle

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.metrics import mean_absolute_error 
from sklearn.metrics import mean_squared_error

pd.set_option('display.max_columns', 999)
pd.set_option('display.max_rows', 500)

In [2]:
path = "./Datasets/"

In [3]:
# Extract and prepare data

resp_df = pd.read_parquet(path + "resp_df.parquet")

y = resp_df.av_rolling_resp_time_250

X = resp_df.drop(columns=["av_rolling_resp_time_250"])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

X_train = np.ascontiguousarray(X_train)
y_train = np.ascontiguousarray(y_train)

X_test = np.ascontiguousarray(X_test)
y_test = np.ascontiguousarray(y_test)

In [4]:
X_train[0]

array([0.5       , 1.        , 0.2       , 1.        , 0.05      ,
       0.0952381 , 0.22222222, 0.15      , 0.25      , 0.05555556,
       0.05555556, 0.        , 0.05      , 0.125     , 0.        ,
       0.        , 0.05555556, 0.05263158, 0.        , 0.2       ,
       0.16666667, 0.19047619, 0.11111111, 0.        , 0.11764706,
       0.05555556, 0.0625    , 0.11764706, 0.05555556, 0.23529412,
       0.        , 0.05555556, 0.11764706, 0.04347826, 0.27777778,
       0.        , 0.09090909, 0.11764706, 0.22222222, 0.0625    ,
       0.        , 0.125     , 0.05882353, 0.26666667, 0.05263158,
       0.        , 0.17391304, 0.13333333, 0.05882353, 0.25      ,
       0.03846154, 0.16666667, 0.0625    , 0.        , 0.04761905,
       0.        , 0.0625    , 0.0625    , 0.05882353, 0.10526316,
       0.375     , 0.1       , 0.25      , 0.13043478, 0.05      ,
       0.17647059, 0.        , 0.13333333, 0.25      , 0.        ,
       0.11764706, 0.11111111, 0.0625    , 0.1875    , 0.14285

In [5]:
X_train.shape

(2974918, 133)

In [None]:
# Train model using best params from grid search

models_dict = dict()

model = Ridge(alpha=0.01, copy_X=True, fit_intercept=False, max_iter=None, normalize=False, random_state=None, solver='sag', tol=0.001)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

models_dict = {
    'Mean Absolute Error': mean_absolute_error(y_test, y_pred),
    'Mean Squared Error': mean_squared_error(y_test, y_pred),
    'Root Mean Squared Error': np.sqrt(mean_squared_error(y_test, y_pred)),
    "Datetime": "_".join(str(datetime.datetime.now()).split())
}

print(models_dict)

with open('./Models/super_model_results.pkl', 'wb') as f:
    pickle.dump(models_dict, f)

with open('./Models/super_model.pkl', 'wb') as f:
    pickle.dump(model, f)
