In [1]:
import pandas as pd
import os
import sys
import numpy as np
import matplotlib.pyplot as plt
import re
import datetime
import json
import pickle

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.metrics import mean_absolute_error 
from sklearn.metrics import mean_squared_error

pd.set_option('display.max_columns', 999)
pd.set_option('display.max_rows', 500)

In [2]:
path = "./Datasets/"

In [3]:
# Extract and prepare data

resp_df = pd.read_parquet(path + "resp_df.parquet")

y = resp_df.av_rolling_resp_time_250

X = resp_df.drop(columns=["av_rolling_resp_time_250"])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

X_train = np.ascontiguousarray(X_train)
y_train = np.ascontiguousarray(y_train)

X_test = np.ascontiguousarray(X_test)
y_test = np.ascontiguousarray(y_test)

In [4]:
X_train[0]

array([0.5       , 1.        , 0.2       , 1.        , 0.05      ,
       0.0952381 , 0.22222222, 0.15      , 0.25      , 0.05555556,
       0.05555556, 0.        , 0.05      , 0.125     , 0.        ,
       0.        , 0.05555556, 0.05263158, 0.        , 0.2       ,
       0.16666667, 0.19047619, 0.11111111, 0.        , 0.11764706,
       0.05555556, 0.0625    , 0.11764706, 0.05555556, 0.23529412,
       0.        , 0.05555556, 0.11764706, 0.04347826, 0.27777778,
       0.        , 0.09090909, 0.11764706, 0.22222222, 0.0625    ,
       0.        , 0.125     , 0.05882353, 0.26666667, 0.05263158,
       0.        , 0.17391304, 0.13333333, 0.05882353, 0.25      ,
       0.03846154, 0.16666667, 0.0625    , 0.        , 0.04761905,
       0.        , 0.0625    , 0.0625    , 0.05882353, 0.10526316,
       0.375     , 0.1       , 0.25      , 0.13043478, 0.05      ,
       0.17647059, 0.        , 0.13333333, 0.25      , 0.        ,
       0.11764706, 0.11111111, 0.0625    , 0.1875    , 0.14285

In [5]:
X_train.shape

(2974918, 133)

In [10]:
'''
RandomForestRegressor()
"RandomForestRegressor": {
        "model": RandomForestRegressor(),
        "params": {
        "bootstrap": [True, False],
        "max_depth": [2, 3, 5, 10, 20, 40, 80, None],
        "max_features": ["sqrt"],
        "min_samples_leaf": [1, 2, 4],
        "min_samples_split": [2, 5, 10],
        "n_jobs": [-1],
        "n_estimators": [
            10,
            50,
            100,
            400,
            1000,
            2000,
'''
# Train model using best params from grid search

models_dict = dict()

model = RandomForestRegressor(max_depth=5, max_features="sqrt",  n_jobs=-1, n_estimators= 300)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

models_dict = {
    'Mean Absolute Error': mean_absolute_error(y_test, y_pred),
    'Mean Squared Error': mean_squared_error(y_test, y_pred),
    'Root Mean Squared Error': np.sqrt(mean_squared_error(y_test, y_pred)),
    "Datetime": "_".join(str(datetime.datetime.now()).split())
}


print(model.feature_importances_)

print(models_dict)
'''
with open('./Models/super_model_results.pkl', 'wb') as f:
    pickle.dump(models_dict, f)

with open('./Models/super_model.pkl', 'wb') as f:
    pickle.dump(model, f)
'''

[3.70818597e-02 1.23920906e-01 2.94673355e-01 5.72380492e-02
 2.05085804e-04 1.48241052e-04 1.45755789e-05 1.68148639e-03
 4.18791980e-04 7.68001932e-04 3.14455965e-04 4.76376790e-04
 8.42445310e-04 3.40887843e-04 8.78524744e-04 8.66071992e-04
 9.18656527e-05 9.18710244e-05 1.29323661e-03 1.42433388e-03
 1.42346779e-04 3.36256145e-02 1.45349984e-03 1.09461619e-03
 4.98357558e-05 3.47126935e-04 8.48574502e-05 2.53983761e-05
 3.62868629e-05 9.28796425e-04 4.29608170e-04 5.78591410e-04
 8.43944519e-04 7.90896198e-04 8.53230217e-05 3.73332693e-04
 5.17529843e-05 1.63204939e-04 9.25962977e-05 3.70278783e-05
 1.72064815e-04 8.63702456e-05 5.90125447e-05 4.61577771e-04
 1.87392233e-02 3.79801011e-05 1.34517440e-02 1.12635708e-03
 5.98737669e-05 1.05515980e-03 9.34317959e-04 4.60509825e-04
 7.96357248e-05 8.91686239e-04 3.06458131e-03 6.79716500e-05
 3.62355142e-04 4.17772269e-04 1.70607867e-04 1.11118639e-02
 2.35365006e-04 6.14643296e-02 2.62666175e-04 2.13297759e-02
 2.71320896e-02 7.266945

"\nwith open('./Models/super_model_results.pkl', 'wb') as f:\n    pickle.dump(models_dict, f)\n\nwith open('./Models/super_model.pkl', 'wb') as f:\n    pickle.dump(model, f)\n"

In [11]:
with open('./Models/RF_model_results.pkl', 'wb') as f:
    pickle.dump(models_dict, f)

with open('./Models/RF_super_model.pkl', 'wb') as f:
    pickle.dump(model, f)

In [6]:
# Train model using best params from grid search

models_dict = dict()

model = Ridge(alpha=0.01, copy_X=True, fit_intercept=False, max_iter=None, normalize=False, random_state=None, solver='sag', tol=0.001)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

models_dict = {
    'Mean Absolute Error': mean_absolute_error(y_test, y_pred),
    'Mean Squared Error': mean_squared_error(y_test, y_pred),
    'Root Mean Squared Error': np.sqrt(mean_squared_error(y_test, y_pred)),
    "Datetime": "_".join(str(datetime.datetime.now()).split())
}

print(models_dict)

with open('./Models/super_model_results.pkl', 'wb') as f:
    pickle.dump(models_dict, f)

with open('./Models/super_model.pkl', 'wb') as f:
    pickle.dump(model, f)


{'Mean Absolute Error': 0.03996184862674655, 'Mean Squared Error': 0.0044228674397935475, 'Root Mean Squared Error': 0.06650464224242957, 'Datetime': '2019-09-23_19:21:30.430228'}


In [7]:
model.coef_

array([-0.0047369 ,  0.02699067, -0.14108559, -0.07388089,  0.03612215,
       -0.00536044,  0.02247008, -0.003981  , -0.00391725, -0.0287208 ,
        0.04016192,  0.01426214, -0.00420039, -0.02032164, -0.01983818,
       -0.00467941,  0.0085137 ,  0.01741639,  0.00528081,  0.02975786,
        0.01811988,  0.06377715,  0.03570359,  0.02145515,  0.01854402,
        0.01322352,  0.02312829,  0.02288895,  0.02188773, -0.00364208,
       -0.00343423, -0.00872134, -0.02833609,  0.03390692, -0.00283011,
       -0.01460388,  0.01587269,  0.02623964,  0.02588624,  0.01233973,
        0.00249478,  0.03226455,  0.03191723, -0.00373894,  0.01406448,
        0.04049004, -0.01265896,  0.02362847,  0.04486216,  0.02372124,
        0.08126604,  0.04131858,  0.02267117,  0.02549065,  0.07015365,
        0.00645866,  0.00998998,  0.00709957,  0.01810426,  0.01121783,
        0.03753595,  0.13341443,  0.04979112,  0.0593423 ,  0.03202561,
        0.02410554,  0.01372761,  0.01436381,  0.01387061,  0.01