In [1]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import (
    r2_score,
    root_mean_squared_error,
    mean_absolute_error,
    mean_squared_error,
    make_scorer,
)
import pandas as pd
import numpy as np
import pickle
import os

In [2]:
train_df = pd.read_csv("clean_data/train.csv")
test_df = pd.read_csv("clean_data/test.csv")

In [3]:
X_train = train_df[[x for x in train_df.columns if x not in ["On", "Off"]]]
Y_train = train_df["On"]

In [4]:
X_test = test_df[[x for x in test_df.columns if x not in ["On", "Off"]]]
Y_test = test_df["On"]

In [None]:
gbr = GradientBoostingRegressor(random_state = 42, verbose=2)
gbr.fit(X_train, Y_train)

In [None]:
Y_pred_gbr = np.floor(gbr.predict(X_test)).astype(int)

In [None]:
rmse_gbr = float(format(np.sqrt(mean_squared_error(Y_test, Y_pred_gbr)), '.3f'))

In [None]:
rmse_gbr

In [5]:
MODELS_FOLDER = "models"

In [None]:
pickle.dump(gbr, open(os.path.join(MODELS_FOLDER, "base_gbt.pkl"), 'wb'))

In [None]:
r2 = r2_score(Y_test, Y_pred_gbr)
rmse = root_mean_squared_error(Y_test, Y_pred_gbr)
mae = mean_absolute_error(Y_test, Y_pred_gbr)
print(f"test rmse: {rmse}, mae: {mae}, r2: {r2}")

In [None]:
Y_pred_gbr = np.floor(gbr.predict(X_train)).astype(int)
r2 = r2_score(Y_train, Y_pred_gbr)
rmse = root_mean_squared_error(Y_train, Y_pred_gbr)
mae = mean_absolute_error(Y_train, Y_pred_gbr)
print(f"train rmse: {rmse}, mae: {mae}, r2: {r2}")

In [13]:
base_gbt = GradientBoostingRegressor(
    random_state=42,
    loss="squared_error",
    criterion="friedman_mse",
    min_samples_split=14,
    min_samples_leaf=7,
    verbose=2,
)
param_grid = [
    {
        "learning_rate": [0.001, 0.1],
        "subsample": [0.8, 1.0],
        "n_estimators": [50],
        "max_depth": [20],
        "max_features": [1.0],
    },
    {
        "learning_rate": [0.001, 0.1],
        "subsample": [0.8, 1.0],
        "n_estimators": [50],
        "max_depth": [100],
        "max_features": ["sqrt"],
    },
    {
        "learning_rate": [0.001, 0.1],
        "subsample": [0.8, 1.0],
        "n_estimators": [10],
        "max_depth": [70],
        "max_features": [1.0],
    },
    {
        "learning_rate": [0.001, 0.1],
        "subsample": [0.8, 1.0],
        "n_estimators": [10],
        "max_depth": [100],
        "max_features": ["sqrt"],
    },
]

In [14]:
scorer = make_scorer(mean_squared_error, greater_is_better=False)
grid_search = GridSearchCV(base_gbt, param_grid, scoring=scorer, n_jobs=-1, verbose=2, cv=3)

In [15]:
grid_search.fit(X_train, Y_train)

Fitting 3 folds for each of 16 candidates, totalling 48 fits
      Iter       Train Loss      OOB Improve   Remaining Time 
      Iter       Train Loss      OOB Improve   Remaining Time 
      Iter       Train Loss      OOB Improve   Remaining Time 
      Iter       Train Loss   Remaining Time 
      Iter       Train Loss      OOB Improve   Remaining Time 
      Iter       Train Loss   Remaining Time 
      Iter       Train Loss      OOB Improve   Remaining Time 
      Iter       Train Loss      OOB Improve   Remaining Time 
      Iter       Train Loss   Remaining Time 
      Iter       Train Loss   Remaining Time 
         1        2087.1713         355.3899           17.93m
         1        2079.8868         343.0553           17.93m
         1        2467.7258           3.7548           18.01m
         1        2455.4581           3.6229           18.10m
         1        2446.0411           3.6425           18.28m
         1        2069.7851         345.0086           18.37m
     

In [16]:
pd.DataFrame(grid_search.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_max_depth,param_max_features,param_n_estimators,param_subsample,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,1192.335052,4.454699,24.065205,0.315125,0.001,20,1.0,50,0.8,"{'learning_rate': 0.001, 'max_depth': 20, 'max...",-2316.155829,-2292.2619,-2249.587536,-2286.001755,27.534542,10
1,1441.229446,4.380348,25.188702,0.477633,0.001,20,1.0,50,1.0,"{'learning_rate': 0.001, 'max_depth': 20, 'max...",-2315.840445,-2291.895284,-2249.760211,-2285.83198,27.315711,9
2,1221.31535,1.434354,18.360734,0.135439,0.1,20,1.0,50,0.8,"{'learning_rate': 0.1, 'max_depth': 20, 'max_f...",-602.879907,-590.303962,-567.286072,-586.823314,14.738079,1
3,1411.918435,52.394547,18.384771,0.593044,0.1,20,1.0,50,1.0,"{'learning_rate': 0.1, 'max_depth': 20, 'max_f...",-616.910621,-604.837185,-592.255086,-604.667631,10.066294,4
4,605.676696,2.447325,73.07829,0.737162,0.001,100,sqrt,50,0.8,"{'learning_rate': 0.001, 'max_depth': 100, 'ma...",-2320.359644,-2297.024085,-2253.553709,-2290.312479,27.683238,12
5,721.158717,6.471939,87.468974,2.847364,0.001,100,sqrt,50,1.0,"{'learning_rate': 0.001, 'max_depth': 100, 'ma...",-2319.231686,-2295.597175,-2252.595126,-2289.141329,27.584612,11
6,662.171662,1.531661,79.043533,2.139171,0.1,100,sqrt,50,0.8,"{'learning_rate': 0.1, 'max_depth': 100, 'max_...",-601.621072,-588.631223,-572.459033,-587.570443,11.928958,2
7,795.735319,4.882697,95.261302,1.202407,0.1,100,sqrt,50,1.0,"{'learning_rate': 0.1, 'max_depth': 100, 'max_...",-604.23036,-598.743866,-580.853467,-594.609231,9.981356,3
8,270.95129,2.982395,16.301711,0.381458,0.001,70,1.0,10,0.8,"{'learning_rate': 0.001, 'max_depth': 70, 'max...",-2456.09394,-2433.669172,-2388.440344,-2426.067819,28.137611,14
9,331.826775,3.128227,18.265487,0.108237,0.001,70,1.0,10,1.0,"{'learning_rate': 0.001, 'max_depth': 70, 'max...",-2456.02712,-2433.550866,-2388.447093,-2426.008359,28.100202,13


In [17]:
print(grid_search.best_params_)

{'learning_rate': 0.1, 'max_depth': 20, 'max_features': 1.0, 'n_estimators': 50, 'subsample': 0.8}


In [18]:
best_gbt = grid_search.best_estimator_

In [19]:
train_y_pred = np.floor(best_gbt.predict(X_train)).astype(int)
test_y_pred = np.floor(best_gbt.predict(X_test)).astype(int)

In [20]:
print("train rmse:", root_mean_squared_error(Y_train, train_y_pred))
print("train mae:", mean_absolute_error(Y_train, train_y_pred))
print("train r2 score:", r2_score(Y_train, train_y_pred))

train rmse: 14.952137945652625
train mae: 4.751572408432017
train r2 score: 0.909240502953645


In [21]:
print("test rmse:", root_mean_squared_error(Y_test, test_y_pred))
print("test mae:", mean_absolute_error(Y_test, test_y_pred))
print("test r2 score:", r2_score(Y_test, test_y_pred))

test rmse: 28.511713288921644
test mae: 7.577101831472448
test r2 score: 0.6515397997474961


In [22]:
pickle.dump(best_gbt, open(os.path.join(MODELS_FOLDER, "tuned_gbt.pkl"), 'wb'))