In [1]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import (
    r2_score,
    root_mean_squared_error,
    mean_absolute_error,
    mean_squared_error,
    make_scorer,
)
import pandas as pd
import numpy as np
import pickle
import os

In [2]:
train_df = pd.read_csv("clean_data/train_wo_weather.csv")
test_df = pd.read_csv("clean_data/test_wo_weather.csv")

In [3]:
X_train = train_df[[x for x in train_df.columns if x not in ["On", "Off"]]]
Y_train = train_df["On"]

In [4]:
X_test = test_df[[x for x in test_df.columns if x not in ["On", "Off"]]]
Y_test = test_df["On"]

In [5]:
gbr = GradientBoostingRegressor(random_state = 42, verbose=2)
gbr.fit(X_train, Y_train)

      Iter       Train Loss   Remaining Time 
         1        2325.3998            6.18m
         2        2213.7135            6.00m
         3        2122.8957            5.98m
         4        2047.1944            5.90m
         5        1983.3689            5.85m
         6        1924.6332            5.82m
         7        1875.0695            5.79m
         8        1837.8375            5.73m
         9        1800.9505            5.66m
        10        1772.4494            5.60m
        11        1745.0150            5.55m
        12        1723.6290            5.47m
        13        1695.1193            5.42m
        14        1676.5471            5.36m
        15        1659.8533            5.29m
        16        1644.0544            5.24m
        17        1631.1337            5.17m
        18        1613.6291            5.11m
        19        1602.9795            5.05m
        20        1591.9675            4.98m
        21        1582.3163            4.91m
        2

In [6]:
Y_pred_gbr = np.floor(gbr.predict(X_test)).astype(int)

In [7]:
rmse_gbr = float(format(np.sqrt(mean_squared_error(Y_test, Y_pred_gbr)), '.3f'))

In [8]:
rmse_gbr

32.34

In [9]:
MODELS_FOLDER = "models"

In [10]:
pickle.dump(gbr, open(os.path.join(MODELS_FOLDER, "base_gbt_wo_weather.pkl"), 'wb'))

In [11]:
r2 = r2_score(Y_test, Y_pred_gbr)
rmse = root_mean_squared_error(Y_test, Y_pred_gbr)
mae = mean_absolute_error(Y_test, Y_pred_gbr)
print(f"test rmse: {rmse}, mae: {mae}, r2: {r2}")

test rmse: 32.33998435494458, mae: 12.464130113257495, r2: 0.5516820141097065


In [12]:
Y_pred_gbr = np.floor(gbr.predict(X_train)).astype(int)
r2 = r2_score(Y_train, Y_pred_gbr)
rmse = root_mean_squared_error(Y_train, Y_pred_gbr)
mae = mean_absolute_error(Y_train, Y_pred_gbr)
print(f"train rmse: {rmse}, mae: {mae}, r2: {r2}")

train rmse: 36.00587159176965, mae: 12.98803066279469, r2: 0.47370148216725916


In [13]:
base_gbt = GradientBoostingRegressor(
    random_state=42,
    loss="squared_error",
    criterion="friedman_mse",
    min_samples_split=14,
    min_samples_leaf=7,
    verbose=2,
)
param_grid = [
    {
        "learning_rate": [0.001, 0.1],
        "subsample": [0.8, 1.0],
        "n_estimators": [50],
        "max_depth": [20],
        "max_features": [1.0],
    },
    {
        "learning_rate": [0.001, 0.1],
        "subsample": [0.8, 1.0],
        "n_estimators": [50],
        "max_depth": [100],
        "max_features": ["sqrt"],
    },
    {
        "learning_rate": [0.001, 0.1],
        "subsample": [0.8, 1.0],
        "n_estimators": [10],
        "max_depth": [70],
        "max_features": [1.0],
    },
    {
        "learning_rate": [0.001, 0.1],
        "subsample": [0.8, 1.0],
        "n_estimators": [10],
        "max_depth": [100],
        "max_features": ["sqrt"],
    },
]

In [14]:
scorer = make_scorer(mean_squared_error, greater_is_better=False)
grid_search = GridSearchCV(base_gbt, param_grid, scoring=scorer, n_jobs=-1, verbose=2, cv=3)

In [15]:
grid_search.fit(X_train, Y_train)

Fitting 3 folds for each of 16 candidates, totalling 48 fits
      Iter       Train Loss      OOB Improve   Remaining Time 
      Iter       Train Loss      OOB Improve   Remaining Time 
      Iter       Train Loss   Remaining Time 
      Iter       Train Loss      OOB Improve   Remaining Time 
      Iter       Train Loss   Remaining Time 
      Iter       Train Loss      OOB Improve   Remaining Time 
      Iter       Train Loss      OOB Improve   Remaining Time 
      Iter       Train Loss   Remaining Time 
      Iter       Train Loss   Remaining Time       Iter       Train Loss      OOB Improve   Remaining Time 

         1        2080.7238         339.5176           13.25m
         1        2446.1562           3.5803           13.37m
         1        2099.8396         348.3623           13.35m
         1        2088.7452         340.4439           13.42m
         1        2467.8591           3.6767           13.42m
         1        2455.5513           3.5918           13.51m
     

In [16]:
pd.DataFrame(grid_search.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_max_depth,param_max_features,param_n_estimators,param_subsample,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,824.113388,1.878284,24.542416,0.574201,0.001,20,1.0,50,0.8,"{'learning_rate': 0.001, 'max_depth': 20, 'max...",-2318.018213,-2295.067351,-2251.189625,-2288.09173,27.724953,9
1,976.878546,4.867375,24.139536,0.422584,0.001,20,1.0,50,1.0,"{'learning_rate': 0.001, 'max_depth': 20, 'max...",-2318.340719,-2295.121782,-2251.459101,-2288.3072,27.72624,10
2,848.106457,2.014832,18.75797,0.164608,0.1,20,1.0,50,0.8,"{'learning_rate': 0.1, 'max_depth': 20, 'max_f...",-696.876129,-681.276981,-660.568192,-679.573768,14.8715,2
3,1601.343276,407.546955,18.773268,0.257926,0.1,20,1.0,50,1.0,"{'learning_rate': 0.1, 'max_depth': 20, 'max_f...",-699.300984,-674.536214,-657.729879,-677.189026,17.074684,1
4,465.77458,0.74794,75.713643,0.302435,0.001,100,sqrt,50,0.8,"{'learning_rate': 0.001, 'max_depth': 100, 'ma...",-2319.249347,-2296.04317,-2251.982223,-2289.09158,27.898147,12
5,1175.816116,433.798542,405.982183,430.6137,0.001,100,sqrt,50,1.0,"{'learning_rate': 0.001, 'max_depth': 100, 'ma...",-2319.299791,-2295.984484,-2251.695269,-2288.993181,28.038683,11
6,1425.332277,12.471653,80.113964,2.595886,0.1,100,sqrt,50,0.8,"{'learning_rate': 0.1, 'max_depth': 100, 'max_...",-729.29464,-708.990934,-693.492956,-710.592843,14.659803,3
7,1262.50076,438.808191,93.625358,1.798932,0.1,100,sqrt,50,1.0,"{'learning_rate': 0.1, 'max_depth': 100, 'max_...",-744.741826,-721.977079,-700.45782,-722.392242,18.081253,4
8,199.383369,3.008048,15.855541,0.10467,0.001,70,1.0,10,0.8,"{'learning_rate': 0.001, 'max_depth': 70, 'max...",-2456.511594,-2434.369612,-2388.889528,-2426.590245,28.149303,13
9,232.234825,1.810553,17.998962,0.219986,0.001,70,1.0,10,1.0,"{'learning_rate': 0.001, 'max_depth': 70, 'max...",-2456.561918,-2434.296392,-2388.924054,-2426.594122,28.145028,14


In [17]:
print(grid_search.best_params_)

{'learning_rate': 0.1, 'max_depth': 20, 'max_features': 1.0, 'n_estimators': 50, 'subsample': 1.0}


In [18]:
best_gbt = grid_search.best_estimator_

In [19]:
train_y_pred = np.floor(best_gbt.predict(X_train)).astype(int)
test_y_pred = np.floor(best_gbt.predict(X_test)).astype(int)

In [20]:
print("train rmse:", root_mean_squared_error(Y_train, train_y_pred))
print("train mae:", mean_absolute_error(Y_train, train_y_pred))
print("train r2 score:", r2_score(Y_train, train_y_pred))

train rmse: 20.42498764291646
train mae: 5.685599164283327
train r2 score: 0.8306406823557606


In [21]:
print("test rmse:", root_mean_squared_error(Y_test, test_y_pred))
print("test mae:", mean_absolute_error(Y_test, test_y_pred))
print("test r2 score:", r2_score(Y_test, test_y_pred))

test rmse: 29.32646273815071
test mae: 7.677398422431186
test r2 score: 0.6313400859656315


In [22]:
pickle.dump(best_gbt, open(os.path.join(MODELS_FOLDER, "tuned_gbt_wo_weather.pkl"), 'wb'))