# Linear Regression

## Import packages

In [1]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import ElasticNet
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    root_mean_squared_error,
    mean_absolute_error,
    mean_squared_error,
    r2_score,
    make_scorer,
)
import pandas as pd
import numpy as np
import pickle
import os

In [2]:
CLEAN_DATA_FOLDER = "clean_data"
MODELS_FOLDER = "models"

## Load the dataframe

In [3]:
train_df = pd.read_csv(os.path.join(CLEAN_DATA_FOLDER, "train_wo_weather.csv"))
test_df = pd.read_csv(os.path.join(CLEAN_DATA_FOLDER, "test_wo_weather.csv"))

## Split into X and y

In [4]:
train_X = train_df[[x for x in train_df.columns if x not in ["On", "Off"]]]
train_y = train_df["On"]
test_X = test_df[[x for x in test_df.columns if x not in ["On", "Off"]]]
test_y = test_df["On"]

## Train the Linear Regression Model

In [5]:
std = StandardScaler()
pca = PCA(random_state=42)
reg = ElasticNet(random_state=42)
lr = Pipeline(
    [("standardization", std), ("decomposition", pca), ("regression", reg)],
    verbose=True,
)
lr = lr.fit(X=train_X, y=train_y)

[Pipeline] ... (step 1 of 3) Processing standardization, total=   0.4s
[Pipeline] ..... (step 2 of 3) Processing decomposition, total=   1.6s
[Pipeline] ........ (step 3 of 3) Processing regression, total=   0.5s


In [6]:
train_y_pred = np.floor(lr.predict(train_X)).astype(int)
test_y_pred = np.floor(lr.predict(test_X)).astype(int)

## Report Train and Test results

In [7]:
print("train rmse:", root_mean_squared_error(train_y, train_y_pred))
print("train mae:", mean_absolute_error(train_y, train_y_pred))
print("train r2 score:", r2_score(train_y, train_y_pred))

train rmse: 45.77232239377747
train mae: 17.472760533321363
train r2 score: 0.14946657276625952


In [8]:
print("test rmse:", root_mean_squared_error(test_y, test_y_pred))
print("test mae:", mean_absolute_error(test_y, test_y_pred))
print("test r2 score:", r2_score(test_y, test_y_pred))

test rmse: 43.452505039230516
test mae: 17.146198357110986
test r2 score: 0.19065033970178002


## Export Model

In [9]:
pickle.dump(lr, open(os.path.join(MODELS_FOLDER, "base_elastic_net_wo_weather.pkl"), "wb"))

## Hyperparameter Tuning with GridSearchCV

### Declare base model and parameters

In [10]:
std = StandardScaler()
pca = PCA(random_state=42)
reg = ElasticNet(random_state=42)
base_lr = Pipeline(
    [("standardization", std), ("decomposition", pca), ("regression", reg)],
    verbose=True,
)
param_grid = {
    "decomposition__n_components": [5, 6, 7],
    "regression__alpha": [0.5, 1.0],
    "regression__l1_ratio": [0.3, 0.5, 0.7],
}

### Declare the scorer and grid search

In [11]:
scorer = make_scorer(mean_squared_error, greater_is_better=False)
grid_search = GridSearchCV(base_lr, param_grid, scoring=scorer, n_jobs=-1, verbose=2, cv=5)

### Train the models

In [12]:
grid_search.fit(train_X, train_y)

Fitting 5 folds for each of 18 candidates, totalling 90 fits
[Pipeline] ... (step 1 of 3) Processing standardization, total=   1.6s
[Pipeline] ... (step 1 of 3) Processing standardization, total=   1.3s
[Pipeline] ... (step 1 of 3) Processing standardization, total=   1.0s
[Pipeline] ... (step 1 of 3) Processing standardization, total=   1.2s
[Pipeline] ... (step 1 of 3) Processing standardization, total=   1.2s
[Pipeline] ... (step 1 of 3) Processing standardization, total=   1.2s
[Pipeline] ... (step 1 of 3) Processing standardization, total=   1.1s
[Pipeline] ... (step 1 of 3) Processing standardization, total=   1.1s
[Pipeline] ... (step 1 of 3) Processing standardization, total=   1.0s
[Pipeline] ... (step 1 of 3) Processing standardization, total=   1.2s
[Pipeline] ..... (step 2 of 3) Processing decomposition, total=  18.0s
[Pipeline] ........ (step 3 of 3) Processing regression, total=   0.7s
[CV] END decomposition__n_components=5, regression__alpha=0.5, regression__l1_ratio=0.3

In [13]:
pd.DataFrame(grid_search.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_decomposition__n_components,param_regression__alpha,param_regression__l1_ratio,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,23.712591,1.90877,0.625731,0.175195,5,0.5,0.3,"{'decomposition__n_components': 5, 'regression...",-2204.153736,-2183.548084,-2251.169061,-2097.704925,-2159.518929,-2179.218947,50.69503,15
1,24.824353,0.989462,0.51878,0.112499,5,0.5,0.5,"{'decomposition__n_components': 5, 'regression...",-2197.765966,-2177.382013,-2244.576342,-2091.633526,-2153.399151,-2172.9514,50.51814,14
2,14.459359,0.790159,0.906243,0.370935,5,0.5,0.7,"{'decomposition__n_components': 5, 'regression...",-2192.136673,-2172.014576,-2238.707531,-2086.38526,-2148.078133,-2167.464435,50.308376,13
3,15.418422,0.938389,0.66217,0.174998,5,1.0,0.3,"{'decomposition__n_components': 5, 'regression...",-2233.734442,-2212.496053,-2281.458957,-2126.295011,-2188.180627,-2208.433018,51.276251,18
4,14.13698,0.583181,0.492372,0.298041,5,1.0,0.5,"{'decomposition__n_components': 5, 'regression...",-2220.110595,-2199.141472,-2267.569781,-2113.02411,-2174.901295,-2194.94945,51.064779,17
5,13.382615,0.331518,0.638806,0.178176,5,1.0,0.7,"{'decomposition__n_components': 5, 'regression...",-2206.285414,-2185.669647,-2253.394572,-2099.679764,-2161.514581,-2181.308796,50.782511,16
6,5.748995,0.078564,0.36468,0.082636,6,0.5,0.3,"{'decomposition__n_components': 6, 'regression...",-2095.497934,-2075.652072,-2140.855563,-1991.448421,-2052.460594,-2071.182917,49.342854,6
7,5.381383,0.280719,0.398772,0.067381,6,0.5,0.5,"{'decomposition__n_components': 6, 'regression...",-2085.238296,-2065.722899,-2130.245751,-1981.844515,-2042.627597,-2061.135812,49.005752,4
8,4.695368,0.159486,0.365158,0.0792,6,0.5,0.7,"{'decomposition__n_components': 6, 'regression...",-2075.951391,-2056.836265,-2120.540182,-1973.372601,-2033.846777,-2052.109443,48.593338,2
9,4.787373,0.364542,0.368082,0.108513,6,1.0,0.3,"{'decomposition__n_components': 6, 'regression...",-2140.91793,-2120.164413,-2187.41354,-2034.971813,-2096.51127,-2115.995793,50.365696,12


In [14]:
print(grid_search.best_params_)

{'decomposition__n_components': 7, 'regression__alpha': 0.5, 'regression__l1_ratio': 0.7}


### Extract the best model

In [15]:
best_lr = grid_search.best_estimator_

In [16]:
train_y_pred = np.floor(best_lr.predict(train_X)).astype(int)
test_y_pred = np.floor(best_lr.predict(test_X)).astype(int)

### Report Train and Test results

In [17]:
print("train rmse:", root_mean_squared_error(train_y, train_y_pred))
print("train mae:", mean_absolute_error(train_y, train_y_pred))
print("train r2 score:", r2_score(train_y, train_y_pred))

train rmse: 45.30223681789324
train mae: 17.412751000727656
train r2 score: 0.1668469618219387


In [18]:
print("test rmse:", root_mean_squared_error(test_y, test_y_pred))
print("test mae:", mean_absolute_error(test_y, test_y_pred))
print("test r2 score:", r2_score(test_y, test_y_pred))

test rmse: 42.778246439202
test mae: 17.029904762859736
test r2 score: 0.21557304699659363


### Export Models

In [19]:
pickle.dump(best_lr, open(os.path.join(MODELS_FOLDER, "tuned_elastic_net_wo_weather.pkl"), "wb"))