# Linear Regression

## Import packages

In [1]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import ElasticNet
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    root_mean_squared_error,
    mean_absolute_error,
    mean_squared_error,
    r2_score,
    make_scorer,
)
import pandas as pd
import numpy as np
import pickle
import os

In [2]:
CLEAN_DATA_FOLDER = "clean_data"
MODELS_FOLDER = "models"

## Load the dataframe

In [3]:
train_df = pd.read_csv(os.path.join(CLEAN_DATA_FOLDER, "train.csv"))
test_df = pd.read_csv(os.path.join(CLEAN_DATA_FOLDER, "test.csv"))

## Split into X and y

In [4]:
train_X = train_df[[x for x in train_df.columns if x not in ["On", "Off"]]]
train_y = train_df["On"]
test_X = test_df[[x for x in test_df.columns if x not in ["On", "Off"]]]
test_y = test_df["On"]

## Train the Linear Regression Model

In [5]:
std = StandardScaler()
pca = PCA(random_state=42)
reg = ElasticNet(random_state=42)
lr = Pipeline(
    [("standardization", std), ("decomposition", pca), ("regression", reg)],
    verbose=True,
)
lr = lr.fit(X=train_X, y=train_y)

[Pipeline] ... (step 1 of 3) Processing standardization, total=   0.8s
[Pipeline] ..... (step 2 of 3) Processing decomposition, total=   2.9s
[Pipeline] ........ (step 3 of 3) Processing regression, total=   0.7s


In [6]:
train_y_pred = np.floor(lr.predict(train_X)).astype(int)
test_y_pred = np.floor(lr.predict(test_X)).astype(int)

## Report Train and Test results

In [7]:
print("train rmse:", root_mean_squared_error(train_y, train_y_pred))
print("train mae:", mean_absolute_error(train_y, train_y_pred))
print("train r2 score:", r2_score(train_y, train_y_pred))

train rmse: 45.784972877057896
train mae: 17.482477732277953
train r2 score: 0.14899636958639972


In [8]:
print("test rmse:", root_mean_squared_error(test_y, test_y_pred))
print("test mae:", mean_absolute_error(test_y, test_y_pred))
print("test r2 score:", r2_score(test_y, test_y_pred))

test rmse: 43.47254827709856
test mae: 17.16333416654789
test r2 score: 0.18990351375539216


## Export Model

In [9]:
pickle.dump(lr, open(os.path.join(MODELS_FOLDER, "base_elastic_net.pkl"), "wb"))

In [10]:
del lr

## Hyperparameter Tuning with GridSearchCV

### Declare base model and parameters

In [11]:
std = StandardScaler()
pca = PCA(random_state=42)
reg = ElasticNet(random_state=42)
base_lr = Pipeline(
    [("standardization", std), ("decomposition", pca), ("regression", reg)],
    verbose=True,
)
param_grid = {
    "decomposition__n_components": [7, 8, 9],
    "regression__alpha": [0.5, 1.0],
    "regression__l1_ratio": [0.3, 0.5, 0.7],
}

### Declare the scorer and grid search

In [12]:
scorer = make_scorer(mean_squared_error, greater_is_better=False)
grid_search = GridSearchCV(base_lr, param_grid, scoring=scorer, n_jobs=-1, verbose=2, cv=5)

### Train the models

In [13]:
grid_search.fit(train_X, train_y)

Fitting 5 folds for each of 18 candidates, totalling 90 fits
[Pipeline] ... (step 1 of 3) Processing standardization, total=   2.7s
[Pipeline] ... (step 1 of 3) Processing standardization, total=   3.0s
[Pipeline] ... (step 1 of 3) Processing standardization, total=   1.9s
[Pipeline] ... (step 1 of 3) Processing standardization, total=   1.9s
[Pipeline] ... (step 1 of 3) Processing standardization, total=   1.8s
[Pipeline] ... (step 1 of 3) Processing standardization, total=   2.0s
[Pipeline] ... (step 1 of 3) Processing standardization, total=   2.0s
[Pipeline] ... (step 1 of 3) Processing standardization, total=   2.0s
[Pipeline] ... (step 1 of 3) Processing standardization, total=   1.5s
[Pipeline] ... (step 1 of 3) Processing standardization, total=   2.1s
[Pipeline] ..... (step 2 of 3) Processing decomposition, total=  45.8s
[Pipeline] ..... (step 2 of 3) Processing decomposition, total=  46.6s
[Pipeline] ........ (step 3 of 3) Processing regression, total=   1.7s
[Pipeline] .....

In [14]:
pd.DataFrame(grid_search.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_decomposition__n_components,param_regression__alpha,param_regression__l1_ratio,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,55.253004,3.250714,1.250892,0.310408,7,0.5,0.3,"{'decomposition__n_components': 7, 'regression...",-2201.405945,-2180.861013,-2248.75013,-2095.560883,-2156.998694,-2176.715333,50.577892,15
1,56.654857,2.743366,1.204062,0.19634,7,0.5,0.5,"{'decomposition__n_components': 7, 'regression...",-2195.009822,-2174.67607,-2242.170508,-2089.513854,-2150.84765,-2170.443581,50.397991,14
2,49.678724,1.369773,1.027392,0.14339,7,0.5,0.7,"{'decomposition__n_components': 7, 'regression...",-2189.377866,-2169.298154,-2236.329829,-2084.308202,-2145.499465,-2164.962703,50.18392,13
3,47.569263,2.133452,1.193236,0.140659,7,1.0,0.3,"{'decomposition__n_components': 7, 'regression...",-2231.590384,-2210.367866,-2279.571329,-2124.6113,-2186.151914,-2206.458558,51.193168,18
4,46.828394,2.354116,0.798564,0.17381,7,1.0,0.5,"{'decomposition__n_components': 7, 'regression...",-2218.069202,-2197.087489,-2265.807441,-2111.474489,-2172.91708,-2193.07114,50.981964,17
5,48.218465,1.462227,0.929625,0.115088,7,1.0,0.7,"{'decomposition__n_components': 7, 'regression...",-2204.376265,-2183.676262,-2251.750015,-2098.227393,-2159.555821,-2179.517151,50.713588,16
6,11.386129,0.790301,0.791549,0.119821,8,0.5,0.3,"{'decomposition__n_components': 8, 'regression...",-2095.421613,-2075.512644,-2140.768976,-1991.359108,-2052.329483,-2071.078365,49.347175,6
7,11.295661,0.778857,0.881678,0.207775,8,0.5,0.5,"{'decomposition__n_components': 8, 'regression...",-2085.260119,-2065.656089,-2130.254465,-1981.844626,-2042.542618,-2061.111584,49.015506,4
8,10.788408,0.535006,0.659726,0.137932,8,0.5,0.7,"{'decomposition__n_components': 8, 'regression...",-2076.073105,-2056.842716,-2120.651628,-1973.470128,-2033.801373,-2052.16779,48.608646,2
9,10.27863,0.643656,0.975826,0.055908,8,1.0,0.3,"{'decomposition__n_components': 8, 'regression...",-2141.072281,-2120.235275,-2187.532668,-2035.082818,-2096.560996,-2116.096808,50.376379,12


In [15]:
print(grid_search.best_params_)

{'decomposition__n_components': 9, 'regression__alpha': 0.5, 'regression__l1_ratio': 0.7}


### Extract the best model

In [16]:
best_lr = grid_search.best_estimator_

In [17]:
train_y_pred = np.floor(best_lr.predict(train_X)).astype(int)
test_y_pred = np.floor(best_lr.predict(test_X)).astype(int)

### Report Train and Test results

In [18]:
print("train rmse:", root_mean_squared_error(train_y, train_y_pred))
print("train mae:", mean_absolute_error(train_y, train_y_pred))
print("train r2 score:", r2_score(train_y, train_y_pred))

train rmse: 45.30069133139723
train mae: 17.420681697259756
train r2 score: 0.1669038069093497


In [19]:
print("test rmse:", root_mean_squared_error(test_y, test_y_pred))
print("test mae:", mean_absolute_error(test_y, test_y_pred))
print("test r2 score:", r2_score(test_y, test_y_pred))

test rmse: 42.78110897101234
test mae: 17.040605243235383
test r2 score: 0.21546806269169627


### Export Models

In [20]:
pickle.dump(best_lr, open(os.path.join(MODELS_FOLDER, "tuned_elastic_net.pkl"), "wb"))