In [None]:
import os
import sys

# Access to "utils"
src_path = os.path.dirname(os.getcwd())
if os.path.exists(src_path) and src_path not in sys.path:
    sys.path.append(src_path)

In [2]:
import csv
import numpy as np
import pickle as pkl
import utils.utils_V2 as utils

from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

FileNotFoundError: [Errno 2] No such file or directory: './transformers/scaler.pkl'

In [None]:
with open('./data/original_data.csv', 'r', newline='', encoding='utf-8') as f:
    reader = csv.reader(f)
    original_header = next(reader)
    original_data = [row for row in reader]

In [None]:
X, y = utils.process_data(original_header, original_data)

In [None]:
X.shape

(10687, 7)

In [None]:
y.shape

(10687,)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)

In [None]:
algs = {
    'Linear Regression': LinearRegression(),
    'Ridge': Ridge(alpha=1),
    'Lasso': Lasso(alpha=5000),
    # 'Elastic Net': ElasticNet(alpha=1.0, l1_ratio=0.5)
    # 'Random Forest': RandomForestRegressor(random_state=13, n_estimators=500)
    # 'Multi-Layer Perceptron': MLPRegressor(hidden_layer_sizes=(64, 32), activation='relu', solver='adam', learning_rate='adaptive', max_iter=500, random_state=13)
    }

In [None]:
scaler = StandardScaler()
cv_results, my_pipelines = utils.cross_validate_models(algs, X_train, y_train)

In [None]:
cv_results

{'Model': ['Linear Regression', 'Ridge', 'Lasso'],
 'MSE': [11.145704233276048, 11.145695858783332, 13.31912519114034],
 'RMSE': [3.3325731001417767, 3.332571157501667, 3.644361802454976],
 'MAPE': [2920594469026613.5, 2920105527310637.5, 3893342250074862.5]}

In [None]:
my_pipelines

{'Linear Regression': Pipeline(steps=[('scaler', StandardScaler()),
                 ('regressor', LinearRegression())]),
 'Ridge': Pipeline(steps=[('scaler', StandardScaler()), ('regressor', Ridge(alpha=1))]),
 'Lasso': Pipeline(steps=[('scaler', StandardScaler()), ('regressor', Lasso(alpha=5000))])}

In [None]:
gs_params = {
    'Ridge': {
        'regressor__alpha': [0.01, 0.1, 1, 10, 100],
        'regressor__fit_intercept': [True, False],
        'regressor__solver': ['auto', 'svd', 'saga']
        },
    'Lasso': {
        'regressor__alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
        'regressor__fit_intercept': [True, False],
        'regressor__max_iter': [1000, 5000, 10000, 20000]
        },
    'Elastic Net': {
        'regressor__alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10],
        'regressor__l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9, 1]
        },
    'Random Forest': {
        'regressor__n_estimators': [50, 250, 500],
        'regressor__max_depth': [None, 10, 20, 30],
        'regressor__min_samples_split': [2, 5, 10],
        'regressor__min_samples_leaf': [1, 2, 4],
        'regressor__bootstrap': [True, False]
        },
    'Mulit-Layer Perceptron': {
        'regressor__hidden_layer_sizes': [(64, 32), (128, 64, 32)],
        'regressor__activation': ['relu', 'tanh'],
        'regressor__solver': ['adam', 'sgd'],
        'regressor__alpha': [0.0001, 0.001],
        'regressor__learning_rate': ['constant', 'adaptive']
        }
    }

In [None]:
tuned_models = utils.tune_hyperparameters(my_pipelines, gs_params, X_train, y_train, 'neg_mean_squared_error')

No parameter grid found for Linear Regression. Fitting model directly...
Fitting Linear Regression took: 0.068 seconds
Best Score: 11.14570

Tuning Ridge hyperparameters...


----Hyperparameter tuning complete ----
Tuning Ridge took: 2.705 seconds
Best Score: 11.14568
Best parameters:
{'regressor__alpha': 1, 'regressor__fit_intercept': True, 'regressor__solver': 'saga'}

Tuning Lasso hyperparameters...
----Hyperparameter tuning complete ----
Tuning Lasso took: 3.436 seconds
Best Score: 11.14570
Best parameters:
{'regressor__alpha': 0.0001, 'regressor__fit_intercept': True, 'regressor__max_iter': 1000}



  _data = np.array(data, dtype=dtype, copy=copy,


In [None]:
tuned_models

{'Linear Regression': Pipeline(steps=[('scaler', StandardScaler()),
                 ('regressor', LinearRegression())]),
 'Ridge': Pipeline(steps=[('scaler', StandardScaler()),
                 ('regressor', Ridge(alpha=1, solver='saga'))]),
 'Lasso': Pipeline(steps=[('scaler', StandardScaler()),
                 ('regressor', Lasso(alpha=0.0001))])}

In [None]:
evaluation_results = utils.test_evaluation(tuned_models, X_train, y_train, X_test, y_test)

In [None]:
evaluation_results

{'Model': ['Linear Regression', 'Ridge', 'Lasso'],
 'MSE': [14.28627516818812, 14.286531863159563, 14.28644903458906],
 'RMSE': [3.7797189271410274, 3.77975288387476, 3.7797419269824575],
 'MAPE': [2926032464242150.5, 2925431061207781.5, 2925607267521119.5]}

In [None]:
utils.save_best_model(evaluation_results, tuned_models, selection_metric='MSE')

In [None]:
pkl.dump(scaler, open('./transformers/scaler.pkl', 'wb'))