In [2]:
import numpy as np
import pandas as pd
import pickle
from zipfile import ZipFile
from time import time

from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor

from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from sklearn.metrics import mean_squared_error, root_mean_squared_error, mean_absolute_percentage_error

In [3]:
ZIP_PATH = '../data/training_data.zip'

with ZipFile(ZIP_PATH, 'r') as z:
    with z.open(z.namelist()[0]) as f:
        data = pd.read_csv(f, parse_dates=['date'])

In [4]:
def prepare_data(X: pd.DataFrame) -> pd.DataFrame:
    X = X.dropna(axis=0).copy()
    X['pressure'] = X['pressure'] / 1000
    return X

In [5]:
data = prepare_data(data)

In [6]:
data.head()

Unnamed: 0,date,cloud_cover,sunshine,global_radiation,max_temp,mean_temp,min_temp,precipitation,pressure,snow_depth
0,1979-01-01,2.0,7.0,52.0,2.3,-4.1,-7.5,0.4,101.9,9.0
1,1979-01-02,6.0,1.7,27.0,1.6,-2.6,-7.5,0.0,102.53,8.0
2,1979-01-03,5.0,0.0,13.0,1.3,-2.8,-7.2,0.0,102.05,4.0
3,1979-01-04,8.0,0.0,13.0,-0.3,-2.6,-6.5,0.0,100.84,2.0
4,1979-01-05,6.0,2.0,29.0,5.6,-0.8,-1.4,0.0,102.25,1.0


In [7]:
X_train, X_test, y_train, y_test = train_test_split(data.drop(['date','precipitation'], axis=1),data['precipitation'],test_size=0.2,random_state=13)

In [8]:
algs = {
    'Linear Regression': LinearRegression(),
    'Ridge': Ridge(alpha=1),
    'Lasso': Lasso(alpha=5000),
    #'Elastic Net': ElasticNet(alpha=1.0, l1_ratio=0.5)
    #'Random Forest': RandomForestRegressor(random_state=13, n_estimators=500)
    # 'Multi-Layer Perceptron': MLPRegressor(hidden_layer_sizes=(64, 32), activation='relu', solver='adam', learning_rate='adaptive', max_iter=500, random_state=13)
    }

In [9]:
scaler = StandardScaler()

In [10]:
def cross_validate_models(models: dict, X_train, y_train) -> pd.DataFrame:

    model_names = []
    mse = []
    rmse = []
    mape = []
    pipes = {}

    for name, alg in models.items():

        pipe = Pipeline(steps=[
            ('scaler', scaler),
            ('regressor', alg)
        ])

        CVresults = cross_validate(pipe, X_train, y_train, scoring=('neg_mean_squared_error','neg_root_mean_squared_error','neg_mean_absolute_percentage_error'))
        
        model_names.append(name)
        mse.append(-np.mean(CVresults['test_neg_mean_squared_error']))
        rmse.append(-np.mean(CVresults['test_neg_root_mean_squared_error']))
        mape.append(-np.mean(CVresults['test_neg_mean_absolute_percentage_error']))
        pipes[name] = pipe

    cvResultsDF = pd.DataFrame({
        'Model': model_names,
        'MSE': mse,
        'RMSE': rmse,
        'MAPE': mape
    })

    return cvResultsDF, pipes

In [11]:
cv_results, pipes = cross_validate_models(algs, X_train, y_train)

In [12]:
cv_results

Unnamed: 0,Model,MSE,RMSE,MAPE
0,Linear Regression,11.17583,3.339983,2904131000000000.0
1,Ridge,11.175809,3.339979,2903587000000000.0
2,Lasso,13.294837,3.642841,3848204000000000.0


In [13]:
cv_params = {
    'Ridge': {
        'regressor__alpha': [0.01, 0.1, 1, 10, 100],
        'regressor__fit_intercept': [True, False],
        'regressor__solver': ['auto', 'svd', 'saga']
        },
    'Lasso': {
        'regressor__alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
        'regressor__fit_intercept': [True, False],
        'regressor__max_iter': [1000, 5000, 10000, 20000]
        },
    'Elastic Net': {
        'regressor__alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10],
        'regressor__l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9, 1]
        },
    'Random Forest': {
        'regressor__n_estimators': [50, 250, 500],
        'regressor__max_depth': [None, 10, 20, 30],
        'regressor__min_samples_split': [2, 5, 10],
        'regressor__min_samples_leaf': [1, 2, 4],
        'regressor__bootstrap': [True, False]
        },
    'Mulit-Layer Perceptron': {
        'regressor__hidden_layer_sizes': [(64, 32), (128, 64, 32)],
        'regressor__activation': ['relu', 'tanh'],
        'regressor__solver': ['adam', 'sgd'],
        'regressor__alpha': [0.0001, 0.001],
        'regressor__learning_rate': ['constant', 'adaptive']
        }
    }

In [17]:
def tune_hyperparameters(pipelines: dict, param_grids: dict, X_train, y_train, cv_scoring: str) -> dict:
    tuned_models = {}

    for name, pipe in pipelines.items():

        params = param_grids.get(name)
        if params:
            print(f'Tuning {name} hyperparameters...')
            gs = GridSearchCV(pipe, param_grid=params, cv=5, scoring=cv_scoring)

            start = time()
            gs.fit(X_train, y_train)
            end = time()

            tuning_time = end - start
            time_message = (f'Tuning {name} took: {tuning_time:.3f} seconds' if tuning_time < 60 else f'Tuning {name} took: {tuning_time/60:.3f} minutes')

            best = gs.best_estimator_

            print(f'----Hyperparameter tuning complete ----')
            print(time_message)
            if 'neg_' in cv_scoring:
                print(f'Best Score: {-gs.best_score_:.5f}')
            else:
                print(f'Best Score: {gs.best_score_:.5f}')
            print(f'Best parameters:\n{gs.best_params_}')
            print()
        
        else:
            print(f'No parameter grid found for {name}. Fitting model directly...')
            
            start = time()
            cv = cross_validate(pipe, X_train, y_train, scoring=cv_scoring)
            pipe.fit(X_train, y_train)
            best = pipe
            end = time()

            tuning_time = end - start
            time_message = (f'Fitting {name} took: {tuning_time:.3f} seconds' if tuning_time < 60 else f'Fitting {name} took: {tuning_time/60:.3f} minutes')
            print(time_message)

            if 'neg_' in cv_scoring:
                print(f'Best Score: {-np.mean(cv['test_score']):.5f}')
            else:
                print(f'Best Score: {np.mean(cv['test_score']):.5f}')
            print()
        
        tuned_models[name] = best

    return tuned_models

In [18]:
tuned_models = tune_hyperparameters(pipes, cv_params, X_train, y_train, 'neg_mean_squared_error')

No parameter grid found for Linear Regression. Fitting model directly...
Fitting Linear Regression took: 0.241 seconds
Best Score: 11.17583

Tuning Ridge hyperparameters...
----Hyperparameter tuning complete ----
Tuning Ridge took: 9.264 seconds
Best Score: 11.17581
Best parameters:
{'regressor__alpha': 1, 'regressor__fit_intercept': True, 'regressor__solver': 'auto'}

Tuning Lasso hyperparameters...
----Hyperparameter tuning complete ----
Tuning Lasso took: 5.084 seconds
Best Score: 11.17549
Best parameters:
{'regressor__alpha': 0.001, 'regressor__fit_intercept': True, 'regressor__max_iter': 1000}



In [None]:
def test_evaluation(tuned_models: dict, X_train, y_train, X_test, y_test) -> pd.DataFrame:
    model_names = []
    mse = []
    rmse = []
    mape = []

    if not isinstance(tuned_models, dict):
        tuned_models = {f'{tuned_models.steps[-1][1].__class__.__name__}': tuned_models}
        
    for name, model in tuned_models.items():

        model.fit(X_train, y_train)
        y_preds = model.predict(X_test)

        mse_score = mean_squared_error(y_test, y_preds)
        rmse_score = root_mean_squared_error(y_test, y_preds)
        mape_score = mean_absolute_percentage_error(y_test, y_preds)

        model_names.append(name)
        mse.append(mse_score)
        rmse.append(rmse_score)
        mape.append(mape_score)

    predResultsDF = pd.DataFrame({
        'Model': model_names,
        'MSE': mse,
        'RMSE': rmse,
        'MAPE': mape
    })

    return predResultsDF

In [None]:
evaluation_results, model = test_evaluation(tuned_models, X_train, y_train, X_test, y_test)

In [21]:
evaluation_results

Unnamed: 0,Model,MSE,RMSE,MAPE
0,Linear Regression,13.879052,3.72546,2911235000000000.0
1,Ridge,13.879324,3.725497,2910714000000000.0
2,Lasso,13.88093,3.725712,2907222000000000.0


In [None]:
def save_best_model(test_resultsDF: pd.DataFrame, tuned_models: dict, selection_metric='', save_path='../models/best_model.pkl'):
    best_model = tuned_models[test_resultsDF.loc[test_resultsDF[f'{selection_metric}'].idxmin(), 'Model']]
    pickle.dump(best_model, open(save_path, 'wb'))

In [20]:
save_best_model(evaluation_results, tuned_models, best_metric='MSE')

In [21]:
pickle.dump(scaler, open('../transformers/scaler.pkl', 'wb'))