In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from matplotlib import pyplot as plt

In [2]:
X_train = pd.read_csv('./Data/X_train_scaled.csv')
y_train = pd.read_csv('./Data/y_train.csv')
X_test = pd.read_csv('./Data/X_test_scaled.csv')
y_test = pd.read_csv('./Data/y_test.csv')

## Dictionary to store results

In [3]:
evals = dict()

##  Multiple Linear Regression (OLS)

In [4]:
MLR_model = LinearRegression()
scores= cross_val_score(MLR_model, X_train, y_train, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1)
mean_mae = np.mean(scores)
print("mean MAE: %.2E" %(mean_mae))

evals['MLR'] =  mean_mae

mean MAE: -3.55E-02


## Ridge without hyperparameter tuning

In [5]:
Ridge_model = Ridge(alpha=1)
scores= cross_val_score(Ridge_model, X_train, y_train, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1)
mean_mae = np.mean(scores)
print(mean_mae)

-0.035205563469938195


## Ridge with hyperparameter tuning

In [6]:
Ridge_model = Ridge()
grid = dict()
grid['alpha'] = np.arange(0,50,.1)
search = GridSearchCV(Ridge_model, grid, scoring='neg_mean_absolute_error', cv=5, n_jobs=-1)
results = search.fit(X_train, y_train)
print('MAE: %.2E' % results.best_score_)
print('Config: %s' % results.best_params_)

evals['Ridge'] = [results.best_score_,results.best_params_]

MAE: -3.41E-02
Config: {'alpha': 49.900000000000006}


## Lasso without hyperparameter tuning

In [7]:
lasso_model = Lasso(alpha=1)
scores= cross_val_score(lasso_model, X_train, y_train, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1)
mean_mae = np.mean(scores)
print("mean MAE: %.2E" %(mean_mae))

mean MAE: -4.67E-02


 ## Lasso with hyperparameter tuning

In [8]:
lasso_model = Lasso()
grid = dict()
grid['alpha'] = np.arange(.0001,50,.1)
search = GridSearchCV(lasso_model, grid, scoring='neg_mean_absolute_error', cv=5, n_jobs=-1)
results = search.fit(X_train, y_train)
print('MAE: %.2E' % results.best_score_)
print('Config: %s' % results.best_params_)

evals['Lasso'] = [results.best_score_,results.best_params_]

MAE: -3.53E-02
Config: {'alpha': 0.0001}


## Random forest without hyperparameter tuning

In [9]:
RF_model = RandomForestRegressor()
scores= cross_val_score(RF_model, X_train, y_train, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1)
mean_mae = np.mean(scores)
print("mean MAE: %.2E" %(mean_mae))

mean MAE: -3.63E-02


## Random forest with hyperparameter tuning

In [10]:
RF_model = RandomForestRegressor()
grid = {
    'bootstrap': [True,False],
    'max_depth': [int(x) for x in np.linspace(50, 110, num = 11)] + [None],
    'max_features': ['auto', 'sqrt'],
    'min_samples_leaf': [1, 2, 4],
    'min_samples_split': [2, 5, 10],
    'n_estimators': [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
}
search = RandomizedSearchCV(RF_model, grid, n_iter=10,scoring='neg_mean_absolute_error', cv=5, n_jobs=-1)
results = search.fit(X_train, y_train)
print('MAE: %.2E' % results.best_score_)
print('Config: %s' % results.best_params_)

evals['RF'] = [results.best_score_,results.best_params_]

  self.best_estimator_.fit(X, y, **fit_params)


MAE: -3.58E-02
Config: {'n_estimators': 1600, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': 'auto', 'max_depth': 68, 'bootstrap': True}


In [11]:
evals

{'MLR': -0.035481637900308075,
 'Ridge': [-0.034148234001457334, {'alpha': 49.900000000000006}],
 'Lasso': [-0.03534415609805873, {'alpha': 0.0001}],
 'RF': [-0.035792743578821874,
  {'n_estimators': 1600,
   'min_samples_split': 10,
   'min_samples_leaf': 4,
   'max_features': 'auto',
   'max_depth': 68,
   'bootstrap': True}]}

## Comparing on test sets

In [12]:
MLR_model.fit(X_train, y_train)
y_pred = MLR_model.predict(X_test)

In [13]:
mean_absolute_error(y_test, y_pred)

0.02757990070422869

In [14]:
Ridge_model.fit(X_train, y_train)
y_pred = Ridge_model.predict(X_test)

mean_absolute_error(y_test, y_pred)

0.027458305190783208

In [15]:
lasso_model = Lasso(alpha=evals['Lasso'][1]['alpha'])
lasso_model.fit(X_train, y_train)

y_pred = lasso_model.predict(X_test)

mean_absolute_error(y_test, y_pred)

0.02741278377202675

In [16]:
RF_model = RandomForestRegressor(n_estimators = evals['RF'][1]['n_estimators'], \
                   min_samples_split = evals['RF'][1]['min_samples_split'], \
                   min_samples_leaf = evals['RF'][1]['min_samples_leaf'], \
                   max_features = evals['RF'][1]['max_features'], \
                   max_depth = evals['RF'][1]['max_depth'], \
                   bootstrap = evals['RF'][1]['bootstrap'])

RF_model.fit(X_train, y_train)

y_pred = RF_model.predict(X_test)

mean_absolute_error(y_test, y_pred)

  RF_model.fit(X_train, y_train)


0.02646905822283388

Ridge Regressor is the most accurate model for the training data, but Random Forest is slightly more accurate for the test data.

### Saving Ridge Model

In [17]:
Ridge_model = Ridge(alpha=evals['Ridge'][1]['alpha'])
Ridge_model.fit(X_train, y_train)
with open('./Model/Ridge_Regressor.pkl', 'wb') as f:
    pickle.dump(Ridge_model, f)