# Ensemble

In [12]:
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from data import train, validation
from data import X, y, categorical, numerical
from utils import StandardizedGridSearchCV

In [13]:
pred_train = pd.read_csv('predictions/train.csv', index_col=0)
pred_validation = pd.read_csv('predictions/validation.csv', index_col=0)

In [14]:
from sklearn.linear_model import LinearRegression

meta = StandardizedGridSearchCV(
    estimator = LinearRegression(),
    param_grid = {
        'fit_intercept': [True, False],
        'positive': [True, False],
    },
)

meta.fit(pred_train, train[y])
meta.results[['mean_train_neg_mean_squared_error', 'mean_test_neg_mean_squared_error']]

Fitting 10 folds for each of 4 candidates, totalling 40 fits


Unnamed: 0_level_0,Unnamed: 1_level_0,mean_train_neg_mean_squared_error,mean_test_neg_mean_squared_error
fit_intercept,positive,Unnamed: 2_level_1,Unnamed: 3_level_1
True,False,-62.193629,-76.667645
False,False,-63.129715,-78.329613
True,True,-84.732709,-96.890167
False,True,-87.67964,-101.235888


In [15]:
meta.save('ensemble.p')

# Coefficient analysis

In [18]:
coef = dict(zip(
    pred_train.columns,
    meta.best_estimator_.coef_,
))
coef = pd.Series(coef, name='coef')
coef.to_frame()

Unnamed: 0,coef
SVR_Radial,0.236733
Boosting10,1.086892
RF5,-0.161529
regression,-0.13424


# Model evaluation

The ensemble model was evaluated against the best individual model on the validation set. The ensemble model shows a significant improvement in prediction variance with a small improvement in prediction bias. 

In [26]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

y_true = validation[y]

print('Best indivdual model:')
y_pred = pred_validation.iloc[:, 0]
print('- mse =', mean_squared_error(y_true=y_true, y_pred=y_pred))
print('- mae =', mean_absolute_error(y_true=y_true, y_pred=y_pred))
print('- std =', (y_true - y_pred).std())

print()
print('Ensemble model:')
y_pred = meta.predict(pred_validation)
print('- mse =', mean_squared_error(y_true=y_true, y_pred=y_pred))
print('- mae =', mean_absolute_error(y_true=y_true, y_pred=y_pred))
print('- std =', (y_true - y_pred).std())

Best indivdual model:
- mse = 1265.690846715096
- mae = 18.181575382899513
- std = 125.55516520789725

Ensemble model:
- mse = 1607.2054233630424
- mae = 16.685320301569497
- std = 40.07376946940644
