In [41]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import VotingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR

np.random.seed(0)

In [2]:
os.getcwd()
train = pd.read_csv('C:\\Users\\chris\\Desktop\\210\\W210-Capstone\\data\\TrainingData\\trainingWithItems.csv')

In [3]:
unemp = train[train['INDICATOR'] == "Unemployment"]
rgdp = train[train['INDICATOR'] == "RealGDP"]
cpce = train[train['INDICATOR'] == "Core PCE"]
ccpi = train[train['INDICATOR'] == "Core CPI"]
ngnp = train[train['INDICATOR'] == "NominalGNP"]
rgnp = train[train['INDICATOR'] == "RealGNP"]

In [4]:
np.random.seed(0)
def model_setup(df):
    
    rand = df.sample(frac = 1)
    #80/20 split
    nrow = len(rand)
    train_prop = int(np.round(nrow*0.8))
    train = rand[:train_prop]
    val = rand[train_prop:]
    
    train = train[["FORECASTER ID", "MAX", "pred_average", "pred_var", "HIT", "ACTUAL_CONF", "actual",
                  "chicken", "coffee", "eggs", "electricity", "rice", "unleadedGasoline"]]
    val = val[["FORECASTER ID", "MAX", "pred_average", "pred_var", "HIT", "ACTUAL_CONF", "actual",
              "chicken", "coffee", "eggs", "electricity", "rice", "unleadedGasoline"]]
    
    return train.dropna(), val.dropna()

In [85]:
def evaluate_models(df, models):
    train, val = model_setup(df)
    X_train, Y_train = train[["FORECASTER ID", "MAX", "pred_average", "pred_var", "HIT", "ACTUAL_CONF",
                             "chicken", "coffee", "eggs", "electricity", "rice", "unleadedGasoline"]], train["actual"].tolist()
    X_val, Y_val = val[["FORECASTER ID", "MAX", "pred_average", "pred_var", "HIT", "ACTUAL_CONF",
                       "chicken", "coffee", "eggs", "electricity", "rice", "unleadedGasoline"]], val["actual"].tolist()
    # fit and evaluate the models
    scores = []
        
    for name, model in models:
        model.fit(X_train, Y_train)
        if model == RandomForestRegressor():
            preds = []
            for i in range(len(Y_val)):
                row = X_val.iloc[[i]].values.tolist()
                pred = model.predict(row)
                pred = pred.flatten()[0]
                preds.append(pred)
            scores.append(-mean_absolute_error(Y_val, preds))
        else:
            preds = model.predict(X_val)
            scores.append(-mean_absolute_error(Y_val, preds))

    return scores

In [86]:
train, val = model_setup(unemp)
X_train, Y_train = train[["FORECASTER ID", "MAX", "pred_average", "pred_var", "HIT", "ACTUAL_CONF",
                         "chicken", "coffee", "eggs", "electricity", "rice", "unleadedGasoline"]], train["actual"].tolist()
X_val, Y_val = val[["FORECASTER ID", "MAX", "pred_average", "pred_var", "HIT", "ACTUAL_CONF",
                   "chicken", "coffee", "eggs", "electricity", "rice", "unleadedGasoline"]], val["actual"].tolist()

In [87]:
models = []
models.append(('knn', KNeighborsRegressor()))
# models.append(('svm', SVR()))
models.append(('lr', LinearRegression()))
models.append(('rf', RandomForestRegressor()))
models.append(('xgb', xgb.XGBRegressor(verbosity=0)))

## Model Scoring as Weights

In [88]:
unemp_scores = evaluate_models(unemp, models)
ensemble = VotingRegressor(estimators=models, weights=unemp_scores)
ensemble.fit(X_train, Y_train)
# make predictions on test set
yhat = ensemble.predict(X_val)
# evaluate predictions
score = mean_absolute_error(Y_val, yhat)
print('Weighted Avg MAE: %.3f' % (score))
# evaluate each standalone model
for i in range(len(models)):
    print('>%s: %.3f' % (models[i][0], unemp_scores[i]))
# evaluate equal weighting
ensemble = VotingRegressor(estimators=models)
ensemble.fit(X_train, Y_train)
yhat = ensemble.predict(X_val)
score = mean_absolute_error(Y_val, yhat)
print('Voting MAE: %.3f' % (score))

Weighted Avg MAE: 0.888
>knn: -0.905
>lr: -1.339
>rf: -0.226
>xgb: -0.274
Voting MAE: 0.592


#### With SVR

Weighted Avg MAE: 1.292 <br>
knn: -0.909 <br>
svm: -2.125 <br>
lr: -1.273 <br>
rf: -0.213 <br>
xgb: -0.270 <br>
Voting MAE: 0.808

## Model Ranking as Weights

In [77]:
scores = evaluate_models(unemp)
print(scores)
ranking = 1 + np.argsort(np.argsort(scores))
print(ranking)
# create the ensemble
ensemble = VotingRegressor(estimators=models, weights=ranking)
# fit the ensemble on the training dataset
ensemble.fit(X_train, Y_train)
# make predictions on test set
yhat = ensemble.predict(X_val)
# evaluate predictions
score = mean_absolute_error(Y_val, yhat)
print('Weighted Avg MAE: %.3f' % (score))
# evaluate each standalone model
for i in range(len(models)):
    print('>%s: %.3f' % (models[i][0], scores[i]))
# evaluate equal weighting
ensemble = VotingRegressor(estimators=models)
ensemble.fit(X_train, Y_train)
yhat = ensemble.predict(X_val)
score = mean_absolute_error(Y_val, yhat)
print('Voting MAE: %.3f' % (score))

[-0.891889366541196, -1.353660924116945, -0.24065022183619644, -0.28433062773514056]
[2 1 4 3]
Weighted Avg MAE: 0.423
>knn: -0.892
>lr: -1.354
>rf: -0.241
>xgb: -0.284
Voting MAE: 0.593


#### With SVR

Scores: [-0.9907339846925971, -2.2477322881321817, -1.3626856438899229, -0.25836087141482267, -0.3111017540999072] <br>
Rankings: [3 1 2 5 4] <br>
Weighted Avg MAE: 0.568 <br>
knn: -0.991 <br>
svm: -2.248 <br>
lr: -1.363 <br>
rf: -0.258 <br>
xgb: -0.311 <br>
Voting MAE: 0.849