In [1]:
import preprocessing as PRE
import feature_engineering as FE
import load_data as LD

import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import ElasticNet, Lasso, BayesianRidge, LassoLarsIC, Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
import xgboost as xgb
import lightgbm as lgb

import sklearn.pipeline as pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

from sklearn.model_selection import GridSearchCV as GSCV

### Setup

In [2]:
def rmse(a, b):
    
    a = np.array(a)
    b = np.array(b)
    
    error = b - a 
    square = (b - a) ** 2
    mean = np.mean(square)
    root = np.sqrt(mean)
    
    return(root)

In [3]:
def get_score(model, X_test, y_test):
    y_pred = model.predict(X_test)
    score = rmse(y_pred, y_test)
    return(score, y_pred)

In [4]:
def get_best_params(model, params, X_train, y_train, cv=5):
    clf = GSCV(model, params, cv=cv)
    clf.fit(X_train, y_train)
    
    best_params = clf.best_params_
    results = clf.cv_results_.items()
     
    return(best_params)

### Hyper-parameter Tuning

In [5]:
# Training, Validation, and Testing sets
X_train, y_train, X_test, y_test = LD.load_data(outliers=True, frac=0.2, scale=True, test_frac=0.4)
df_val_test = pd.concat([X_test, y_test], axis=1)

X_val, y_val, X_test, y_test = LD.split_train_val(df_val_test, test_frac=0.5)

In [6]:
# Linear Regression
params_linear = {'alpha':[0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 0.6, 0.7, 0.8, 0.9]}
lasso = Lasso(**get_best_params(Lasso(), params_linear, X_train, y_train, cv=4))
ridge = Ridge(**get_best_params(Ridge(), params_linear, X_train, y_train, cv=4))

params_linear_net = {'alpha':[0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 0.6, 0.7, 0.8, 0.9], 
                     'l1_ratio':[0.1, 0.3, 0.5, 0.7, 0.9]}
enet = ElasticNet(**get_best_params(ElasticNet(), params_linear_net, X_train, y_train, cv=4))

In [7]:
# Kernel Ridge Regressions
params_kernel_ridge = {'alpha':[0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 0.6, 0.7, 0.8, 0.9], 
                      'kernel':['linear', 'polynomial'], 'degree':[1, 2, 3, 4]}
kernel_ridge = KernelRidge(**get_best_params(KernelRidge(), params_kernel_ridge, X_train, y_train, cv=4))

In [8]:
# Random Forest
params_rf = {'max_depth':[2, 3, 4, 5, 6], 'n_estimators':[50, 100, 200, 300, 400, 500, 750], 'n_jobs':[-1]}
rf = RandomForestRegressor(**get_best_params(RandomForestRegressor(), params_rf, X_train, y_train, cv=4))

In [None]:
params_gboost = {'n_estimators':[1000, 2000, 3000, 4000, 5000], 'learning_rate':[ 0.05, 0.1, 0.2], 
                 'min_samples_leaf':[5, 10, 15, 20], 'min_samples_split':[5, 10, 15, 20], 'loss':['huber']}
gboost = GradientBoostingRegressor(**get_best_params(GradientBoostingRegressor(), params_gboost, X_train, y_train, cv=4))

In [None]:
model_xgb = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=3, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, silent=1,
                             random_state =7, nthread = -1)

In [None]:
model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=5,
                              learning_rate=0.05, n_estimators=720,
                              max_bin = 55, bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.2319,
                              feature_fraction_seed=9, bagging_seed=9,
                              min_data_in_leaf =6, min_sum_hessian_in_leaf = 11)

### Train Models

In [None]:
# All models
model_names = ['Lasso', 'Ridge', 'Kernel Ridge', 'Random Forest', 'Gradient Boosting', 'XG Boost', 'Light GB']
model_list = [lasso, ridge, kernel_ridge, rf, gboost, model_xgb, model_lgb]

In [None]:
score_list = []
pred_list = []

for model, name in zip(model_list, model_names):
    model.fit(X_train, y_train)
    score, pred = get_score(model, X_val, y_val)
    
    score_list.append(score)
    pred_list.append(pred)
    
    print(f'{name} \n{score}')

### Validation

In [7]:
print('Validation Scores')
pred_list = []
for model in [lasso, ridge, enet]:
    
    model.fit(X_train, y_train)
    
    score, pred = get_score(model, X_val, y_val)
    
    pred_list.append(pred)
    print(score)

Validation Scores
0.12547395778163944
0.11972912062438024
0.12174525897677237


### Testing

In [8]:
print('Testing Scores')
pred_list_test = []
for model in [lasso, ridge, enet]:
    
    model.fit(X_train, y_train)
    score, pred = get_score(model, X_test, y_test)
    pred_list_test.append(pred)
    
    print(score)

Testing Scores
0.13596039116052264
0.13700377483737983
0.13533984829670617


### Meta ModeL

In [9]:
# Train Meta
meta_features = pd.DataFrame({'Lasso':pred_list[0], 'Ridge':pred_list[1], 'ENet':pred_list[2]})
meta_model = Lasso(**get_best_params(Lasso(), params_linear, meta_features, y_val, cv=4))
meta_model.fit(meta_features, y_val)

Lasso(alpha=0.0005, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [10]:
X_test_meta = pd.DataFrame({'Lasso':pred_list_test[0], 'Ridge':pred_list_test[1], 'ENet':pred_list_test[2]})
score, pred = get_score(meta_model, X_test_meta, y_test)
print('Meta Model Testing Score')
score

Meta Model Testing Score


0.1329324749907174

### Error

In [13]:
yhat = meta_model.predict(X_test_meta)
y = y_test

rmse(np.exp(yhat), np.exp(y))

25755.52588186906

In [18]:
np.mean(np.abs((yhat - y) / y))

0.007584512141128519

### Kernel Reg

In [11]:
# params_kernel_ridge = {'alpha':[0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 0.6, 0.7, 0.8, 0.9], 
#                       'kernel':['linear', 'polynomial'], 'degree':[1, 2, 3, 4]}
# kernel_ridge = KernelRidge(**get_best_params(KernelRidge(), params_kernel_ridge, X_train, y_train, cv=4))