In [None]:
##Import libraries 

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
from datetime import datetime, timedelta

# import all libraries and dependencies for data visualization
pd.options.display.float_format='{:.4f}'.format
plt.rcParams['figure.figsize'] = [8,8]
pd.set_option('display.max_columns', 350)
pd.set_option('display.max_colwidth', -1) 
pd.set_option("display.max_rows", 50)
sns.set(style='darkgrid')
import matplotlib.ticker as ticker
import matplotlib.ticker as plticker


# import all libraries and dependencies for machine learning
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.base import TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import RobustScaler

from sklearn.compose import make_column_transformer


import statsmodels.api as sm
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import PolynomialFeatures

from scipy import stats
from scipy.stats import norm, kurtosis, skew

# Import specific libraries
import statsmodels.api as sm
from statsmodels.stats import diagnostic as diag
from statsmodels.stats.outliers_influence import variance_inflation_factor

pd.set_option('expand_frame_repr', False)


# Models
import mlxtend
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, BaggingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.linear_model import ElasticNet, ElasticNetCV
from sklearn.svm import SVR
from mlxtend.regressor import StackingCVRegressor
import lightgbm as lgb
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

# Stats
from scipy.stats import skew, norm
from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax

# Misc
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import scale
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.decomposition import PCA

pd.set_option('display.max_columns', None)

# Ignore useless warnings
import warnings
warnings.filterwarnings(action="ignore")
pd.options.display.max_seq_items = 8000
pd.options.display.max_rows = 8000


In [None]:
%store -r X_train
%store -r y_train
%store -r X_validation
%store -r y_validation




In [None]:
y_train.head(2)

In [None]:
#Renaming
X = X_train
train_labels = y_train



In [None]:
X.shape

In [None]:
#Setup cross validation and define error metrics

In [None]:
# Setup cross validation folds
kf = KFold(n_splits=10, random_state=42, shuffle=True)

In [None]:
#SETUP MODELS

In [None]:
# Define error metrics
def rmsle(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

def cv_rmse(model, X = X):
    rmse = np.sqrt(-cross_val_score(model, X, train_labels, scoring="neg_mean_squared_error", cv=kf))
    return (rmse)

In [None]:
# Light Gradient Boosting Regressor
lightgbm = LGBMRegressor(objective='regression', 
                       num_leaves=6,
                       learning_rate=0.01, 
                       n_estimators=7000,
                       max_bin=200, 
                       bagging_fraction=0.8,
                       bagging_freq=4, 
                       bagging_seed=8,
                       feature_fraction=0.2,
                       feature_fraction_seed=8,
                       min_sum_hessian_in_leaf = 11,
                       verbose=-1,
                       random_state=42)

# XGBoost Regressor
xgboost = XGBRegressor(learning_rate=0.01,
                       n_estimators=6000,
                       max_depth=4,
                       min_child_weight=0,
                       gamma=0.6,
                       subsample=0.7,
                       colsample_bytree=0.7,
                       objective ='reg:squarederror',
                       nthread=-1,
                       scale_pos_weight=1,
                       seed=27,
                       reg_alpha=0.00006,
                       random_state=42)

# Ridge Regressor
ridge_alphas = [1e-15, 1e-10, 1e-8, 9e-4, 7e-4, 5e-4, 3e-4, 1e-4, 1e-3, 5e-2, 1e-2, 0.1, 0.3, 1, 3, 5, 10, 15, 18, 20, 30, 50, 75, 100]
ridge = make_pipeline(RobustScaler(), RidgeCV(alphas=ridge_alphas, cv=kf))

# Support Vector Regressor
svr = make_pipeline(RobustScaler(), SVR(C= 20, epsilon= 0.008, gamma=0.0003))

# Gradient Boosting Regressor
gbr = GradientBoostingRegressor(n_estimators=6000,
                                learning_rate=0.01,
                                max_depth=4,
                                max_features='sqrt',
                                min_samples_leaf=15,
                                min_samples_split=10,
                                loss='huber',
                                random_state=42)  

# Random Forest Regressor
rf = RandomForestRegressor(n_estimators=1200,
                          max_depth=15,
                          min_samples_split=5,
                          min_samples_leaf=5,
                          max_features=None,
                          oob_score=True,
                          random_state=42)

# Stack up all the models above, optimized using xgboost
stack_gen = StackingCVRegressor(regressors=(xgboost, lightgbm, svr, ridge, gbr, rf),
                                meta_regressor=xgboost,
                                use_features_in_secondary=True)

In [None]:
#TRAIN INDIVIDUAL MODELS

##url: StackingCV regressor: https://analyticsindiamag.com/stackingcvregressor-in-python/

In [None]:
#Get cross validation scores for each model

In [None]:
scores = {}
score = cv_rmse(lightgbm)
print("lightgbm: {:.4f} ({:.4f})".format(score.mean(), score.std()))
scores['lgb'] = (score.mean(), score.std())

In [None]:
score = cv_rmse(xgboost)
print("xgboost: {:.4f} ({:.4f})".format(score.mean(), score.std()))
scores['xgb'] = (score.mean(), score.std())


In [None]:
score = cv_rmse(svr)
print("SVR: {:.4f} ({:.4f})".format(score.mean(), score.std()))
scores['svr'] = (score.mean(), score.std())

In [None]:
score = cv_rmse(ridge)
print("ridge: {:.4f} ({:.4f})".format(score.mean(), score.std()))
scores['ridge'] = (score.mean(), score.std())

In [None]:
score = cv_rmse(rf)
print("rf: {:.4f} ({:.4f})".format(score.mean(), score.std()))
scores['rf'] = (score.mean(), score.std())

In [None]:
score = cv_rmse(gbr)
print("gbr: {:.4f} ({:.4f})".format(score.mean(), score.std()))
scores['gbr'] = (score.mean(), score.std())

In [None]:
scores

In [None]:
#FIT THE MODELS

In [None]:
print('stack_gen')
stack_gen_model = stack_gen.fit(np.array(X), np.array(train_labels))


In [None]:
print('lightgbm')
lgb_model_full_data = lightgbm.fit(X, train_labels)

In [None]:
print('xgboost')
xgb_model_full_data = xgboost.fit(X, train_labels)

In [None]:
print('Svr')
svr_model_full_data = svr.fit(X, train_labels)

In [None]:
print('Ridge')
ridge_model_full_data = ridge.fit(X, train_labels)

In [None]:
print('RandomForest')
rf_model_full_data = rf.fit(X, train_labels)

In [None]:
print('GradientBoosting')
gbr_model_full_data = gbr.fit(X, train_labels)

In [None]:
#BLEND MODELS AND MAKE PREDICTIONS
print(ridge_model_full_data.predict(X), svr_model_full_data.predict(X), gbr_model_full_data.predict(X),xgb_model_full_data.predict(X),lgb_model_full_data.predict(X), rf_model_full_data.predict(X),stack_gen_model.predict(np.array(X)))


In [None]:
lista = [ridge_model_full_data.predict(X), svr_model_full_data.predict(X), gbr_model_full_data.predict(X),xgb_model_full_data.predict(X),lgb_model_full_data.predict(X), rf_model_full_data.predict(X),stack_gen_model.predict(np.array(X))]

for l in lista:
    print(l.shape)

In [None]:
# Blend models in order to make the final predictions more robust to overfitting: 

def blended_predictions(X):
    return ((0.05 * np.reshape(ridge_model_full_data.predict(X), (X.shape[0],))) + \
            (0.05 * svr_model_full_data.predict(X)) + \
            (0.15 * gbr_model_full_data.predict(X)) + \
            (0.2 * xgb_model_full_data.predict(X)) + \
            (0.05* lgb_model_full_data.predict(X)) + \
            (0.2 * rf_model_full_data.predict(X)) + \
            (0.3* stack_gen_model.predict(np.array(X))))


In [None]:
blended_predictions(X)

In [None]:
train_labels.shape, blended_predictions(X).shape, X.shape

In [None]:
# Get final precitions from the blended model
blended_score = rmsle(train_labels, blended_predictions(X))
scores['blended'] = (blended_score, 0)
print('RMSLE score on train data:')
print(blended_score)

In [None]:
#IDENTIFY THE BEST PERFORMING MODEL

In [None]:
# Plot the predictions for each model
sns.set_style("white")
fig = plt.figure(figsize=(24, 12))

ax = sns.pointplot(x=list(scores.keys()), y=[score for score, _ in scores.values()], markers=['o'], linestyles=['-'])
for i, score in enumerate(scores.values()):
    ax.text(i, score[0] + 0.002, '{:.6f}'.format(score[0]), horizontalalignment='left', size='large', color='black', weight='semibold')

plt.ylabel('Score (RMSE)', size=20, labelpad=12.5)
plt.xlabel('Model', size=20, labelpad=12.5)
plt.tick_params(axis='x', labelsize=13.5)
plt.tick_params(axis='y', labelsize=12.5)

plt.title('Scores of Models', size=20)

plt.show()

In [None]:
##EVALUATION ON THE VALIDATION TEST: We make the prediction using the best working model (blended)


In [None]:
#Make a prediction on the validation set
prediction = blended_predictions(X_validation)


In [None]:
# Results from the blended model
blended_score = rmsle(y_validation, prediction)
scores['blended'] = (blended_score, 0)
print('RMSLE score on validation data:')
print(blended_score)

In [None]:
"""Conclusion:
We can see a slight improvement in the score of StackingCVRegressor compared to the individual algorithms,
however, this may not always be the case. StackingCVRegressor may turn out to be less or more efficient in terms 
of accurate predictions than individual algorithms depending on the data and the level one regressor used. 
One thing which can be certain though is that the predictions from StackingCVRegressor can be deemed stable and is
expected to show less variance due to the very fact that it combines the skills of a variety of algorithms."""

In [None]:
#Pasos para sofisticar el módulo: agregar un gridsearch para seleccionar los parámetros de cada modelo