In [47]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
import lightgbm as lgb
from xgboost import XGBRegressor
from sklearn import metrics
from itertools import combinations

def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn
pd.options.mode.chained_assignment = None  # default='warn'

In [52]:
# Sample Data: Previously Preprocessed House Prices from Kaggle
X = pd.read_csv('House_Prices_X.csv', index_col = 0)
y = pd.read_csv('House_Prices_y.csv', header = None, index_col = 0)

In [53]:
X.head()

Unnamed: 0_level_0,LotFrontage,LotArea,OverallQual,OverallCond,MasVnrArea,BsmtUnfSF,TotalBsmtSF,CentralAir,1stFlrSF,2ndFlrSF,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,65,8450,7,5,196,150,856,True,856,854,...,0,0,0,1,0,0,0,0,1,0
2,80,9600,6,8,0,284,1262,True,1262,0,...,0,0,0,1,0,0,0,0,1,0
3,68,11250,7,5,162,434,920,True,920,866,...,0,0,0,1,0,0,0,0,1,0
4,60,9550,7,5,0,540,756,True,961,756,...,0,0,0,1,1,0,0,0,0,0
5,84,14260,8,5,350,490,1145,True,1145,1053,...,0,0,0,1,0,0,0,0,1,0


In [15]:
clf_svm = SVR()
clf_rf = RandomForestRegressor(n_jobs = -1)
clf_xgb = XGBRegressor(n_jobs = -1)
clf_ada = AdaBoostRegressor()
clf_knn = KNeighborsRegressor(n_jobs = -1)
clf_lgbm = lgb.LGBMRegressor(n_jobs = -1,silent = True)

In [16]:
classifiers = [clf_svm, clf_rf, clf_xgb, clf_ada, clf_knn, clf_lgbm]
clf_list=[]
for clf in classifiers:
    clf_list.append(clf.__class__.__name__)

In [None]:
acc_list = []
stacked_results = pd.DataFrame()
skf = StratifiedKFold(n_splits = 5, random_state=None, shuffle=False)
b = pd.DataFrame()
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Scaling
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = pd.DataFrame(scaler.transform(X_train))
    X_test = pd.DataFrame(scaler.transform(X_test))
    
    # Skipping Hyperparameter Optimization
    """
    # HYPERPARAMETER OPTIMIZATION
    # SVM HYPERPARAMETER OPTIMIZATION
    print("SVM...")
    Cs = [0.01, 0.1, 1, 10] #, 100, 1000]
    gammas = [0.01, 0.1, 1]#, 3]
    kernel = ['linear','rbf']
    epsilon = [0.001, 0.01, 0.1]#, 1, 5]
    param_grid = {'C': Cs, 'gamma' : gammas, 'kernel':kernel}
    svm_grid_search = GridSearchCV(clf_svm, param_grid, cv=5, n_jobs = -1, verbose = 1, scoring = 'neg_mean_squared_log_error')
    svm_grid_search.fit(X_train, y_train)
    
    # RANDOM FOREST
    print("RF...")
    n_estimators = [100, 250]
    max_features = ['auto', 'sqrt', 'log2']
    max_depth = [3, 5, 7]
    max_depth.append(None)
    min_samples_split = [2, 3, 5]
    min_samples_leaf = [1, 2, 4]
    bootstrap = [True, False]
    criterion = ['gini', 'entropy']
    param_grid = {'n_estimators': n_estimators,
                   'max_features': max_features,
                   'max_depth': max_depth,
                   'min_samples_split': min_samples_split,
                   'min_samples_leaf': min_samples_leaf,
                   'bootstrap': bootstrap} #'criterion': criterion
    rf_grid_search = GridSearchCV(clf_rf, param_grid = param_grid, cv = 5, n_jobs = -1, verbose=1, scoring = "neg_mean_squared_log_error")
    rf_grid_search.fit(X_train, y_train)
    
    # XGBoost
    print("XGB...")
    grid_xgb = {'min_child_weight': [1, 5, 7, 12],
            'gamma': [0.5, 1, 1.5, 2],
            'subsample': [0.5, 0.7, 1.0],
            'colsample_bytree': [0.5, 0.7, 1.0],
            'max_depth': [3, 5, 9, 15, 25]}
    xgb = XGBRegressor()
    xgb_grid_search = GridSearchCV(clf_xgb, grid_xgb, cv=5, n_jobs=-1, verbose = 1, scoring = "neg_mean_squared_log_error")
    xgb_grid_search.fit(X_train, y_train)
    
    # AdaBoost
    print("Ada...")
    param_grid = {'n_estimators': [30, 50, 100, 250], 'learning_rate': [0.08, 0.1, 0.2, 0.4]}
    ada_grid_search = GridSearchCV(clf_ada, param_grid, cv=5, refit=True, n_jobs = -1, verbose=1, scoring = "neg_mean_squared_log_error")
    ada_grid_search.fit(X_train, y_train)
    
    # kNN
    print("kNN...")
    leaf_range = list(range(3, 15, 1))
    k_range = list(range(1, 15, 1))
    weight_options = ['uniform', 'distance']
    param_grid = dict(leaf_size=leaf_range, n_neighbors=k_range, weights=weight_options)
    knn_grid_search = GridSearchCV(clf_knn, param_grid, cv=5, n_jobs = -1, verbose=1, scoring='neg_mean_squared_log_error')
    knn_grid_search.fit(X_train, y_train)
    
    # LGBM
    print("LGBM...")
    param_grid = {"max_depth": [3, 5, 9, 15, 20, 25], "learning_rate" : [0.008, 0.01, 0.012], 
                  "num_leaves": [80, 100, 120], "n_estimators": [200, 250]}
    lgbm_grid_search = GridSearchCV(clf_lgbm, param_grid, cv=5, refit=True, n_jobs = -1, verbose=1,  scoring = "neg_mean_squared_log_error")
    lgbm_grid_search.fit(X_train, y_train)
    
    clf_svm.set_params(**svm_grid_search.best_params_)
    clf_rf.set_params(**rf_grid_search.best_params_)
    clf_xgb.set_params(**xgb_grid_search.best_params_)
    clf_ada.set_params(**ada_grid_search.best_params_)
    clf_knn.set_params(**knn_grid_search.best_params_)
    clf_lgbm.set_params(**lgbm_grid_search.best_params_)
    
    """
    
    a = pd.DataFrame()
    for clf in classifiers:
        name = clf.__class__.__name__
        clf.fit(X_train, y_train)
        preds = pd.Series(clf.predict(X_test).reshape(-1))
        a = pd.concat([a, preds], axis=1).astype(int)
    b = pd.concat([b, pd.Series(y_test.values.reshape(-1))], axis = 0)
    stacked_results = pd.concat([stacked_results,a], axis = 0)
stacked_results.columns = clf_list
y_kfold = b.iloc[:, 0].astype(int)

RMLSE_list = []
for i in range(stacked_results.shape[1]):
    RMLSE_list.append(metrics.mean_squared_log_error(y_kfold, stacked_results.iloc[:, i]))
df_RMLSE = pd.DataFrame({'RMLSE': RMLSE_list}, index = clf_list).sort_values(by = 'RMLSE', ascending = True)

- All combinations for Stacking

In [50]:
# Try all combinations of classifiers
final_ensemble_results = pd.DataFrame()
for i in range(2, len(clf_list), 1):

    ens_clf_list = []
    ens_RMLSE_list = []
    for x in list(combinations(np.linspace(0, len(clf_list)-1, len(clf_list)).astype(int), i)):
        temp_prob_frame = stacked_results.iloc[:, list(x)]
        temp_prob_frame['mean'] = temp_prob_frame.mean(axis=1)
        temp_prob_frame['mean'] = temp_prob_frame['mean'].astype(int)
        ens_clf_list.append(" ".join(stacked_results.columns[list(x)].values))
        ens_RMLSE_list.append(metrics.mean_squared_log_error(y_kfold, temp_prob_frame['mean']))

    ens_clf_df = pd.DataFrame(index = ens_clf_list)
    ens_clf_df['Score'] = ens_RMLSE_list
    final_ensemble_results = pd.concat([final_ensemble_results, ens_clf_df])

final_ensemble_results = final_ensemble_results.sort_values(by = 'Score', ascending = True) # ascending False, depending on the metric

In [51]:
final_ensemble_results

Unnamed: 0,Score
XGBRegressor LGBMRegressor,0.017518
RandomForestRegressor XGBRegressor LGBMRegressor,0.01795
RandomForestRegressor XGBRegressor,0.018486
RandomForestRegressor XGBRegressor KNeighborsRegressor LGBMRegressor,0.018586
XGBRegressor KNeighborsRegressor LGBMRegressor,0.018854
RandomForestRegressor LGBMRegressor,0.018943
RandomForestRegressor XGBRegressor KNeighborsRegressor,0.020007
RandomForestRegressor KNeighborsRegressor LGBMRegressor,0.020132
RandomForestRegressor XGBRegressor AdaBoostRegressor KNeighborsRegressor LGBMRegressor,0.021041
RandomForestRegressor XGBRegressor AdaBoostRegressor LGBMRegressor,0.02106
