# Stacking Ensemble

from google.colab import drive
drive.mount('/content/drive')

!pip install catboost

## Data

In [1]:
from typing import Dict, Tuple, Union, List
from tqdm import tqdm
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import os, glob, warnings
from itertools import combinations
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.svm import SVC
from catboost import CatBoostClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import RepeatedStratifiedKFold
import optuna
import sklearn
warnings.filterwarnings('ignore')

In [2]:
train_df=pd.read_csv("../data/cleaned_train.csv")
test_df=pd.read_csv("../data/cleaned_test.csv")

train_df.head()

Unnamed: 0.1,Unnamed: 0,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,days_birth,days_employed,work_phone,home_phone,email,occup_type,family_size,begin_month,credit,CODE
0,0,F,N,N,0,202500.0,Commercial associate,Higher education,Married,Municipal apartment,13899,4709,0,0,0,No job,2.0,6,1.0,F-13899202500.0Commercial associate
1,1,F,N,Y,1,247500.0,Commercial associate,Secondary / secondary special,Civil marriage,House / apartment,11380,1540,0,0,1,Laborers,3.0,5,1.0,F-11380247500.0Commercial associate
2,2,M,Y,Y,0,450000.0,Working,Higher education,Married,House / apartment,19087,4434,0,1,0,Managers,2.0,22,2.0,M-19087450000.0Working
3,3,F,N,Y,0,202500.0,Commercial associate,Secondary / secondary special,Married,House / apartment,15088,2092,0,1,0,Sales staff,2.0,37,0.0,F-15088202500.0Commercial associate
4,4,F,Y,Y,0,157500.0,State servant,Higher education,Married,House / apartment,15037,2105,0,0,0,Managers,2.0,26,2.0,F-15037157500.0State servant


## Preprocessing

### Numerical Data: Scaling

In [3]:
def num_data_scale(train_df, test_df, scaling_cols=None):
    
    scaling_cols = ['child_num','income_total','days_birth','days_employed','family_size','begin_month']

    std_scaler = StandardScaler()
    std_scaler.fit(train_df[scaling_cols])

    train_scaled = std_scaler.transform(train_df[scaling_cols])
    test_scaled = std_scaler.transform(test_df[scaling_cols]) # Apply to the test set using the scaler that was fitted with train data!!

    train_scaled = pd.DataFrame(train_scaled, columns=scaling_cols)
    test_scaled = pd.DataFrame(test_scaled, columns=scaling_cols)

    train_scaled.shape, test_scaled.shape
    
    return train_scaled, test_scaled

In [4]:
train_scaled, test_scaled = num_data_scale(train_df, test_df)

### Categorical Data: One-hot Encoding

In [5]:
def cat_data_encode(train_df, test_df, onehot_cols=None):
    
    onehot_cols = ['gender','car','reality','income_type','edu_type','family_type','house_type','occup_type','work_phone','home_phone','email']
    data = pd.concat([train_df[onehot_cols], test_df[onehot_cols]]) # One-hot encoding using the combination of train and test data
    data = pd.get_dummies(data)

    # data separation
    data = data.reset_index(drop=True)
    train_encoded = data.loc[:train_df.shape[0]-1]
    test_encoded = data.loc[train_df.shape[0]:]
    test_encoded = test_encoded.reset_index(drop=True)
    
    train_encoded.shape, test_encoded.shape
    
    return train_encoded, test_encoded

In [6]:
train_encoded, test_encoded = cat_data_encode(train_df, test_df)

### Merge scaled numerical data with encoded categorical data

In [7]:
def merge_scaled_encoded(train_scaled, test_scaled, train_encoded, test_encoded):
    
    Train = pd.concat([train_scaled,train_encoded],axis=1)
    Train = pd.concat([Train,train_df['credit']],axis=1)

    Test = pd.concat([test_scaled,test_encoded],axis=1)

    Train.shape, Test.shape
    
    return Train, Test

In [8]:
Train, Test = merge_scaled_encoded(train_scaled, test_scaled, train_encoded, test_encoded)

## Split Training and Validation set

In [9]:
x = Train.drop(['credit'], axis=1)
y = Train['credit']

In [10]:
from sklearn.model_selection import train_test_split
train_x, valid_x, train_y, valid_y = train_test_split(x, y, test_size=0.25)

## Model Train


### SVC

In [None]:
def objective(trial):

    svc_c = trial.suggest_float("C", 1e-10, 1e10, log=True)
    classifier_obj = sklearn.svm.SVC(C=svc_c, gamma="auto")

    score = sklearn.model_selection.cross_val_score(classifier_obj, train_x, train_y, n_jobs=-1, cv=3)
    accuracy = score.mean()
    return accuracy

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=10)

print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[32m[I 2022-12-13 20:06:25,111][0m A new study created in memory with name: no-name-6eea1a0b-8650-4624-bc54-bc97b291551d[0m
[32m[I 2022-12-13 20:06:48,335][0m Trial 0 finished with value: 0.6370083816892328 and parameters: {'C': 0.0001837471058497414}. Best is trial 0 with value: 0.6370083816892328.[0m


In [None]:
svc_c = study.best_params['C']
svc_c

### Random Forest

In [None]:
def objective(trial):

    rf_max_depth = trial.suggest_int("max_depth", 2, 32, log=True)
    classifier_obj = sklearn.ensemble.RandomForestClassifier(
        max_depth=rf_max_depth, n_estimators=10)

    score = sklearn.model_selection.cross_val_score(classifier_obj, train_x, train_y, n_jobs=-1, cv=3)
    accuracy = score.mean()
    return accuracy

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=10)

print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

In [None]:
rf_best = study.best_params
rf_best

### LightGBM

In [None]:
import lightgbm as lgb

train_y_label = train_y-4
valid_y_label = valid_y-4
dtrain = lgb.Dataset(train_x, label=train_y_label)

# FYI: Objective functions can take additional arguments
# (https://optuna.readthedocs.io/en/stable/faq.html#objective-func-additional-args).
def objective(trial):
    param = {
        "objective": "multiclass",
        "metric": "multi_logloss",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        'num_class':5,
    }

    gbm = lgb.train(param, dtrain)
    preds = gbm.predict(valid_x)
    pred_labels = []
    for i in range(0,len(preds)):
        pred_labels.append(np.argmax(preds[i]))
    accuracy = cal_acc(valid_y_label, pred_labels)
    return accuracy

def cal_acc(true, pred):
    return np.mean(true==pred)

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)

print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

In [None]:
lgb_best = study.best_params
lgb_best

### Stacking Ensemble

In [None]:
# get a stacking ensemble of models
def get_stacking():
	# define the base models
	level0 = list()
	level0.append(('svc', sklearn.svm.SVC(C=svc_c, gamma="auto")))
	level0.append(('rf', sklearn.ensemble.RandomForestClassifier(**rf_best)))
	level0.append(('lgb', lgb.LGBMClassifier(**lgb_best)))
	# define meta learner model
	level1 = CatBoostClassifier()
	# define the stacking ensemble
	model = StackingClassifier(estimators=level0, final_estimator=level1, cv=5)
	return model
 
# get a list of models to evaluate
def get_models():
	models = dict()
	models['svc'] = sklearn.svm.SVC(C=svc_c, gamma="auto")
	models['rf'] = sklearn.ensemble.RandomForestClassifier(**rf_best)
	models['lgb'] = lgb.LGBMClassifier(**lgb_best)
	models['stacking'] = get_stacking()
	return models
 
# evaluate a give model using cross-validation
def evaluate_model(model, x, y):
	cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
	scores = cross_val_score(model, x, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
	return scores
 
# get the models to evaluate
models = get_models()

# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
	scores = evaluate_model(model, x, y)
	results.append(scores)
	names.append(name)
	print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))


## Stacking Ensemble

lgbm_params = {
    'objective': 'multiclass',
    'boosting_type': 'gbdt',
    'eval_metric' : 'logloss',    
    'n_estimators': 10000,
    'max_depth': -1,
    'max_bin': 255,
    'boost_from_average' : False,
    'bagging_freq' : 1,
    'min_data_in_leaf': 40,    
    'learning_rate': 0.02272,    
    'num_leaves': 64,    
    'feature_fraction': 0.89387,
    'bagging_fraction': 0.76326,        
    'seed': 2018,
    'verbose': -1,
    'n_jobs': -1,    
}

svc_params = {
    'kernel' : 'rbf',
    "random_state": 42,
    'C' : 10.0,
    'gamma' : 0.1,
    'probability' : True,
}

rf_params = {
  "criterion": "entropy",
  "n_estimators": 300,
  "min_samples_split": 10,
  "min_samples_leaf": 2,
  "max_features": "sqrt",
  "oob_score": True,
  "random_state": 42,
  "n_jobs": -1,
  }

# get a stacking ensemble of models
def get_stacking():
	# define the base models
	level0 = list()
	level0.append(('svc', SVC(**svc_params)))
	#level0.append(('rf', RandomForestClassifier(**rf_params)))
	level0.append(('lgb', LGBMClassifier(**lgbm_params)))
	# define meta learner model
	level1 = CatBoostClassifier()
	# define the stacking ensemble
	model = StackingClassifier(estimators=level0, final_estimator=level1, cv=5)
	return model
 
# get a list of models to evaluate
def get_models():
	models = dict()
	models['svc'] = SVC(**svc_params)
	#models['rf'] = RandomForestClassifier(**rf_sparams)
	models['lgb'] = LGBMClassifier(**lgbm_params)
	models['stacking'] = get_stacking()
	return models
 
# evaluate a give model using cross-validation
def evaluate_model(model, X, y):
	cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
	scores = cross_val_score(model, X, y, scoring='neg_log_loss', cv=cv, n_jobs=-1, error_score='raise')
	return scores

# get the models to evaluate
models = get_models()
# evaluate the models and store results
results, names = list(), list()
for name, model in tqdm(models.items()):
	scores = evaluate_model(model, X, y)
	results.append(scores)
	names.append(name)
	print('>%s %.3f (%.3f)' % (name, np.mean(scores), np.std(scores)))


# plot model performance for comparison
pyplot.boxplot(results, labels=names, showmeans=True)
pyplot.show()