# Stacking Ensemble

## Data

In [1]:
from typing import Dict, Tuple, Union, List
from tqdm import tqdm
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import os, glob, warnings
from itertools import combinations
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.svm import SVC
from catboost import CatBoostClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import RepeatedStratifiedKFold
warnings.filterwarnings('ignore')

In [2]:
train_df=pd.read_csv("../data/cleaned_train.csv")
test_df=pd.read_csv("../data/cleaned_test.csv")

train_df.head()

Unnamed: 0.1,Unnamed: 0,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,days_birth,days_employed,work_phone,home_phone,email,occup_type,family_size,begin_month,credit,CODE
0,0,F,N,N,0,202500.0,Commercial associate,Higher education,Married,Municipal apartment,13899,4709,0,0,0,No job,2.0,6,1.0,F-13899202500.0Commercial associate
1,1,F,N,Y,1,247500.0,Commercial associate,Secondary / secondary special,Civil marriage,House / apartment,11380,1540,0,0,1,Laborers,3.0,5,1.0,F-11380247500.0Commercial associate
2,2,M,Y,Y,0,450000.0,Working,Higher education,Married,House / apartment,19087,4434,0,1,0,Managers,2.0,22,2.0,M-19087450000.0Working
3,3,F,N,Y,0,202500.0,Commercial associate,Secondary / secondary special,Married,House / apartment,15088,2092,0,1,0,Sales staff,2.0,37,0.0,F-15088202500.0Commercial associate
4,4,F,Y,Y,0,157500.0,State servant,Higher education,Married,House / apartment,15037,2105,0,0,0,Managers,2.0,26,2.0,F-15037157500.0State servant


## Preprocessing

### Numerical Data: Scaling

In [3]:
def num_data_scale(train_df, test_df, scaling_cols=None):
    
    scaling_cols = ['child_num','income_total','days_birth','days_employed','family_size','begin_month']

    std_scaler = StandardScaler()
    std_scaler.fit(train_df[scaling_cols])

    train_scaled = std_scaler.transform(train_df[scaling_cols])
    test_scaled = std_scaler.transform(test_df[scaling_cols]) # Apply to the test set using the scaler that was fitted with train data!!

    train_scaled = pd.DataFrame(train_scaled, columns=scaling_cols)
    test_scaled = pd.DataFrame(test_scaled, columns=scaling_cols)

    train_scaled.shape, test_scaled.shape
    
    return train_scaled, test_scaled

In [4]:
train_scaled, test_scaled = num_data_scale(train_df, test_df)

### Categorical Data: One-hot Encoding

In [5]:
def cat_data_encode(train_df, test_df, onehot_cols=None):
    
    onehot_cols = ['gender','car','reality','income_type','edu_type','family_type','house_type','occup_type','work_phone','home_phone','email']
    data = pd.concat([train_df[onehot_cols], test_df[onehot_cols]]) # One-hot encoding using the combination of train and test data
    data = pd.get_dummies(data)

    # data separation
    data = data.reset_index(drop=True)
    train_encoded = data.loc[:train_df.shape[0]-1]
    test_encoded = data.loc[train_df.shape[0]:]
    test_encoded = test_encoded.reset_index(drop=True)
    
    train_encoded.shape, test_encoded.shape
    
    return train_encoded, test_encoded

In [6]:
train_encoded, test_encoded = cat_data_encode(train_df, test_df)

### Merge scaled numerical data with encoded categorical data

In [7]:
def merge_scaled_encoded(train_scaled, test_scaled, train_encoded, test_encoded):
    
    Train = pd.concat([train_scaled,train_encoded],axis=1)
    Train = pd.concat([Train,train_df['credit']],axis=1)

    Test = pd.concat([test_scaled,test_encoded],axis=1)

    Train.shape, Test.shape
    
    return Train, Test

In [8]:
Train, Test = merge_scaled_encoded(train_scaled, test_scaled, train_encoded, test_encoded)

## Split Training and Validation set

In [9]:
X = Train.drop(['credit'], axis=1)
y = Train['credit']

from sklearn.model_selection import train_test_split
train_X, val_X, train_y, val_y = train_test_split(x_data, y_data, test_size=0.2)

## Model Train

**Among the tuned parameters or manual parameters tested by each team member, the one that produced better performance was selected.**

# Catboost
def stratified_kfold_cat(
    params: Dict[str, Union[int, float, str, List[str]]],
    n_fold: int,
    X: pd.DataFrame,
    y: pd.DataFrame,
    X_test: pd.DataFrame,
) -> Tuple[np.ndarray, np.ndarray]:
    folds = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=42)
    splits = folds.split(X, y)
    cat_oof = np.zeros((X.shape[0], 3))
    cat_preds = np.zeros((X_test.shape[0], 3))
    cat_cols = [c for c in X.columns if X[c].dtypes == "int64"]

    for fold, (train_idx, valid_idx) in enumerate(splits):
        print(f"============ Fold {fold} ============\n")
        X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
        y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]
        train_data = Pool(data=X_train, label=y_train, cat_features=cat_cols)
        valid_data = Pool(data=X_valid, label=y_valid, cat_features=cat_cols)

        model = CatBoostClassifier(**params)

        model.fit(
            train_data,
            eval_set=valid_data,
            early_stopping_rounds=100,
            use_best_model=True,
            verbose=100,
        )

        cat_oof[valid_idx] = model.predict_proba(X_valid)
        cat_preds += model.predict_proba(X_test) / n_fold

    log_score = log_loss(y, cat_oof)
    print(f"Log Loss Score: {log_score:.5f}\n")
    return cat_oof, cat_preds


# LightGBM
def stratified_kfold_lgbm(
    params: Dict[str, Union[int, float, str]],
    n_fold: int,
    X: pd.DataFrame,
    y: pd.DataFrame,
    X_test: pd.DataFrame,
) -> Tuple[np.ndarray, np.ndarray]:
    
    folds = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=42)
    splits = folds.split(X, y)
    lgb_oof = np.zeros((X.shape[0], 3))
    lgb_preds = np.zeros((X_test.shape[0], 3))

    for fold, (train_idx, valid_idx) in enumerate(splits):
        print(f"============ Fold {fold} ============\n")
        X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
        y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]
        pre_model = LGBMClassifier(**params)

        pre_model.fit(
            X_train,
            y_train,
            eval_set=[(X_train, y_train), (X_valid, y_valid)],
            early_stopping_rounds=100,
            verbose=100,
        )
        params2 = params.copy()
        params2["learning_rate"] = params["learning_rate"] * 0.1

        model = LGBMClassifier(**params2)
        model.fit(
            X_train,
            y_train,
            eval_set=[(X_train, y_train), (X_valid, y_valid)],
            early_stopping_rounds=100,
            verbose=100,
            init_model=pre_model,
        )
        lgb_oof[valid_idx] = model.predict_proba(X_valid)
        lgb_preds += model.predict_proba(X_test) / n_fold

    log_score = log_loss(y, lgb_oof)
    print(f"Log Loss Score: {log_score:.5f}")

    return lgb_oof, lgb_preds


# SVM
def stratified_kfold_svm(
    params: Dict[str, Union[int, float, str]],
    n_fold: int,
    X: pd.DataFrame,
    y: pd.DataFrame,
    X_test: pd.DataFrame,
) -> Tuple[np.ndarray, np.ndarray]:

    folds = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=42)
    splits = folds.split(X, y)
    svm_oof = np.zeros((X.shape[0], 3))
    svm_preds = np.zeros((X_test.shape[0], 3))

    for fold, (train_idx, valid_idx) in enumerate(splits):
        print(f"============ Fold {fold} ============\n")
        X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
        y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

        model = SVC(**params)
        model.fit(
            X_train,
            y_train,
            #eval_set=[(X_train, y_train), (X_valid, y_valid)],
            #early_stopping_rounds=100,
            #verbose=100,
        )

        svm_oof[valid_idx] = model.predict_proba(X_valid)
        svm_preds += model.predict_proba(X_test) / n_fold

    log_score = log_loss(y, svm_oof)
    print(f"Log Loss Score: {log_score:.5f}")

    return svm_oof, svm_preds


# Random Foreset
def stratified_kfold_rf(
    params: Dict[str, Union[int, float, str, bool]],
    n_fold: int,
    X: pd.DataFrame,
    y: pd.DataFrame,
    X_test: pd.DataFrame,
) -> Tuple[np.ndarray, np.ndarray]:

    folds = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=42)
    splits = folds.split(X, y)
    rf_oof = np.zeros((X.shape[0], 3))
    rf_preds = np.zeros((X_test.shape[0], 3))

    for fold, (train_idx, valid_idx) in enumerate(splits):
        print(f"============ Fold {fold} ============\n")
        X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
        y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]
        model = RandomForestClassifier(**params)
        model.fit(
            X_train,
            y_train,
        )

        rf_oof[valid_idx] = model.predict_proba(X_valid)
        rf_preds += model.predict_proba(X_test) / n_fold
        print(f"Log Loss Score: {log_loss(y_valid, rf_oof[valid_idx]):.5f}")

    log_score = log_loss(y, rf_oof)
    print(f"Log Loss Score: {log_score:.5f}")

    return rf_oof, rf_preds

### LGBM Train

lgbm_params = {
    'objective': 'multiclass',
    'boosting_type': 'gbdt',
    'eval_metric' : 'logloss',    
    'n_estimators': 10000,
    'early_stopping_round': 100, 
    'max_depth': -1,
    'max_bin': 255,
    'boost_from_average' : False,
    'bagging_freq' : 1,
    'min_data_in_leaf': 40,    
    'learning_rate': 0.02272,    
    'num_leaves': 64,    
    'feature_fraction': 0.89387,
    'bagging_fraction': 0.76326,        
    'seed': 2018,
    'verbose': -1,
    'n_jobs': -1,    
}

lgbm_oof, lgbm_preds = stratified_kfold_lgbm(lgbm_params, 10, X, y, Test)

### SVM Train

svm_params = {
    'kernel' : 'rbf',
    "random_state": 42,
    'C' : 10.0,
    'gamma' : 0.1,
    'probability' : True,
}

svm_oof, svm_preds = stratified_kfold_svm(svm_params, 5, X, y, Test)

### Random Forest Train

rf_params = {
  "criterion": "entropy",
  "n_estimators": 300,
  "min_samples_split": 10,
  "min_samples_leaf": 2,
  "max_features": "sqrt",
  "oob_score": True,
  "random_state": 42,
  "n_jobs": -1,
  }

rf_oof, rf_preds = stratified_kfold_rf(rf_params, 5, X, y, Test)

## Stacking Ensemble

In [10]:
lgbm_params = {
    'objective': 'multiclass',
    'boosting_type': 'gbdt',
    'eval_metric' : 'logloss',    
    'n_estimators': 10000,
    'early_stopping_round': 100, 
    'max_depth': -1,
    'max_bin': 255,
    'boost_from_average' : False,
    'bagging_freq' : 1,
    'min_data_in_leaf': 40,    
    'learning_rate': 0.02272,    
    'num_leaves': 64,    
    'feature_fraction': 0.89387,
    'bagging_fraction': 0.76326,        
    'seed': 2018,
    'verbose': -1,
    'n_jobs': -1,    
}

svc_params = {
    'kernel' : 'rbf',
    "random_state": 42,
    'C' : 10.0,
    'gamma' : 0.1,
    'probability' : True,
}

rf_params = {
  "criterion": "entropy",
  "n_estimators": 300,
  "min_samples_split": 10,
  "min_samples_leaf": 2,
  "max_features": "sqrt",
  "oob_score": True,
  "random_state": 42,
  "n_jobs": -1,
  }

In [12]:
# get a stacking ensemble of models
def get_stacking():
	# define the base models
	level0 = list()
	level0.append(('svc', SVC(**svc_params)))
	#level0.append(('rf', RandomForestClassifier(**rf_params)))
	level0.append(('lgb', LGBMClassifier(**lgbm_params)))
	# define meta learner model
	level1 = CatBoostClassifier()
	# define the stacking ensemble
	model = StackingClassifier(estimators=level0, final_estimator=level1, cv=5)
	return model
 
# get a list of models to evaluate
def get_models():
	models = dict()
	models['svc'] = SVC(**svc_params)
	#models['rf'] = RandomForestClassifier(**rf_sparams)
	models['lgb'] = LGBMClassifier(**lgbm_params)
	models['stacking'] = get_stacking()
	return models
 
# evaluate a give model using cross-validation
def evaluate_model(model, X, y):
	cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
	scores = cross_val_score(model, X, y, scoring='neg_log_loss', cv=cv, n_jobs=-1, error_score='raise')
	return scores

# get the models to evaluate
models = get_models()
# evaluate the models and store results
results, names = list(), list()
for name, model in tqdm(models.items()):
	scores = evaluate_model(model, X, y)
	results.append(scores)
	names.append(name)
	print('>%s %.3f (%.3f)' % (name, np.mean(scores), np.std(scores)))


  0%|                                                     | 0/3 [21:12<?, ?it/s]


NameError: name 'std' is not defined

# plot model performance for comparison
pyplot.boxplot(results, labels=names, showmeans=True)
pyplot.show()