# Stacking Ensemble - CODE

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install catboost
!pip install optuna

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting catboost
  Downloading catboost-1.1.1-cp38-none-manylinux1_x86_64.whl (76.6 MB)
[K     |████████████████████████████████| 76.6 MB 1.1 MB/s 
Installing collected packages: catboost
Successfully installed catboost-1.1.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting optuna
  Downloading optuna-3.0.5-py3-none-any.whl (348 kB)
[K     |████████████████████████████████| 348 kB 36.6 MB/s 
[?25hCollecting importlib-metadata<5.0.0
  Downloading importlib_metadata-4.13.0-py3-none-any.whl (23 kB)
Collecting cmaes>=0.8.2
  Downloading cmaes-0.9.0-py3-none-any.whl (23 kB)
Collecting cliff
  Downloading cliff-4.1.0-py3-none-any.whl (81 kB)
[K     |████████████████████████████████| 81 kB 10.4 MB/s 
Collecting colorlog
  Downloading colorlog-6.7.0-py2.py3-none-any.whl (11 kB)
Collecting alembic>=1.5.0
  Downloading alembic-1.9.

## Setup

In [3]:
from typing import Dict, Tuple, Union, List
from tqdm import tqdm
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import os, glob, warnings
from itertools import combinations
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.svm import SVC
from catboost import CatBoostClassifier
from sklearn.ensemble import StackingClassifier
from xgboost import XGBClassifier
import lightgbm as lgb
from sklearn.model_selection import RepeatedStratifiedKFold
import optuna
import sklearn

In [4]:
train = pd.read_csv("../data/new_train.csv")
train = train.drop(['Unnamed: 0'], axis=1)

test = pd.read_csv("../data/new_test.csv")
test = test.drop(['index'],axis=1)

In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23392 entries, 0 to 23391
Data columns (total 27 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             23392 non-null  object 
 1   car                23392 non-null  object 
 2   reality            23392 non-null  object 
 3   income_total       23392 non-null  float64
 4   income_type        23392 non-null  object 
 5   edu_type           23392 non-null  object 
 6   family_type        23392 non-null  object 
 7   house_type         23392 non-null  object 
 8   work_phone         23392 non-null  int64  
 9   home_phone         23392 non-null  int64  
 10  email              23392 non-null  int64  
 11  occup_type         23392 non-null  object 
 12  family_size        23392 non-null  float64
 13  begin_month        23392 non-null  int64  
 14  credit             23392 non-null  float64
 15  days_unemployed    23392 non-null  int64  
 16  income_unemployed  233

## Numerical Scaling

In [6]:
# Standard Scaler
num_col = train.dtypes[train.dtypes != "object"].index.tolist()
num_col.remove('credit')

scaler = StandardScaler()
train[num_col] = scaler.fit_transform(train[num_col])
test[num_col] = scaler.transform(test[num_col])

## Categorical Encoding

In [7]:
# OrdinalEncoding
from sklearn.preprocessing import OrdinalEncoder
cat_col = train.dtypes[train.dtypes == "object"].index.tolist()

Encoder = OrdinalEncoder()
train[cat_col] = Encoder.fit_transform(train[cat_col], train['credit'])
test[cat_col] = Encoder.transform(test[cat_col])

## Train Model - Stacking Ensemble

In [8]:
"""
 THIS HYPERPARAMETERS ARE NOT USED:

lgb_best = {'objective': 'multiclass',
            'boosting_type': 'gbdt',
            'eval_metric' : 'logloss',    
            'n_estimators': 10000,
            'early_stopping_round': 100, 
            'max_depth': -1,
            'max_bin': 255,
            'boost_from_average' : False,
            'bagging_freq' : 1,
            'min_data_in_leaf': 40,    
            'learning_rate': 0.02272,    
            'num_leaves': 64,    
            'feature_fraction': 0.89387,
            'bagging_fraction': 0.76326,        
            'seed': 2018,
            'verbose': -1,
            'n_jobs': -1,  
            }
"""

# This hyperparameters are from the executions of each algorithms. 
#
# Catboost -> Optuna 
# RF, LGB -> Manual
rf_best = {"criterion": "entropy",
            "n_estimators": 300,
            "min_samples_split": 10,
            "min_samples_leaf": 2,
            "max_features": "sqrt",
            "oob_score": True,
            "random_state": 42,
            "n_jobs": -1,
            }

In [9]:
X = train.drop("credit", axis=1)
y = train["credit"]
X_test = test.copy()

In [10]:
def stratified_kfold_cv(model, n_fold, X, y, X_test):
    # Declaring Stratified K-Fold:
    folds = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=42)
    splits = folds.split(X, y)
    
    # Initializing arrays:
    rf_oof = np.zeros((X.shape[0], 3))
    rf_preds = np.zeros((X_test.shape[0], 3))

    # Main loop:
    for fold, (train_idx, valid_idx) in enumerate(splits):
        print(f"============ Fold {fold} ============")
        
        X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
        y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]
        
        # Model fit:
        model.fit(X_train, y_train)

        rf_oof[valid_idx] = model.predict_proba(X_valid) # Infers the class probability of train dataset
        rf_preds += model.predict_proba(X_test) / n_fold # Infers the class probability of test dataset
        print(f"Log Loss Score: {log_loss(y_valid, rf_oof[valid_idx]):.5f}")

    log_score = log_loss(y, rf_oof)
    print(f"\nLog Loss Score: {log_score:.5f}")

    return rf_oof, rf_preds

In [12]:
from sklearn.linear_model import LogisticRegression

# Extra models
from sklearn.ensemble import GradientBoostingClassifier

# Setting up the estimators
estimators = [
    ("rf", RandomForestClassifier(**rf_best)),
    ("lgb", LGBMClassifier()),
    ("LR", LogisticRegression(max_iter=4000)),
]

# Stacking classifier
clf = StackingClassifier(
    estimators=estimators, final_estimator=XGBClassifier()
)

# Apply stratified K-Fold
oof, preds = stratified_kfold_cv(clf, n_fold=10, X=X, y=y, X_test=test)

Log Loss Score: 0.67848
Log Loss Score: 0.67854
Log Loss Score: 0.65776
Log Loss Score: 0.65436
Log Loss Score: 0.68177
Log Loss Score: 0.66122
Log Loss Score: 0.69934
Log Loss Score: 0.67799
Log Loss Score: 0.67192
Log Loss Score: 0.68849

Log Loss Score: 0.67499
