# Model Stacking
[Logistic Regression, KNN, SVC, Decision Tree, Random Forest, Naive Bayes, XGBoost, LightGBM, LDA]

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import model_selection
from sklearn.metrics import accuracy_score

from tqdm import tqdm
from vecstack import StackingTransformer
from itertools import combinations
import joblib

import warnings
warnings.filterwarnings("ignore")

## Load Data

In [2]:
data_df = pd.read_csv('titanic_data.csv')
data_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,Fare,Cabin,Embarked,Family
0,1,0,3,1,0,1,0,6,0,1
1,2,1,1,3,1,2,4,3,1,1
2,3,1,3,2,1,1,1,6,0,0
3,4,1,1,3,1,2,4,3,0,1
4,5,0,3,1,0,2,1,6,0,0


In [3]:
pipe = joblib.load(f'titanic_pipe.pkl')
pipe

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('cat',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['Pclass', 'Name', 'Sex',
                                                   'Age', 'Fare', 'Cabin',
                                                   'Embarked', 'Family'])]))])

In [4]:
titanic_X = data_df.drop(['PassengerId','Survived'], axis=1)
titanic_Y = np.array(data_df[['Survived']])

x_train, x_test, y_train, y_test = \
   model_selection.train_test_split(titanic_X, titanic_Y, test_size=0.3, random_state=0)

x_train = pipe.fit_transform(x_train)
x_test = pipe.transform(x_test)

## Model Fitting

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from xgboost import XGBClassifier
from lightgbm.sklearn import LGBMClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

### 5-Fold

In [6]:
# Stacking 모델은 모든 조합에 대해 for문을 돌렸을 때 가장 높은 Accuracy가 나온 모델 조합을 선택
estimator = [
    ('Logistic', LogisticRegression()),
    ('SVC', SVC(kernel='rbf', C=100, gamma=0.01)),
    ('DecisionTree', DecisionTreeClassifier(random_state = 0, max_depth = 4)),
    ('RandomForest', RandomForestClassifier(random_state = 0, n_estimators = 100, max_depth = 6))]

In [7]:
stack = StackingTransformer(estimator, 
                            regression = False, 
                            metric = accuracy_score, 
                            n_folds = 5, stratified = True, shuffle = True, 
                            random_state = 0, verbose = 0)

In [8]:
stack = stack.fit(x_train, y_train)

S_train = stack.transform(x_train)
S_test = stack.transform(x_test)

model = DecisionTreeClassifier(max_depth=4, random_state=0)
model = model.fit(S_train, y_train)
y_pred = model.predict(S_test)
accuracy_score(y_test, y_pred)

0.8544776119402985

### 10-Fold

In [9]:
# Stacking 모델은 모든 조합에 대해 for문을 돌렸을 때 가장 높은 Accuracy가 나온 모델 조합을 선택
estimator = [
    ('Logistic', LogisticRegression()),
    ('KNN', KNeighborsClassifier(16)),
    ('DecisionTree', DecisionTreeClassifier(random_state = 0, max_depth = 4)),
    ('NaiveBayes', BernoulliNB(alpha=10))]

In [10]:
stack = StackingTransformer(estimator, 
                            regression = False, 
                            metric = accuracy_score, 
                            n_folds = 10, stratified = True, shuffle = True, 
                            random_state = 0, verbose = 0)

In [11]:
stack = stack.fit(x_train, y_train)

S_train = stack.transform(x_train)
S_test = stack.transform(x_test)

model = RandomForestClassifier(random_state = 0, n_estimators = 100, max_depth = 6)
model = model.fit(S_train, y_train)
y_pred = model.predict(S_test)
accuracy_score(y_test, y_pred)

0.8507462686567164

## Stacking Combination

In [12]:
estimators = [
    ('Logistic', LogisticRegression()),
    ('KNN', KNeighborsClassifier(16)),
    ('SVM', SVC(kernel='rbf', C=100, gamma=0.01)),
    ('DecisionTree', DecisionTreeClassifier(random_state = 0, max_depth = 4)),
    ('RandomForest', RandomForestClassifier(random_state = 0, n_estimators = 100, max_depth = 6)),
    ('NaiveBayes', BernoulliNB(alpha=10)),
    ('XGBoost', XGBClassifier(seed = 0, n_estimators = 200, max_depth = 2, verbosity=0)),
    ('LightGBM', LGBMClassifier(random_state=0, n_estimators = 200, max_depth = 2))]

In [13]:
comb = list()

for i in range(2,len(estimators)+1):
    comb += list(combinations(estimators, i))

In [14]:
stacks = [StackingTransformer(estimator, 
                              regression = False, 
                              metric = accuracy_score, 
                              n_folds = 5, stratified = True, shuffle = True, 
                              random_state = 0, verbose = 0)
                              for estimator in comb]

In [16]:
scores = list()
models = estimators + [('LDA', LinearDiscriminantAnalysis())]

for stack in tqdm(stacks, desc='Stack'):
    stack = stack.fit(x_train, y_train)

    S_train = stack.transform(x_train)
    S_test = stack.transform(x_test)

    for name, model in models:
        model.fit(S_train, y_train)
        y_pred = model.predict(S_test)
        scores.append((accuracy_score(y_test, y_pred), (stack, model)))

Stack: 100%|██████████| 247/247 [04:58<00:00,  1.21s/it]


In [17]:
sorted(scores, key=lambda x: x[0], reverse=True)[0]

(0.8544776119402985,
 (StackingTransformer(estimators=(('Logistic', LogisticRegression()),
                                  ('SVM', SVC(C=100, gamma=0.01)),
                                  ('RandomForest',
                                   RandomForestClassifier(max_depth=6,
                                                          random_state=0))),
                      metric=<function accuracy_score at 0x13979ab80>, n_folds=5,
                      regression=False, shuffle=True, stratified=True),
  RandomForestClassifier(max_depth=6, random_state=0)))

## Case 2: Numeric Age
수치형 나이 데이터를 모델에 적용한 경우

In [18]:
data_df = pd.read_csv('titanic_data2.csv')
data_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,Fare,Cabin,Embarked,Family
0,1,0,3,1,0,22,0,6,0,1
1,2,1,1,3,1,38,4,3,1,1
2,3,1,3,2,1,26,1,6,0,0
3,4,1,1,3,1,35,4,3,0,1
4,5,0,3,1,0,35,1,6,0,0


In [19]:
pipe = joblib.load(f'titanic_pipe2.pkl')
pipe

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num', StandardScaler(),
                                                  ['Age']),
                                                 ('cat',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['Pclass', 'Name', 'Sex',
                                                   'Fare', 'Cabin', 'Embarked',
                                                   'Family'])]))])

In [20]:
titanic_X = data_df.drop(['PassengerId','Survived'], axis=1)
titanic_Y = np.array(data_df[['Survived']])

x_train, x_test, y_train, y_test = \
   model_selection.train_test_split(titanic_X, titanic_Y, test_size=0.3, random_state=0)

x_train = pipe.fit_transform(x_train)
x_test = pipe.transform(x_test)

In [21]:
# Stacking 모델은 모든 조합에 대해 for문을 돌렸을 때 가장 높은 Accuracy가 나온 모델 조합을 선택
estimator = [
    ('Logistic', LogisticRegression()),
    ('KNN', KNeighborsClassifier(16)),
    ('SVM', SVC(kernel='rbf', C=100, gamma=0.01)),
    ('DecisionTree', DecisionTreeClassifier(random_state = 0, max_depth = 4)),
    ('RandomForest', RandomForestClassifier(random_state = 0, n_estimators = 100, max_depth = 6)),
    ('XGBoost', XGBClassifier(seed = 1, n_estimators = 200, max_depth = 2, verbosity=0)),
    ('LightGBM', LGBMClassifier(random_state=0, n_estimators = 200, max_depth = 2))]

In [22]:
stack = StackingTransformer(estimator, 
                            regression = False, 
                            metric = accuracy_score, 
                            n_folds = 5, stratified = True, shuffle = True, 
                            random_state = 0, verbose = 0)

In [23]:
stack = stack.fit(x_train, y_train)

S_train = stack.transform(x_train)
S_test = stack.transform(x_test)

model = RandomForestClassifier(random_state = 0, n_estimators = 100, max_depth = 6)
model = model.fit(S_train, y_train)
y_pred = model.predict(S_test)
accuracy_score(y_test, y_pred)

0.8470149253731343

## Case 3: Drop Cabin
선실 열이 제거된 데이터를 모델에 적용한 경우

In [24]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

data_df = pd.read_csv('titanic_data.csv')

categorical_transformer = OneHotEncoder(categories='auto', handle_unknown='ignore')
categorical_features = ['Pclass', 'Name', 'Sex', 'Age',
                        'Fare', 'Embarked', 'Family']

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features)])

pipe = Pipeline(steps=[('preprocessor', preprocessor)])

In [25]:
titanic_X = data_df.copy()

titanic_X = data_df.drop(['PassengerId','Survived','Cabin'], axis=1)
titanic_Y = np.array(data_df[['Survived']])

x_train, x_test, y_train, y_test = \
   model_selection.train_test_split(titanic_X, titanic_Y, test_size=0.3, random_state=0)

x_train = pipe.fit_transform(x_train)
x_test = pipe.transform(x_test)

In [26]:
# Stacking 모델은 모든 조합에 대해 for문을 돌렸을 때 가장 높은 Accuracy가 나온 모델 조합을 선택
estimator = [
    ('Logistic', LogisticRegression()),
    ('DecisionTree', DecisionTreeClassifier(random_state = 0, max_depth = 4)),
    ('RandomForest', RandomForestClassifier(random_state = 0, n_estimators = 100, max_depth = 6)),
    ('LightGBM', LGBMClassifier(random_state=0, n_estimators = 200, max_depth = 2))]

In [27]:
stack = StackingTransformer(estimator, 
                            regression = False, 
                            metric = accuracy_score, 
                            n_folds = 5, stratified = True, shuffle = True, 
                            random_state = 0, verbose = 0)

In [28]:
stack = stack.fit(x_train, y_train)

S_train = stack.transform(x_train)
S_test = stack.transform(x_test)

model = BernoulliNB(alpha=10)
model = model.fit(S_train, y_train)
y_pred = model.predict(S_test)
accuracy_score(y_test, y_pred)

0.8507462686567164