# Model Stacking
[Logistic Regression, KNN, SVC, Decision Tree, Random Forest, Naive Bayes, XGBoost, LightGBM, LDA]

In [59]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import model_selection
from sklearn.metrics import accuracy_score

import joblib

from tqdm import tqdm

from vecstack import StackingTransformer

from itertools import combinations

import warnings
warnings.filterwarnings("ignore")

## Load Data

In [2]:
data_df = pd.read_csv('titanic_data.csv')
pipe = joblib.load(f'titanic_pipe.pkl')
pipe.fit(data_df)

data_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,Fare,Cabin,Embarked,Family
0,1,0,3,1,0,1,0,6,0,1
1,2,1,1,3,1,2,4,3,1,1
2,3,1,3,2,1,1,1,6,0,0
3,4,1,1,3,1,2,4,3,0,1
4,5,0,3,1,0,2,1,6,0,0


In [3]:
titanic_X = data_df.copy()

del titanic_X['PassengerId']
del titanic_X['Survived']

titanic_Y = np.array(data_df[['Survived']])

x_train, x_test, y_train, y_test = \
   model_selection.train_test_split(titanic_X, titanic_Y, test_size=0.3, random_state=0)

x_train = pipe.transform(x_train)
x_test = pipe.transform(x_test)

## Model Fitting

In [5]:
from sklearn import linear_model # LogisticRegression()
from sklearn import neighbors # KNeighborsClassifier()
from sklearn.svm import SVC # SVC(gamma='auto')
from sklearn import tree # DecisionTreeClassifier()
from sklearn import ensemble # RandomForestClassifier()
from sklearn import naive_bayes # BernoulliNB()
from sklearn import cluster # KMeans(random_state=0)
from sklearn import decomposition # PCA(n_components=1)
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from xgboost import XGBClassifier
from lightgbm.sklearn import LGBMClassifier

### 5-Fold

In [48]:
estimator = [
    ('Logistic', linear_model.LogisticRegression()),
    ('SVC', SVC(kernel='rbf', C=100, gamma=0.01)),
    ('DecisionTree', tree.DecisionTreeClassifier(random_state = 0, max_depth = 4)),
    ('RandomForest', ensemble.RandomForestClassifier(random_state = 0, n_estimators = 100, max_depth = 6))]

In [49]:
stack = StackingTransformer(estimator, 
                            regression = False, 
                            metric = accuracy_score, 
                            n_folds = 5, stratified = True, shuffle = True, 
                            random_state = 0, verbose = 0)

In [50]:
stack = stack.fit(x_train, y_train)

S_train = stack.transform(x_train)
S_test = stack.transform(x_test)

model = tree.DecisionTreeClassifier(max_depth=4, random_state=0)
model = model.fit(S_train, y_train)
y_pred = model.predict(S_test)
accuracy_score(y_test, y_pred)

0.8544776119402985

### 10-Fold

In [56]:
estimator = [
    ('Logistic', linear_model.LogisticRegression()),
    ('KNN', neighbors.KNeighborsClassifier(16)),
    ('DecisionTree', tree.DecisionTreeClassifier(random_state = 0, max_depth = 4)),
    ('NaiveBayes', naive_bayes.BernoulliNB(alpha=10))]

In [57]:
stack = StackingTransformer(estimator, 
                            regression = False, 
                            metric = accuracy_score, 
                            n_folds = 10, stratified = True, shuffle = True, 
                            random_state = 0, verbose = 0)

In [58]:
stack = stack.fit(x_train, y_train)

S_train = stack.transform(x_train)
S_test = stack.transform(x_test)

model = ensemble.RandomForestClassifier(random_state = 0, n_estimators = 100, max_depth = 6)
model = model.fit(S_train, y_train)
y_pred = model.predict(S_test)
accuracy_score(y_test, y_pred)

0.8507462686567164

## Stacking 조합 탐색

In [39]:
estimators = [
    ('Logistic', linear_model.LogisticRegression()),
    ('KNN', neighbors.KNeighborsClassifier(16)),
    ('SVM', SVC(kernel='rbf', C=100, gamma=0.01)),
    ('DecisionTree', tree.DecisionTreeClassifier(random_state = 0, max_depth = 4)),
    ('RandomForest', ensemble.RandomForestClassifier(random_state = 0, n_estimators = 100, max_depth = 6)),
    ('NaiveBayes', naive_bayes.BernoulliNB(alpha=10)),
    ('XGBoost', XGBClassifier(seed = 1, n_estimators = 200, max_depth = 2, verbosity=0)),
    ('LightGBM', LGBMClassifier(random_state=0, n_estimators = 200, max_depth = 2))]

In [40]:
comb = list()

for i in range(2,len(estimators)+1):
    comb += list(combinations(estimators, i))

In [41]:
stacks = [StackingTransformer(estimator, 
                              regression = False, 
                              metric = accuracy_score, 
                              n_folds = 10, stratified = True, shuffle = True, 
                              random_state = 0, verbose = 0)
                              for estimator in comb]

In [42]:
scores = list()
models = estimators + [('LDA', LinearDiscriminantAnalysis())]

for stack in tqdm(stacks, desc='Stack'):
    stack = stack.fit(x_train, y_train)

    S_train = stack.transform(x_train)
    S_test = stack.transform(x_test)

    for name, model in models:
        model.fit(S_train, y_train)
        y_pred = model.predict(S_test)
        scores.append((accuracy_score(y_test, y_pred), (stack, model)))
print('end')


Stack: 100%|██████████| 247/247 [45:23<00:00, 11.03s/it]

end





In [None]:
sorted(scores, key=lambda x: x[0], reverse=True)[0]