# Model Stacking
[Logistic Regression, KNN, SVC, Decision Tree, Random Forest, Naive Bayes]

In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import model_selection
from sklearn.metrics import accuracy_score

import joblib

from vecstack import StackingTransformer

from itertools import combinations

import warnings
warnings.filterwarnings("ignore")

## Load Data

In [11]:
data_df = pd.read_csv('titanic_data.csv')
pipe = joblib.load(f'titanic_pipe.pkl')

data_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Family
0,1,0,3,1,0,22,1,0,7.25,6,0,1
1,2,1,1,3,1,38,1,0,71.2833,3,1,1
2,3,1,3,2,1,26,0,0,7.925,6,0,0
3,4,1,1,3,1,35,1,0,53.1,3,0,1
4,5,0,3,1,0,35,0,0,8.05,5,0,0


In [12]:
titanic_X = pipe.fit_transform(data_df)
titanic_Y = np.array(data_df[['Survived']])

x_train, x_test, y_train, y_test = \
   model_selection.train_test_split(titanic_X, titanic_Y, test_size=0.3, random_state=0)

## Model Fitting

In [13]:
from sklearn import linear_model # LogisticRegression()
from sklearn import neighbors # KNeighborsClassifier()
from sklearn.svm import SVC # SVC(gamma='auto')
from sklearn import tree # DecisionTreeClassifier()
from sklearn import ensemble # RandomForestClassifier()
from sklearn import naive_bayes # BernoulliNB()
from sklearn import cluster # KMeans(random_state=0)
from sklearn import decomposition # PCA(n_components=1)

In [14]:
estimators = [
    ('Logistic', linear_model.LogisticRegression()),
    ('KNN', neighbors.KNeighborsClassifier()),
    ('SVM', SVC(gamma='auto')),
    ('DecisionTree', tree.DecisionTreeClassifier(random_state = 0, max_depth = 4)),
    ('RandomForest', ensemble.RandomForestClassifier(random_state = 0, n_jobs = -1, n_estimators = 100, max_depth = 4)),
    ('NaiveBayes', naive_bayes.BernoulliNB())]

In [15]:
comb = list()

for i in range(2,len(estimators)+1):
    comb += list(combinations(estimators, i))

In [16]:
stacks = [StackingTransformer(estimator, 
                                    regression = False, 
                                    metric = accuracy_score, 
                                    n_folds = 4, stratified = True, shuffle = True, 
                                    random_state = 0, verbose = 0)
                                    for estimator in comb]

In [17]:
scores = list()

for stack in stacks:
    stack = stack.fit(x_train, y_train)

    S_train = stack.transform(x_train)
    S_test = stack.transform(x_test)

    model = linear_model.LogisticRegression()
    model = model.fit(S_train, y_train)
    y_pred = model.predict(S_test)
    scores.append((accuracy_score(y_test, y_pred), stack))

In [18]:
sorted(scores, key=lambda x: x[0])[-1]

(0.8432835820895522,
 StackingTransformer(estimators=(('Logistic', LogisticRegression()),
                                 ('SVM', SVC(gamma='auto')),
                                 ('DecisionTree',
                                  DecisionTreeClassifier(max_depth=4,
                                                         random_state=0)),
                                 ('RandomForest',
                                  RandomForestClassifier(max_depth=4, n_jobs=-1,
                                                         random_state=0)),
                                 ('NaiveBayes', BernoulliNB())),
                     metric=<function accuracy_score at 0x12f5d95e0>,
                     regression=False, shuffle=True, stratified=True))

## ========== 테스트 코드 ==========

## Load Data
3. 나이 및 선실 결측치 행 제거

4. 나이 및 선실 결측치 평균값 처리
5. 나이 결측치 행 제거 (One-Hot Encoding 적용 안함)

In [2]:
data_df = pd.read_csv('titanic_data.csv')

data_df_age = data_df[ data_df['Age'] > 0 ].copy()
data_df_age['Age'] = data_df_age['Age'].astype(int)
y_data_age = np.array(data_df_age[['Survived']])

del data_df_age['PassengerId']
del data_df_age['Cabin']
del data_df_age['Survived']

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(data_df_age[['Age']])
data_df_age['Age'] = scaler.transform(data_df_age[['Age']])

x_data_age = np.array(data_df_age)

df_list = list()

for i in [3, 4]:
    df_list.append(pd.read_csv(f'titanic_data_{i}.csv'))

pipe_list = list()

for i in [2, 3]:
    pipe_list.append(joblib.load(f'titanic_pipe_{i}.pkl'))

data_list = list()

for data, pipe in zip(df_list, pipe_list):
    pipe.fit(data)
    titanic_X = pipe.transform(data)
    titanic_Y = np.array(data[['Survived']])
    data_list.append((titanic_X, titanic_Y))

data_list.append((x_data_age, y_data_age))

tdata_list = list()

for titanic_X, titanic_Y in data_list:
   x_train, x_test, y_train, y_test = \
      model_selection.train_test_split(titanic_X, titanic_Y, test_size=0.3, random_state=0)
   tdata_list.append((x_train, x_test, y_train, y_test))

## Model Stacking

In [3]:
from sklearn import linear_model # LogisticRegression()
from sklearn import neighbors # KNeighborsClassifier()
from sklearn.svm import SVC # SVC(gamma='auto')
from sklearn import tree # DecisionTreeClassifier()
from sklearn import ensemble # RandomForestClassifier()
from sklearn import naive_bayes # BernoulliNB()

In [4]:
estimators_3 = [
    ('DecisionTree', tree.DecisionTreeClassifier(random_state = 0, max_depth = 4)),
    ('Logistic', linear_model.LogisticRegression()),
    ('KNN', neighbors.KNeighborsClassifier(19)),
    ('SVM', SVC(gamma='auto')),
    ('RandomForest', ensemble.RandomForestClassifier(random_state = 0, n_jobs = -1, n_estimators = 100, max_depth = 4)),
    ('NaiveBayes', naive_bayes.BernoulliNB())]

In [5]:
estimators_4 = [
    ('KNN', neighbors.KNeighborsClassifier(27)),
    ('Logistic', linear_model.LogisticRegression()),
    ('SVM', SVC(gamma='auto')),
    ('DecisionTree', tree.DecisionTreeClassifier(random_state = 0, max_depth = 4)),
    ('RandomForest', ensemble.RandomForestClassifier(random_state = 0, n_jobs = -1, n_estimators = 100, max_depth = 4)),
    ('NaiveBayes', naive_bayes.BernoulliNB())]

In [6]:
estimators_6 = [
    ('Logistic', linear_model.LogisticRegression()),
    ('KNN', neighbors.KNeighborsClassifier(9)),
    ('SVM', SVC(gamma='auto')),
    ('DecisionTree', tree.DecisionTreeClassifier(random_state = 0, max_depth = 4)),
    ('RandomForest', ensemble.RandomForestClassifier(random_state = 0, n_jobs = -1, n_estimators = 100, max_depth = 3)),
    ('NaiveBayes', naive_bayes.BernoulliNB())]

In [7]:
stack_3 = StackingTransformer(estimators_3, 
                            regression = False, 
                            metric = accuracy_score, 
                            n_folds = 4, stratified = True, shuffle = True, 
                            random_state = 0, verbose = 1)

In [8]:
stack_4 = StackingTransformer(estimators_4, 
                            regression = False, 
                            metric = accuracy_score, 
                            n_folds = 4, stratified = True, shuffle = True, 
                            random_state = 0, verbose = 1)

In [9]:
stack_6 = StackingTransformer(estimators_6, 
                            regression = False, 
                            metric = accuracy_score, 
                            n_folds = 4, stratified = True, shuffle = True, 
                            random_state = 0, verbose = 1)

In [11]:
stack_zip = zip([3,4,6], tdata_list, [stack_3,stack_4,stack_6])
s_list = list()

for i, (x_train, x_test, y_train, y_test), stack in stack_zip:
    stack = stack.fit(x_train, y_train)
    S_train = stack.transform(x_train)
    S_test = stack.transform(x_test)
    s_list.append((S_train,S_test))

task:         [classification]
n_classes:    [2]
metric:       [accuracy_score]
variant:      [A]
n_estimators: [6]

estimator  0: [DecisionTree: DecisionTreeClassifier]
    MEAN:     [0.68181818] + [0.08767784]

estimator  1: [Logistic: LogisticRegression]
    MEAN:     [0.73603220] + [0.06568098]

estimator  2: [KNN: KNeighborsClassifier]
    MEAN:     [0.68963068] + [0.05087429]

estimator  3: [SVM: SVC]
    MEAN:     [0.70572917] + [0.03157384]

estimator  4: [RandomForest: RandomForestClassifier]
    MEAN:     [0.69767992] + [0.04600058]

estimator  5: [NaiveBayes: BernoulliNB]
    MEAN:     [0.72774621] + [0.09097935]

Train set was detected.
Transforming...

estimator  0: [DecisionTree: DecisionTreeClassifier]
    DONE

estimator  1: [Logistic: LogisticRegression]
    DONE

estimator  2: [KNN: KNeighborsClassifier]
    DONE

estimator  3: [SVM: SVC]
    DONE

estimator  4: [RandomForest: RandomForestClassifier]
    DONE

estimator  5: [NaiveBayes: BernoulliNB]
    DONE

Transfor

In [12]:
model_3 = tree.DecisionTreeClassifier(random_state = 0, max_depth = 4)
model_4 = neighbors.KNeighborsClassifier(27)
model_6 = linear_model.LogisticRegression()

In [14]:
stack_zip = zip([3,4,6], tdata_list, s_list, [model_3,model_4,model_6])
model_list = list()

for i, (x_train, x_test, y_train, y_test), (S_train, S_test), model in stack_zip:
    model = model.fit(S_train, y_train)
    y_pred = model.predict(S_test)
    model_list.append(model)
    print(f'Final prediction score {i}: [%.8f]' % accuracy_score(y_test, y_pred))

Final prediction score 3: [0.80357143]
Final prediction score 4: [0.80970149]
Final prediction score 6: [0.82790698]


## 모든 조합 (Data 3)

In [28]:
from itertools import combinations

est_3_list = list()

for i in range(2,len(estimators_3)+1):
    est_3_list += list(combinations(estimators_3, i))

In [23]:
stack_list_3 = [StackingTransformer(estimator, 
                            regression = False, 
                            metric = accuracy_score, 
                            n_folds = 4, stratified = True, shuffle = True, 
                            random_state = 0, verbose = 0)
                            for estimator in est_3_list]

In [29]:
x_train, x_test, y_train, y_test = tdata_list[0]
score_list_3 = list()

for stack in stack_list_3:
    stack = stack.fit(x_train, y_train)
    S_train = stack.transform(x_train)
    S_test = stack.transform(x_test)
    model = tree.DecisionTreeClassifier(random_state = 0, max_depth = 4)
    model = model.fit(S_train, y_train)
    y_pred = model.predict(S_test)
    score_list_3.append((accuracy_score(y_test, y_pred), stack))

In [30]:
sorted(score_list_3, key=lambda x: x[0])[-1]

(0.8571428571428571,
 StackingTransformer(estimators=(('DecisionTree',
                                  DecisionTreeClassifier(max_depth=4,
                                                         random_state=0)),
                                 ('Logistic', LogisticRegression()),
                                 ('KNN', KNeighborsClassifier(n_neighbors=19)),
                                 ('RandomForest',
                                  RandomForestClassifier(max_depth=4, n_jobs=-1,
                                                         random_state=0)),
                                 ('NaiveBayes', BernoulliNB())),
                     metric=<function accuracy_score at 0x1678c55e0>,
                     regression=False, shuffle=True, stratified=True))

## 모든 조합 (Data 4)

In [35]:
from itertools import combinations

est_4_list = list()

for i in range(2,len(estimators_4)+1):
    est_4_list += list(combinations(estimators_4, i))

In [36]:
stack_list_4 = [StackingTransformer(estimator, 
                            regression = False, 
                            metric = accuracy_score, 
                            n_folds = 4, stratified = True, shuffle = True, 
                            random_state = 0, verbose = 0)
                            for estimator in est_4_list]

In [37]:
x_train, x_test, y_train, y_test = tdata_list[1]
score_list_4 = list()

for stack in stack_list_4:
    stack = stack.fit(x_train, y_train)
    S_train = stack.transform(x_train)
    S_test = stack.transform(x_test)
    model = neighbors.KNeighborsClassifier(27)
    model = model.fit(S_train, y_train)
    y_pred = model.predict(S_test)
    score_list_4.append((accuracy_score(y_test, y_pred), stack))

In [38]:
sorted(score_list_4, key=lambda x: x[0])[-1]

(0.8171641791044776,
 StackingTransformer(estimators=(('Logistic', LogisticRegression()),
                                 ('SVM', SVC(gamma='auto')),
                                 ('DecisionTree',
                                  DecisionTreeClassifier(max_depth=4,
                                                         random_state=0)),
                                 ('RandomForest',
                                  RandomForestClassifier(max_depth=4, n_jobs=-1,
                                                         random_state=0))),
                     metric=<function accuracy_score at 0x1678c55e0>,
                     regression=False, shuffle=True, stratified=True))

## 모든 조합 (Data 6)

In [39]:
from itertools import combinations

est_6_list = list()

for i in range(2,len(estimators_6)+1):
    est_6_list += list(combinations(estimators_6, i))

In [40]:
stack_list_6 = [StackingTransformer(estimator, 
                            regression = False, 
                            metric = accuracy_score, 
                            n_folds = 4, stratified = True, shuffle = True, 
                            random_state = 0, verbose = 0)
                            for estimator in est_6_list]

In [41]:
x_train, x_test, y_train, y_test = tdata_list[2]
score_list_6 = list()

for stack in stack_list_6:
    stack = stack.fit(x_train, y_train)
    S_train = stack.transform(x_train)
    S_test = stack.transform(x_test)
    model = linear_model.LogisticRegression()
    model = model.fit(S_train, y_train)
    y_pred = model.predict(S_test)
    score_list_6.append((accuracy_score(y_test, y_pred), stack))

In [42]:
sorted(score_list_6, key=lambda x: x[0])[-1]

(0.8372093023255814,
 StackingTransformer(estimators=(('KNN', KNeighborsClassifier(n_neighbors=9)),
                                 ('SVM', SVC(gamma='auto')),
                                 ('RandomForest',
                                  RandomForestClassifier(max_depth=3, n_jobs=-1,
                                                         random_state=0))),
                     metric=<function accuracy_score at 0x1678c55e0>,
                     regression=False, shuffle=True, stratified=True))