In [2]:
# 사용할 모델 : KNeighbor, RandomForest, Adaboost, DecisionTree, Logistic(meta model C=10)
# 사용할 데이터 : cancer data
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

cancer = load_breast_cancer()
X = cancer.data
y = cancer.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# model 선언
kn_clf = KNeighborsClassifier(n_neighbors=4)
rf_clf = RandomForestClassifier(n_estimators=100)
ada_clf = AdaBoostClassifier(n_estimators=100)
dt_clf = DecisionTreeClassifier()

# meta model 선언
lg_clf = LogisticRegression(C=10)

In [3]:
# 개별 모델 학습
kn_clf.fit(X_train, y_train)
rf_clf.fit(X_train, y_train)
ada_clf.fit(X_train, y_train)
dt_clf.fit(X_train, y_train)

# 개별 모델 예측
kn_pred = kn_clf.predict(X_test)
rf_pred = rf_clf.predict(X_test)
ada_pred = ada_clf.predict(X_test)
dt_pred = dt_clf.predict(X_test)

# 개별 모델 예측 결과 shape: (114, ) --> 최종 메타 학습 데이터 shape: (114, 4)
print(kn_pred.shape)
print(rf_pred.shape)
print(ada_pred.shape)
print(dt_pred.shape)

(114,)
(114,)
(114,)
(114,)


In [4]:
# 개별 모델 예측 결과 array로 합치기
preds = np.array([kn_pred, rf_pred, ada_pred, dt_pred])
print(preds.shape)

# (114, 4)로 만들기 위해서 transpose 취해주기
meta_X_train = np.transpose(preds)
print(meta_X_train.shape)

(4, 114)
(114, 4)


In [5]:
# 개별 모델의 예측 성능 평가
from sklearn.metrics import accuracy_score

print('kn_clf accuaracy: {:.3f}'.format(accuracy_score(kn_pred, y_test)))
print('rf_clf accuaracy: {:.3f}'.format(accuracy_score(rf_pred, y_test)))
print('ada_clf accuaracy: {:.3f}'.format(accuracy_score(ada_pred, y_test)))
print('dt_clf accuaracy: {:.3f}'.format(accuracy_score(dt_pred, y_test)))

kn_clf accuaracy: 0.939
rf_clf accuaracy: 0.965
ada_clf accuaracy: 0.974
dt_clf accuaracy: 0.939


In [6]:
# 메타 model 적용하기

lg_clf.fit(meta_X_train, y_test) # 여기서 meta_X_train은 사실 X_test 데이터를 보고 예측한 것이기 때문에 테스트셋은 이미 오염되었다.
# y_test가 메타 모델의 라벨인 이유는 개별 모델의 X_test 데이터를 predict로 예측한 것의 정답 레이블이기 때문이다.
meta_pred = lg_clf.predict(meta_X_train)
print('meta model accuaracy: {:.3f}'.format(accuracy_score(meta_pred, y_test)))
# 당연히 과적합이 된다.

meta model accuaracy: 0.974


In [7]:
X_train.shape

(455, 30)

In [33]:
# Cross Validation을 이용해 stacking 구현하기 (KFold 사용)
from sklearn.model_selection import KFold

kf = KFold(n_splits=3, shuffle=False)

# 0으로 모델 trainset, testset 초기화하기
train_data = np.zeros((X_train.shape[0], 1)) # shape (M, 1)
print(train_data.shape)
test_data = np.zeros((X_test.shape[0], 3)) # shape (K, kold number) --> 이후 평균 내는 것
print(test_data.shape)

for iter, (train_idx, val_idx) in enumerate(kf.split(X_train)):
    X_tr, y_tr = X_train[train_idx], y_train[train_idx]
    val_train = X_train[val_idx]
    
    print("## {} fold start ##".format(iter))
    kn_clf.fit(X_tr, y_tr)
    pred = kn_clf.predict(val_train)
    train_data[val_idx, :] = pred.reshape(-1, 1)
    test_data[:, iter] = kn_clf.predict(X_test)
    
# test data 평균내기
meta_test = np.mean(test_data, axis=1)
print('평균 전 shape: {0} --> 평균 후 shape: {1}'.format(test_data.shape, meta_test.shape))
    

(455, 1)
(114, 3)
## 0 fold start ##
## 1 fold start ##
## 2 fold start ##
평균 전 shape: (114, 3) --> 평균 후 shape: (114,)


In [31]:
# train set을 폴드 별로 나눠서 pred 값 추가하는 것 연습해보기

zeros = np.zeros((X_train.shape[0], 1)) # shape (M, 1)
print(zeros.shape)
index = [0,1,2,3,4,5]
pred_ex = np.array([0.5, 0.8, 0.6, 0.4, 0.5, 0.3]).reshape(-1,1)
print(pred_ex.shape)
zeros[index, :] = pred_ex
print(zeros)

(455, 1)
(6, 1)
[[0.5]
 [0.8]
 [0.6]
 [0.4]
 [0.5]
 [0.3]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0. ]
 [0.

In [47]:
# cross validation으로 stacking 함수로 만들기
from sklearn.model_selection import KFold
import numpy as np

def get_stacking_train_test(model, n_split: int, X_train, X_test, y_train):
    kf = KFold(n_splits=n_split)
    # train, test dataset 0으로 초기화 하기
    # trainset shape: (M, 1), testset shape: (K, fold 개수)
    train_data = np.zeros((X_train.shape[0], 1))
    test_data = np.zeros((X_test.shape[0], n_split))
    
    print('## {} train start'.format(model.__class__.__name__))
    # KFold를 이용하여 학습 및 검증
    for fold_num, (train_idx, val_idx) in enumerate(kf.split(X_train)):
        # index에 따른 train, val set 재지정
        X_tr, y_tr, X_val = X_train[train_idx], y_train[train_idx], X_train[val_idx]
        
        # 학습 
        print("## {} fold train start".format(fold_num))
        model.fit(X_tr, y_tr)
        pred = model.predict(X_val)
        # testdata에 대해서 학습
        pred_test = model.predict(X_test)
        
        # 초기화했던 것에 값 넣기
        train_data[val_idx, :] = pred.reshape(-1,1) # 길게 한줄로 만들어서 넣기 reshape(-1,1)
        test_data[:, fold_num] = pred_test
        
    # testset 평균 내서 한 줄로 만들기
    test_data = np.mean(test_data, axis=1).reshape(-1, 1)
    return train_data, test_data

In [52]:
# 모델별로 개별 train & test dataset 뽑아내기
kn_train, kn_test = get_stacking_train_test(kn_clf, 7, X_train, X_test, y_train)
rf_train, rf_test = get_stacking_train_test(rf_clf, 7, X_train, X_test, y_train)
ada_train, ada_test = get_stacking_train_test(ada_clf, 7, X_train, X_test, y_train)
dt_train, dt_test = get_stacking_train_test(dt_clf, 7, X_train, X_test, y_train)

# 메타 모델을 위한 train, test dataset 만들기
# 시도했지만 실패한 코드 아래의 예시의 경우 dimension을 추가해서 합하는 경우이다.
# meta_train = np.array([kn_train, rf_train, ada_train, dt_train]) # shape: (4, 455, 1)
# meta_test = np.array([kn_test, rf_test, ada_test, dt_test]) # shape: (4, 114, 1)

meta_train = np.concatenate((kn_train, rf_train, ada_train, dt_train), axis=1)
meta_test = np.concatenate((kn_test, rf_test, ada_test, dt_test), axis=1)
print(meta_train.shape)
print(meta_test.shape)

## KNeighborsClassifier train start
## 0 fold train start
## 1 fold train start
## 2 fold train start
## 3 fold train start
## 4 fold train start
## 5 fold train start
## 6 fold train start
## RandomForestClassifier train start
## 0 fold train start
## 1 fold train start
## 2 fold train start
## 3 fold train start
## 4 fold train start
## 5 fold train start
## 6 fold train start
## AdaBoostClassifier train start
## 0 fold train start
## 1 fold train start
## 2 fold train start
## 3 fold train start
## 4 fold train start
## 5 fold train start
## 6 fold train start
## DecisionTreeClassifier train start
## 0 fold train start
## 1 fold train start
## 2 fold train start
## 3 fold train start
## 4 fold train start
## 5 fold train start
## 6 fold train start
(455, 4)
(114, 4)


In [54]:
# 메타 모델 학습 시키기
# 메타 모델 : Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

lr_clf = LogisticRegression(C=10)
lr_clf.fit(meta_train, y_train)
meta_pred = lr_clf.predict(meta_test)
print('메타 모델을 위한 trainset shape: {0}, testset shape: {1}'.format(meta_train.shape, meta_test.shape))
print('final accuracy: {:.3f}'.format(accuracy_score(meta_pred, y_test)))

메타 모델을 위한 trainset shape: (455, 4), testset shape: (114, 4)
final accuracy: 0.965
