In [1]:
import numpy as np

# 사용할 모델 : KNeighbor, RandomForest, Adaboost, DecisionTree, Logistic
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression # 메타 모델

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

cancer_data = load_breast_cancer()
X = cancer_data.data
y = cancer_data.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)


In [2]:
# 개별 Classifier와 최종 메타 Classifier 생성
knn_clf = KNeighborsClassifier(n_neighbors=4)
rf_clf = RandomForestClassifier(n_estimators=100, random_state=0)
ada_clf = AdaBoostClassifier(n_estimators=100)
dt_clf = DecisionTreeClassifier()

# 메타 Classifier 
lr_clf = LogisticRegression(C=10)

In [4]:
# 개별 Classifier 학습/예측/평가
knn_clf.fit(X_train, y_train)
rf_clf.fit(X_train, y_train)
ada_clf.fit(X_train, y_train)
dt_clf.fit(X_train, y_train)

In [5]:
# 학습된 개별 모델들이 각자 반환하는 예측 데이터셋을 생성하고 개별 모델의 정확도 추정
knn_pred = knn_clf.predict(X_test)
rf_pred = rf_clf.predict(X_test)
ada_pred = ada_clf.predict(X_test)
dt_pred = dt_clf.predict(X_test)

print('KNN 정확도: {0:.4f}'.format(accuracy_score(y_test, knn_pred)))
print('랜덤 포레스트 정확도: {0:.4f}'.format(accuracy_score(y_test, rf_pred)))
print('결정 트리 정확도: {0:.4f}'.format(accuracy_score(y_test, dt_pred)))
print('에이다부스트 정확도: {0:.4f}'.format(accuracy_score(y_test, ada_pred)))

KNN 정확도: 0.9211
랜덤 포레스트 정확도: 0.9649
결정 트리 정확도: 0.9035
에이다부스트 정확도: 0.9561


In [6]:
# 개별 모델의 예측 결과를 메타모델이 학습할 수 있도록 스태킹 형태로 재생성
pred = np.array([knn_pred, rf_pred, ada_pred, dt_pred])
print(pred.shape)

(4, 114)


In [7]:
# transpose하기
pred = np.transpose(pred)
print(pred.shape)

(114, 4)


In [9]:
# 메타모델 학습/예측/평가
lr_clf.fit(pred, y_test)
final = lr_clf.predict(pred)
print(accuracy_score(final, y_test))

0.9736842105263158


In [23]:
# CV셋 기반의 stacking
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error

def get_stacking_datasets(model, X_train_n, y_train_n, X_test_n, n_folds):
    kf = KFold(n_splits=n_folds, shuffle=False)
    train_fold_pred = np.zeros((X_train_n.shape[0], 1))
    test_pred = np.zeros((X_test_n.shape[0], n_folds))
    print(model.__class__.__name__, ' model start')
    
    for folder_counter, (train_index, val_index) in enumerate(kf.split(X_train_n)):
        print('fold set', folder_counter, ' start')
        X_tr, y_tr, X_val = X_train_n[train_index], y_train_n[train_index], X_train_n[val_index] # y_val는 필요없음. 어차피 개별 모델에 대해선 예측 평가하지 않을 거니깐
        model.fit(X_tr, y_tr)
        train_fold_pred[val_index, :] = model.predict(X_val).reshape(-1,1) # predict하고 세로로 만들기
        # 입력된 원본 테스트 데이터를 폴드 세트내 학습된 기반 모델에서 예측 후 데이터 저장
        test_pred[:, folder_counter] = model.predict(X_test_n)
    
    test_pred_mean = np.mean(test_pred, axis=1) # (114, 7) => (114, 1)

    return train_fold_pred, test_pred_mean

In [41]:
from sklearn.model_selection import KFold 

def get_stacking(model, X_train_n, y_train_n, X_test_n, n_folds):
    kf = KFold(n_splits=n_folds)
    train_fold_pred = np.zeros((X_train_n.shape[0], 1))
    test_pred = np.zeros((X_test_n.shape[0], n_folds))
    print(model.__class__.__name__, 'model start')
    
    for k_fold_num, (train_idx, val_idx) in enumerate(kf.split(X_train_n)):
        print('fold set', k_fold_num, 'start')
        X_tr, y_tr, X_val = X_train_n[train_idx], y_train_n[train_idx], X_train_n[val_idx]
        model.fit(X_tr, y_tr)
        train_fold_pred[val_idx, :] = model.predict(X_val).reshape(-1,1)
        test_pred[:, k_fold_num]  = model.predict(X_test_n)
        
    test_pred_mean = np.mean(test_pred, axis=1).reshape(-1,1)
    
    return train_fold_pred, test_pred_mean

In [24]:
X_train_n = X_train 
X_test_n = X_test
print(X_train_n.shape)
print(X_test_n.shape)

train_fold_pred = np.zeros((X_train_n.shape[0], 1))
test_pred = np.zeros((X_test_n.shape[0], 7))
print(train_fold_pred.shape)
print(test_pred.shape)

(455, 30)
(114, 30)
(455, 1)
(114, 7)


In [42]:
knn_train, knn_test = get_stacking(knn_clf, X_train, y_train, X_test, 7)
rf_train, rf_test = get_stacking(rf_clf, X_train, y_train, X_test, 7)
ada_train, ada_test = get_stacking(ada_clf, X_train, y_train, X_test, 7)
dt_train, dt_test = get_stacking(dt_clf, X_train, y_train, X_test, 7)

KNeighborsClassifier model start
fold set 0 start
fold set 1 start
fold set 2 start
fold set 3 start
fold set 4 start
fold set 5 start
fold set 6 start
RandomForestClassifier model start
fold set 0 start
fold set 1 start
fold set 2 start
fold set 3 start
fold set 4 start
fold set 5 start
fold set 6 start
AdaBoostClassifier model start
fold set 0 start
fold set 1 start
fold set 2 start
fold set 3 start
fold set 4 start
fold set 5 start
fold set 6 start
DecisionTreeClassifier model start
fold set 0 start
fold set 1 start
fold set 2 start
fold set 3 start
fold set 4 start
fold set 5 start
fold set 6 start


In [45]:
# 개별 모델 합치기
final_train = np.concatenate((knn_train, rf_train, ada_train, dt_train), axis=1) 
final_test = np.concatenate((knn_test, rf_test, ada_test, dt_test), axis=1)
print('원본 학습 피처 데이터 Shape:',X_train.shape, '원본 테스트 피처 Shape:',X_test.shape)
print('스태킹 학습 피처 데이터 Shape:', final_train.shape,
      '스태킹 테스트 피처 데이터 Shape:',final_test.shape)

lr_clf.fit(final_train, y_train)
final_pred = lr_clf.predict(final_test)
print("최종 메타 모델의 예측 정확도 : {0:.4f}".format(accuracy_score(y_test, final_pred)))

원본 학습 피처 데이터 Shape: (455, 30) 원본 테스트 피처 Shape: (114, 30)
스태킹 학습 피처 데이터 Shape: (455, 4) 스태킹 테스트 피처 데이터 Shape: (114, 4)
최종 메타 모델의 예측 정확도 : 0.9737
