In [3]:
# stack base model 생성
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [8]:
# CV기반의 스태킹
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error

# for step 1
def get_stacking_base_datasets(model, X_train_n, y_train_n, X_test_n, n_folds):
    kf = KFold(n_splits=n_folds, shuffle=False, random_state=0)
    
    # 메타모델이 사용할 학습 테이터 반환 위한 데이터 초기화
    train_fold_pred = np.zeros((X_train_n.shape[0], 1))
    test_pred = np.zeros((X_test_n.shape[0], n_folds))
    print(model.__class__.__name__, 'model start ##')
    
    for folder_counter, (train_index, valid_index) in enumerate(kf.split(X_train_n)):
        print('\t fold set: ', folder_counter, ' start')
        X_tr = X_train_n[train_index]
        y_tr = y_train_n[train_index]
        X_te = X_train_n[valid_index]
        
        # fold data set으로 training
        model.fit(X_tr, y_tr)
        # 예측값 메타모델 학습 위해 저장
        train_fold_pred[valid_index, :] = model.predict(X_te).reshape(-1,1)
        # test set 전체에 대한 예측 결과 따로 저장
        test_pred[:, folder_counter] = model.predict(X_test_n)
        
    # 위의 fold set 내에서 원본 테스트 데이터를 예측한 데이터를 평균내 테스트 데이터로 생성
    test_pred_mean = np.mean(test_pred, axis=1).reshape(-1,1)
    
    # train_fold_pred는 최종 메타모델이 학습 위해 사용하는 데이터, test_pred_mean은 테스트위한 데이터
    return train_fold_pred, test_pred_mean

In [9]:
cancer_data = load_breast_cancer()

X_data = cancer_data.data
y_label = cancer_data.target

knn_clf = KNeighborsClassifier(n_neighbors=4)
rf_clf = RandomForestClassifier(n_estimators=100)
dt_clf = DecisionTreeClassifier()
ada_clf = AdaBoostClassifier(n_estimators=100)

# 최종 메타모델
lr_final = LogisticRegression(C=10)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X_data, y_label, test_size=0.2)

In [11]:
knn_train, knn_test = get_stacking_base_datasets(knn_clf, X_train, y_train, X_test, 7)
rf_train, rf_test = get_stacking_base_datasets(rf_clf, X_train, y_train, X_test, 7)
dt_train, dt_test = get_stacking_base_datasets(dt_clf, X_train, y_train, X_test, 7)
ada_train, ada_test = get_stacking_base_datasets(ada_clf, X_train, y_train, X_test, 7)



KNeighborsClassifier model start ##
	 fold set:  0  start
	 fold set:  1  start
	 fold set:  2  start
	 fold set:  3  start
	 fold set:  4  start
	 fold set:  5  start
	 fold set:  6  start
RandomForestClassifier model start ##
	 fold set:  0  start
	 fold set:  1  start
	 fold set:  2  start
	 fold set:  3  start
	 fold set:  4  start
	 fold set:  5  start
	 fold set:  6  start
DecisionTreeClassifier model start ##
	 fold set:  0  start
	 fold set:  1  start
	 fold set:  2  start
	 fold set:  3  start
	 fold set:  4  start
	 fold set:  5  start
	 fold set:  6  start
AdaBoostClassifier model start ##
	 fold set:  0  start




	 fold set:  1  start
	 fold set:  2  start
	 fold set:  3  start
	 fold set:  4  start
	 fold set:  5  start
	 fold set:  6  start


In [12]:
# for step 2
Stack_final_X_train = np.concatenate((knn_train, rf_train, dt_train, ada_train), axis=1)
Stack_final_X_test = np.concatenate((knn_test, rf_test, dt_test, ada_test), axis=1)
print('Original feature data for training shape : ', X_train.shape)
print('Original feature data for testing shape : ', X_test.shape)
print('Stacking feature data for training shape : ', Stack_final_X_train.shape)
print('Stacking feature data for testing shape : ', Stack_final_X_test.shape)

Original feature data for training shape :  (455, 30)
Original feature data for testing shape :  (114, 30)
Stacking feature data for training shape :  (455, 4)
Stacking feature data for testing shape :  (114, 4)


In [13]:
# 즉, 최종 메타모델은 feature가 4개, 각 model별 record의 예측 결과를 가지고 train을 진행한다.
lr_final.fit(Stack_final_X_train, y_train)
stack_final = lr_final.predict(Stack_final_X_test)

print('Accuracy of final meta model: {0:.4f}'.format(accuracy_score(y_test, stack_final)))

Accuracy of final meta model: 0.9561
