In [0]:
from copy import deepcopy
from collections import defaultdict

import numpy as np
from sklearn.model_selection import StratifiedKFold

class StackedClassifier:
    def __init__(self, estimators, merge_estimator):
        self.original_clfs = dict(estimators)
        self.m_clf = merge_estimator

        self.clfs_dict = defaultdict(list)
        self.clfs_index = sorted(self.original_clfs.keys())

    def fit(self, X, y):
        self.clfs_dict = defaultdict(list)

        skf = StratifiedKFold(n_splits=15)
        index_list = list(skf.split(X, y))

        merge_feature_list = []
        for clf_name in self.clfs_index:
            clf_origin = self.original_clfs[clf_name]
            preds_tmp_list = []
            for train_index, test_index in index_list:
                clf_copy = deepcopy(clf_origin)
                clf_copy.fit(X[train_index], y[train_index])
                preds_tmp_list.append(
                    clf_copy.predict_proba(X[test_index]))
                self.clfs_dict[clf_name].append(clf_copy)
            merge_feature_list.append(np.vstack(preds_tmp_list))
        
        X_merged = np.hstack(merge_feature_list)
        y_merged = np.hstack([y[test_index] 
                              for _, test_index in index_list])

        self.m_clf.fit(X_merged, y_merged)
        return self

        
    def predict(self, X):
        merge_feature_list = []
        for clf_name in self.clfs_index:
            tmp_proba_list = []
            for clf in self.clfs_dict[clf_name]:
                tmp_proba_list.append(clf.predict_proba(X))
            merge_feature_list.append(
                np.mean(tmp_proba_list, axis=0))
        X_merged = np.hstack(merge_feature_list)

        return self.m_clf.predict(X_merged)

In [0]:
import numpy as np
import pandas as pd
import xgboost as xgb

from sklearn import datasets
from sklearn.utils.validation import check_random_state
#from stacked_generalization.lib.stacking import StackedClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.manifold import TSNE


In [0]:
#irisデータのロード
iris = datasets.load_iris()
rng = check_random_state(0)
perm = rng.permutation(iris.target.size)
iris.data = iris.data[perm]
iris.target = iris.target[perm]

In [0]:
# Stage 1モデル
bclf = xgb.XGBClassifier(
    eta = 0.02,
    min_child_weight= 2,
    max_depth= 4,
    gamma=0.9,
    subsample=0.8,
    colsample_bytree=0.8,
    Lambda=0.4,
    alpha=0.4,
    scale_pos_weight=1,
    objective= 'binary:logistic',
    eval_metric = 'rmse',
    random_state=0
)

# Stage 0のモデルたち
clfs = [RandomForestClassifier(n_estimators=40, criterion = 'gini', random_state=1),
        ExtraTreesClassifier(n_estimators=30, criterion = 'gini', random_state=3),
        GradientBoostingClassifier(n_estimators=25, random_state=1),
        GradientBoostingClassifier(n_estimators=30, random_state=2),
        GradientBoostingClassifier(n_estimators=30, random_state=3),
        KNeighborsClassifier(),
        RidgeClassifier(random_state=1),
        TSNE(n_components=2)
        ]


In [14]:
#スタッキングモデルのインスタンシエート
sc = StackedClassifier(bclf,
                       clfs,
                       #n_folds=3,
                       #verbose=0,
                       #stack_by_proba=True,
                       #oob_score_flag=True,
                       )

#単体では一番ハイスコアの勾配ブースティング(比較用)
gb = GradientBoostingClassifier(n_estimators=25, random_state=1)

TypeError: ignored

In [0]:
# クロスバリデーション
sc_score = 0
gb_score = 0
n_folds = 3
for train_idx, test_idx in StratifiedKFold(iris.target, n_folds):
    xs_train = iris.data[train_idx]
    y_train = iris.target[train_idx]
    xs_test = iris.data[test_idx]
    y_test = iris.target[test_idx]

    sc.fit(xs_train, y_train)
    print('oob_score: {0}'.format(sc.oob_score_))
    sc_score += sc.score(xs_test, y_test)
    gb.fit(xs_train, y_train)
    gb_score += gb.score(xs_test, y_test)

#スコアの表示
sc_score /= n_folds
print('Stacked Classfier score: {0}'.format(sc_score))
gb_score /= n_folds
print('Gradient Boosting Classfier score: {0}'.format(gb_score))

  if n_splits == 'warn':


ValueError: ignored

In [0]:
class StackedRegressor(BaseStacked, RegressorMixin):
    def __init__(self,
                 bclf,
                 clfs,
                 n_folds=3,
                 oob_score_flag=False,
                 oob_metrics=mean_squared_error,
                 Kfold=None,
                 verbose=0,
                 save_stage0=False,
                 save_dir=''):
        self.n_folds = n_folds
        self.clfs = clfs
        self.bclf = bclf
        self.all_learner = OrderedDict()
        self.oob_score_flag = oob_score_flag
        self.oob_metrics = oob_metrics
        self.verbose = verbose
        self.stack_by_proba = False
        self.save_stage0 = save_stage0
        self.save_dir = save_dir
        self.MyKfold = Kfold

    def predict(self, X, index=None):
        """
        The predicted value of an input sample is a vote by the StackedRegressor.
        Parameters
        ----------
        X : array-like or sparse matrix of shape = [n_samples, n_features]
            The input samples. Internally, it will be converted to
            ``dtype=np.float32`` and if a sparse matrix is provided
            to a sparse ``csr_matrix``.
        Returns
        -------
        y : array of shape = [n_samples]
            The predicted values.
        """
        blend_test = self._make_blend_test(X, index)
        blend_test = self._pre_propcess(blend_test, X)
        return self.bclf.predict(blend_test)

    def _make_kfold(self, X, Y):
        if self.MyKfold is not None:
            return self.MyKfold
        else:
            return list(KFold(self.n_folds).split(X, Y))

    def _get_blend_init(self, y_train, clf):
        if hasattr(clf, 'predict'):
            width = 1
        elif hasattr(clf, 'n_components'):
            width = clf.n_components
        return np.zeros((y_train.size, width))

    def _get_child_predict(self, clf, X, index=None):
        if hasattr(clf, 'predict'):
            if self.save_stage0 and index is not None:
                predict_result = util.saving_predict(clf, X, index)
            else:
                predict_result = clf.predict(X)
            return predict_result.reshape(predict_result.size, 1)
        else:
            return clf.fit_transform(X)