In [10]:
from sklearn.base import BaseEstimator, ClassifierMixin, clone
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import _name_estimators
import numpy as np
import operator

"""
Два класса-наследника нужны для методов get_params, set_params и score
"""
class MajorityVoteClassifier(BaseEstimator, ClassifierMixin):
    """
    A majority vote ensemble classifier
    
    Parameters
    ----------
    classifiers : list of classifiers
        The list of classifiers that will be used to make the final prediction
    
    vote : str, = 'classlabel' or 'probability'
        The type of vote that will be used to make the final prediction
        'classlabel' : the class label of the majority of the classifiers
        'probability' : the weighted average probability of the class label of the majority of the classifiers
    
    weights : list of floats, default=None
        The list of weights that will be used to weight the individual classifiers
        If None, then weights will be set to 1 for each classifier
    """

    def __init__(self, classifiers, vote='classlabel', weights=None):
        self.classifiers = classifiers
        """
        Словарь из пар "строковое_имя_модели" - "модель". Строковое имя - название класса модели в нижнем регистре. 
        Если мы передадим несколько одинаковых моделей, то функция добавит к их именам индексы.
        Или если мы сами дадим уникальные имена, то функция использует их, а не будет создавать свои.
        """
        self.named_estimators = {key: value for key, value in _name_estimators(self.classifiers)}
        self.vote = vote
        self.weights = weights

    def fit(self, X, y):
        """
        Fit the ensemble of classifiers
        
        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            The training input samples
        y : array-like, shape = [n_samples]
            The target values
        
        Returns
        -------
        self : object
            Returns self.
        """
        
        if self.vote not in ('probability', 'classlabel'):
            raise ValueError("vote must be 'classlabel' or 'probability'; gained (vote=%r)" % self.vote)
        if self.weights and len(self.weights) != len(self.classifiers):
            raise ValueError("The number of weights must be equal to the number of classifiers; gained (weights=%d, classifiers=%d)" % (len(self.weights), len(self.classifiers)))
        self.lablenc_ = LabelEncoder()
        self.lablenc_.fit(y)
        self.classes_ = self.lablenc_.classes_
        self.classifiers_ = []
        for clf in self.classifiers:
            """
            clone создаёт независимую копию модели. Метод fit обучает её на датасете с закодированными метками.
            """
            fitted_clf = clone(clf).fit(X, self.lablenc_.transform(y))
            self.classifiers_.append(fitted_clf)

        return self
    
    def predict(self, X):
        """
        Predict class labels for X.
        
        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            The input samples
        
        Returns
        -------
        maj_vote : array, shape = [n_samples]
            The predicted class labels
        """
        
        if self.vote == 'probability':
            """
            Возвращает массив из наиболее вероятных меток для каждого образца. Пример:
            [[0.1, 0.7, 0.2],
             [0.6, 0.3, 0.1],   ->   [1, 0, 2]
             [0.2, 0.2, 0.6]]
            """
            maj_vote = np.argmax(self.predict_proba(X), axis=1)
        else:
            """
            Возвращает сначала массив из массивов с прогнозами каждой модели, потом это превращается в массив numpy
            и транспонируется, таким образом возврашается массив из массивов с прогнозами для каждого образца от всех
            моделей и там уже определяется какая метка встречается чаще всего для данного образца.
            """
            predictions = np.asarray([clf.predict(X) for clf in self.classifiers_]).T
            """
            x - массив с прогнозами для одного образца. bincount подсчитывает кол-во голосов по каждой метке для одного
            образца с учётом весов. apply_along_axis - применить функцию по конкретной оси к матрицы из массивов numpy.
            """
            maj_vote = np.apply_along_axis(lambda x: np.argmax(np.bincount(x, weights=self.weights)), axis=1, arr=predictions)
        """
        Обратно в строковые метки для уже сделанных прогнозов.
        """
        maj_vote = self.lablenc_.inverse_transform(maj_vote)

        return maj_vote
    
    def predict_proba(self, X):
        """
        Predict class probabilities for X.
        
        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            The input samples
        
        Returns
        -------
        avg_proba : array, shape = [n_samples, n_classes]
            The predicted class probabilities
        """

        probas = np.asarray([clf.predict_proba(X) for clf in self.classifiers_])
        avg_proba = np.average(probas, axis=0, weights=self.weights)

        return avg_proba
        
    def get_params(self, deep=True):
        """
        Gains the params names of classifier for GridSearch

        deep: returns all the params of MVclassifier for GridSearch if False.
        Otherwise, returns all the params of every inner classifier of ensemble.
        """
        if not deep:
            return super(MajorityVoteClassifier, self).get_params(deep=False)
        else:
            out = self.named_estimators.copy()
            for name, step in self.named_estimators.items():
                for key, value in step.get_params(deep=True).items():
                    out['%s__%s' % (name, key)] = value
            return out


In [11]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder


iris = datasets.load_iris()
X, y = iris.data[50:, [1, 2]], iris.target[50:]
le = LabelEncoder()
y = le.fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5)
sc = StandardScaler()
X_train_std = sc.fit_transform(X_train)
X_test_std = sc.transform(X_test)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=1, stratify=y)

In [12]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline


clf1 = LogisticRegression(penalty='l2', C=0.001, solver='lbfgs', random_state=1)
clf2 = DecisionTreeClassifier(criterion='entropy', max_depth=1, random_state=0)
clf3 = KNeighborsClassifier(n_neighbors=1, p=2, metric='minkowski')

pipe1 = Pipeline([['sc', StandardScaler()], ['clf', clf1]])
pipe3 = Pipeline([['sc', StandardScaler()], ['clf', clf3]])

clf_labels = ['Logistic Regression', 'Decision Tree', 'KNN']

print("Cross val-validation with k=10:\n")
for clf, label in zip([pipe1, clf2, pipe3], clf_labels):
    scores = cross_val_score(estimator=clf, X=X_train, y=y_train, cv=10, scoring='roc_auc')
    print("ROC AUC: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))

Cross val-validation with k=10:

ROC AUC: 0.98 (+/- 0.05) [Logistic Regression]
ROC AUC: 0.87 (+/- 0.18) [Decision Tree]
ROC AUC: 0.83 (+/- 0.15) [KNN]


In [13]:
mv_clf = MajorityVoteClassifier(classifiers=[pipe1, clf2, pipe3])
clf_labels += ['Majority Vote']
all_clf = [pipe1, clf2, pipe3, mv_clf]
for clf, label in zip(all_clf, clf_labels):
    scores = cross_val_score(estimator=clf, X=X_train, y=y_train, cv=10, scoring='roc_auc')
    print("ROC AUC: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))

ROC AUC: 0.92 (+/- 0.15) [Logistic Regression]
ROC AUC: 0.87 (+/- 0.18) [Decision Tree]
ROC AUC: 0.85 (+/- 0.13) [KNN]
ROC AUC: 0.98 (+/- 0.05) [Majority Vote]


In [14]:
mv_clf.get_params()

{'pipeline-1': Pipeline(steps=[['sc', StandardScaler()],
                 ['clf', LogisticRegression(C=0.001, random_state=1)]]),
 'decisiontreeclassifier': DecisionTreeClassifier(criterion='entropy', max_depth=1, random_state=0),
 'pipeline-2': Pipeline(steps=[['sc', StandardScaler()],
                 ['clf', KNeighborsClassifier(n_neighbors=1)]]),
 'pipeline-1__memory': None,
 'pipeline-1__steps': [['sc', StandardScaler()],
  ['clf', LogisticRegression(C=0.001, random_state=1)]],
 'pipeline-1__verbose': False,
 'pipeline-1__sc': StandardScaler(),
 'pipeline-1__clf': LogisticRegression(C=0.001, random_state=1),
 'pipeline-1__sc__copy': True,
 'pipeline-1__sc__with_mean': True,
 'pipeline-1__sc__with_std': True,
 'pipeline-1__clf__C': 0.001,
 'pipeline-1__clf__class_weight': None,
 'pipeline-1__clf__dual': False,
 'pipeline-1__clf__fit_intercept': True,
 'pipeline-1__clf__intercept_scaling': 1,
 'pipeline-1__clf__l1_ratio': None,
 'pipeline-1__clf__max_iter': 100,
 'pipeline-1__clf__m

In [17]:
from sklearn.model_selection import GridSearchCV

"""
Указываем параметры для GridSearch в формате: {name_of_classifier}__{name_of_parametr}
Если классификатор входит в пайплайн, то формат: {name_of_pipeline}__{name_of_classifier}__{name_of_parametr}
"""
params = {'decisiontreeclassifier__max_depth': [1, 2], 'pipeline-1__clf__C': [0.001, 0.1, 100.0]}
grid = GridSearchCV(estimator=mv_clf, param_grid=params, cv=10, scoring='roc_auc')
grid.fit(X_train, y_train)

for r, _ in enumerate(grid.cv_results_['mean_test_score']):
    print("%0.3f +/- %0.2f % r" % (grid.cv_results_['mean_test_score'][r], grid.cv_results_['std_test_score'][r] / 2.0, grid.cv_results_['params'][r]))

0.983 +/- 0.02 {'decisiontreeclassifier__max_depth': 1, 'pipeline-1__clf__C': 0.001}
0.983 +/- 0.02 {'decisiontreeclassifier__max_depth': 1, 'pipeline-1__clf__C': 0.1}
0.967 +/- 0.05 {'decisiontreeclassifier__max_depth': 1, 'pipeline-1__clf__C': 100.0}
0.983 +/- 0.02 {'decisiontreeclassifier__max_depth': 2, 'pipeline-1__clf__C': 0.001}
0.983 +/- 0.02 {'decisiontreeclassifier__max_depth': 2, 'pipeline-1__clf__C': 0.1}
0.967 +/- 0.05 {'decisiontreeclassifier__max_depth': 2, 'pipeline-1__clf__C': 100.0}
