In [13]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random

from sklearn.model_selection import train_test_split

from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score

import warnings
warnings.filterwarnings('ignore')

In [14]:
titanic_data = pd.read_csv('titanic.csv')

In [15]:
titanic_data.shape

(891, 12)

In [16]:
X = titanic_data[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]
y = titanic_data.Survived
X = pd.get_dummies(X)
X = X.fillna({'Age': X.Age.median()})

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=239)

In [18]:
class StackedClassifier:
    def __init__(self, estimators, ratio=0.25):
        self.estimators_ = estimators
        self.ratio_ = ratio
        
    def fit(self, X, y):
        predictions = []

        for model in self.estimators_:
            rs = np.random.RandomState()
            # обучаем i-ю базовую модель
            ind = rs.choice(range(len(y)), size=round(len(y) * self.ratio_), replace=False)
            model.fit(X.iloc[ind], y.iloc[ind])
            
            predictions.append(model.predict_proba(X)[:, 0])
        # предсказания базовых моделей имеют размерность (n_samples, 1)
        # состыкуем эти предсказания в новый набор признаков размерности (n_samples, n_features)
        new_X = np.stack(predictions, axis=1)
        # обучим мета-модель, предсказывающую итоговый класс по предсказаниям базовых моделей
        self.meta_lr_ = LogisticRegression(penalty="l2", solver='lbfgs', C=1e4, max_iter=1000)
        self.meta_lr_.fit(new_X, y)
        
    def predict_proba(self, X):
        predictions = []
        
        for model in self.estimators_:
            predictions.append(model.predict_proba(X)[:, 0])
        
        new_X = np.stack(predictions, axis=1)
        return self.meta_lr_.predict_proba(new_X)
    
    def predict(self, X):
        return np.argmax(self.predict_proba(X), axis=1)
    
    def get_params(self, deep=True):
        return {"ratio": self.ratio_}

    def set_params(self, **parameters):
        for parameter, value in parameters.items():
            setattr(self, parameter, value)
        return self

In [20]:
gaussian = GaussianNB()
gaussian.fit(X_train, y_train)
print('Balanced accuracy score for GaussianNB:', balanced_accuracy_score(y_test, gaussian.predict(X_test)))

Balanced accuracy score for GaussianNB: 0.7659273545483767


In [21]:
kNN = KNeighborsClassifier()
parameters = {'n_neighbors': range(1, 30), 
              'metric': ['euclidean', 'manhattan', 'chebyshev']}
grid_search_cv_kNN = GridSearchCV(kNN, parameters, cv=5, n_jobs=-1)
grid_search_cv_kNN.fit(X_train, y_train)
print('Best params for KNN :', grid_search_cv_kNN.best_params_)
best_kNN = grid_search_cv_kNN.best_estimator_
print('Balanced accuracy score for KNeighborsClassifier:', balanced_accuracy_score(y_test, best_kNN.predict(X_test)))

Best params for KNN : {'metric': 'manhattan', 'n_neighbors': 7}
Balanced accuracy score for KNeighborsClassifier: 0.7510125361620058


In [22]:
tree = DecisionTreeClassifier()
parameters = {'criterion': ['gini', 'entropy'],
              'max_depth': range(1, 20),
              'min_samples_leaf': range(1, 20),
              'min_samples_split': range(1, 20)}
grid_search_cv_tree = GridSearchCV(tree, parameters, cv=5, n_jobs=-1)
grid_search_cv_tree.fit(X_train, y_train)
print('Best params for DecisionTree:', grid_search_cv_tree.best_params_)
best_tree = grid_search_cv_tree.best_estimator_
print('Balanced accuracy score for DecisionTreeClassifier:', balanced_accuracy_score(y_test, best_tree.predict(X_test)))

Best params for DecisionTree: {'criterion': 'entropy', 'max_depth': 8, 'min_samples_leaf': 4, 'min_samples_split': 17}
Balanced accuracy score for DecisionTreeClassifier: 0.756702025072324


In [23]:
svc = SVC(probability=True)
grid_search_cv_svc = GridSearchCV(svc, {'C': range(1, 500)}, cv=5, n_jobs=-1)
grid_search_cv_svc.fit(X_train, y_train)
print('Best params for SVC:', grid_search_cv_svc.best_params_)
best_svc = grid_search_cv_svc.best_estimator_
print('Balanced accuracy score for SVC:', balanced_accuracy_score(y_test, best_svc.predict(X_test)))

Best params for SVC: {'C': 311}
Balanced accuracy score for SVC: 0.7524911603985857


In [24]:
estimators = [gaussian, best_kNN, best_tree, best_svc]

In [25]:
best_ratio = 0.1
best_score = 0
for ratio in np.arange(0.25, 1, 0.01):
    slr = StackedClassifier(ratio=float(ratio), estimators=estimators)
    slr.fit(X_train, y_train)
    slr_predictions = slr.predict(X_test)
    score = balanced_accuracy_score(y_test, slr_predictions)
    if (best_score < score):
        best_ratio = ratio
        best_score = score
print('Оптимальный размер данных для одной модели:', round(best_ratio, 2)) 
print('Balanced accuracy score for meta-classifier:', best_score)

Оптимальный размер данных для одной модели: 0.98
Balanced accuracy score for meta-classifier: 0.7959819993571199
