### Задание
Выполнить Стэкинг, Бэгин, Вотинг и Бустинг. При реализации алгоритмов не использовать готовые решения. 
За сровнение взять CatBoostClassifier как базовая метрика качества. Сравнить результат с реализацией своих ансамблей. 
Для однозначности и интерпретируемости результатов использовать приложенный набор данных. 
При реализации бустинга - просто сокращайте набор данных на котором модель отработала хорошо (правильно предсказанные данные). 

### Анализ
Построим как можно больше базовых моделей, работающих на разных принципах. На их основе составим несколько типов ансамблей. Сравним точность моделей с CatBoost 

### Имплементация

Загрузим датасет

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

data = pd.read_csv('winequality-white.csv', sep=';')
print(data.head())
print(data.describe())
X = data.drop('quality', axis=1)
y = data['quality']

scaler = StandardScaler()
X = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=0)

   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0            7.0              0.27         0.36            20.7      0.045   
1            6.3              0.30         0.34             1.6      0.049   
2            8.1              0.28         0.40             6.9      0.050   
3            7.2              0.23         0.32             8.5      0.058   
4            7.2              0.23         0.32             8.5      0.058   

   free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \
0                 45.0                 170.0   1.0010  3.00       0.45   
1                 14.0                 132.0   0.9940  3.30       0.49   
2                 30.0                  97.0   0.9951  3.26       0.44   
3                 47.0                 186.0   0.9956  3.19       0.40   
4                 47.0                 186.0   0.9956  3.19       0.40   

   alcohol  quality  
0      8.8        6  
1      9.5        6  
2     10.1        6 

### Создадим и оптимизируем базовые модели для классификации

### KNeighborsClassifier

In [2]:
import optuna
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier

optuna.logging.set_verbosity(optuna.logging.WARNING)
N_TRIALS = 1000

def objective_knn(trial):
    params = {
        'n_neighbors': trial.suggest_int('n_neighbors', 1, 50),
        'weights': trial.suggest_categorical('weights', ['uniform', 'distance']),
        'p': trial.suggest_int('p', 1, 3)
    }
    model = KNeighborsClassifier(**params)
    return cross_val_score(model, X_train, y_train, cv=3, scoring='accuracy').mean()

study_knn = optuna.create_study(direction='maximize')
study_knn.optimize(objective_knn, n_trials=N_TRIALS, show_progress_bar=True, n_jobs=-1)
best_knn = KNeighborsClassifier(**study_knn.best_params)

print(study_knn.best_params)

  0%|          | 0/1000 [00:00<?, ?it/s]

{'n_neighbors': 26, 'weights': 'distance', 'p': 1}


### Decision Tree optimizing

In [3]:
def objective_tree(trial):
    params = {
        'max_depth': trial.suggest_int('max_depth', 2, 50),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'criterion': trial.suggest_categorical('criterion', ['gini', 'entropy', 'log_loss'])
    }
    model = DecisionTreeClassifier(**params)
    return cross_val_score(model, X_train, y_train, cv=3, scoring='accuracy').mean()

study_tree = optuna.create_study(direction='maximize')
study_tree.optimize(objective_tree, n_trials=N_TRIALS, show_progress_bar=True, n_jobs=-1)
best_tree = DecisionTreeClassifier(**study_tree.best_params)

print(study_tree.best_params)

  0%|          | 0/1000 [00:00<?, ?it/s]

{'max_depth': 26, 'min_samples_split': 2, 'min_samples_leaf': 1, 'criterion': 'entropy'}


### SVC

In [4]:
def objective_svm(trial):
    params = {
        'C': trial.suggest_float('C', 0.1, 10),
        'kernel': trial.suggest_categorical('kernel', ['linear', 'rbf', 'poly', 'sigmoid']),
        'gamma': trial.suggest_categorical('gamma', ['scale', 'auto']),
        'degree': trial.suggest_int('degree', 2, 5)
    }
    model = SVC(**params, probability=True)
    return cross_val_score(model, X_train, y_train, cv=3, scoring='accuracy').mean()

study_svm = optuna.create_study(direction='maximize')
study_svm.optimize(objective_svm, n_trials=N_TRIALS, show_progress_bar=True, n_jobs=-1)
best_svm = SVC(**study_svm.best_params, probability=True)

print(study_svm.best_params)

  0%|          | 0/1000 [00:00<?, ?it/s]

{'C': 8.065678596217653, 'kernel': 'rbf', 'gamma': 'auto', 'degree': 5}


### Gradient Boosting

In [5]:
def objective_gb(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0)
    }
    model = GradientBoostingClassifier(**params)
    return cross_val_score(model, X_train, y_train, cv=3, scoring='accuracy').mean()

study_gb = optuna.create_study(direction='maximize')
study_gb.optimize(objective_gb, n_trials=N_TRIALS / 5, show_progress_bar=True, n_jobs=-1)
best_gb = GradientBoostingClassifier(**study_gb.best_params)

print(study_gb.best_params)

  0%|          | 0/200.0 [00:00<?, ?it/s]

{'n_estimators': 176, 'learning_rate': 0.057458558420938666, 'max_depth': 8, 'min_samples_split': 16, 'min_samples_leaf': 6, 'subsample': 0.9308140603990446}


### MLPClassifier

In [6]:
def objective_mlp(trial):
    n_layers = trial.suggest_int('n_layers', 1, 6)
    layers = [trial.suggest_int(f'n_units_l{i}', 50, 1000) for i in range(n_layers)]
    params = {
        'hidden_layer_sizes': tuple(layers),
        'activation': trial.suggest_categorical('activation', ['relu', 'tanh', 'logistic']),
        'solver': trial.suggest_categorical('solver', ['adam', 'sgd']),
        'alpha': trial.suggest_float('alpha', 0.0001, 0.05),
        'learning_rate_init': trial.suggest_float('learning_rate_init', 0.001, 0.1),
        'batch_size': trial.suggest_int('batch_size', 32, 128)
    }
    model = MLPClassifier(**params, max_iter=N_TRIALS ** 2)
    return cross_val_score(model, X_train, y_train, cv=3, scoring='accuracy').mean()

study_mlp = optuna.create_study(direction='maximize')
study_mlp.optimize(objective_mlp, n_trials=N_TRIALS / 10, show_progress_bar=True)

best_params = study_mlp.best_params
n_layers = best_params['n_layers']
layers = [best_params[f'n_units_l{i}'] for i in range(n_layers)]
best_params['hidden_layer_sizes'] = tuple(layers)
best_params = {k: v for k, v in best_params.items() if k not in ['n_layers'] + [f'n_units_l{i}' for i in range(n_layers)]}
best_mlp = MLPClassifier(**best_params, max_iter=N_TRIALS ** 2)

print(best_params)

  0%|          | 0/100.0 [00:00<?, ?it/s]

{'activation': 'relu', 'solver': 'sgd', 'alpha': 0.004919926382476044, 'learning_rate_init': 0.03717539393212284, 'batch_size': 65, 'hidden_layer_sizes': (885, 893, 221)}


### Создадим ансамбли моделей

### VotingClassifier

In [7]:
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.base import clone

class VotingClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, models):
        self.models = models

    def fit(self, X, y):
        self.models_ = [(name, clone(model).fit(X, y)) for name, model in self.models]
        return self

    def predict(self, X):
        predictions = np.array([model.predict(X) for _, model in self.models_]).T
        return np.apply_along_axis(
            lambda x: np.argmax(np.bincount(x, minlength=len(np.unique(y)))),
            axis=1,
            arr=predictions.astype(int)
        )

### BaggingClassifier

In [8]:
class BaggingClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, models, n_bags=50):
        self.models = models
        self.n_bags = n_bags

    def fit(self, X, y):
        X = np.asarray(X)
        y = np.asarray(y)
        self.bagged_models_ = []
        for _ in range(self.n_bags):
            indices = np.random.choice(X.shape[0], X.shape[0], replace=True)
            bagged_models = [(name, clone(model).fit(X[indices], y[indices])) for name, model in self.models]
            self.bagged_models_.append(bagged_models)
        return self

    def predict(self, X):
        X = np.asarray(X)
        bagged_predictions = []
        for bagged_models in self.bagged_models_:
            predictions = np.array([model.predict(X) for _, model in bagged_models]).T
            bagged_predictions.append(predictions)
        bagged_predictions = np.mean(bagged_predictions, axis=0)
        return np.round(np.mean(bagged_predictions, axis=1)).astype(int)

### StackingClassifier

In [9]:
class StackingClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, models, meta_model):
        self.models = models
        self.meta_model = meta_model

    def fit(self, X, y):
        self.models_ = [(name, clone(model).fit(X, y)) for name, model in self.models]
        base_predictions = np.array([model.predict(X) for _, model in self.models_]).T
        self.meta_model_ = self.meta_model.fit(base_predictions, y)
        return self

    def predict(self, X):
        base_predictions = np.array([model.predict(X) for _, model in self.models_]).T
        return self.meta_model_.predict(base_predictions)

### В качестве референсной модели будем использовать модель CatBoostClassifier

In [13]:
from catboost import CatBoostClassifier

catboost = CatBoostClassifier(verbose=0, allow_writing_files=False)
catboost.fit(X_train, y_train)
catboost_accuracy = accuracy_score(y_test, catboost.predict(X_test))

print(f"CatBoost accuracy: {catboost_accuracy}")


CatBoost accuracy: 0.6693877551020408


### Выведем метрику точности для всех моделей

In [12]:
models = [
    ('KNN', best_knn),
    ('Decision Tree', best_tree),
    ('SVM', best_svm),
    ('Gradient Boosting', best_gb),
    ('MLP', best_mlp)
]

ensembles = [
    ('Voting', VotingClassifier(models)),
    ('Bagging', BaggingClassifier(models)),
    ('Stacking', StackingClassifier(models, GradientBoostingClassifier()))
]

all_models = models + ensembles

scores = {}

for name, model in all_models:
    model.fit(X_train, y_train)
    scores[name] = accuracy_score(y_test, model.predict(X_test))

scores_df = pd.DataFrame(list(scores.items()), columns=['Model', 'Accuracy'])
print(scores_df)

               Model  Accuracy
0                KNN  0.663265
1      Decision Tree  0.620408
2                SVM  0.595918
3  Gradient Boosting  0.684694
4                MLP  0.644898
5             Voting  0.687755
6            Bagging  0.682173
7           Stacking  0.679592


### Выводы
- Обучили 5 базовых моделей классификаторов. KNN и GradientBoostingClassifier сравнимы по точности с CatBoostClassifier. Остальные модели по метрикам хуже.
- Построили 3 различных ансамбля из базовых моделей. Все ансамбли по метрикам лучше CatBoostClassifier.
- На подбор гиперпараметров и обучение моделей с хорошими метриками потребовалось много вычислительных ресурсов. CatBoostClassifier имеет метрики не сильно хуже, однако дает результат из коробки и очень быстро.