**Градиентный бустинг, случайные деревья, логистическая регрессия**


* Подготовим данные к бинарной классификации. Условно разделим вино на хорошее и плохое.
* Обучим на тренировочной выборке модель случайного леса
* Сделаем предсказание качества вина для тренировочного и тестового наборов данных.
* Обучим модель логистической регрессии

In [None]:
import numpy as np #для матричных вычислений
import pandas as pd #для анализа и предобработки данных
import matplotlib.pyplot as plt #для визуализации
import seaborn as sns #для визуализации

from sklearn import metrics #метрики
from sklearn import linear_model

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import mean_squared_error

import warnings # для игнорирования предупреждений
#Игнорируем варнинги
warnings.filterwarnings('ignore')


# Устанавливаем стиль визуализаций в matplotlib
%matplotlib inline
plt.style.use('seaborn')

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from sklearn import ensemble #ансамбли

In [None]:
data = pd.read_csv('drive/MyDrive/wine.csv', sep=';')
data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [None]:
data.isnull().sum()

Unnamed: 0,0
fixed acidity,0
volatile acidity,0
citric acid,0
residual sugar,0
chlorides,0
free sulfur dioxide,0
total sulfur dioxide,0
density,0
pH,0
sulphates,0


Разделим вино на хорошее и остальное

In [None]:
#1

data['good_quality'] = data['quality'] >= 6
data

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,good_quality
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,False
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5,False
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5,False
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6,True
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5,False
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6,True
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6,True
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5,False


Удалим целевые столбцы

In [None]:
X = data.drop('quality',  axis=1)
X = X.drop('good_quality',  axis=1)
y = data['good_quality']
X.head()


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4


In [None]:
#2

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Посмотрим на размерности выборок
print(f'Размерность обучающей выборки {X_train.shape}')
print(f'Размерность тестовой выборки {X_test.shape}')

Размерность обучающей выборки (1279, 11)
Размерность тестовой выборки (320, 11)


In [None]:
#3

rf_clf = ensemble.RandomForestClassifier(
    n_estimators=500, #число деревьев
    criterion='gini', #критерий эффективности
    max_depth=8, #максимальная глубина дерева
    max_features='sqrt', #корень из всех признаков
    random_state=42, #генератор случайных чисел
    min_samples_leaf=10 # минимальное число в листе
)
#Обучаем модель
rf_clf.fit(X_train, y_train)

Посмотрим на метрики

На тестовых данных:

In [None]:
#4

rf_clf_pred = rf_clf.predict(X_test)
print(metrics.classification_report(y_test, rf_clf_pred))

              precision    recall  f1-score   support

       False       0.70      0.74      0.72       141
        True       0.79      0.75      0.77       179

    accuracy                           0.75       320
   macro avg       0.75      0.75      0.75       320
weighted avg       0.75      0.75      0.75       320



На обучающих данных:

In [None]:
rf_clf_pred_train = rf_clf.predict(X_train)
print(metrics.classification_report(y_train, rf_clf_pred_train))

              precision    recall  f1-score   support

       False       0.82      0.86      0.84       603
        True       0.87      0.84      0.85       676

    accuracy                           0.85      1279
   macro avg       0.85      0.85      0.85      1279
weighted avg       0.85      0.85      0.85      1279



Посмотрим также на другие модельки, конкретно на бустинговый ансамбль решающий деревьев (градиентный бустинг):

In [None]:
# Создаем модель градиентного бустинга
gb = GradientBoostingClassifier(
    loss='log_loss', #функция потерь
    #learning_rate=0.1, #темп обучения
    n_estimators=600, #число деревьев
    max_depth=2, #максимальная глубина дерева
    random_state=42, #генератор случайных чисел
    min_samples_leaf=10
)

In [None]:
gb.fit(X_train, y_train)

In [None]:
gb_pred = gb.predict(X_test)
print(metrics.classification_report(y_test, gb_pred))

              precision    recall  f1-score   support

       False       0.80      0.79      0.80       141
        True       0.84      0.84      0.84       179

    accuracy                           0.82       320
   macro avg       0.82      0.82      0.82       320
weighted avg       0.82      0.82      0.82       320



In [None]:
gb_pred_train = gb.predict(X_train)
print(metrics.classification_report(y_train, gb_pred_train))

              precision    recall  f1-score   support

       False       0.93      0.93      0.93       603
        True       0.93      0.93      0.93       676

    accuracy                           0.93      1279
   macro avg       0.93      0.93      0.93      1279
weighted avg       0.93      0.93      0.93      1279



Заметно некоторое переобучение, однако, на тестовых данных метрики модели оказались лучше

Также посмотрим на модель логистической регрессии

In [None]:
#Создаём объект класса LogisticRegression
log_reg = linear_model.LogisticRegression(
    max_iter=1000, #количество итераций, выделенных на сходимость
    random_state=42 #генерация случайных чисел
)
#Обучаем модель
log_reg.fit(X_train, y_train)
#Делаем предсказание класса
y_pred = log_reg.predict(X_test)

In [None]:
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       False       0.66      0.64      0.65       141
        True       0.72      0.74      0.73       179

    accuracy                           0.69       320
   macro avg       0.69      0.69      0.69       320
weighted avg       0.69      0.69      0.69       320



In [None]:
y_pred_train = log_reg.predict(X_train)
print(metrics.classification_report(y_train, y_pred_train))

              precision    recall  f1-score   support

       False       0.74      0.71      0.72       603
        True       0.75      0.78      0.76       676

    accuracy                           0.74      1279
   macro avg       0.74      0.74      0.74      1279
weighted avg       0.74      0.74      0.74      1279



Здесь же метрики оказались сопоставимы с изначальной моделью случайного леса