In [None]:
import numpy as np
import pandas as pd
import matplotlib as plt
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

from sklearn.metrics import roc_auc_score

In [None]:
RANDOM_STATE = 111
DATASET_PATH = 'https://raw.githubusercontent.com/evgpat/edu_stepik_practical_ml/main/datasets/flight_delays_train.csv'

In [None]:
data = pd.read_csv(DATASET_PATH)
print(data)

## Обучение моделей

In [None]:
X = data[['DepTime', 'Distance']]
y = data['dep_delayed_15min'].map({'Y': 1, 'N': 0})
print(X)

In [None]:
dt = DecisionTreeClassifier()

cross_val_score(dt, X, y, cv=3, scoring='roc_auc').mean()

In [None]:
rf = RandomForestClassifier()

cross_val_score(rf, X, y, cv=3, scoring='roc_auc').mean()

In [None]:
gb = GradientBoostingClassifier()

cross_val_score(gb, X, y, cv=3, scoring='roc_auc').mean()

Выведем *bias* и *variance* каждой из моделей при помощи `bias_variance_decomp` из библиотеки `mlxtend`. Функция на вход ожидает получения тренировочных и тестовых данных, поэтому разобъем все данные на train и test.

In [None]:
!pip install mlxtend --upgrade

In [None]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.25, random_state=RANDOM_STATE)

In [None]:
from mlxtend.evaluate import bias_variance_decomp

avg_var = bias_variance_decomp(dt, Xtrain[:1000].values, ytrain[:1000].values,
                                                  Xtest.values, ytest.values, loss = '0-1_loss',
                                                  random_seed=np.random.seed(RANDOM_STATE), num_rounds = 50)
avg_bias = bias_variance_decomp(dt, Xtrain[:1000].values, ytrain[:1000].values,
                                                  Xtest.values, ytest.values, loss = '0-1_loss',
                                                  random_seed=np.random.seed(RANDOM_STATE), num_rounds = 50)
print(avg_var)
print(avg_bias)

In [None]:
from mlxtend.evaluate import bias_variance_decomp

avg_var = bias_variance_decomp(rf, Xtrain[:1000].values, ytrain[:1000].values,
                                                  Xtest.values, ytest.values, loss = '0-1_loss',
                                                  random_seed=np.random.seed(RANDOM_STATE), num_rounds = 50)
avg_bias = bias_variance_decomp(rf, Xtrain[:1000].values, ytrain[:1000].values,
                                                  Xtest.values, ytest.values, loss = '0-1_loss',
                                                  random_seed=np.random.seed(RANDOM_STATE), num_rounds = 50)
print(avg_var)
print(avg_bias)

In [None]:
from mlxtend.evaluate import bias_variance_decomp

avg_var = bias_variance_decomp(gb, Xtrain[:1000].values, ytrain[:1000].values,
                                                  Xtest.values, ytest.values, loss = '0-1_loss',
                                                  random_seed=np.random.seed(RANDOM_STATE), num_rounds = 50)
avg_bias = bias_variance_decomp(gb, Xtrain[:1000].values, ytrain[:1000].values,
                                                  Xtest.values, ytest.values, loss = '0-1_loss',
                                                  random_seed=np.random.seed(RANDOM_STATE), num_rounds = 50)
print(avg_var)
print(avg_bias)

## Подбор гиперпараметров

### Гиперпараметры для решающего дерева

In [None]:
params = {'max_depth': np.arange(2, 30, 3),
         'max_features': np.arange(2, 12, 3),
         'criterion': ('gini', 'entropy', 'log_loss'),
         'min_samples_split': np.arange(1, 5, 1),
         'min_samples_leaf': np.arange(1, 5, 1)}

gs = GridSearchCV(dt, params, cv=3, scoring='roc_auc', verbose=1)

gs.fit(Xtrain[:10000], ytrain[:10000])

Теперь обучите на **всех** тренировочных данных решающее дерево с лучшими  
найденными гиперпараметрами и выведите *ROC-AUC* на тесте.

In [None]:
modeldt = DecisionTreeClassifier(**gs.best_params_)

modeldt.fit(Xtrain, ytrain)

pred = modeldt.predict_proba(Xtest)[:, 1]
roc_auc_score(ytest, pred)

### Гиперпараметры для случайного леса:

In [None]:
params = {'n_estimators': np.arange(10, 200, 20),
          'max_depth': np.arange(2, 30, 3),
         'max_features': np.arange(2, 12, 3),
         'criterion': ('gini', 'entropy', 'log_loss'),
         'min_samples_split': np.arange(1, 5, 1),
         'min_samples_leaf': np.arange(1, 5, 1)}

gsrf = GridSearchCV(rf, params, cv=3, scoring='roc_auc', verbose=1)

gsrf.fit(Xtrain[:10000], ytrain[:10000])

In [None]:
modelrf = RandomForestClassifier(**gsrf.best_params_)

modelrf.fit(Xtrain, ytrain)

pred = modelrf.predict_proba(Xtest)[:, 1]
roc_auc_score(ytest, pred)

### Гиперпараметры для градиентного бустинга:

In [None]:
params = {'n_estimators': np.arange(10, 200, 20),
          'max_depth': np.arange(2, 30, 3),
         'max_features': np.arange(2, 12, 3),
         'criterion': ('gini', 'entropy', 'log_loss'),
         'min_samples_split': np.arange(1, 5, 1),
         'min_samples_leaf': np.arange(1, 5, 1)}

gsgb = GridSearchCV(gb, params, cv=3, scoring='roc_auc', verbose=1)

gsgb.fit(Xtrain[:10000], ytrain[:10000])

Теперь обучите на **всех** тренировочных данных градиентный бустинг с лучшими  
найденными гиперпараметрами и выведите *ROC-AUC* на тесте.

In [None]:
modelgb = GradientBoostingClassifier(**gsgb.best_params_)

modelgb.fit(Xtrain, ytrain)

pred = modelgb.predict_proba(Xtest)[:, 1]
roc_auc_score(ytest, pred)

## Размышления

Мы использовали для обучения очень много объектов и очень мало признаков, поэтому ансамбли моделей не смогли себя показать в полную мощь. Если задействовать больше характеристик полета, то и качество моделей может стать выше.

# Улучшение модели

In [None]:
data['dep_delayed_15min'] = data['dep_delayed_15min'].map({'Y' : 1, 'N' : 0})

In [None]:
data['Month'] = data['Month'].apply(lambda x: int(x.split('-')[1]))

print('average:', data['dep_delayed_15min'].mean())

data.groupby('Month')['dep_delayed_15min'].mean()

In [None]:
print('average:', data['dep_delayed_15min'].mean())

data.groupby('UniqueCarrier').agg({"dep_delayed_15min": ["mean", "count"]})

In [None]:
X = data[['Month','UniqueCarrier','DepTime','Distance']]
y = data['dep_delayed_15min']

X = pd.get_dummies(X, columns=['UniqueCarrier'], drop_first=True) # drop_first=True - выкидываем один столбец, чтобы избежать линейной зависимости
X.head()

In [None]:
print('Decision Tree:', cross_val_score(DecisionTreeClassifier(), X, y, cv=3, scoring='roc_auc').mean())
print('Random Forest:', cross_val_score(RandomForestClassifier(), X, y, cv=3, scoring='roc_auc').mean())
print('Gradient Boosting:', cross_val_score(GradientBoostingClassifier(), X, y, cv=3, scoring='roc_auc').mean())

In [None]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.25, random_state=RANDOM_STATE)

### Оптимальные гиперпараметры для Decision Tree.

In [None]:
params = {'max_depth': np.arange(2, 30, 3),
         'max_features': np.arange(2, 12, 3),
         'criterion': ('gini', 'entropy', 'log_loss'),
         'min_samples_split': np.arange(1, 5, 1),
         'min_samples_leaf': np.arange(1, 5, 1)}

gs = GridSearchCV(dt, params, cv=3, scoring='roc_auc', verbose=1)

gs.fit(Xtrain, ytrain)

In [None]:
modeldt = DecisionTreeClassifier(**gs.best_params_)

modeldt.fit(Xtrain, ytrain)

pred = modeldt.predict_proba(Xtest)[:, 1]
roc_auc_score(ytest, pred)

### Оптимальные гиперпараметры для RandomForest.

In [None]:
params = {'n_estimators': np.arange(10, 200, 20),
          'max_depth': np.arange(2, 30, 3),
         'max_features': np.arange(2, 12, 3),
         'criterion': ('gini', 'entropy', 'log_loss'),
         'min_samples_split': np.arange(1, 5, 1),
         'min_samples_leaf': np.arange(1, 5, 1)}

gsrf = GridSearchCV(rf, params, cv=3, scoring='roc_auc', verbose=1)

gsrf.fit(Xtrain, ytrain)

In [None]:
modelrf = RandomForestClassifier(**gsrf.best_params_)

modelrf.fit(Xtrain, ytrain)

pred = modelrf.predict_proba(Xtest)[:, 1]
roc_auc_score(ytest, pred)

### Оптимальные гиперпараметры для GradientBoosting.

In [None]:
params = {'n_estimators': np.arange(10, 200, 20),
          'max_depth': np.arange(2, 30, 3),
         'max_features': np.arange(2, 12, 3),
         'criterion': ('gini', 'entropy', 'log_loss'),
         'min_samples_split': np.arange(1, 5, 1),
         'min_samples_leaf': np.arange(1, 5, 1)}

gsgb = GridSearchCV(gb, params, cv=3, scoring='roc_auc', verbose=1)

gsgb.fit(Xtrain, ytrain)

In [None]:
modelgb = GradientBoostingClassifier(**gsgb.best_params_)

modelgb.fit(Xtrain, ytrain)

pred = modelgb.predict_proba(Xtest)[:, 1]
roc_auc_score(ytest, pred)

Посмотрим на важность признаков у полученного бустинга.

In [None]:
importances = modelgb.feature_importances_
indices = np.argsort(importances)

plt.figure()
plt.title("Feature importances")
plt.bar(range(len(modelgb.feature_importances_)), modelgb.feature_importances_)
plt.xticks(range(len(modelgb.feature_importances_)), X.columns, rotation='vertical')
plt.show()