In [None]:
!pip install catboost lightgbm optuna -q

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import roc_auc_score
import pandas as pd
import numpy as np
import optuna

from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

In [None]:
RANDOM_STATE = 111
DATASET_PATH = 'https://raw.githubusercontent.com/evgpat/edu_stepik_practical_ml/main/datasets/flight_delays_train.csv'

In [None]:
data = pd.read_csv(DATASET_PATH)

X = data.drop('dep_delayed_15min', axis=1)
y = data['dep_delayed_15min'] == 'Y'


X.head()

In [None]:
cat_features = np.where(X.dtypes == object)[0]

In [None]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.25, random_state=RANDOM_STATE)

## Модели с параметрами по умолчанию

In [None]:
model = CatBoostClassifier()
model.fit(Xtrain, ytrain, cat_features = cat_features)

pred = model.predict_proba(Xtest)[:, 1]

roc_auc_score(ytest, pred)

In [None]:
f_import = model.get_feature_importance()
f_names = X.columns
for score, name in sorted(zip(f_import, f_names), reverse = True):
  print('{}: {}'.format(name, score))

In [None]:
for c in X.columns:
    col_type = X[c].dtype
    if col_type == 'object' or col_type.name == 'category':
        Xtrain[c] = Xtrain[c].astype('category')
        Xtest[c] = Xtest[c].astype('category')

In [None]:
lgbm = LGBMClassifier()
lgbm.fit(Xtrain, ytrain)
pred = lgbm.predict_proba(Xtest)[:, 1]

roc_auc_score(ytest, pred)


## Optuna

In [None]:
Xtrain_new, Xval, ytrain_new, yval = train_test_split(Xtrain, ytrain, test_size=0.25, random_state=RANDOM_STATE)

In [None]:
def  objective_lgbm(trial):
  num_leaves = trial.suggest_int("num_leaves", 10, 100)
  n_estimators = trial.suggest_int("n_estimators", 10, 1000)

  lgbm1 = LGBMClassifier(num_leaves = num_leaves, n_estimators = n_estimators)
  lgbm1.fit(Xtrain_new, ytrain_new)
  pred1 = lgbm1.predict_proba(Xval)[:, 1]

  score = roc_auc_score(yval, pred1)


  return score

study = optuna.create_study(direction = 'maximize')
study.optimize(objective_lgbm, n_trials = 30)



In [None]:
study.best_params

Обучим модель с найденными гиперпараметрами на Xtrain, ytrain и оценим ROC-AUC на тестовых данных.

In [None]:
lgbm2 = LGBMClassifier(**study.best_params)
lgbm2.fit(Xtrain, ytrain)
pred2 = lgbm2.predict_proba(Xtest)[:, 1]

roc_auc_score(ytest, pred2)