# Создание и обучение моделей МО

In [1]:
import pandas as pd
import numpy as np
import category_encoders as ce
import pickle

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, accuracy_score, log_loss, brier_score_loss, roc_curve

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier, XGBRFClassifier

import os
import warnings
warnings.filterwarnings('ignore')

# mlflow.sklearn.autolog()
# import mlflow

## Выделение выборок

In [2]:
# Загрузка данных

df_train = pd.read_csv('data/train_data_processed.csv')
df_test = pd.read_csv('data/test_data_processed.csv')

In [3]:
# Выделение выборки валидации из тренировочной части

df_train, df_valid = train_test_split(df_train, test_size=0.2, random_state=42)

In [4]:
# Сохраним валидационный датасет

df_valid.to_csv('data/valid_data_processed.csv')

In [5]:
feature_columns = [
    'Client_age', 'Gender', 'Numb_of_Prod', 'Salary',
    'HasCrCard', 'Numb_of_years', 'CreditScore', 'Balance', 'IsActiveMember',
]
target_col = 'res'

### Параметры моделей

___Параметры для поиска оптимальной комбинации значений при обучении модели___

In [6]:
params_xgb = {
    'n_estimators': np.arange(1, 100, 25),
    'max_depth': np.arange(1, 10, 3),
    'max_leaves': np.arange(0, 20, 3),
    # 'grow_policy': [0, 1],
    'learning_rate': np.arange(0.1, 1, 0.3),
    # 'booster': ['gbtree', 'gblinear', 'dart'],
    # 'subsample': np.arange(0.6, 0.8, 0.1),
    # 'colsample_bytree': np.arange(0.6, 0.8, 0.1),
    # 'colsample_bylevel': np.arange(0.6, 0.8, 0.1),
    # 'colsample_bynode': np.arange(0.6, 0.8, 0.1),
    'reg_alpha': np.arange(0, 1, 0.5),
    'reg_lambda': np.arange(0, 1, 0.5),
}

params_lgb = {
    'n_estimators': np.arange(1, 100, 25),
    'max_depth': np.arange(1, 10, 3),
    'num_leaves': np.arange(0, 20, 3),
    'learning_rate': np.arange(0.1, 1, 0.3),
    # 'boosting_type': ['gbdt', 'goss', 'dart'],
    # 'subsample': np.arange(0.6, 0.8, 0.1),
    # 'colsample_bytree': np.arange(0.6, 0.8, 0.1),
    'reg_alpha': np.arange(0, 1, 0.5),
    'reg_lambda': np.arange(0, 1, 0.5),
}

params_rf = {
    'n_estimators': np.arange(1, 100, 50),
    'max_leaves': np.arange(0, 20, 10),
    'learning_rate': np.arange(0.1, 1, 0.3),
    'reg_alpha': np.arange(0, 1, 0.5),
    'reg_lambda': np.arange(0, 1, 0.5),
}

params_log = {
    'l1_ratio': np.arange(0, 1, 0.2),
    'C': np.arange(0.001, 0.01, 0.001),
}

___Описание моделей___

In [7]:
# Создание моделей
# Поиск наилучших параметров будет осуществляться на основе ранее заданных словарей
# Оценка производится по метрике ROC AUC, используя кросс-валидацию с 4 фолдами 
# Результаты и процесс обучения выводятся в консоль благодаря параметру verbose=2


model_xgb = GridSearchCV(
    XGBClassifier(
        n_jobs=-1,
        random_state=42,
        eval_metric=roc_auc_score,
        early_stopping_rounds=5,
        colsample_bytree=0.8,
        colsample_bylevel=0.8,
        colsample_bynode=0.8,
        subsample=0.8,
    ),
    param_grid=params_xgb,
    scoring='roc_auc',
    cv=4,
    verbose=2,
)

model_lgb = GridSearchCV(
    LGBMClassifier(
        n_jobs=-1,
        random_state=42,
        subsample=0.8,
        colsample_bytree=0.8,
    ),
    param_grid=params_lgb,
    scoring='roc_auc',
    cv=4,
    verbose=2,
)

model_rf = GridSearchCV(
    XGBRFClassifier(
        n_jobs=-1,
        random_state=42,
        eval_metric=roc_auc_score,
        colsample_bytree=0.8,
        colsample_bylevel=0.8,
        colsample_bynode=0.8,
        subsample=0.8,
        max_depth=8
    ),
    param_grid=params_rf,
    scoring='roc_auc',
    cv=4,
    verbose=2,
)

model_log = GridSearchCV(
    LogisticRegression(
        penalty='elasticnet',
        max_iter=300,
        solver='saga',
    ),
    param_grid=params_log,
    scoring='roc_auc',
    cv=4,
    verbose=2,
)

### Обучение и сохранение моделей

___Обучение моделей___

In [8]:
# Обучение XGB бустинга
"""model_xgb.fit(
    X=df_train[feature_columns],
    y=df_train[target_col],
    eval_set=[(df_valid[feature_columns], df_valid[target_col])],
)"""

'model_xgb.fit(\n    X=df_train[feature_columns],\n    y=df_train[target_col],\n    eval_set=[(df_valid[feature_columns], df_valid[target_col])],\n)'

In [9]:
# Обучение LGBM бустинга
"""model_lgb.fit(
    X=df_train[feature_columns],
    y=df_train[target_col],
    eval_set=[(df_valid[feature_columns], df_valid[target_col])],
)"""

'model_lgb.fit(\n    X=df_train[feature_columns],\n    y=df_train[target_col],\n    eval_set=[(df_valid[feature_columns], df_valid[target_col])],\n)'

In [10]:
# Обучение XGB случайного леса
"""model_rf.fit(
    X=df_train[feature_columns],
    y=df_train[target_col],
    eval_set=[(df_valid[feature_columns], df_valid[target_col])],
)"""

'model_rf.fit(\n    X=df_train[feature_columns],\n    y=df_train[target_col],\n    eval_set=[(df_valid[feature_columns], df_valid[target_col])],\n)'

In [11]:
# Обучение логистической регрессии
"""model_log.fit(
    X=df_train[feature_columns],
    y=df_train[target_col],
)"""

'model_log.fit(\n    X=df_train[feature_columns],\n    y=df_train[target_col],\n)'

___Сохранение моделей___

In [12]:
# Сохранение моделей

"""with open('models/model_xgb.pkl','wb') as f:
     pickle.dump(model_xgb.best_estimator_, f)
with open('models/model_lgb.pkl','wb') as f:
     pickle.dump(model_lgb.best_estimator_, f)
with open('models/model_rf.pkl','wb') as f:
     pickle.dump(model_rf.best_estimator_, f)
with open('models/model_log.pkl','wb') as f:
     pickle.dump(model_log.best_estimator_, f)"""

"with open('models/model_xgb.pkl','wb') as f:\n     pickle.dump(model_xgb.best_estimator_, f)\nwith open('models/model_lgb.pkl','wb') as f:\n     pickle.dump(model_lgb.best_estimator_, f)\nwith open('models/model_rf.pkl','wb') as f:\n     pickle.dump(model_rf.best_estimator_, f)\nwith open('models/model_log.pkl','wb') as f:\n     pickle.dump(model_log.best_estimator_, f)"