In [57]:
import os
import psycopg
from dotenv import load_dotenv

import pandas as pd
import numpy as np
import optuna
from optuna.integration.mlflow import MLflowCallback
import mlflow
from catboost import CatBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from collections import defaultdict
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import (
    confusion_matrix,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    log_loss
)

In [58]:
load_dotenv()

True

In [59]:
TABLE_NAME = 'users_churn'

TRACKING_SERVER_HOST = '127.0.0.1'
TRACKING_SERVER_PORT = 5000

EXPERIMENT_NAME = 'churn_fio'
RUN_NAME = 'model_bayesian_search'
REGISTRY_MODEL_NAME = 'churn_model_maximpetrov'

STUDY_DB_NAME = "sqlite:///local.study.db"
STUDY_NAME = "churn_model_gb"

In [60]:
connection = {'sslmode' : 'require', 'target_session_attrs' : 'read-write'}

postgres_credetials = {
    'dbname' : os.getenv('DB_DESTINATION_NAME'),
    'host' : os.getenv('DB_DESTINATION_HOST'),
    'port' : os.getenv('DB_DESTINATION_PORT'),
    'user' : os.getenv('DB_DESTINATION_USER'),
    'password' : os.getenv('DB_DESTINATION_PASSWORD')
}

connection.update(postgres_credetials)

In [61]:
mlflow.set_tracking_uri(f'http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}')
mlflow.set_registry_uri(f'http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}')

In [62]:
with psycopg.connect(**connection) as conn:
    with conn.cursor() as cur:
        cur.execute(f'SELECT * FROM {TABLE_NAME}')
        data = cur.fetchall()
        columns = [col[0] for col in cur.description]

        df = pd.DataFrame(data, columns=columns)

In [63]:
columns_without_datetime = df.select_dtypes(exclude='datetime').columns
df = df.dropna(subset=columns_without_datetime)

In [64]:
features = ["monthly_charges", "total_charges", "senior_citizen"]
target = "target"

split_column = 'total_charges' # ваш код здесь
stratify_column = 'target' # ваш код здесь
test_size = 0.25 # ваш код здесь

df = df.sort_values(by=[split_column])

In [65]:
X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=test_size, shuffle=False)

print(f"Размер выборки для обучения: {X_train.shape}")
print(f"Размер выборки для теста: {X_test.shape}")

Размер выборки для обучения: (3624, 3)
Размер выборки для теста: (1208, 3)


In [66]:
# Функция оптимизации
def objective(trial: optuna.Trial) -> float:
    param = {
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.1, log=True),
        "max_depth": trial.suggest_int("max_depth", 1, 12),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "n_estimators": trial.suggest_int("n_estimators", 50, 500)
    }

    model = GradientBoostingClassifier(**param)
    skf = StratifiedKFold(n_splits=2)
    metrics = defaultdict(list)

    metrics = defaultdict(list)
    for i, (train_index, val_index) in enumerate(skf.split(X_train, y_train)):
        # ваш код здесь #
        train_x = X_train.iloc[train_index]
        train_y = y_train.iloc[train_index]
        val_x = X_train.iloc[val_index]
        val_y = y_train.iloc[val_index]

        model.fit(train_x, train_y)
        prediction = model.predict(val_x)
        probas = model.predict_proba(val_x)[:, 1]

        _, err1, _, err2 = confusion_matrix(val_y, prediction, normalize='all').ravel()
        auc = roc_auc_score(val_y, probas)
        precision = precision_score(val_y, prediction)
        recall = recall_score(val_y, prediction)
        f1 = f1_score(val_y, prediction)
        logloss = log_loss(val_y, probas)
        
        metrics["err1"].append(err1)
        metrics["err2"].append(err2)
        metrics["auc"].append(auc)
        metrics["precision"].append(precision)
        metrics["recall"].append(recall)
        metrics["f1"].append(f1)
        metrics["logloss"].append(logloss)


        # ваш код здесь #

    metrics["err1"] = np.mean(metrics["err1"])
    metrics["err2"] = np.mean(metrics["err2"])
    metrics["auc"] = np.mean(metrics["auc"])
    metrics["precision"] = np.mean(metrics["precision"])
    metrics["recall"] = np.mean(metrics["recall"])
    metrics["f1"] = np.mean(metrics["f1"])
    metrics["logloss"] = np.mean(metrics["logloss"])

    return auc

In [67]:
experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)
if not experiment:
    experiment_id = mlflow.create_experiment(EXPERIMENT_NAME)
else:
    experiment_id = experiment.experiment_id

In [68]:
# Определяем родительский run ID
with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as parent_run:
    parent_run_id = parent_run.info.run_id

    # Создаем Optuna callback для использования MLflow
    mlflc = MLflowCallback(
        tracking_uri=f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}",
        metric_name="AUC",
        create_experiment=False,
        mlflow_kwargs={
            "experiment_id": experiment_id,
            "tags": {'MLFLOW_PARENT_RUN_ID': parent_run_id},
            "nested": True,  # Используйте вложенные запуски
        },
    )

    # Создаем Optuna study
    study = optuna.create_study(
        direction="maximize",
        study_name=STUDY_NAME,
        storage=STUDY_DB_NAME,
        sampler=optuna.samplers.TPESampler(),
        load_if_exists=True,  # Убедитесь, что не создается новая study
    )

    # Оптимизируем с указанием Optuna callback
    study.optimize(
        objective, 
        n_trials=10, 
        callbacks=[mlflc]
    )
    print(study.best_params)
    # Логирование лучшей модели в MLflow
    best_params = {**study.best_params}
    best_model = GradientBoostingClassifier(**best_params)
    best_model.fit(X_train, y_train)

    metrics = {}

    prediction = best_model.predict(X_test)
    probas = best_model.predict_proba(X_test)[:, 1]

    _, err1, _, err2 = confusion_matrix(y_test, prediction, normalize='all').ravel()
    auc = roc_auc_score(y_test, probas)
    precision = precision_score(y_test, prediction)
    recall = recall_score(y_test, prediction)
    f1 = f1_score(y_test, prediction)
    logloss = log_loss(y_test, probas)

    metrics["err1"] = err1
    metrics["err2"] = err2
    metrics["auc"] = auc
    metrics["precision"] = precision
    metrics["recall"] = recall
    metrics["f1"] = f1
    metrics["logloss"] = logloss

    signature = mlflow.models.infer_signature(X_test, best_model.predict(X_test))
    input_example = X_test[:10]

    mlflow.sklearn.log_model(
        best_model,
        artifact_path="cv",
        signature=signature,
        input_example=input_example,
    )
    mlflow.log_params(best_params)
    mlflow.log_metrics(metrics)

  mlflc = MLflowCallback(
[I 2025-02-13 07:56:52,139] A new study created in RDB with name: churn_model_gb
  _warn_prf(average, modifier, msg_start, len(result))
[I 2025-02-13 07:56:54,704] Trial 0 finished with value: 0.3901401487715425 and parameters: {'learning_rate': 0.019561606107542383, 'max_depth': 11, 'subsample': 0.7901793340078577, 'n_estimators': 136}. Best is trial 0 with value: 0.3901401487715425.
  _warn_prf(average, modifier, msg_start, len(result))
[I 2025-02-13 07:56:55,151] Trial 1 finished with value: 0.6213166396486508 and parameters: {'learning_rate': 0.03584558192753426, 'max_depth': 1, 'subsample': 0.706037769666984, 'n_estimators': 95}. Best is trial 1 with value: 0.6213166396486508.
  _warn_prf(average, modifier, msg_start, len(result))
[I 2025-02-13 07:56:56,855] Trial 2 finished with value: 0.6188039003028828 and parameters: {'learning_rate': 0.09502932665982165, 'max_depth': 2, 'subsample': 0.8795652491610209, 'n_estimators': 466}. Best is trial 1 with value

{'learning_rate': 0.04866311391547348, 'max_depth': 1, 'subsample': 0.6620849831564267, 'n_estimators': 113}


  _warn_prf(average, modifier, msg_start, len(result))
  inputs = _infer_schema(model_input) if model_input is not None else None
