In [1]:
import os
from dotenv import load_dotenv

import psycopg
import pandas as pd
import mlflow
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
import matplotlib.pyplot as plt
from sklearn.metrics import (
    roc_auc_score, 
    f1_score, 
    precision_score, 
    recall_score,
    confusion_matrix,
    log_loss)

In [2]:
load_dotenv()

True

In [3]:
TABLE_NAME = 'users_churn' # ваш код здесь
TRACKING_SERVER_HOST = '127.0.0.1' # ваш код здесь
TRACKING_SERVER_PORT = 5000 # ваш код здесь

EXPERIMENT_NAME = 'churn_fio' # ваш код здесь
RUN_NAME = 'model_grid_search' # ваш код здесь
REGISTRY_MODEL_NAME = 'churn_model_maximpetrov' # ваш код здесь

In [4]:
mlflow.set_tracking_uri(f'http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}')
mlflow.set_registry_uri(f'http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}')

In [5]:
connection = {'sslmode' : 'require', 'target_session_attrs' : 'read-write'}

postgres_credetials = {
    'dbname' : os.getenv('DB_DESTINATION_NAME'),
    'host' : os.getenv('DB_DESTINATION_HOST'),
    'port' : os.getenv('DB_DESTINATION_PORT'),
    'user' : os.getenv('DB_DESTINATION_USER'),
    'password' : os.getenv('DB_DESTINATION_PASSWORD'),
}

connection.update(postgres_credetials)

In [6]:
with psycopg.connect(**connection) as conn:
    with conn.cursor() as cur:
        cur.execute(f'SELECT * FROM {TABLE_NAME}')
        data = cur.fetchall()
        columns = [col[0] for col in cur.description]

        df = pd.DataFrame(data, columns=columns)

In [7]:
features = ["monthly_charges", "total_charges", "senior_citizen"]
target = "target"

split_column = 'total_charges' # ваш код здесь
stratify_column = 'target' # ваш код здесь
test_size = 0.25 # ваш код здесь

df = df.sort_values(by=[split_column])

In [8]:
X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=test_size, shuffle=False)

print(f"Размер выборки для обучения: {X_train.shape}")
print(f"Размер выборки для теста: {X_test.shape}")

Размер выборки для обучения: (5282, 3)
Размер выборки для теста: (1761, 3)


In [9]:

loss_function = "Logloss"
task_type = 'CPU'
random_seed = 0
iterations = 300
verbose = False

params = {  
    "learning_rate": [0.01, 0.1, 0.2],  # Темп обучения
    "depth": [4, 6, 8, 10],  # Глубина деревьев
    "l2_leaf_reg": [1, 3, 5, 10],  # L2-регуляризация
    "bagging_temperature": [0, 0.5, 1],  # Bootstrap
    }

model = CatBoostClassifier(
    loss_function=loss_function,
    task_type=task_type,
    random_seed=random_seed,
    iterations=iterations,
    verbose = verbose
    ) # ваш код здесь

In [10]:
cv = GridSearchCV(estimator=model, param_grid=params, cv=2, scoring='roc_auc', n_jobs=-1) # ваш код здесь

clf = cv.fit(X_train, y_train)

In [11]:
cv_results = pd.DataFrame(clf.cv_results_)# ваш код здесь

best_params = clf.best_params_ 

In [13]:
model_best =  CatBoostClassifier(
    loss_function=loss_function,
    task_type=task_type,
    random_seed=random_seed,
    iterations=iterations,
    verbose = verbose, 
    **best_params
    ) # ваш код здесь (объявите вашу модель с подобранными best_params)

model_best.fit(X_train, y_train)

<catboost.core.CatBoostClassifier at 0x7f912450e080>

In [14]:
prediction = model_best.predict(X_test)
probas = model_best.predict_proba(X_test)[:, 1]

In [15]:
# расчёт метрик качества
metrics = {}

_, err1, _, err2 = confusion_matrix(y_test, prediction, normalize='all').ravel()
auc = roc_auc_score(y_test, probas)
precision = precision_score(y_test, prediction)
recall = recall_score(y_test, prediction)
f1 = f1_score(y_test, prediction)
logloss = log_loss(y_test, prediction)

# сохранение метрик в словарь
metrics["err1"] = err1
metrics["err2"] = err2
metrics["auc"] = auc
metrics["precision"] = precision
metrics["recall"] = recall
metrics["f1"] = f1
metrics["logloss"] = logloss

# дополнительные метрики из результатов кросс-валидации
metrics['mean_fit_time'] = cv_results['mean_fit_time'].mean()# среднее время обучения
metrics['std_fit_time'] = cv_results['std_fit_time'].mean() # стандартное отклонение времени обучения
metrics['mean_test_score'] = cv_results['mean_test_score'].mean() # средний результат на тесте
metrics['std_test_score'] = cv_results['std_test_score'].mean() # стандартное отклонение результата на тесте
metrics['best_score'] = clf.best_score_ # лучший результат кросс-валидации

In [16]:
pip_requirements = '../requirements.txt'
signature = mlflow.models.infer_signature(X_test, prediction)
input_example = X_test[:10]

  inputs = _infer_schema(model_input) if model_input is not None else None


In [18]:
experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

In [24]:
with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    # ваш код здесь
    run_id = run.info.run_id
    
    mlflow.log_params(model_best.get_params())
    mlflow.log_metrics(metrics)
    
    cv_info = mlflow.sklearn.log_model(cv, artifact_path='cv')
    
    model_info = mlflow.catboost.log_model(
    cb_model=model_best,
    signature=signature,
    input_example=input_example,
    artifact_path="models",
    registered_model_name=REGISTRY_MODEL_NAME,
    await_registration_for=60,
    pip_requirements=pip_requirements)

Registered model 'churn_model_maximpetrov' already exists. Creating a new version of this model...
2025/02/11 11:03:54 INFO mlflow.tracking._model_registry.client: Waiting up to 60 seconds for model version to finish creation. Model name: churn_model_maximpetrov, version 17
Created version '17' of model 'churn_model_maximpetrov'.


In [25]:
run_id

'c844dbf2fc1d4c0580eb18dcb91cf046'