In [7]:
import os

import psycopg
import pandas as pd
import numpy as np
import mlflow
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV

from sklearn.metrics import (precision_score, recall_score, f1_score, 
                             roc_auc_score, log_loss, confusion_matrix)

from dotenv import load_dotenv, find_dotenv

In [8]:
# подгружаем .env
load_dotenv()

True

In [9]:
# Считываем все креды
src_host = os.environ.get('DB_SOURCE_HOST')
src_port = os.environ.get('DB_SOURCE_PORT')
src_username = os.environ.get('DB_SOURCE_USER')
src_password = os.environ.get('DB_SOURCE_PASSWORD')
src_db = os.environ.get('DB_SOURCE_NAME') 

dst_host = os.environ.get('DB_DESTINATION_HOST')
dst_port = os.environ.get('DB_DESTINATION_PORT')
dst_username = os.environ.get('DB_DESTINATION_USER')
dst_password = os.environ.get('DB_DESTINATION_PASSWORD')
dst_db = os.environ.get('DB_DESTINATION_NAME')

s3_bucket = os.environ.get('S3_BUCKET_NAME')
s3_access_key = os.environ.get('AWS_ACCESS_KEY_ID')
s3_secret_access_key = os.environ.get('AWS_SECRET_ACCESS_KEY')

In [10]:
TABLE_NAME = "users_churn"

connection = {"sslmode": "require", "target_session_attrs": "read-write"}
postgres_credentials = {
    "host": os.getenv("DB_DESTINATION_HOST"),
    "port": os.getenv("DB_DESTINATION_PORT"),
    "dbname": os.getenv("DB_DESTINATION_NAME"),
    "user": os.getenv("DB_DESTINATION_USER"),
    "password": os.getenv("DB_DESTINATION_PASSWORD"),
}

connection.update(postgres_credentials)

with psycopg.connect(**connection) as conn:

    with conn.cursor() as cur:
        cur.execute(f"SELECT * FROM {TABLE_NAME}")
        data = cur.fetchall()
        columns = [col[0] for col in cur.description]

df = pd.DataFrame(data, columns=columns)

df.head(2)

Unnamed: 0,id,customer_id,begin_date,end_date,type,paperless_billing,payment_method,monthly_charges,total_charges,internet_service,...,device_protection,tech_support,streaming_tv,streaming_movies,gender,senior_citizen,partner,dependents,multiple_lines,target
0,6023,5025-GOOKI,2018-06-01,NaT,Month-to-month,Yes,Credit card (automatic),18.9,347.65,,...,,,,,Female,0,No,No,No,0
1,6024,4698-KVLLG,2015-11-01,NaT,Two year,No,Credit card (automatic),19.6,967.9,,...,,,,,Female,1,No,No,No,0


In [11]:
TRACKING_SERVER_HOST = "127.0.0.1"
TRACKING_SERVER_PORT = 5000

os.environ["MLFLOW_S3_ENDPOINT_URL"] = "https://storage.yandexcloud.net"
os.environ["AWS_ACCESS_KEY_ID"] = s3_access_key
os.environ["AWS_SECRET_ACCESS_KEY"] = s3_secret_access_key

mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")
mlflow.set_registry_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")

In [8]:
### GRID SEARCH

In [None]:
EXPERIMENT_NAME = 'model_grid_search_mmakarov'# ваш код здесь
RUN_NAME = 'model_grid_search' # ваш код здесь
REGISTRY_MODEL_NAME = 'Model_churn_mmakarov_GS'# ваш код здесь

features = ["monthly_charges", "total_charges", "senior_citizen"]
target = "target"

split_column = "monthly_charges"
stratify_column = 'target'
test_size = 2000

df = df.sort_values(by=[split_column])

X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], 
                                                    test_size=test_size, 
                                                    shuffle=False)

print(f"Размер выборки для обучения: {X_train.shape}")
print(f"Размер выборки для теста: {X_test.shape}")

loss_function = "Logloss"
task_type = 'CPU'
random_seed = 0
iterations = 300
verbose = False

params = {
    'depth': [3, 4, 5, 6, 7],
    'learning_rate': np.logspace(-3, -1, 5),
    'iterations': [10, 20, 30],
    'l2_leaf_reg': np.logspace(-2, 0, 3)
} 

model = CatBoostClassifier(verbose=verbose, 
                           loss_function=loss_function, 
                           random_seed=random_seed, 
                           task_type=task_type, 
                           iterations=iterations)# ваш код здесь

cv = GridSearchCV(cv=2, param_grid=params, n_jobs=-1, estimator=model)# ваш код здесь

clf = cv.fit(X_train, y_train) # ваш код здесь

cv_results = pd.DataFrame(clf.cv_results_)
best_params = clf.best_params_

model_best = CatBoostClassifier(**best_params, 
                                verbose=verbose, 
                                loss_function=loss_function, 
                                random_seed=random_seed, 
                                task_type=task_type)# ваш код здесь (объявите вашу модель с подобранными best_params)

model_best.fit(X_train, y_train)

prediction = model_best.predict(X_test)
probas = model_best.predict_proba(X_test)[:, 1]

# расчёт метрик качества
metrics = {}

_, err1, _, err2 = confusion_matrix(y_test, prediction, normalize='all').ravel()# ошибки первого и второго рода
auc = roc_auc_score(y_test, probas)# площадь под ROC-кривой
precision = precision_score(y_test, prediction)
recall = recall_score(y_test, prediction)
f1 = f1_score(y_test, prediction)
logloss = log_loss(y_test, prediction)

# сохранение метрик в словарь
metrics["err1"] = err1
metrics["err2"] = err2
metrics["auc"] = auc
metrics["precision"] = precision
metrics["recall"] = recall
metrics["f1"] = f1
metrics["logloss"] = logloss

# дополнительные метрики из результатов кросс-валидации
metrics['mean_fit_time'] = cv_results['mean_fit_time'].mean()# среднее время обучения
metrics['std_fit_time'] = cv_results['std_fit_time'].mean()# стандартное отклонение времени обучения
metrics['mean_test_score'] = cv_results['mean_test_score'].mean() # средний результат на тесте
metrics['std_test_score'] = cv_results['std_test_score'].mean()# стандартное отклонение результата на тесте
metrics['best_score'] = clf.best_score_# лучший результат кросс-валидации

# настройки для логирования в MLFlow
pip_requirements = '../requirements.txt'
signature = mlflow.models.infer_signature(X_test, prediction)
input_example = X_test[:10]

experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)
if experiment is None:
    experiment_id = mlflow.create_experiment(EXPERIMENT_NAME)
else:
    experiment_id = experiment.experiment_id
experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:

    run_id = run.info.run_id
    mlflow.log_metrics(metrics)
    mlflow.log_params(best_params)
    cv_info = mlflow.sklearn.log_model(cv, artifact_path='cv')
    model_info = mlflow.catboost.log_model(
        await_registration_for=60, 
        cb_model=model_best, 
        signature=signature, 
        artifact_path='models',
        registered_model_name=REGISTRY_MODEL_NAME, 
        input_example=input_example, 
        pip_requirements=pip_requirements)



Размер выборки для обучения: (5043, 3)
Размер выборки для теста: (2000, 3)


  inputs = _infer_schema(model_input) if model_input is not None else None
Successfully registered model 'Model_churn_mmakarov_GS'.
2025/03/23 10:54:34 INFO mlflow.tracking._model_registry.client: Waiting up to 60 seconds for model version to finish creation. Model name: Model_churn_mmakarov_GS, version 1
Created version '1' of model 'Model_churn_mmakarov_GS'.


In [6]:
### RANDOM SEARCH

In [None]:
EXPERIMENT_NAME = 'model_random_search_mmakarov'# ваш код здесь
RUN_NAME = 'model_random_search' # ваш код здесь
REGISTRY_MODEL_NAME = 'Model_churn_mmakarov_RS'# ваш код здесь

features = ["monthly_charges", "total_charges", "senior_citizen"]
target = "target"

split_column = "monthly_charges"
stratify_column = 'target'
test_size = 2000

df = df.sort_values(by=[split_column])

X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], 
                                                    test_size=test_size, 
                                                    shuffle=False)

print(f"Размер выборки для обучения: {X_train.shape}")
print(f"Размер выборки для теста: {X_test.shape}")

loss_function = "Logloss"
task_type = 'CPU'
random_seed = 0
iterations = 300
verbose = False

params = {
    'depth': [3, 4, 5, 6, 7],
    'learning_rate': np.logspace(-3, -1, 5),
    'iterations': [10, 20, 30],
    'l2_leaf_reg': np.logspace(-2, 0, 3)
} 

model = CatBoostClassifier(verbose=verbose, 
                           loss_function=loss_function, 
                           random_seed=random_seed, 
                           task_type=task_type, 
                           iterations=iterations)# ваш код здесь

cv = RandomizedSearchCV(cv=2, 
                        n_jobs=-1, n_iter=20, 
                        param_distributions=params, 
                        estimator=model)

clf = cv.fit(X_train, y_train) 

cv_results = pd.DataFrame(clf.cv_results_)
best_params = clf.best_params_

model = CatBoostClassifier(**best_params, 
                           verbose=verbose, 
                           loss_function=loss_function, 
                           random_seed=random_seed, 
                           iterations=iterations, 
                           task_type=task_type)

model.fit(X_train, y_train)

prediction = model.predict(X_test)
probas = model.predict_proba(X_test)[:, 1]

# расчёт метрик качества
metrics = {}

_, err1, _, err2 = confusion_matrix(y_test, prediction, normalize='all').ravel()
auc = roc_auc_score(y_test, probas)# площадь под ROC-кривой
precision = precision_score(y_test, prediction)
recall = recall_score(y_test, prediction)
f1 = f1_score(y_test, prediction)
logloss = log_loss(y_test, prediction)

# сохранение метрик в словарь
metrics["err1"] = err1
metrics["err2"] = err2
metrics["auc"] = auc
metrics["precision"] = precision
metrics["recall"] = recall
metrics["f1"] = f1
metrics["logloss"] = logloss

# дополнительные метрики из результатов кросс-валидации
metrics['mean_fit_time'] = cv_results['mean_fit_time'].mean()# среднее время обучения
metrics['std_fit_time'] = cv_results['std_fit_time'].mean()# стандартное отклонение времени обучения
metrics['mean_test_score'] = cv_results['mean_test_score'].mean() # средний результат на тесте
metrics['std_test_score'] = cv_results['std_test_score'].mean()# стандартное отклонение результата на тесте
metrics['best_score'] = clf.best_score_# лучший результат кросс-валидации

# настройки для логирования в MLFlow
pip_requirements = '../requirements.txt'
signature = mlflow.models.infer_signature(X_test, prediction)
input_example = X_test[:10]

experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)
if experiment is None:
    experiment_id = mlflow.create_experiment(EXPERIMENT_NAME)
else:
    experiment_id = experiment.experiment_id
experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:

    run_id = run.info.run_id
    mlflow.log_metrics(metrics)
    mlflow.log_params(best_params)
    cv_info = mlflow.sklearn.log_model(cv, artifact_path='cv')
    model_info = mlflow.catboost.log_model(
        await_registration_for=60, cb_model=model, signature=signature, artifact_path='models', registered_model_name=REGISTRY_MODEL_NAME, input_example=input_example, pip_requirements=pip_requirements)

Размер выборки для обучения: (5043, 3)
Размер выборки для теста: (2000, 3)


TypeError: RandomizedSearchCV.__init__() got an unexpected keyword argument 'param_grid'