In [1]:
import os
import psycopg
import mlflow
import pandas as pd
from catboost import CatBoostClassifier
from dotenv import load_dotenv

load_dotenv()

TABLE_NAME = "users_churn"
TRACKING_SERVER_HOST = "127.0.0.1"
TRACKING_SERVER_PORT = 5000

EXPERIMENT_NAME = "churn_marselkamilov_HYPERSEARCH_cv"
RUN_NAME = 'model_random_search'
REGISTRY_MODEL_NAME = "churn_marselkamilov_HYPERSEARCH_train_rs"


* 'schema_extra' has been renamed to 'json_schema_extra'


In [2]:
connection = {"sslmode": "require", "target_session_attrs": "read-write"}
postgres_credentials = {
    "host": os.getenv("DB_DESTINATION_HOST"),
    "port": os.getenv("DB_DESTINATION_PORT"),
    "dbname": os.getenv("DB_DESTINATION_NAME"),
    "user": os.getenv("DB_DESTINATION_USER"),
    "password": os.getenv("DB_DESTINATION_PASSWORD"),
}
connection.update(postgres_credentials)
with psycopg.connect(**connection) as conn:

    with conn.cursor() as cur:
        cur.execute(f"SELECT * FROM {TABLE_NAME}")
        data = cur.fetchall()
        columns = [col[0] for col in cur.description]
df = pd.DataFrame(data, columns=columns)
df.head()

Unnamed: 0,id,customer_id,begin_date,end_date,type,paperless_billing,payment_method,monthly_charges,total_charges,internet_service,...,device_protection,tech_support,streaming_tv,streaming_movies,gender,senior_citizen,partner,dependents,multiple_lines,target
0,1,7590-VHVEG,2020-01-01,NaT,Month-to-month,Yes,Electronic check,29.85,29.85,DSL,...,No,No,No,No,Female,0,Yes,No,,0
1,2,5575-GNVDE,2017-04-01,NaT,One year,No,Mailed check,56.95,1889.5,DSL,...,Yes,No,No,No,Male,0,No,No,No,0
2,3,3668-QPYBK,2019-10-01,2019-12-01,Month-to-month,Yes,Mailed check,53.85,108.15,DSL,...,No,No,No,No,Male,0,No,No,No,1
3,4,7795-CFOCW,2016-05-01,NaT,One year,No,Bank transfer (automatic),42.3,1840.75,DSL,...,Yes,Yes,No,No,Male,0,No,No,,0
4,5,9237-HQITU,2019-09-01,2019-11-01,Month-to-month,Yes,Electronic check,70.7,151.65,Fiber optic,...,No,No,No,No,Female,0,No,No,No,1


In [3]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV

features = ["monthly_charges", "total_charges", "senior_citizen"]
target = "target"

split_column = "begin_date"
stratify_column = ["type"]
test_size = 0.2
df = df.sort_values(by=[split_column])

X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=test_size, shuffle=False)

print(f"Размер выборки для обучения: {X_train.shape}")
print(f"Размер выборки для теста: {X_test.shape}")

Размер выборки для обучения: (5634, 3)
Размер выборки для теста: (1409, 3)


In [5]:
import numpy as np

loss_function = "Logloss"
task_type = 'CPU'
random_seed = 0
iterations = 300
verbose = False

model = CatBoostClassifier(loss_function=loss_function,
                           iterations=iterations,
                          task_type=task_type,
                          random_seed = random_seed,
                         verbose=verbose)

params = {
    'depth': [3, 4, 5, 6, 7],
    'learning_rate': np.logspace(-3, -1, 5),
    'iterations': [10, 20, 30],
    'l2_leaf_reg': np.logspace(-2, 0, 3)
} 

cv = RandomizedSearchCV(model, 
                  params, 
                  cv=2, 
                  n_jobs=-1, 
                  verbose = False)

clf = cv.fit(X_train, y_train)


In [6]:
# выведите лучшие параметры и оценку точности
print("Лучшие гиперпараметры:", clf.best_params_)
print("Лучший счет:", clf.best_score_)

Лучшие гиперпараметры: {'learning_rate': 0.0031622776601683794, 'l2_leaf_reg': 0.09999999999999999, 'iterations': 30, 'depth': 4}
Лучший счет: 0.7405040823571175


In [7]:
# os.environ["MLFLOW_S3_ENDPOINT_URL"] = "https://storage.yandexcloud.net"
# os.environ["AWS_ACCESS_KEY_ID"] = os.getenv("S3_ACCESS_KEY")
# os.environ["AWS_SECRET_ACCESS_KEY"] = os.getenv("S3_SECRET_KEY")

os.environ["MLFLOW_S3_ENDPOINT_URL"] = "https://storage.yandexcloud.net" #endpoint бакета от YandexCloud
os.environ["AWS_ACCESS_KEY_ID"] = os.getenv("AWS_ACCESS_KEY_ID") # получаем id ключа бакета, к которому подключён MLFlow, из .env
os.environ["AWS_SECRET_ACCESS_KEY"] = os.getenv("AWS_SECRET_ACCESS_KEY") # получаем ключ бакета, к которому подключён MLFlow, из .env

mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")
mlflow.set_registry_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")

cv_results = pd.DataFrame(clf.cv_results_)

best_params = clf.best_params_

model_best = CatBoostClassifier(**best_params,
                               loss_function=loss_function,
                                # iterations=iterations,
                                task_type=task_type,
                                random_seed = random_seed,
                                verbose=verbose)

model_best.fit(X_train, y_train)

prediction = model_best.predict(X_test)
probas = model_best.predict_proba(X_test)[:, 1]

In [8]:
from sklearn.metrics import precision_score, recall_score, f1_score, log_loss, roc_auc_score, confusion_matrix

metrics = {}
_, err1, _, err2 = confusion_matrix(y_test, prediction, normalize='all').ravel()
auc = roc_auc_score(y_test, probas)
precision = precision_score(y_test, prediction)
recall = recall_score(y_test, prediction)
f1 = f1_score(y_test, prediction)
logloss = log_loss(y_test, prediction)

metrics["err1"] = err1
metrics["err2"] = err2
metrics["auc"] = auc
metrics["precision"] = precision
metrics["recall"] = recall
metrics["f1"] = f1
metrics["logloss"] = logloss

# дополнительные метрики из результатов кросс-валидации
metrics['mean_fit_time'] = cv_results['mean_fit_time'].mean()
metrics['std_fit_time'] = cv_results['std_fit_time'].mean()
metrics['mean_test_score'] = cv_results['mean_test_score'].mean()
metrics['std_test_score'] = cv_results['std_test_score'].mean()
metrics["best_score"] = clf.best_score_

In [9]:
# настройки для логирования в MLFlow
pip_requirements = 'requirements.txt'
# pip_requirements = '../requirements.txt'
signature = mlflow.models.infer_signature(X_test, prediction)
input_example = X_test[:10]

try:
    experiment_id = mlflow.create_experiment(EXPERIMENT_NAME)
except:
    experiment_id = mlflow.set_experiment(EXPERIMENT_NAME).experiment_id

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id
    mlflow.log_metrics(metrics) 
    mlflow.log_params(best_params)
    cv_info = mlflow.sklearn.log_model(cv, artifact_path='cv')
    model_info = mlflow.catboost.log_model(cb_model=model_best,
        artifact_path="cv",
        signature=signature,
        input_example=input_example,
        registered_model_name=REGISTRY_MODEL_NAME,
        await_registration_for=60,
        pip_requirements=pip_requirements)

  inputs = _infer_schema(model_input) if model_input is not None else None
Successfully registered model 'churn_marselkamilov_HYPERSEARCH_train_rs'.
2024/10/29 19:08:39 INFO mlflow.tracking._model_registry.client: Waiting up to 60 seconds for model version to finish creation. Model name: churn_marselkamilov_HYPERSEARCH_train_rs, version 1
Created version '1' of model 'churn_marselkamilov_HYPERSEARCH_train_rs'.
