In [1]:
import os
import psycopg
import mlflow
import pandas as pd
import numpy as np
import optuna
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_score, recall_score, f1_score, log_loss, roc_auc_score, confusion_matrix
from statistics import median
from collections import defaultdict
from optuna.integration.mlflow import MLflowCallback
from dotenv import load_dotenv
from sklearn.model_selection import train_test_split

load_dotenv()


* 'schema_extra' has been renamed to 'json_schema_extra'
  from .autonotebook import tqdm as notebook_tqdm


True

In [2]:
TABLE_NAME = "users_churn"
connection = {"sslmode": "require", "target_session_attrs": "read-write"}
postgres_credentials = {
    "host": os.getenv("DB_DESTINATION_HOST"),
    "port": os.getenv("DB_DESTINATION_PORT"),
    "dbname": os.getenv("DB_DESTINATION_NAME"),
    "user": os.getenv("DB_DESTINATION_USER"),
    "password": os.getenv("DB_DESTINATION_PASSWORD"),
}
connection.update(postgres_credentials)
with psycopg.connect(**connection) as conn:

    with conn.cursor() as cur:
        cur.execute(f"SELECT * FROM {TABLE_NAME}")
        data = cur.fetchall()
        columns = [col[0] for col in cur.description]
df = pd.DataFrame(data, columns=columns)
df.head()

Unnamed: 0,id,customer_id,begin_date,end_date,type,paperless_billing,payment_method,monthly_charges,total_charges,internet_service,...,device_protection,tech_support,streaming_tv,streaming_movies,gender,senior_citizen,partner,dependents,multiple_lines,target
0,1,7590-VHVEG,2020-01-01,NaT,Month-to-month,Yes,Electronic check,29.85,29.85,DSL,...,No,No,No,No,Female,0,Yes,No,,0
1,2,5575-GNVDE,2017-04-01,NaT,One year,No,Mailed check,56.95,1889.5,DSL,...,Yes,No,No,No,Male,0,No,No,No,0
2,3,3668-QPYBK,2019-10-01,2019-12-01,Month-to-month,Yes,Mailed check,53.85,108.15,DSL,...,No,No,No,No,Male,0,No,No,No,1
3,4,7795-CFOCW,2016-05-01,NaT,One year,No,Bank transfer (automatic),42.3,1840.75,DSL,...,Yes,Yes,No,No,Male,0,No,No,,0
4,5,9237-HQITU,2019-09-01,2019-11-01,Month-to-month,Yes,Electronic check,70.7,151.65,Fiber optic,...,No,No,No,No,Female,0,No,No,No,1


In [3]:
features = ["monthly_charges", "total_charges", "senior_citizen"]
target = "target"

split_column = "begin_date"
stratify_column = ["type"]
test_size = 0.2
df = df.sort_values(by=[split_column])

X_train = df[features]
y_train = df[target]

In [4]:
def objective(trial: optuna.Trial) -> float:

    param = {
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.1, log=True),
        "depth": trial.suggest_int("depth", 1, 12),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 0.1, 5),
        "random_strength": trial.suggest_float("random_strength", 0.1, 5),
        "loss_function": "Logloss",
        "task_type": "CPU",
        "random_seed": 0,
        "iterations": 300,
        "verbose": False,
    }
    
    model = CatBoostClassifier(**param)

    skf = StratifiedKFold(n_splits=2)

    metrics = defaultdict(list)
    for i, (train_index, val_index) in enumerate(skf.split(X_train, y_train)):
        
        train_x = X_train.iloc[train_index]
        train_y = y_train.iloc[train_index]
        val_x = X_train.iloc[val_index]
        val_y = y_train.iloc[val_index]
        
        model.fit(train_x, train_y)
        prediction = model.predict(val_x)
        probas = model.predict_proba(val_x)[:, 1]
        
        _, err1, _, err2 = confusion_matrix(val_y, prediction, normalize='all').ravel()
        auc = roc_auc_score(val_y, probas)
        precision = precision_score(val_y, prediction)
        recall = recall_score(val_y, prediction)
        f1 = f1_score(val_y, prediction)
        logloss = log_loss(val_y, prediction)
        
        metrics["err1"].append(err1)
        metrics["err2"].append(err2)
        metrics["auc"].append(auc)
        metrics["precision"].append(precision)
        metrics["recall"].append(recall)
        metrics["f1"].append(f1)
        metrics["logloss"].append(logloss)

    auc = median(metrics['auc'])
    return auc

In [5]:
TRACKING_SERVER_HOST = "127.0.0.1"
TRACKING_SERVER_PORT = 5000

EXPERIMENT_NAME = "churn_marselkamilov_OPTUNA_50"
RUN_NAME = 'model_bayesian_search'

STUDY_DB_NAME = "sqlite:///local.study.db"
STUDY_NAME = "churn_model_optuned19"

REGISTRY_MODEL_NAME = "OPTUNA_model"


# os.environ["MLFLOW_S3_ENDPOINT_URL"] = "https://storage.yandexcloud.net"
# os.environ["AWS_ACCESS_KEY_ID"] = os.getenv("S3_ACCESS_KEY")
# os.environ["AWS_SECRET_ACCESS_KEY"] = os.getenv("S3_SECRET_KEY")

os.environ["MLFLOW_S3_ENDPOINT_URL"] = "https://storage.yandexcloud.net" #endpoint бакета от YandexCloud
os.environ["AWS_ACCESS_KEY_ID"] = os.getenv("AWS_ACCESS_KEY_ID") # получаем id ключа бакета, к которому подключён MLFlow, из .env
os.environ["AWS_SECRET_ACCESS_KEY"] = os.getenv("AWS_SECRET_ACCESS_KEY") # получаем ключ бакета, к которому подключён MLFlow, из .env

mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")
mlflow.set_registry_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")

In [6]:

# experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)
# if not experiment:
#     experiment_id = mlflow.create_experiment(EXPERIMENT_NAME)
# else:
#     experiment_id = experiment.experiment_id
experiment_id = mlflow.create_experiment(EXPERIMENT_NAME)

In [7]:
with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id

mlflc = MLflowCallback(
    tracking_uri=f'http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}',
    metric_name="AUC",
    create_experiment=False,
    mlflow_kwargs={'experiment_id': experiment_id, 'tags': {'mlflow.parentRunId': run_id}}
)

study = optuna.create_study(direction='maximize', 
                            study_name=STUDY_NAME, 
                            storage=STUDY_DB_NAME, 
                            sampler=optuna.samplers.TPESampler(), 
                            load_if_exists=True,
                            )

study.optimize(objective, n_trials=10, callbacks=[mlflc]) 

  mlflc = MLflowCallback(
[I 2024-11-02 10:01:31,336] A new study created in RDB with name: churn_model_optuned19
  _warn_prf(average, modifier, msg_start, len(result))
[I 2024-11-02 10:01:32,265] Trial 0 finished with value: 0.7843215277684671 and parameters: {'learning_rate': 0.006237796033631818, 'depth': 1, 'l2_leaf_reg': 3.369899590602935, 'random_strength': 4.866860635297008}. Best is trial 0 with value: 0.7843215277684671.
  _warn_prf(average, modifier, msg_start, len(result))
[I 2024-11-02 10:01:33,313] Trial 1 finished with value: 0.7704577178147163 and parameters: {'learning_rate': 0.010018739340919514, 'depth': 1, 'l2_leaf_reg': 2.5560547256944335, 'random_strength': 1.134085928216683}. Best is trial 0 with value: 0.7843215277684671.
[I 2024-11-02 10:01:34,922] Trial 2 finished with value: 0.8209064645826171 and parameters: {'learning_rate': 0.04240174215735648, 'depth': 6, 'l2_leaf_reg': 0.13543169357323853, 'random_strength': 4.431133096149785}. Best is trial 2 with value:

In [10]:
mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")
mlflow.set_registry_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")

model_best = CatBoostClassifier(**study.best_params)

X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=test_size, shuffle=False)
model_best.fit(X_train, y_train)
prediction = model_best.predict(X_test)
probas = model_best.predict_proba(X_test)[:, 1]
pip_requirements = 'requirements.txt'
signature = mlflow.models.infer_signature(X_test, prediction)
input_example = X_test[:10]

with mlflow.start_run(run_id='ef93f34d013a41d4ba5ef429cca1c3ff') as run:
    run_id = run.info.run_id

    mlflow.log_params(study.best_params)
    mlflow.log_metric(key="best_value", value =study.best_value)

    cv_info = mlflow.sklearn.log_model(model_best,"cv")
                                           
    model_info = mlflow.catboost.log_model(cb_model=model_best,
        artifact_path="models",
        signature=signature,
        input_example=input_example,
        registered_model_name=REGISTRY_MODEL_NAME,
        await_registration_for=60,
        pip_requirements=pip_requirements)

0:	learn: 0.6881311	total: 1.39ms	remaining: 1.39s
1:	learn: 0.6811907	total: 3.06ms	remaining: 1.53s
2:	learn: 0.6747003	total: 4.46ms	remaining: 1.48s
3:	learn: 0.6684645	total: 6.1ms	remaining: 1.52s
4:	learn: 0.6637177	total: 7.47ms	remaining: 1.49s
5:	learn: 0.6580374	total: 8.99ms	remaining: 1.49s
6:	learn: 0.6524582	total: 10.6ms	remaining: 1.5s
7:	learn: 0.6471929	total: 12.2ms	remaining: 1.51s
8:	learn: 0.6416332	total: 13.9ms	remaining: 1.53s
9:	learn: 0.6357566	total: 15.6ms	remaining: 1.54s
10:	learn: 0.6310403	total: 19.3ms	remaining: 1.73s
11:	learn: 0.6274180	total: 20.7ms	remaining: 1.71s
12:	learn: 0.6223383	total: 22.4ms	remaining: 1.7s
13:	learn: 0.6174854	total: 24ms	remaining: 1.69s
14:	learn: 0.6134294	total: 25.7ms	remaining: 1.69s
15:	learn: 0.6088117	total: 27.3ms	remaining: 1.68s
16:	learn: 0.6043216	total: 29.2ms	remaining: 1.69s
17:	learn: 0.6002244	total: 33.4ms	remaining: 1.82s
18:	learn: 0.5972675	total: 35.2ms	remaining: 1.82s
19:	learn: 0.5939494	total:

  inputs = _infer_schema(model_input) if model_input is not None else None
Registered model 'OPTUNA_model' already exists. Creating a new version of this model...
2024/11/02 10:04:40 INFO mlflow.tracking._model_registry.client: Waiting up to 60 seconds for model version to finish creation. Model name: OPTUNA_model, version 8
Created version '8' of model 'OPTUNA_model'.


In [None]:
# model_info.model_uri

In [None]:
# # loaded_model = mlflow.catboost.load_model(model_uri=model_info.model_uri) 
# loaded_model = mlflow.catboost.load_model(model_uri='runs:/9c1ce26fa90d465bb64f6a570135fb81/artifacts/cv')

In [8]:
run = mlflow.get_run(run_id) # ваш код здесь

print("EXPERIMENT_NAME: ", EXPERIMENT_NAME)
print("experiment_id: ", experiment_id)
print("run_id: ", run_id)

EXPERIMENT_NAME:  churn_marselkamilov_OPTUNA_50
experiment_id:  65
run_id:  ef93f34d013a41d4ba5ef429cca1c3ff
