In [1]:
# Загрузка данных
import psycopg
import pandas as pd
from dotenv import load_dotenv
import os
load_dotenv()

connection = {"sslmode": "require", "target_session_attrs": "read-write"}
postgres_credentials = {
    "host": os.environ["DB_DESTINATION_HOST"], 
    "port": os.environ["DB_DESTINATION_PORT"],
    "dbname": os.environ["DB_DESTINATION_NAME"],
    "user": os.environ["DB_DESTINATION_USER"],
    "password": os.environ["DB_DESTINATION_PASSWORD"],
}

assert all([var_value != "" for var_value in list(postgres_credentials.values())])
connection.update(postgres_credentials)

TABLE_NAME = "clean_users_churn"
with psycopg.connect(**connection) as conn:
    with conn.cursor() as cur:
        cur.execute(f"SELECT * FROM {TABLE_NAME}")
        data = cur.fetchall()
        columns = [col[0] for col in cur.description]

data = pd.DataFrame(data, columns=columns)

In [2]:
# Обучение модели
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from catboost import CatBoostClassifier
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from category_encoders import CatBoostEncoder

cat_features = data.select_dtypes(include='object')
potential_binary_features = cat_features.nunique() == 2

binary_cat_features = cat_features[potential_binary_features[potential_binary_features].index]
other_cat_features = cat_features[potential_binary_features[~potential_binary_features].index]
num_features = data.select_dtypes(['float'])

binary_cols = binary_cat_features.columns.tolist()
non_binary_cat_cols = other_cat_features.columns.tolist()
num_cols = num_features.columns.tolist()

X_tr, X_val, y_tr, y_test = train_test_split(data, data['target'], stratify=data['target'])

preprocessor = ColumnTransformer(
    [
    ('binary', OneHotEncoder(drop='if_binary'), binary_cols),
    ('cat', CatBoostEncoder(), non_binary_cat_cols),
    ('num', StandardScaler(), num_cols)
    ],
    remainder='drop',
    verbose_feature_names_out=False
)
model = CatBoostClassifier(auto_class_weights='Balanced')

# создайте пайплайн
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', model)]
)

# обучите пайплайн
pipeline.fit(X_tr, y_tr)

# получите предсказания для тестовой выборки
prediction = pipeline.predict(X_val)
probas = pipeline.predict_proba(X_val)

Learning rate set to 0.020938
0:	learn: 0.6842135	total: 49ms	remaining: 49s
1:	learn: 0.6771797	total: 52.8ms	remaining: 26.4s
2:	learn: 0.6685509	total: 55.7ms	remaining: 18.5s
3:	learn: 0.6598897	total: 58.6ms	remaining: 14.6s
4:	learn: 0.6521227	total: 61.6ms	remaining: 12.3s
5:	learn: 0.6453220	total: 64.4ms	remaining: 10.7s
6:	learn: 0.6392787	total: 67.1ms	remaining: 9.52s
7:	learn: 0.6339055	total: 70.2ms	remaining: 8.71s
8:	learn: 0.6268729	total: 73.1ms	remaining: 8.05s
9:	learn: 0.6217045	total: 76ms	remaining: 7.53s
10:	learn: 0.6151740	total: 79.1ms	remaining: 7.11s
11:	learn: 0.6088583	total: 82.1ms	remaining: 6.76s
12:	learn: 0.6027748	total: 85.8ms	remaining: 6.51s
13:	learn: 0.5978981	total: 88.5ms	remaining: 6.24s
14:	learn: 0.5935511	total: 91.4ms	remaining: 6s
15:	learn: 0.5882329	total: 94.4ms	remaining: 5.81s
16:	learn: 0.5840079	total: 97.3ms	remaining: 5.63s
17:	learn: 0.5800999	total: 100ms	remaining: 5.47s
18:	learn: 0.5769146	total: 103ms	remaining: 5.33s
19:

In [3]:
model = pipeline['model']
preprocessor = pipeline['preprocessor']

In [4]:
# Метрики
from sklearn.metrics import precision_score, recall_score, f1_score, log_loss, roc_auc_score, confusion_matrix

metrics = {}
_, err1, _, err2 = confusion_matrix(y_test, prediction, normalize='all').ravel()
auc = roc_auc_score(y_test.tolist(), probas[:,1])
precision = precision_score(y_test, prediction)
recall = recall_score(y_test, prediction)
f1 = f1_score(y_test, prediction)
logloss = log_loss(y_test, prediction)

metrics["err1"] = err1
metrics["err2"] = err2
metrics["auc"] = auc
metrics["precision"] = precision
metrics["recall"] = recall
metrics["f1"] = f1
metrics["logloss"] = logloss

In [17]:
import mlflow

EXPERIMENT_NAME = "churn_kamilovmarsel"
RUN_NAME = "train_users_churn_2"
REGISTRY_MODEL_NAME = "train_users_churn_model"

TRACKING_SERVER_HOST = "127.0.0.1"
TRACKING_SERVER_PORT = 5000
mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")

experiment_id = mlflow.set_experiment(EXPERIMENT_NAME).experiment_id
if not experiment_id: 
    experiment_id = mlflow.create_experiment(EXPERIMENT_NAME)

pip_requirements = 'requirements.txt'
signature = mlflow.models.infer_signature(X_val, prediction)
input_example = X_val[:10]
metadata = {'model_type': 'monthly'}

os.environ["MLFLOW_S3_ENDPOINT_URL"] = "https://storage.yandexcloud.net" #endpoint бакета от YandexCloud
os.environ["AWS_ACCESS_KEY_ID"] = os.getenv("AWS_ACCESS_KEY_ID") # получаем id ключа бакета, к которому подключён MLFlow, из .env
os.environ["AWS_SECRET_ACCESS_KEY"] = os.getenv("AWS_SECRET_ACCESS_KEY") # получаем ключ бакета, к которому подключён MLFlow, из .env

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id

    mlflow.log_metrics(metrics) # ваш код здесь
    
    # # логируем файлы как артефакты эксперимента — 'columns.txt' и 'users_churn.csv'
    # mlflow.log_artifact('columns.txt', "dataframe") # ваш код здесь
    # mlflow.log_artifact('users_churn.csv', "dataframe") # ваш код здесь

    mlflow.log_artifact('test_artifact.txt', "dataframe") # ваш код здесь

    model_info = mlflow.catboost.log_model(cb_model=model,
        artifact_path="models",
        signature=signature,
        metadata=metadata,
        input_example=input_example,
        registered_model_name=REGISTRY_MODEL_NAME,
        pip_requirements=pip_requirements,
        # await_registration_for=60
		)
    
run = mlflow.get_run(run_id) # ваш код здесь

print("EXPERIMENT_NAME: ", EXPERIMENT_NAME)
print("experiment_id: ", experiment_id)
print("run_id: ", run_id)

Registered model 'train_users_churn_model' already exists. Creating a new version of this model...
2024/10/19 07:57:01 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: train_users_churn_model, version 2
Created version '2' of model 'train_users_churn_model'.


EXPERIMENT_NAME:  churn_kamilovmarsel
experiment_id:  2
run_id:  00fa46db336146b49b73ab41ebc6e9b1


In [13]:
loaded_model = mlflow.catboost.load_model(model_uri=model_info.model_uri) 
model_predictions = loaded_model.predict(preprocessor.transform(X_val))
assert model_predictions.dtype == int
print(model_predictions[:10])

Downloading artifacts: 100%|██████████| 6/6 [00:00<00:00, 31.93it/s]


[1 0 1 1 0 1 0 1 0 0]


In [15]:
print("Tracking URI: ", mlflow.get_tracking_uri())
print("model_uri: ", model_info.model_uri)
print("Run id:", run.info.run_id)
print("Experiment:", run.info.experiment_id)

assert "FINISHED" == run.info.status
# os.remove('columns.txt')
# os.remove('users_churn.csv')

Tracking URI:  http://127.0.0.1:5000
model_uri:  runs:/ffb73657b7ec41f6837d6df6ee2ce658/models
Run id: ffb73657b7ec41f6837d6df6ee2ce658
Experiment: 2
