In [1]:
import os
import psycopg
from dotenv import load_dotenv
from datetime import date

import pandas as pd
import numpy as np
import mlflow
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier
from category_encoders import CatBoostEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    roc_auc_score, 
    f1_score, 
    precision_score, 
    recall_score,
    confusion_matrix,
    log_loss)

In [2]:
load_dotenv()

True

In [26]:
TABLE_NAME = 'users_churn' # таблица с данными

TRACKING_SERVER_HOST = "127.0.0.1"
TRACKING_SERVER_PORT = 5000

EXPERIMENT_NAME = "EDA_experiment" # название эксперимента
RUN_NAME = "model_validation" 
REGISTRY_MODEL_NAME = 'churn_model' # название зарегистрированной модели 

In [4]:
connection = {'sslmode' : 'require', 'target_session_attrs' : 'read-write'}

postgres_credetials = {
    'dbname' : os.getenv('DB_DESTINATION_NAME'),
    'host' : os.getenv('DB_DESTINATION_HOST'),
    'port' : os.getenv('DB_DESTINATION_PORT'),
    'user' : os.getenv('DB_DESTINATION_USER'),
    'password' : os.getenv('DB_DESTINATION_PASSWORD'),
}

connection.update(postgres_credetials)

In [5]:
with psycopg.connect(**connection) as conn:

    with conn.cursor() as cur:
        cur.execute(f'SELECT * FROM {TABLE_NAME}')
        data = cur.fetchall()
        columns = [col[0] for col in cur.description]

        df = pd.DataFrame(data, columns=columns)

In [6]:
columns_without_datetime = df.select_dtypes(exclude='datetime').columns
df = df.dropna(subset=columns_without_datetime)
y = df['target']

In [7]:
model = LogisticRegression()

In [8]:
model_uri = 's3://s3-student-mle-20241219-a60d0b01a0/4/24906d24e7e74b5da4a8b81daca6e18c/artifacts/column_transformer'

preprocessor = mlflow.sklearn.load_model(model_uri)
encoded_features = preprocessor.fit_transform(df)
df = pd.DataFrame(encoded_features, columns=preprocessor.get_feature_names_out())


Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

In [9]:
X = df

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75)

In [10]:
model.fit(X_train, y_train)

In [11]:
prediction = model.predict(X_test)
proba = model.predict_proba(X_test)[:, 1]

In [12]:
metrics = {}

_, err1, _, err2 = confusion_matrix(y_test, prediction, normalize='all').ravel()
auc = roc_auc_score(y_test, proba)
precision = precision_score(y_test, prediction)
recall = recall_score(y_test, prediction)
f1 = f1_score(y_test, prediction)
logloss = log_loss(y_test, prediction)

metrics["err1"] = err1
metrics["err2"] = err2
metrics["auc"] = auc
metrics["precision"] = precision
metrics["recall"] = recall
metrics["f1"] = f1
metrics["logloss"] = logloss

In [13]:
metrics

{'err1': 0.09768211920529801,
 'err2': 0.17798013245033112,
 'auc': 0.7868012476119436,
 'precision': 0.6456456456456456,
 'recall': 0.5555555555555556,
 'f1': 0.5972222222222222,
 'logloss': 8.652863810301303}

In [14]:
mlflow.set_registry_uri(f'http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}')
mlflow.set_registry_uri(f'http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}')

In [22]:
experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)
if experiment is None:
    experiment_id = mlflow.create_experiment(
        EXPERIMENT_NAME
    )
else:
    experiment_id = experiment.experiment_id

In [23]:
mlflow.set_experiment(EXPERIMENT_NAME)

<Experiment: artifact_location='file:///home/mle-user/mle_projects/mle-mlflow/notebooks/mlruns/399505771290086027', creation_time=1738916588304, experiment_id='399505771290086027', last_update_time=1738916588304, lifecycle_stage='active', name='EDA_experiment', tags={}>

In [20]:
with mlflow.start_run(run_name=RUN_NAME) as run:
    run_id = run.info.run_id

    mlflow.log_metrics(metrics)

    # Логируем модель
    model_name = REGISTRY_MODEL_NAME  # Имя зарегистрированной модели
    artifact_path = "models"  # Путь к артефактам
    mlflow.sklearn.log_model(
        sk_model=model,
        artifact_path=artifact_path,
        registered_model_name=model_name  # Это создаст новую версию
    )


Registered model 'churn_model_maximpetrov' already exists. Creating a new version of this model...


RestException: RESOURCE_DOES_NOT_EXIST: Run with id=d3420944888e4d64b1d790516f0004d7 not found

In [24]:
pip_requirements = '../requirements.txt'
signature = mlflow.models.infer_signature(X_train, model.predict(X_train))
input_example = X_train.iloc[:1].to_dict(orient="records")
metadata = {
    "model_type": "LogisticRegression",
    "task": "binary_classification"
    }

In [None]:
!echo $AWS_BUCKET_NAME

In [28]:
with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id

    mlflow.log_metrics(metrics)

    mlflow.sklearn.log_model(
    sk_model=model,                     # Ваша новая модель
    artifact_path="models/v2",             # Путь, где будет сохранена модель
    registered_model_name=REGISTRY_MODEL_NAME,  # Имя зарегистрированной модели
    pip_requirements=pip_requirements,  # Зависимости
    signature=signature,                # Подпись модели
    input_example=input_example,        # Пример ввода
    metadata=metadata,                  # Дополнительные метаданные
    await_registration_for=60           # Ожидание регистрации в секундах
    )

Registered model 'churn_model' already exists. Creating a new version of this model...


RestException: RESOURCE_DOES_NOT_EXIST: Run with id=8c96a138a80f4233b0a4dab9b0a11da0 not found

In [34]:
client = mlflow.MlflowClient()

# Укажите нужный run_id

# Получите информацию о запуске
run_info = client.get_run(run_id)
print("Artifact URI:", run_info.info.artifact_uri)

Artifact URI: file:///home/mle-user/mle_projects/mle-mlflow/notebooks/mlruns/985234804674562658/76920c87918a4e8383d7ddd15b0c5575/artifacts


In [18]:
client = mlflow.MlflowClient()
experiments = client.search_experiments()  # Используйте search_experiments()
for exp in experiments:
    print(f"ID: {exp.experiment_id}, Name: {exp.name}")

ID: 985234804674562658, Name: churn_fio
ID: 0, Name: Default
