In [1]:
import os
import psycopg
from dotenv import load_dotenv
from datetime import date

import pandas as pd
import numpy as np
import mlflow
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier
from category_encoders import CatBoostEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    roc_auc_score, 
    f1_score, 
    precision_score, 
    recall_score,
    confusion_matrix,
    log_loss)

In [2]:
load_dotenv()

True

In [35]:
TABLE_NAME = 'users_churn' # таблица с данными

TRACKING_SERVER_HOST = "127.0.0.1"
TRACKING_SERVER_PORT = 5000

EXPERIMENT_NAME = "churn_fio" # название эксперимента
RUN_NAME = "model_validation" 
REGISTRY_MODEL_NAME = 'churn_model_maximpetrov' # название зарегистрированной модели 

In [4]:
connection = {'sslmode' : 'require', 'target_session_attrs' : 'read-write'}

postgres_credetials = {
    'dbname' : os.getenv('DB_DESTINATION_NAME'),
    'host' : os.getenv('DB_DESTINATION_HOST'),
    'port' : os.getenv('DB_DESTINATION_PORT'),
    'user' : os.getenv('DB_DESTINATION_USER'),
    'password' : os.getenv('DB_DESTINATION_PASSWORD'),
}

connection.update(postgres_credetials)

In [5]:
with psycopg.connect(**connection) as conn:

    with conn.cursor() as cur:
        cur.execute(f'SELECT * FROM {TABLE_NAME}')
        data = cur.fetchall()
        columns = [col[0] for col in cur.description]

        df = pd.DataFrame(data, columns=columns)

In [6]:
columns_without_datetime = df.select_dtypes(exclude='datetime').columns
df = df.dropna(subset=columns_without_datetime)
y = df['target']

In [7]:
model = LogisticRegression()

In [8]:
model_uri = 's3://s3-student-mle-20241219-a60d0b01a0/4/24906d24e7e74b5da4a8b81daca6e18c/artifacts/column_transformer'

preprocessor = mlflow.sklearn.load_model(model_uri)
encoded_features = preprocessor.fit_transform(df)
df = pd.DataFrame(encoded_features, columns=preprocessor.get_feature_names_out())


Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

In [9]:
X = df

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75)

In [10]:
model.fit(X_train, y_train)

In [11]:
prediction = model.predict(X_test)
proba = model.predict_proba(X_test)[:, 1]

In [12]:
metrics = {}

_, err1, _, err2 = confusion_matrix(y_test, prediction, normalize='all').ravel()
auc = roc_auc_score(y_test, proba)
precision = precision_score(y_test, prediction)
recall = recall_score(y_test, prediction)
f1 = f1_score(y_test, prediction)
logloss = log_loss(y_test, prediction)

metrics["err1"] = err1
metrics["err2"] = err2
metrics["auc"] = auc
metrics["precision"] = precision
metrics["recall"] = recall
metrics["f1"] = f1
metrics["logloss"] = logloss

In [13]:
metrics

{'err1': 0.14486754966887416,
 'err2': 0.19619205298013245,
 'auc': 0.7503396612693095,
 'precision': 0.5752427184466019,
 'recall': 0.5954773869346733,
 'f1': 0.5851851851851851,
 'logloss': 10.025387035383579}

In [31]:
mlflow.set_tracking_uri(f'http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}')
mlflow.set_registry_uri(f'http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}')

In [36]:
experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)
if experiment is None:
    experiment_id = mlflow.create_experiment(
        EXPERIMENT_NAME
    )
else:
    experiment_id = experiment.experiment_id

In [37]:
pip_requirements = '../requirements.txt'
signature = mlflow.models.infer_signature(X_train, model.predict(X_train))
input_example = X_train.iloc[:1].to_dict(orient="records")
metadata = {
    "model_type": "LogisticRegression",
    "task": "binary_classification"
    }

In [38]:
with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id

    mlflow.log_metrics(metrics)

    model_info = mlflow.sklearn.log_model(
    sk_model=model,
    pip_requirements=pip_requirements,        
    signature=signature,
    input_example=input_example,
    metadata=metadata,
    artifact_path="models",
    registered_model_name=REGISTRY_MODEL_NAME,
    await_registration_for=60
    )

Registered model 'churn_model_maximpetrov' already exists. Creating a new version of this model...
2025/02/07 11:43:48 INFO mlflow.tracking._model_registry.client: Waiting up to 60 seconds for model version to finish creation. Model name: churn_model_maximpetrov, version 8
Created version '8' of model 'churn_model_maximpetrov'.


In [39]:
run_id

'86390ea93c9a4769877cb7dc2bb0ee3e'