In [4]:
import pandas as pd
import os
import mlflow
from dotenv import load_dotenv
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from category_encoders import CatBoostEncoder
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from catboost import CatBoostClassifier
from sqlalchemy import create_engine
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score, confusion_matrix, log_loss


In [5]:
load_dotenv()

True

In [6]:
def create_connection():
    host = os.environ.get("DB_DESTINATION_HOST")
    port = os.environ.get("DB_DESTINATION_PORT")
    db = os.environ.get("DB_DESTINATION_NAME")
    username = os.environ.get("DB_DESTINATION_USER")
    password = os.environ.get("DB_DESTINATION_PASSWORD")
    
    conn = create_engine(
        f"postgresql://{username}:{password}@{host}:{port}/{db}",
        connect_args={"sslmode": "require"},
    )
    return conn

def get_data():
    conn = create_connection()
    data = pd.read_sql("select * from clean_users_churn", conn)
    conn.dispose()
    return data

def division_into_data_types(data: pd.DataFrame):
    """разделения ДФ на столбцы по типу данных"""

    cat_features = data.select_dtypes(include="object")
    potential_binary_features = cat_features.nunique() == 2

    binary_cat_features = cat_features[
        potential_binary_features[potential_binary_features].index
    ]
    other_cat_features = cat_features[
        potential_binary_features[~potential_binary_features].index
    ]
    num_features = data.select_dtypes(["float"])
    return binary_cat_features, other_cat_features, num_features

In [7]:
df = get_data()
binary_cat_features, other_cat_features, num_features = division_into_data_types(
        df
    )

In [8]:
preprocessor = ColumnTransformer(
        [
            (
                "binary",
                OneHotEncoder(drop='if_binary', sparse_output=False),
                binary_cat_features.columns.tolist(),
            ),
            (
                "cat",
                CatBoostEncoder(return_df=False),
                other_cat_features.columns.tolist(),
            ),
            ("num", StandardScaler(), num_features.columns.tolist()),
        ],
        remainder="drop",
        verbose_feature_names_out=False,
    )

In [9]:
data_array = preprocessor.fit_transform(df, df["target"])
feature_names = preprocessor.get_feature_names_out()
clean_data = pd.DataFrame(data_array, columns=feature_names)


In [10]:
model = LogisticRegression()
X = clean_data
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75)

In [11]:
model.fit(X_train, y_train)


In [12]:
prediction = model.predict(X_test)
proba = model.predict_proba(X_test)[:, 1]

In [13]:
metrics = {}

_, err1, _, err2 = confusion_matrix(y_test, prediction, normalize='all').ravel()
auc = roc_auc_score(y_test, proba)
precision = precision_score(y_test, prediction)
recall = recall_score(y_test, prediction)
f1 = f1_score(y_test, prediction)
logloss = log_loss(y_test, prediction)

metrics["err1"] = err1
metrics["err2"] = err2
metrics["auc"] = auc
metrics["precision"] = precision
metrics["recall"] = recall
metrics["f1"] = f1
metrics["logloss"] = logloss

In [14]:
EXPERIMENT_NAME = "churn_fio"
RUN_NAME = "model_0_registry"
REGISTRY_MODEL_NAME = "churn_model_maximpetrov"

In [15]:
TRACKING_SERVER_HOST = "127.0.0.1"
TRACKING_SERVER_PORT = 5000
os.environ["AWS_BUCKET_NAME"] = os.getenv("STUDENT_S3_BUCKET")

mlflow.set_registry_uri(f'http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}')
mlflow.set_tracking_uri(f'http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}')

In [16]:
experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)
if experiment is None:
    experiment_id = mlflow.create_experiment(
        EXPERIMENT_NAME
    )
else:
    experiment_id = experiment.experiment_id

In [17]:
pip_requirements = '../requirements.txt'
signature = mlflow.models.infer_signature(X_train, model.predict(X_train))
input_example = X_train.iloc[:1].to_dict(orient="records")
metadata = {
    "model_type": "LogisticRegression",
    "task": "binary_classification"
    }

In [18]:
if mlflow.active_run():
    mlflow.end_run()

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id
    mlflow.log_metrics(metrics)

    model_info = mlflow.sklearn.log_model(
        sk_model=model,
        pip_requirements=pip_requirements,        
        signature=signature,
        input_example=input_example,
        metadata=metadata,
        artifact_path="models",
        registered_model_name=REGISTRY_MODEL_NAME
    )


Registered model 'churn_model_maximpetrov' already exists. Creating a new version of this model...
2025/02/01 18:00:07 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: churn_model_maximpetrov, version 4
Created version '4' of model 'churn_model_maximpetrov'.


In [20]:
model_uri = "models:/churn_model_maximpetrov/3"
model_uri = model_info.model_uri
loaded_model = mlflow.sklearn.load_model(model_uri)

Downloading artifacts:   0%|          | 0/6 [00:00<?, ?it/s]

In [21]:
predictions = loaded_model.predict(X_test)