In [1]:
import pandas as pd
import os
import mlflow
from dotenv import load_dotenv
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from category_encoders import CatBoostEncoder
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from catboost import CatBoostClassifier
from sqlalchemy import create_engine
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score, confusion_matrix, log_loss


In [2]:
load_dotenv()

True

In [3]:
def create_connection():
    host = os.environ.get("DB_DESTINATION_HOST")
    port = os.environ.get("DB_DESTINATION_PORT")
    db = os.environ.get("DB_DESTINATION_NAME")
    username = os.environ.get("DB_DESTINATION_USER")
    password = os.environ.get("DB_DESTINATION_PASSWORD")
    
    conn = create_engine(
        f"postgresql://{username}:{password}@{host}:{port}/{db}",
        connect_args={"sslmode": "require"},
    )
    return conn

def get_data():
    conn = create_connection()
    data = pd.read_sql("select * from clean_users_churn", conn)
    conn.dispose()
    return data

def division_into_data_types(data: pd.DataFrame):
    """разделения ДФ на столбцы по типу данных"""

    cat_features = data.select_dtypes(include="object")
    potential_binary_features = cat_features.nunique() == 2

    binary_cat_features = cat_features[
        potential_binary_features[potential_binary_features].index
    ]
    other_cat_features = cat_features[
        potential_binary_features[~potential_binary_features].index
    ]
    num_features = data.select_dtypes(["float"])
    return binary_cat_features, other_cat_features, num_features

In [4]:
df = get_data()
binary_cat_features, other_cat_features, num_features = division_into_data_types(df)

In [5]:
preprocessor = ColumnTransformer(
        [
            (
                "binary",
                OneHotEncoder(drop='if_binary', sparse_output=False),
                binary_cat_features.columns.tolist(),
            ),
            (
                "cat",
                CatBoostEncoder(return_df=False),
                other_cat_features.columns.tolist(),
            ),
            ("num", StandardScaler(), num_features.columns.tolist()),
        ],
        remainder="drop",
        verbose_feature_names_out=False,
    )

In [6]:
data_array = preprocessor.fit_transform(df, df["target"])
feature_names = preprocessor.get_feature_names_out()
clean_data = pd.DataFrame(data_array, columns=feature_names)


In [7]:
model = LogisticRegression()
X = clean_data
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75)

In [8]:
model.fit(X_train, y_train)


In [9]:
prediction = model.predict(X_test)
proba = model.predict_proba(X_test)[:, 1]

In [10]:
metrics = {}

_, err1, _, err2 = confusion_matrix(y_test, prediction, normalize='all').ravel()
auc = roc_auc_score(y_test, proba)
precision = precision_score(y_test, prediction)
recall = recall_score(y_test, prediction)
f1 = f1_score(y_test, prediction)
logloss = log_loss(y_test, prediction)

metrics["err1"] = err1
metrics["err2"] = err2
metrics["auc"] = auc
metrics["precision"] = precision
metrics["recall"] = recall
metrics["f1"] = f1
metrics["logloss"] = logloss

In [5]:
EXPERIMENT_NAME = "churn_fio"
RUN_NAME = "model_0_registry"
REGISTRY_MODEL_NAME = "churn_model_maximpetrov"

In [6]:
TRACKING_SERVER_HOST = "127.0.0.1"
TRACKING_SERVER_PORT = 5000
os.environ["AWS_BUCKET_NAME"] = os.getenv("STUDENT_S3_BUCKET")

mlflow.set_registry_uri(f'http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}')
mlflow.set_tracking_uri(f'http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}')

In [7]:
experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)
if experiment is None:
    experiment_id = mlflow.create_experiment(
        EXPERIMENT_NAME
    )
else:
    experiment_id = experiment.experiment_id

In [17]:
pip_requirements = '../requirements.txt'
signature = mlflow.models.infer_signature(X_train, model.predict(X_train))
input_example = X_train.iloc[:1].to_dict(orient="records")
metadata = {
    "model_type": "LogisticRegression",
    "task": "binary_classification"
    }

In [18]:
if mlflow.active_run():
    mlflow.end_run()

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id
    mlflow.log_metrics(metrics)

    model_info = mlflow.sklearn.log_model(
        sk_model=model,
        pip_requirements=pip_requirements,        
        signature=signature,
        input_example=input_example,
        metadata=metadata,
        artifact_path="models",
        registered_model_name=REGISTRY_MODEL_NAME,
        await_registration_for=60
    )


Registered model 'churn_model_maximpetrov' already exists. Creating a new version of this model...
2025/02/01 18:00:07 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: churn_model_maximpetrov, version 4
Created version '4' of model 'churn_model_maximpetrov'.


In [20]:
model_uri = "models:/churn_model_maximpetrov/3"
model_uri = model_info.model_uri
loaded_model = mlflow.sklearn.load_model(model_uri)

Downloading artifacts:   0%|          | 0/6 [00:00<?, ?it/s]

In [21]:
predictions = loaded_model.predict(X_test)

In [None]:
filter_string = f"name = '{REGISTRY_MODEL_NAME}'"

client = mlflow.MlflowClient()
models = mlflow.search_model_versions(filter_string=filter_string)

[print(f"Model info:\n {model}") for model in models]

Model info:
 <ModelVersion: aliases=[], creation_timestamp=1738436081153, current_stage='Archived', description='', last_updated_timestamp=1738568596819, name='churn_model_maximpetrov', run_id='4003503b60d54a8ea7a6b744475e2230', run_link='', source='s3://s3-student-mle-20241219-a60d0b01a0/3/4003503b60d54a8ea7a6b744475e2230/artifacts/models', status='READY', status_message='', tags={}, user_id='', version='6'>
Model info:
 <ModelVersion: aliases=[], creation_timestamp=1738430946582, current_stage='Staging', description='', last_updated_timestamp=1738568596819, name='churn_model_maximpetrov', run_id='d44c3a48f97a4b67833bc8b33f16766f', run_link='', source='s3://s3-student-mle-20241219-a60d0b01a0/3/d44c3a48f97a4b67833bc8b33f16766f/artifacts/models', status='READY', status_message='', tags={}, user_id='', version='2'>
Model info:
 <ModelVersion: aliases=[], creation_timestamp=1738436310733, current_stage='Archived', description='', last_updated_timestamp=1738568588144, name='churn_model_max

[None, None, None, None, None, None, None]

In [29]:
experiment_runs = mlflow.search_runs(experiment_ids=[experiment_id])
runs = experiment_runs[[
	"run_id", "start_time", 'metrics.err1', 'metrics.err2', 'metrics.logloss', 'metrics.recall', 'metrics.auc',
'metrics.f1', 'metrics.precision']]
runs

Unnamed: 0,run_id,start_time,metrics.err1,metrics.err2,metrics.logloss,metrics.recall,metrics.auc,metrics.f1,metrics.precision
0,ac922914d7ef4b359f72778de9e8ab67,2025-02-01 18:58:30.341000+00:00,0.060399,0.14302,7.39357,0.49703,0.848524,0.582367,0.703081
1,4003503b60d54a8ea7a6b744475e2230,2025-02-01 18:54:40.722000+00:00,0.060399,0.14302,7.39357,0.49703,0.848524,0.582367,0.703081
2,203d1e3bcb7c4928be527935a02dceca,2025-02-01 18:54:28.507000+00:00,0.060399,0.14302,7.39357,0.49703,0.848524,0.582367,0.703081
3,ac8437e659594923a41c06ef661a9594,2025-02-01 18:50:17.202000+00:00,0.060399,0.14302,7.39357,0.49703,0.848524,0.582367,0.703081
4,aa8abaf8a96945ce9091abdecffafd0c,2025-02-01 18:31:41.983000+00:00,,,,,,,
5,765ab65c30b9455c9ba2dbce0e1a9b97,2025-02-01 18:31:36.986000+00:00,,,,,,,
6,1187d59bdcd448c698b100757e35ff4b,2025-02-01 18:30:56.547000+00:00,,,,,,,
7,a57949bb43a94033b0d94c37b6cf0ab0,2025-02-01 18:30:20.109000+00:00,,,,,,,
8,092b21c5740b4d2abcfe1142290676b8,2025-02-01 18:00:06.757000+00:00,0.063818,0.140171,7.023891,0.516807,0.848644,0.589928,0.687151
9,23dd8dc3cae047839317ea5e3c6df028,2025-02-01 17:35:00.424000+00:00,0.076923,0.128205,7.537334,0.492341,0.811221,0.550796,0.625


In [34]:
run_id = 'ac922914d7ef4b359f72778de9e8ab67'
metric = 'logloss'
client.get_metric_history(run_id, metric)

[<Metric: key='logloss', step=0, timestamp=1738436310387, value=7.393569925972749>]

In [38]:
run = client.get_run(run_id)
artifact_uri = run.info.artifact_uri
print(f"Путь артефакта в S3: '{artifact_uri}'") 

Путь артефакта в S3: 's3://s3-student-mle-20241219-a60d0b01a0/3/ac922914d7ef4b359f72778de9e8ab67/artifacts'


In [39]:
mlflow.artifacts.download_artifacts(artifact_uri=artifact_uri, dst_path='./artifacts_local')

Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

'/home/mle-user/mle_projects/mle-mlflow/notebooks/artifacts_local/artifacts'

In [11]:
REGISTRY_MODEL_VERSION = 7

model_uri = client.get_model_version_download_uri(REGISTRY_MODEL_NAME, REGISTRY_MODEL_VERSION)

model = mlflow.sklearn.load_model(model_uri)
print(f"Путь до модели {REGISTRY_MODEL_NAME} версии {REGISTRY_MODEL_VERSION} в S3: '{model_uri}'") 


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Путь до модели churn_model_maximpetrov версии 7 в S3: 's3://s3-student-mle-20241219-a60d0b01a0/3/ac922914d7ef4b359f72778de9e8ab67/artifacts/models'
