In [1]:
import os
from dotenv import load_dotenv
import pandas as pd
from typing import Dict, List, Optional
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import mlflow 
from mlflow import MlflowClient
from mlflow.models import infer_signature
from mlflow_utils import create_dataset, register_model_with_client

load_dotenv()

True

In [2]:
def get_classification_metrics(
    y_true: pd.Series, y_pred: pd.Series, prefix: str) -> Dict[str, float]:
    """
    Get the classification metrics.

    :param y_true: The true target values.
    :param y_pred: The predicted target values.
    :param prefix: The prefix of the metric names.
    :return: The classification metrics.
    """

    return {
        f"{prefix}_accuracy": accuracy_score(y_true=y_true, y_pred=y_pred),
        f"{prefix}_precision": precision_score(y_true=y_true, y_pred=y_pred),
        f"{prefix}_recall": recall_score(y_true=y_true, y_pred=y_pred),
        f"{prefix}_f1": f1_score(y_true=y_true, y_pred=y_pred),
    }


def get_sklearn_pipeline(
    numerical_features: List[str], categorical_features: Optional[List[str]] = []) -> Pipeline:
    """
    Get the sklearn pipeline.

    :param numerical_features: The numerical features.
    :param categorical_features: The categorical features.
    :return: The sklearn pipeline.
    """

    preprocessing = ColumnTransformer(
        transformers=[
            ("numerical", SimpleImputer(strategy="median"), numerical_features),
            ("categorical", OneHotEncoder(), categorical_features),
        ]
    )

    pipeline = Pipeline(
        steps=[
            ("preprocessing", preprocessing),
            ("model", RandomForestClassifier())
        ]
    )

    return pipeline


def objective_function(
    params: Dict,
    x_train: pd.DataFrame,
    x_test: pd.DataFrame,
    y_train: pd.DataFrame,
    y_test: pd.DataFrame,
    numerical_features: List[str],
    categorical_features: List[str],
) -> float:
    """
    Objective function to minimize.

    :param params: The hyperparameter values to evaluate.
    :param x_train: The training data.
    :param x_test: The test data.
    :param y_train: The training target.
    :param y_test: The test target.
    :param numerical_features: The numerical features.
    :param categorical_features: The categorical features.
    :return: The score of the model.
    """

    pipeline = get_sklearn_pipeline(numerical_features=numerical_features)
    params.update({"model__max_depth": int(params["model__max_depth"])})
    params.update({"model__n_estimators": int(params["model__n_estimators"])})
    pipeline.set_params(**params)
    with mlflow.start_run(nested=True) as run:
        pipeline.fit(x_train, y_train)
        y_pred = pipeline.predict(x_test)
        metrics = get_classification_metrics(
            y_true=y_test, y_pred=y_pred, prefix="test"
        )

        mlflow.log_params(pipeline["model"].get_params())
        mlflow.log_metrics(metrics)
        mlflow.sklearn.log_model(pipeline, f"{run.info.run_id}-model")

    return -metrics["test_f1"]


In [3]:
# conectar con mlflow y minio (por http)
mlflow.set_tracking_uri("http://127.0.0.1:5000")

os.environ['MLFLOW_S3_ENDPOINT_URL'] = "http://127.0.0.1:9000"
os.environ['AWS_ACCESS_KEY_ID'] = os.getenv('KEY_ID')
os.environ['AWS_SECRET_ACCESS_KEY'] = os.getenv('ACCESS_KEY')

# Pipeline



In [4]:
df = create_dataset()

In [5]:
x_train, x_test, y_train, y_test = train_test_split(
    df.drop("target", axis=1),
    df["target"],
    test_size=0.2,
    random_state=42,
)

In [6]:
numerical_features = [f for f in x_train.columns if f.startswith("feature")]
print(numerical_features)

['feature_0', 'feature_1', 'feature_2', 'feature_3', 'feature_4', 'feature_5', 'feature_6', 'feature_7', 'feature_8', 'feature_9', 'feature_10', 'feature_11', 'feature_12', 'feature_13', 'feature_14', 'feature_15', 'feature_16', 'feature_17', 'feature_18', 'feature_19', 'feature_20', 'feature_21', 'feature_22', 'feature_23', 'feature_24', 'feature_25', 'feature_26', 'feature_27', 'feature_28', 'feature_29', 'feature_30', 'feature_31', 'feature_32', 'feature_33', 'feature_34', 'feature_35', 'feature_36', 'feature_37', 'feature_38', 'feature_39', 'feature_40', 'feature_41', 'feature_42', 'feature_43', 'feature_44', 'feature_45', 'feature_46', 'feature_47', 'feature_48', 'feature_49']


In [7]:
pipeline = get_sklearn_pipeline(numerical_features=numerical_features)
pipeline.fit(x_train, y_train)
pipeline

In [8]:
pipeline.get_params()

{'memory': None,
 'steps': [('preprocessing',
   ColumnTransformer(transformers=[('numerical', SimpleImputer(strategy='median'),
                                    ['feature_0', 'feature_1', 'feature_2',
                                     'feature_3', 'feature_4', 'feature_5',
                                     'feature_6', 'feature_7', 'feature_8',
                                     'feature_9', 'feature_10', 'feature_11',
                                     'feature_12', 'feature_13', 'feature_14',
                                     'feature_15', 'feature_16', 'feature_17',
                                     'feature_18', 'feature_19', 'feature_20',
                                     'feature_21', 'feature_22', 'feature_23',
                                     'feature_24', 'feature_25', 'feature_26',
                                     'feature_27', 'feature_28', 'feature_29', ...]),
                                   ('categorical', OneHotEncoder(), [])])),
  ('mode

In [9]:
y_pred = pipeline.predict(x_test)
metrics = get_classification_metrics(y_true=y_test, y_pred=y_pred, prefix="test")
# Falta plots

In [10]:
mlflow.set_experiment("mlflow_tracking")

<Experiment: artifact_location='s3://mlflow/3', creation_time=1718807838682, experiment_id='3', last_update_time=1718807838682, lifecycle_stage='active', name='mlflow_tracking', tags={}>

In [11]:
model_name = "r_model"
tags = {"autor": "oecorrechag", "version": "1.0"}
description = "Classification model"

In [12]:
client = MlflowClient()
client.create_registered_model(model_name, tags=tags, description=description)

with mlflow.start_run(run_name="logging_artifacts26") as run:

    # log metric
    mlflow.log_metrics(metrics)

    # log params
    mlflow.log_params(pipeline.get_params())

    tags = {
        "type": "classifier",
        "mlflow.note.content": "This is a classifier for the house pricing dataset",
    }
    mlflow.set_tags(tags)


    # # log tags
    # mlflow.set_tags({"type": "classifier"})

    # # log description
    # mlflow.set_tag({'mlflow.note.content': 'This is a classifier for the house pricing dataset'})

    # log plots (faltan)

    # print info about the run
    print("run_id: {}".format(run.info.run_id))

RestException: RESOURCE_ALREADY_EXISTS: Registered Model (name=registed model) already exists. Error: (sqlite3.IntegrityError) UNIQUE constraint failed: registered_models.name
[SQL: INSERT INTO registered_models (name, creation_time, last_updated_time, description) VALUES (?, ?, ?, ?)]
[parameters: ('registed model', 1718833911675, 1718833911675, 'Classification model')]
(Background on this error at: https://sqlalche.me/e/20/gkpj)

In [12]:
print('tracking uri:', mlflow.get_tracking_uri())
print('artifact uri:', mlflow.get_artifact_uri())

tracking uri: http://127.0.0.1:5000
artifact uri: s3://mlflow/3/b7148149b1544ea6b3c336d31d33dfef/artifacts


In [13]:
mlflow.end_run()

## Ejemplo 2

In [14]:
mlflow.set_experiment("mlflow_tracking")

<Experiment: artifact_location='s3://mlflow/3', creation_time=1718807838682, experiment_id='3', last_update_time=1718807838682, lifecycle_stage='active', name='mlflow_tracking', tags={}>

In [18]:
model_name = "r_model2"
tags = {"autor": "oecorrechag", "version": "1.0"}
description = "Classification model"
run_id = '4fcf2259550d4f838be46088dd8d0bc2'

In [19]:
register_model_with_client(model_name=model_name, run_id=run_id, artifact_path='model_26')

with mlflow.start_run(run_name="logging_artifacts26B") as run:

    # log metric
    mlflow.log_metrics(metrics)

    # log params
    mlflow.log_params(pipeline.get_params())

    tags = {
        "type": "classifier",
        "mlflow.note.content": "This is a classifier for the house pricing dataset",
    }
    mlflow.set_tags(tags)


    # # log tags
    # mlflow.set_tags({"type": "classifier"})

    # # log description
    # mlflow.set_tag({'mlflow.note.content': 'This is a classifier for the house pricing dataset'})

    # log plots (faltan)

RestException: RESOURCE_ALREADY_EXISTS: Registered Model (name=registed model) already exists. Error: (sqlite3.IntegrityError) UNIQUE constraint failed: registered_models.name
[SQL: INSERT INTO registered_models (name, creation_time, last_updated_time, description) VALUES (?, ?, ?, ?)]
[parameters: ('registed model', 1718834135266, 1718834135266, '')]
(Background on this error at: https://sqlalche.me/e/20/gkpj)

In [27]:
mlflow.end_run()

# Ejemplo 3

In [7]:
model_name = "r_model3"
artifact_path = "random_forest_classifier2"

df = create_dataset()

x_train, x_test, y_train, y_test = train_test_split(
    df.drop("target", axis=1),
    df["target"],
    test_size=0.2,
    random_state=42,
)

numerical_features = [f for f in x_train.columns if f.startswith("feature")]

pipeline = get_sklearn_pipeline(numerical_features=numerical_features)
pipeline.fit(x_train, y_train)

y_pred = pipeline.predict(x_test)
metrics = get_classification_metrics(y_true=y_test, y_pred=y_pred, prefix="test")

In [8]:
# infer signature
model_signature = infer_signature(model_input=x_train, model_output=y_pred)
mlflow.sklearn.log_model(sk_model=pipeline, artifact_path=artifact_path, signature=model_signature, registered_model_name=model_name)

Successfully registered model 'r_model3'.
2024/06/19 19:02:49 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: r_model3, version 1
Created version '1' of model 'r_model3'.


<mlflow.models.model.ModelInfo at 0x7fabae3eb790>

In [12]:
with mlflow.start_run(run_name="logging_artifacts26C") as run:

    # log metric
    mlflow.log_metrics(metrics)

    # log params
    mlflow.log_params(pipeline.get_params())

    tags = {
        "type": "classifier",
        "mlflow.note.content": "This is a classifier for the house pricing dataset",
    }
    mlflow.set_tags(tags)

    # print info about the run
    print("run_id: {}".format(run.info.run_id))

run_id: 4c834c85048a4a59b13b9601279a31d0


In [13]:
mlflow.end_run()

In [14]:
print('ok_')

ok_
