In [1]:
import os
from dotenv import load_dotenv
import pandas as pd
from functools import partial
from typing import Dict, List, Optional
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from hyperopt import hp, fmin, tpe, Trials
import mlflow 
from mlflow_utils import create_dataset

load_dotenv()

True

In [2]:
def get_classification_metrics(
    y_true: pd.Series, y_pred: pd.Series, prefix: str) -> Dict[str, float]:
    """
    Get the classification metrics.

    :param y_true: The true target values.
    :param y_pred: The predicted target values.
    :param prefix: The prefix of the metric names.
    :return: The classification metrics.
    """

    return {
        f"{prefix}_accuracy": accuracy_score(y_true=y_true, y_pred=y_pred),
        f"{prefix}_precision": precision_score(y_true=y_true, y_pred=y_pred),
        f"{prefix}_recall": recall_score(y_true=y_true, y_pred=y_pred),
        f"{prefix}_f1": f1_score(y_true=y_true, y_pred=y_pred),
    }


def get_sklearn_pipeline(
    numerical_features: List[str], categorical_features: Optional[List[str]] = []) -> Pipeline:
    """
    Get the sklearn pipeline.

    :param numerical_features: The numerical features.
    :param categorical_features: The categorical features.
    :return: The sklearn pipeline.
    """

    preprocessing = ColumnTransformer(
        transformers=[
            ("numerical", SimpleImputer(strategy="median"), numerical_features),
            ("categorical", OneHotEncoder(), categorical_features),
        ]
    )

    pipeline = Pipeline(
        steps=[
            ("preprocessing", preprocessing),
            ("model", RandomForestClassifier())
        ]
    )

    return pipeline


def objective_function(
    params: Dict,
    x_train: pd.DataFrame,
    x_test: pd.DataFrame,
    y_train: pd.DataFrame,
    y_test: pd.DataFrame,
    numerical_features: List[str],
    categorical_features: List[str],
) -> float:
    """
    Objective function to minimize.

    :param params: The hyperparameter values to evaluate.
    :param x_train: The training data.
    :param x_test: The test data.
    :param y_train: The training target.
    :param y_test: The test target.
    :param numerical_features: The numerical features.
    :param categorical_features: The categorical features.
    :return: The score of the model.
    """

    pipeline = get_sklearn_pipeline(numerical_features=numerical_features)
    params.update({"model__max_depth": int(params["model__max_depth"])})
    params.update({"model__n_estimators": int(params["model__n_estimators"])})
    pipeline.set_params(**params)
    with mlflow.start_run(nested=True) as run:
        pipeline.fit(x_train, y_train)
        y_pred = pipeline.predict(x_test)
        metrics = get_classification_metrics(
            y_true=y_test, y_pred=y_pred, prefix="test"
        )

        mlflow.log_params(pipeline["model"].get_params())
        mlflow.log_metrics(metrics)
        mlflow.sklearn.log_model(pipeline, f"{run.info.run_id}-model")

    return -metrics["test_f1"]


In [3]:
# conectar con mlflow y minio (por http)
mlflow.set_tracking_uri("http://127.0.0.1:5000")

os.environ['MLFLOW_S3_ENDPOINT_URL'] = "http://127.0.0.1:9000"
os.environ['AWS_ACCESS_KEY_ID'] = os.getenv('KEY_ID')
os.environ['AWS_SECRET_ACCESS_KEY'] = os.getenv('ACCESS_KEY')

# Pipeline



In [4]:
df = create_dataset()

In [5]:
x_train, x_test, y_train, y_test = train_test_split(
    df.drop("target", axis=1),
    df["target"],
    test_size=0.2,
    random_state=42,
)

In [6]:
numerical_features = [f for f in x_train.columns if f.startswith("feature")]
print(numerical_features)

['feature_0', 'feature_1', 'feature_2', 'feature_3', 'feature_4', 'feature_5', 'feature_6', 'feature_7', 'feature_8', 'feature_9', 'feature_10', 'feature_11', 'feature_12', 'feature_13', 'feature_14', 'feature_15', 'feature_16', 'feature_17', 'feature_18', 'feature_19', 'feature_20', 'feature_21', 'feature_22', 'feature_23', 'feature_24', 'feature_25', 'feature_26', 'feature_27', 'feature_28', 'feature_29', 'feature_30', 'feature_31', 'feature_32', 'feature_33', 'feature_34', 'feature_35', 'feature_36', 'feature_37', 'feature_38', 'feature_39', 'feature_40', 'feature_41', 'feature_42', 'feature_43', 'feature_44', 'feature_45', 'feature_46', 'feature_47', 'feature_48', 'feature_49']


In [7]:
space = {
    "model__n_estimators": hp.quniform("model__n_estimators", 20, 200, 10),
    "model__max_depth": hp.quniform("model__max_depth", 10, 100, 10),
}

In [8]:
mlflow.set_experiment("mlflow_tracking")

<Experiment: artifact_location='s3://mlflow/3', creation_time=1718807838682, experiment_id='3', last_update_time=1718807838682, lifecycle_stage='active', name='mlflow_tracking', tags={}>

In [9]:
with mlflow.start_run(run_name="logging_artifacts24") as run:
    best_params = fmin(
        fn=partial(
            objective_function,
            x_train=x_train,
            x_test=x_test,
            y_train=y_train,
            y_test=y_test,
            numerical_features=numerical_features,
            categorical_features=None,
        ),
        space=space,
        algo=tpe.suggest,
        max_evals=10,
        trials=Trials(),
    )

    pipeline = get_sklearn_pipeline(numerical_features=numerical_features)

    best_params.update({"model__max_depth": int(best_params["model__max_depth"])})
    best_params.update({"model__n_estimators": int(best_params["model__n_estimators"])})

    pipeline.set_params(**best_params)
    pipeline.fit(x_train, y_train)
    y_pred = pipeline.predict(x_test)
    metrics = get_classification_metrics(
        y_true=y_test, y_pred=y_pred, prefix="best_model_test"
    )

    mlflow.log_params(pipeline["model"].get_params())
    mlflow.log_metrics(metrics)
    mlflow.sklearn.log_model(pipeline, f"{run.info.run_id}-best-model")

100%|██████████| 10/10 [02:02<00:00, 12.29s/trial, best loss: -0.905337361530715]


In [10]:
print('tracking uri:', mlflow.get_tracking_uri())
print('artifact uri:', mlflow.get_artifact_uri())

tracking uri: http://127.0.0.1:5000
artifact uri: s3://mlflow/3/0471b98581f8448a99dbd46a724c74a2/artifacts


In [11]:
mlflow.end_run()

In [12]:
print('ok_')

ok_
