In [1]:
import mlflow 

In [2]:
experiment_name = "classic_evaluation"
mlflow.set_experiment(experiment_name)

<Experiment: artifact_location='file:///c:/Users/manue/projects/mlflow_for_ml_dev/traditional_ml_evaluation/mlruns/457969554144424460', creation_time=1723773638391, experiment_id='457969554144424460', last_update_time=1723773638391, lifecycle_stage='active', name='classic_evaluation', tags={}>

In [3]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import pandas as pd

In [4]:
x , y = make_classification(n_samples=1000, n_features=20, n_classes=2, random_state=42)

x_df = pd.DataFrame(x, columns=[f'feature_{i+1}' for i in range(x.shape[1])])
y_df = pd.DataFrame(y, columns=['target'])

x_train, x_test, y_train, y_test = train_test_split(x_df, y_df, test_size=0.2, random_state=42)

In [5]:
rfc = RandomForestClassifier(n_estimators=1)

rfc.fit(x_train, y_train)

predictions = rfc.predict(x_test)

  return fit_method(estimator, *args, **kwargs)


In [15]:
predictions

array([0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1,
       0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0,
       1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0,
       1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       0, 1])

In [16]:
eval_df = pd.DataFrame({
    "target": y_test.to_numpy().flatten(),
    "predicted": predictions
})
print(eval_df.head())

   target  predicted
0       1          0
1       1          1
2       1          0
3       1          1
4       1          1


In [21]:
result = mlflow.evaluate(
    data = eval_df,
    model_type  = "classifier",
    targets="target",
    predictions="predicted")

2024/08/15 21:31:52 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.
2024/08/15 21:31:52 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...
2024/08/15 21:31:52 INFO mlflow.models.evaluation.default_evaluator: Shap explainer ExactExplainer is used.


In [22]:
result.artifacts

{'confusion_matrix': ImageEvaluationArtifact(uri='file:///c:/Users/manue/projects/mlflow_for_ml_dev/traditional_ml_evaluation/mlruns/457969554144424460/6f42e5756b8e4511a63559505ebaf4ea/artifacts/confusion_matrix.png')}

## Adding extra metrics

In [23]:
from mlflow.metrics import make_metric

In [24]:
def custom_accuracy(df, __builtin_metrics):
    targets = df["target"]
    predictions = df["prediction"]
    return sum(targets == predictions) / len(targets)

def custom_f1_score(df, __builtin_metrics):
    from sklearn.metrics import f1_score
    targets = df["target"]
    predictions = df["prediction"]
    return f1_score(targets, predictions, average="weighted")

In [25]:
custom_metric_accuracy = make_metric(
    eval_fn = custom_accuracy,
    name = "custom_accuracy",
    greater_is_better=True,
)

custom_metric_f1_score = make_metric(
    eval_fn = custom_f1_score,
    name = "custom_f1_score",
    greater_is_better=True,
)

In [26]:
result = mlflow.evaluate(
    data = eval_df,
    model_type  = "classifier",
    targets="target",
    predictions="predicted",
    extra_metrics=[custom_metric_accuracy, custom_metric_f1_score]
)


2024/08/15 21:31:59 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.
2024/08/15 21:31:59 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...
2024/08/15 21:32:00 INFO mlflow.models.evaluation.default_evaluator: Shap explainer ExactExplainer is used.


In [27]:
result.metrics

{'true_negatives': 83,
 'false_positives': 10,
 'false_negatives': 27,
 'true_positives': 80,
 'example_count': 200,
 'accuracy_score': 0.815,
 'recall_score': 0.7476635514018691,
 'precision_score': 0.8888888888888888,
 'f1_score': 0.8121827411167513,
 'custom_accuracy': 0.815,
 'custom_f1_score': 0.814764071916181}

In [None]:
with mlflow.start_run(run_name="custom-metrics") as run:
    print("Logging custom metrics")
    mlflow.log_metrics(result.metrics)

## Adding Custom Artifacts

In [105]:
# Custom precision recal curve
from sklearn.metrics import PrecisionRecallDisplay
import matplotlib.pyplot as plt
def custom_precision_recall_curve(df, _builtin_metrics, _artifacts_dir):
    targets = df["target"]
    predictions = df["prediction"]
    # Precision-recall curve
    pr_display = PrecisionRecallDisplay.from_predictions(targets, predictions)
    return {"precision_recall_curve": pr_display.figure_}


In [106]:
result = mlflow.evaluate(
        data = eval_df,
        model_type  = "classifier",
        targets="target",
        predictions="predicted",
        extra_metrics=[custom_metric_accuracy, custom_metric_f1_score],
        custom_artifacts=[custom_precision_recall_curve]
)

2024/08/15 22:00:03 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.
2024/08/15 22:00:03 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...
2024/08/15 22:00:04 INFO mlflow.models.evaluation.default_evaluator: Shap explainer ExactExplainer is used.
2024/08/15 22:00:04 INFO mlflow.models.evaluation.base: Validating generated model metrics
2024/08/15 22:00:04 INFO mlflow.models.evaluation.base: Model validation passed!
