# Evaluating ML models

In [1]:
import mlflow 

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

import pandas as pd

In [2]:
experiment_name = "classic_evaluation"
mlflow.set_experiment(experiment_name)

<Experiment: artifact_location='file:///c:/Users/manue/projects/mlflow_for_ml_dev/traditional_ml_evaluation/mlruns/919999149587451714', creation_time=1723858156380, experiment_id='919999149587451714', last_update_time=1723858156380, lifecycle_stage='active', name='classic_evaluation', tags={}>

## Example Dataset

In [3]:
x , y = make_classification(n_samples=1000, n_features=5, n_classes=2, random_state=42)

x_df = pd.DataFrame(x, columns=[f'feature_{i+1}' for i in range(x.shape[1])])
y_df = pd.DataFrame(y, columns=['target'])

x_train, x_test, y_train, y_test = train_test_split(x_df, y_df, test_size=0.2, random_state=42)


print(x_train.head())

     feature_1  feature_2  feature_3  feature_4  feature_5
29   -1.358062   0.388926  -2.219300   0.629189   0.502890
535  -1.692785   0.161613  -0.451659   0.630933   1.416180
695   3.291478  -0.557601  -0.580053  -1.342261  -2.159247
557  -2.076136   1.416809  -0.181224   1.351993  -1.239513
836  -1.348164   0.336725   0.038238   0.601181   0.619803


In [4]:
# instantiate the model
rfc = RandomForestClassifier(n_estimators=1)

# fit the model
rfc.fit(x_train, y_train)

# make predictions
predictions = rfc.predict(x_test)

  return fit_method(estimator, *args, **kwargs)


In [5]:
eval_df = pd.DataFrame({
    "target": y_test.to_numpy().flatten(),
    "predictions": predictions
})
print(eval_df.head())

   target  predictions
0       1            1
1       1            1
2       1            1
3       1            1
4       0            0


### Evaluating the model using the evaluation dataframe

In [6]:
result = mlflow.evaluate(
    data = eval_df,
    model_type  = "classifier",
    targets= "target",
    predictions="predictions"
)

2024/08/18 16:07:15 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.
2024/08/18 16:07:15 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...
2024/08/18 16:07:16 INFO mlflow.models.evaluation.default_evaluator: Shap explainer ExactExplainer is used.


<Figure size 1050x700 with 0 Axes>

### Evaluating the model providing the estimator.

1. Provide the model as a function

In [7]:
def random_forest_clf(model_input):
    return rfc.predict(model_input)

In [8]:
eval_df_for_model = x_test.copy()
eval_df_for_model['target'] = y_test

In [9]:
eval_df_for_model.head()

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,target
521,1.250932,-1.064291,-2.238231,-0.914547,1.261286,1
737,-0.196283,0.19082,-0.243384,0.154804,-0.256094,1
740,2.659138,-0.265773,1.072978,-0.996758,-2.195564,1
660,0.087778,-0.021011,-0.66778,-0.038709,-0.042586,1
411,-0.662457,0.741043,-0.35834,0.568499,-1.101298,0


In [10]:
result = mlflow.evaluate(
    model = random_forest_clf,
    data = eval_df_for_model,
    model_type  = "classifier",
    targets= "target",
    predictions="predictions"
)

2024/08/18 16:07:37 INFO mlflow.models.evaluation.default_evaluator: Computing model predictions.
2024/08/18 16:07:37 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.
2024/08/18 16:07:37 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...
2024/08/18 16:07:37 INFO mlflow.models.evaluation.default_evaluator: Shap explainer ExactExplainer is used.


In [14]:
result.metrics

{'true_negatives': 86,
 'false_positives': 11,
 'false_negatives': 13,
 'true_positives': 90,
 'example_count': 200,
 'accuracy_score': 0.88,
 'recall_score': 0.8737864077669902,
 'precision_score': 0.8910891089108911,
 'f1_score': 0.8823529411764706}

## Adding extra metrics

In [15]:
from mlflow.metrics import make_metric
from sklearn.metrics import f1_score

In [17]:
def custom_accuracy(df, __builtin_metrics):
    targets = df["target"]
    predictions = df["prediction"]
    return sum(targets == predictions) / len(targets)

def custom_f1_score(df, __builtin_metrics):
    targets = df["target"]
    predictions = df["prediction"]
    return f1_score(targets, predictions, average="weighted")

In [18]:
custom_metric_accuracy = make_metric(
    eval_fn = custom_accuracy,
    name = "custom_accuracy",
    greater_is_better=True,
)

custom_metric_f1_score = make_metric(
    eval_fn = custom_f1_score,
    name = "custom_f1_score",
    greater_is_better=True,
)

In [19]:
result = mlflow.evaluate(
    model = random_forest_clf,
    data = eval_df_for_model,
    model_type  = "classifier",
    targets="target",
    predictions="predictions",
    extra_metrics=[custom_metric_accuracy, custom_metric_f1_score]
)


2024/08/18 16:10:37 INFO mlflow.models.evaluation.default_evaluator: Computing model predictions.
2024/08/18 16:10:37 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.
2024/08/18 16:10:37 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...
2024/08/18 16:10:37 INFO mlflow.models.evaluation.default_evaluator: Shap explainer ExactExplainer is used.


In [20]:
result.artifacts

{'confusion_matrix': ImageEvaluationArtifact(uri='file:///c:/Users/manue/projects/mlflow_for_ml_dev/traditional_ml_evaluation/mlruns/919999149587451714/2e01853f0cda4216b61129f7839eb881/artifacts/confusion_matrix.png'),
 'shap_beeswarm_plot': ImageEvaluationArtifact(uri='file:///c:/Users/manue/projects/mlflow_for_ml_dev/traditional_ml_evaluation/mlruns/919999149587451714/2e01853f0cda4216b61129f7839eb881/artifacts/shap_beeswarm_plot.png'),
 'shap_summary_plot': ImageEvaluationArtifact(uri='file:///c:/Users/manue/projects/mlflow_for_ml_dev/traditional_ml_evaluation/mlruns/919999149587451714/2e01853f0cda4216b61129f7839eb881/artifacts/shap_summary_plot.png'),
 'shap_feature_importance_plot': ImageEvaluationArtifact(uri='file:///c:/Users/manue/projects/mlflow_for_ml_dev/traditional_ml_evaluation/mlruns/919999149587451714/2e01853f0cda4216b61129f7839eb881/artifacts/shap_feature_importance_plot.png')}

## Adding Custom Artifacts

In [35]:
from sklearn.metrics import PrecisionRecallDisplay
import matplotlib.pyplot as plt

# Custom precision recal curve
def custom_precision_recall_curve(df, _builtin_metrics, _artifacts_dir):
    targets = df["target"]
    predictions = df["prediction"]
    pr_display = PrecisionRecallDisplay.from_predictions(targets, predictions)
    return {"precision_recall_curve": pr_display.figure_}


In [36]:
result = mlflow.evaluate(
        data = eval_df,
        model_type  = "classifier",
        targets="target",
        predictions="predictions",
        extra_metrics=[custom_metric_accuracy, custom_metric_f1_score],
        custom_artifacts=[custom_precision_recall_curve]
)

2024/08/18 16:12:43 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.
2024/08/18 16:12:43 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...
2024/08/18 16:12:44 INFO mlflow.models.evaluation.default_evaluator: Shap explainer ExactExplainer is used.


In [37]:
result.artifacts

{'precision_recall_curve': ImageEvaluationArtifact(uri='file:///c:/Users/manue/projects/mlflow_for_ml_dev/traditional_ml_evaluation/mlruns/919999149587451714/a36801e417c3435b9dae664480796d66/artifacts/precision_recall_curve.png'),
 'confusion_matrix': ImageEvaluationArtifact(uri='file:///c:/Users/manue/projects/mlflow_for_ml_dev/traditional_ml_evaluation/mlruns/919999149587451714/a36801e417c3435b9dae664480796d66/artifacts/confusion_matrix.png')}

In [38]:
result = mlflow.evaluate(
    model = random_forest_clf,
    data = eval_df_for_model,
    model_type  = "classifier",
    targets="target",
    predictions="predictions",
    extra_metrics=[custom_metric_accuracy, custom_metric_f1_score],
    custom_artifacts=[custom_precision_recall_curve]
)

2024/08/18 16:12:48 INFO mlflow.models.evaluation.default_evaluator: Computing model predictions.
2024/08/18 16:12:48 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.
2024/08/18 16:12:48 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...
2024/08/18 16:12:49 INFO mlflow.models.evaluation.default_evaluator: Shap explainer ExactExplainer is used.


In [39]:
result.artifacts

{'precision_recall_curve': ImageEvaluationArtifact(uri='file:///c:/Users/manue/projects/mlflow_for_ml_dev/traditional_ml_evaluation/mlruns/919999149587451714/c748e2cf7f8e49e2a14340e85efc6118/artifacts/precision_recall_curve.png'),
 'confusion_matrix': ImageEvaluationArtifact(uri='file:///c:/Users/manue/projects/mlflow_for_ml_dev/traditional_ml_evaluation/mlruns/919999149587451714/c748e2cf7f8e49e2a14340e85efc6118/artifacts/confusion_matrix.png'),
 'shap_beeswarm_plot': ImageEvaluationArtifact(uri='file:///c:/Users/manue/projects/mlflow_for_ml_dev/traditional_ml_evaluation/mlruns/919999149587451714/c748e2cf7f8e49e2a14340e85efc6118/artifacts/shap_beeswarm_plot.png'),
 'shap_summary_plot': ImageEvaluationArtifact(uri='file:///c:/Users/manue/projects/mlflow_for_ml_dev/traditional_ml_evaluation/mlruns/919999149587451714/c748e2cf7f8e49e2a14340e85efc6118/artifacts/shap_summary_plot.png'),
 'shap_feature_importance_plot': ImageEvaluationArtifact(uri='file:///c:/Users/manue/projects/mlflow_for_m