# Evaluating ML models

In [1]:
import mlflow 

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

import pandas as pd

In [2]:
experiment_name = "classic_evaluation"
mlflow.set_experiment(experiment_name)

<Experiment: artifact_location='file:///c:/Users/manue/projects/mlflow_for_ml_dev/traditional_ml_evaluation/mlruns/919999149587451714', creation_time=1723858156380, experiment_id='919999149587451714', last_update_time=1723858156380, lifecycle_stage='active', name='classic_evaluation', tags={}>

## Example Dataset

In [3]:
x , y = make_classification(n_samples=1000, n_features=5, n_classes=2, random_state=42)

x_df = pd.DataFrame(x, columns=[f'feature_{i+1}' for i in range(x.shape[1])])
y_df = pd.DataFrame(y, columns=['target'])

x_train, x_test, y_train, y_test = train_test_split(x_df, y_df, test_size=0.2, random_state=42)


print(x_train.head())

     feature_1  feature_2  feature_3  feature_4  feature_5
29   -1.358062   0.388926  -2.219300   0.629189   0.502890
535  -1.692785   0.161613  -0.451659   0.630933   1.416180
695   3.291478  -0.557601  -0.580053  -1.342261  -2.159247
557  -2.076136   1.416809  -0.181224   1.351993  -1.239513
836  -1.348164   0.336725   0.038238   0.601181   0.619803


In [4]:
# instantiate the model
rfc = RandomForestClassifier(n_estimators=1)

# fit the model
rfc.fit(x_train, y_train)

# make predictions
predictions = rfc.predict(x_test)

  return fit_method(estimator, *args, **kwargs)


In [5]:
eval_df = pd.DataFrame({
    "target": y_test.to_numpy().flatten(),
    "predictions": predictions
})
print(eval_df.head())

   target  predictions
0       1            1
1       1            1
2       1            1
3       1            1
4       0            0


### Evaluating the model using the evaluation dataframe

In [6]:
result = mlflow.evaluate(
    data = eval_df,
    model_type  = "classifier",
    targets= "target",
    predictions="predictions"
)

2024/08/16 21:06:42 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.
2024/08/16 21:06:42 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...
2024/08/16 21:06:45 INFO mlflow.models.evaluation.default_evaluator: Shap explainer ExactExplainer is used.


<Figure size 1050x700 with 0 Axes>

### Evaluating the model providing the estimator.

1. Provide the model as a function

In [7]:
def random_forest_clf(model_input):
    print(model_input)
    return rfc.predict(model_input)

In [8]:
eval_df_for_model = x_test.copy()
eval_df_for_model['target'] = y_test

In [9]:
eval_df_for_model.head()

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,target
521,1.250932,-1.064291,-2.238231,-0.914547,1.261286,1
737,-0.196283,0.19082,-0.243384,0.154804,-0.256094,1
740,2.659138,-0.265773,1.072978,-0.996758,-2.195564,1
660,0.087778,-0.021011,-0.66778,-0.038709,-0.042586,1
411,-0.662457,0.741043,-0.35834,0.568499,-1.101298,0


In [10]:
result = mlflow.evaluate(
    model = random_forest_clf,
    data = eval_df_for_model,
    model_type  = "classifier",
    targets= "target",
    predictions="predictions"
)

2024/08/16 21:06:45 INFO mlflow.models.evaluation.default_evaluator: Computing model predictions.
2024/08/16 21:06:45 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.
2024/08/16 21:06:45 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...


     feature_1  feature_2  feature_3  feature_4  feature_5
521   1.250932  -1.064291  -2.238231  -0.914547   1.261286
737  -0.196283   0.190820  -0.243384   0.154804  -0.256094
740   2.659138  -0.265773   1.072978  -0.996758  -2.195564
660   0.087778  -0.021011  -0.667780  -0.038709  -0.042586
411  -0.662457   0.741043  -0.358340   0.568499  -1.101298
..         ...        ...        ...        ...        ...
408  -0.197984  -0.085901  -0.502975   0.024067   0.421614
332   1.375817  -1.199700  -0.838586  -1.019683   1.458419
208   2.045861  -0.117525   0.695538  -0.725620  -1.901583
613  -0.778229  -0.532867  -0.479493   0.001984   2.134059
78   -0.120428   0.544656   1.194109   0.297849  -1.201483

[200 rows x 5 columns]
     feature_1  feature_2  feature_3  feature_4  feature_5
521   1.250932  -1.064291  -2.238231  -0.914547   1.261286
737  -0.196283   0.190820  -0.243384   0.154804  -0.256094
740   2.659138  -0.265773   1.072978  -0.996758  -2.195564
660   0.087778  -0.021011  -0.66

2024/08/16 21:06:45 INFO mlflow.models.evaluation.default_evaluator: Shap explainer ExactExplainer is used.


      feature_1  feature_2  feature_3  feature_4  feature_5
0     -0.222842   0.432536  -0.354041   0.278185  -0.818068
1     -0.880206   0.413875   0.789057   0.484566  -0.069252
2     -0.055715   0.059618   1.476934   0.046529  -0.086013
3      0.656203  -0.618955  -0.413606  -0.508526   0.809792
4      1.094111  -0.973531   0.985450  -0.820138   1.207371
...         ...        ...        ...        ...        ...
3195   1.250932  -1.978614   0.183835  -1.685418   2.393225
3196   1.250932   0.285794   0.261251  -0.183900  -1.741946
3197   1.250932  -0.205288  -0.784762   0.301110   1.803470
3198   1.250932  -0.501734  -0.230525  -0.446208   0.545373
3199   1.250932  -1.585530   0.253165  -1.339404   1.954297

[3200 rows x 5 columns]
      feature_1  feature_2  feature_3  feature_4  feature_5
0     -0.222842   0.432536  -0.354041   0.278185  -0.818068
1     -0.880206   0.413875   0.789057   0.484566  -0.069252
2     -0.055715   0.059618   1.476934   0.046529  -0.086013
3      0.656203



In [11]:
result.metrics

{'true_negatives': 83,
 'false_positives': 14,
 'false_negatives': 15,
 'true_positives': 88,
 'example_count': 200,
 'accuracy_score': 0.855,
 'recall_score': 0.8543689320388349,
 'precision_score': 0.8627450980392157,
 'f1_score': 0.8585365853658536}

## Adding extra metrics

In [24]:
from mlflow.metrics import make_metric
from sklearn.metrics import f1_score

In [25]:
def custom_accuracy(df, __builtin_metrics):
    targets = df["target"]
    predictions = df["prediction"]
    return sum(targets == predictions) / len(targets)

def custom_f1_score(df, __builtin_metrics):
    targets = df["target"]
    predictions = df["prediction"]
    return f1_score(targets, predictions, average="weighted")

In [26]:
custom_metric_accuracy = make_metric(
    eval_fn = custom_accuracy,
    name = "custom_accuracy",
    greater_is_better=True,
)

custom_metric_f1_score = make_metric(
    eval_fn = custom_f1_score,
    name = "custom_f1_score",
    greater_is_better=True,
)

In [31]:
result = mlflow.evaluate(
    model = random_forest_clf,
    data = eval_df_for_model,
    model_type  = "classifier",
    targets="target",
    predictions="predictions",
    extra_metrics=[custom_metric_accuracy, custom_metric_f1_score]
)


2024/08/16 21:15:22 INFO mlflow.models.evaluation.default_evaluator: Computing model predictions.
2024/08/16 21:15:22 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.
2024/08/16 21:15:22 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...


     feature_1  feature_2  feature_3  feature_4  feature_5
521   1.250932  -1.064291  -2.238231  -0.914547   1.261286
737  -0.196283   0.190820  -0.243384   0.154804  -0.256094
740   2.659138  -0.265773   1.072978  -0.996758  -2.195564
660   0.087778  -0.021011  -0.667780  -0.038709  -0.042586
411  -0.662457   0.741043  -0.358340   0.568499  -1.101298
..         ...        ...        ...        ...        ...
408  -0.197984  -0.085901  -0.502975   0.024067   0.421614
332   1.375817  -1.199700  -0.838586  -1.019683   1.458419
208   2.045861  -0.117525   0.695538  -0.725620  -1.901583
613  -0.778229  -0.532867  -0.479493   0.001984   2.134059
78   -0.120428   0.544656   1.194109   0.297849  -1.201483

[200 rows x 5 columns]
     feature_1  feature_2  feature_3  feature_4  feature_5
521   1.250932  -1.064291  -2.238231  -0.914547   1.261286
737  -0.196283   0.190820  -0.243384   0.154804  -0.256094
740   2.659138  -0.265773   1.072978  -0.996758  -2.195564
660   0.087778  -0.021011  -0.66

2024/08/16 21:15:23 INFO mlflow.models.evaluation.default_evaluator: Shap explainer ExactExplainer is used.


      feature_1  feature_2  feature_3  feature_4  feature_5
0     -0.222842   0.432536  -0.354041   0.278185  -0.818068
1     -0.880206   0.413875   0.789057   0.484566  -0.069252
2     -0.055715   0.059618   1.476934   0.046529  -0.086013
3      0.656203  -0.618955  -0.413606  -0.508526   0.809792
4      1.094111  -0.973531   0.985450  -0.820138   1.207371
...         ...        ...        ...        ...        ...
3195   1.250932  -1.978614   0.183835  -1.685418   2.393225
3196   1.250932   0.285794   0.261251  -0.183900  -1.741946
3197   1.250932  -0.205288  -0.784762   0.301110   1.803470
3198   1.250932  -0.501734  -0.230525  -0.446208   0.545373
3199   1.250932  -1.585530   0.253165  -1.339404   1.954297

[3200 rows x 5 columns]
      feature_1  feature_2  feature_3  feature_4  feature_5
0     -0.222842   0.432536  -0.354041   0.278185  -0.818068
1     -0.880206   0.413875   0.789057   0.484566  -0.069252
2     -0.055715   0.059618   1.476934   0.046529  -0.086013
3      0.656203



In [34]:
result.artifacts

{'confusion_matrix': ImageEvaluationArtifact(uri='file:///c:/Users/manue/projects/mlflow_for_ml_dev/traditional_ml_evaluation/mlruns/919999149587451714/75e9143eb31740559b5476f171d40a80/artifacts/confusion_matrix.png'),
 'shap_beeswarm_plot': ImageEvaluationArtifact(uri='file:///c:/Users/manue/projects/mlflow_for_ml_dev/traditional_ml_evaluation/mlruns/919999149587451714/75e9143eb31740559b5476f171d40a80/artifacts/shap_beeswarm_plot.png'),
 'shap_summary_plot': ImageEvaluationArtifact(uri='file:///c:/Users/manue/projects/mlflow_for_ml_dev/traditional_ml_evaluation/mlruns/919999149587451714/75e9143eb31740559b5476f171d40a80/artifacts/shap_summary_plot.png'),
 'shap_feature_importance_plot': ImageEvaluationArtifact(uri='file:///c:/Users/manue/projects/mlflow_for_ml_dev/traditional_ml_evaluation/mlruns/919999149587451714/75e9143eb31740559b5476f171d40a80/artifacts/shap_feature_importance_plot.png')}

## Adding Custom Artifacts

In [35]:
from sklearn.metrics import PrecisionRecallDisplay
import matplotlib.pyplot as plt

# Custom precision recal curve
def custom_precision_recall_curve(df, _builtin_metrics, _artifacts_dir):
    targets = df["target"]
    predictions = df["prediction"]
    pr_display = PrecisionRecallDisplay.from_predictions(targets, predictions)
    return {"precision_recall_curve": pr_display.figure_}


In [37]:
result = mlflow.evaluate(
        data = eval_df,
        model_type  = "classifier",
        targets="target",
        predictions="predictions",
        extra_metrics=[custom_metric_accuracy, custom_metric_f1_score],
        custom_artifacts=[custom_precision_recall_curve]
)

2024/08/16 21:16:12 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.
2024/08/16 21:16:12 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...
2024/08/16 21:16:12 INFO mlflow.models.evaluation.default_evaluator: Shap explainer ExactExplainer is used.


In [38]:
result.artifacts

{'precision_recall_curve': ImageEvaluationArtifact(uri='file:///c:/Users/manue/projects/mlflow_for_ml_dev/traditional_ml_evaluation/mlruns/919999149587451714/1b4442864d884fbda348e3ae8ed64950/artifacts/precision_recall_curve.png'),
 'confusion_matrix': ImageEvaluationArtifact(uri='file:///c:/Users/manue/projects/mlflow_for_ml_dev/traditional_ml_evaluation/mlruns/919999149587451714/1b4442864d884fbda348e3ae8ed64950/artifacts/confusion_matrix.png')}

In [39]:
result = mlflow.evaluate(
    model = random_forest_clf,
    data = eval_df_for_model,
    model_type  = "classifier",
    targets="target",
    predictions="predictions",
    extra_metrics=[custom_metric_accuracy, custom_metric_f1_score],
    custom_artifacts=[custom_precision_recall_curve]
)

2024/08/16 21:16:43 INFO mlflow.models.evaluation.default_evaluator: Computing model predictions.
2024/08/16 21:16:43 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.
2024/08/16 21:16:43 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...


     feature_1  feature_2  feature_3  feature_4  feature_5
521   1.250932  -1.064291  -2.238231  -0.914547   1.261286
737  -0.196283   0.190820  -0.243384   0.154804  -0.256094
740   2.659138  -0.265773   1.072978  -0.996758  -2.195564
660   0.087778  -0.021011  -0.667780  -0.038709  -0.042586
411  -0.662457   0.741043  -0.358340   0.568499  -1.101298
..         ...        ...        ...        ...        ...
408  -0.197984  -0.085901  -0.502975   0.024067   0.421614
332   1.375817  -1.199700  -0.838586  -1.019683   1.458419
208   2.045861  -0.117525   0.695538  -0.725620  -1.901583
613  -0.778229  -0.532867  -0.479493   0.001984   2.134059
78   -0.120428   0.544656   1.194109   0.297849  -1.201483

[200 rows x 5 columns]
     feature_1  feature_2  feature_3  feature_4  feature_5
521   1.250932  -1.064291  -2.238231  -0.914547   1.261286
737  -0.196283   0.190820  -0.243384   0.154804  -0.256094
740   2.659138  -0.265773   1.072978  -0.996758  -2.195564
660   0.087778  -0.021011  -0.66

2024/08/16 21:16:44 INFO mlflow.models.evaluation.default_evaluator: Shap explainer ExactExplainer is used.


      feature_1  feature_2  feature_3  feature_4  feature_5
0     -0.222842   0.432536  -0.354041   0.278185  -0.818068
1     -0.880206   0.413875   0.789057   0.484566  -0.069252
2     -0.055715   0.059618   1.476934   0.046529  -0.086013
3      0.656203  -0.618955  -0.413606  -0.508526   0.809792
4      1.094111  -0.973531   0.985450  -0.820138   1.207371
...         ...        ...        ...        ...        ...
3195   1.250932  -1.978614   0.183835  -1.685418   2.393225
3196   1.250932   0.285794   0.261251  -0.183900  -1.741946
3197   1.250932  -0.205288  -0.784762   0.301110   1.803470
3198   1.250932  -0.501734  -0.230525  -0.446208   0.545373
3199   1.250932  -1.585530   0.253165  -1.339404   1.954297

[3200 rows x 5 columns]
      feature_1  feature_2  feature_3  feature_4  feature_5
0     -0.222842   0.432536  -0.354041   0.278185  -0.818068
1     -0.880206   0.413875   0.789057   0.484566  -0.069252
2     -0.055715   0.059618   1.476934   0.046529  -0.086013
3      0.656203



In [40]:
result.artifacts

{'precision_recall_curve': ImageEvaluationArtifact(uri='file:///c:/Users/manue/projects/mlflow_for_ml_dev/traditional_ml_evaluation/mlruns/919999149587451714/abd5b411ef12474e9dfb7aeb25885636/artifacts/precision_recall_curve.png'),
 'confusion_matrix': ImageEvaluationArtifact(uri='file:///c:/Users/manue/projects/mlflow_for_ml_dev/traditional_ml_evaluation/mlruns/919999149587451714/abd5b411ef12474e9dfb7aeb25885636/artifacts/confusion_matrix.png'),
 'shap_beeswarm_plot': ImageEvaluationArtifact(uri='file:///c:/Users/manue/projects/mlflow_for_ml_dev/traditional_ml_evaluation/mlruns/919999149587451714/abd5b411ef12474e9dfb7aeb25885636/artifacts/shap_beeswarm_plot.png'),
 'shap_summary_plot': ImageEvaluationArtifact(uri='file:///c:/Users/manue/projects/mlflow_for_ml_dev/traditional_ml_evaluation/mlruns/919999149587451714/abd5b411ef12474e9dfb7aeb25885636/artifacts/shap_summary_plot.png'),
 'shap_feature_importance_plot': ImageEvaluationArtifact(uri='file:///c:/Users/manue/projects/mlflow_for_m