# MLflow AutoLogging

Enables (or disables) and configures autologging for scikit-learn estimators.

>```python
>mlflow.sklearn.autolog(log_input_examples=False, log_model_signatures=True, log_models=True, log_datasets=True, disable=False, exclusive=False, disable_for_unsupported_versions=False, >silent=False, max_tuning_runs=5, log_post_training_metrics=True, serialization_format='cloudpickle', registered_model_name=None, pos_label=None, extra_tags=None)


In [1]:
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from pprint import pprint
import mlflow 

from mlflow_for_ml_dev.experiments.exp_utils import get_or_create_experiment

In [2]:
# create experiment
experiment = get_or_create_experiment("sklearn_autologging")

In [3]:
# enable autologging
mlflow.sklearn.autolog()

# load data
iris = load_iris(as_frame=True)
X = iris.data
y = iris.target

# train model
rf = RandomForestClassifier()
with mlflow.start_run(run_name="autologged_run") as run:
    print(f"MLflow run_id: {run.info.run_id}")
    rf.fit(X, y)



MLflow run_id: ecb6ad9951ff46c2b8d1a1e58099f6d2




In [4]:
# get run and fecth logged data
run_id = run.info.run_id
run = mlflow.get_run(run_id)
pprint(run.data.params)


{'bootstrap': 'True',
 'ccp_alpha': '0.0',
 'class_weight': 'None',
 'criterion': 'gini',
 'max_depth': 'None',
 'max_features': 'sqrt',
 'max_leaf_nodes': 'None',
 'max_samples': 'None',
 'min_impurity_decrease': '0.0',
 'min_samples_leaf': '1',
 'min_samples_split': '2',
 'min_weight_fraction_leaf': '0.0',
 'monotonic_cst': 'None',
 'n_estimators': '100',
 'n_jobs': 'None',
 'oob_score': 'False',
 'random_state': 'None',
 'verbose': '0',
 'warm_start': 'False'}


In [5]:
run.data.metrics

{'training_accuracy_score': 1.0,
 'training_f1_score': 1.0,
 'training_log_loss': 0.026832977589816096,
 'training_precision_score': 1.0,
 'training_recall_score': 1.0,
 'training_roc_auc': 1.0,
 'training_score': 1.0}

In [6]:
run.data.tags

{'estimator_class': 'sklearn.ensemble._forest.RandomForestClassifier',
 'estimator_name': 'RandomForestClassifier',
 'mlflow.log-model.history': '[{"run_id": "ecb6ad9951ff46c2b8d1a1e58099f6d2", "artifact_path": "model", "utc_time_created": "2024-03-30 14:31:12.548332", "flavors": {"python_function": {"model_path": "model.pkl", "predict_fn": "predict", "loader_module": "mlflow.sklearn", "python_version": "3.11.8", "env": {"conda": "conda.yaml", "virtualenv": "python_env.yaml"}}, "sklearn": {"pickled_model": "model.pkl", "sklearn_version": "1.4.1.post1", "serialization_format": "cloudpickle", "code": null}}, "model_uuid": "dee5d8496ec449319b51a1056efebb61", "mlflow_version": "2.11.3", "signature": {"inputs": "[{\\"type\\": \\"double\\", \\"name\\": \\"sepal length (cm)\\", \\"required\\": true}, {\\"type\\": \\"double\\", \\"name\\": \\"sepal width (cm)\\", \\"required\\": true}, {\\"type\\": \\"double\\", \\"name\\": \\"petal length (cm)\\", \\"required\\": true}, {\\"type\\": \\"double

In [7]:
model_artifacts = [f.path for f in mlflow.MlflowClient().list_artifacts(run_id, "model")]
run_artifacts = [f.path for f in mlflow.MlflowClient().list_artifacts(run_id)]
print("Model artifacts:")
pprint(model_artifacts)
print("All artifacts:")
pprint(run_artifacts)

Model artifacts:
['model/MLmodel',
 'model/conda.yaml',
 'model/metadata',
 'model/model.pkl',
 'model/python_env.yaml',
 'model/requirements.txt']
All artifacts:
['estimator.html', 'model', 'training_confusion_matrix.png']


## Logging Input examples

In [8]:
# enable autologging
mlflow.sklearn.autolog(log_input_examples=True)

# load data
iris = load_iris(as_frame=True)
X = iris.data
y = iris.target

# train model
rf = RandomForestClassifier()
with mlflow.start_run(run_name="autologged_run") as run:
    print(f"MLflow run_id: {run.info.run_id}")
    rf.fit(X, y)

MLflow run_id: 9e03eb9960564bea98e4f2d2c5325cdf




In [9]:
# enable autologging
mlflow.sklearn.autolog(log_input_examples=True)

# Autologging must be enabled before scikit-learn metric
# APIs are imported from sklearn.metrics. Metric APIs imported before autologging 
# is enabled do not log metrics to MLflow runs.

from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

# load data
iris = load_iris(as_frame=True)
X = iris.data
y = iris.target

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# train model
rf = RandomForestClassifier()
with mlflow.start_run(run_name="autologged_run") as run:
    print(f"MLflow run_id: {run.info.run_id}")
    rf.fit(x_train, y_train)

    # predictions
    y_pred = rf.predict(x_test)
    balanced_accuracy = balanced_accuracy_score(y_test, y_pred)
    print(f"Balanced accuracy: {balanced_accuracy}")

    # accuracy
    accuracy = accuracy_score(y_test, y_pred)

    # recall
    recall = recall_score(y_test, y_pred, average="weighted")

    # f1
    f1 = f1_score(y_test, y_pred, average="weighted")



MLflow run_id: ffd1be1c71ff42a492cc375bf898f8c5




Balanced accuracy: 0.8727272727272727
