In [24]:
import os
import json
import warnings
import sys
import mlflow
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNet
import mlflow.sklearn

##### You can attach files and tags to an MLFlow Experiment Run

In [3]:
# Create some files to preserve as artifacts
features = "rooms, zipcode, median_price, school_rating, transport"
data = {"state": "TX", "Available": 25, "Type": "Detached"}

# Create couple of artifact files under the directory "data"
os.makedirs("data", exist_ok=True)
with open("data/data.json", 'w', encoding='utf-8') as f:
    json.dump(data, f, indent=2)
with open("data/features.txt", 'w') as f:
    f.write(features)

tags = {
    "engineering": "ML Platform",
    "release.candidate": "RC1",
    "release.version": "2.2.0",
}

The Experiment Run context allows you to access Experiment Run Metadata

In [25]:
def eval_metrics(actual, pred):
    rmse = np.sqrt(mean_squared_error(actual, pred))
    mae = mean_absolute_error(actual, pred)
    r2 = r2_score(actual, pred)
    return rmse, mae, r2

In [30]:
def run_experiment():
    warnings.filterwarnings("ignore")
    np.random.seed(40)

    mlflow.set_experiment("wine-quality-test")

    # Read the wine-quality csv file from the URL
    csv_url = (
        "https://raw.githubusercontent.com/mlflow/mlflow-example/master/wine-quality.csv"
    )
    try:
        data = pd.read_csv(csv_url, sep=",")
    except Exception as e:
        logger.exception(
            "Unable to download training & test CSV, check your internet connection. Error: %s", e
        )

    # Split the data into training and test sets. (0.75, 0.25) split.
    train, test = train_test_split(data)

    # The predicted column is "quality" which is a scalar from [3, 9]
    train_x = train.drop(["quality"], axis=1)
    test_x = test.drop(["quality"], axis=1)
    train_y = train[["quality"]]
    test_y = test[["quality"]]

    alpha = 0.8
    l1_ratio = 0.7

    with mlflow.start_run():

        lr = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=42)
        lr.fit(train_x, train_y)

        predicted_qualities = lr.predict(test_x)

        (rmse, mae, r2) = eval_metrics(test_y, predicted_qualities)

        print("Elasticnet model (alpha={:f}, l1_ratio={:f}):".format(alpha, l1_ratio))
        print("  RMSE: %s" % rmse)
        print("  MAE: %s" % mae)
        print("  R2: %s" % r2)

        mlflow.log_param("alpha", alpha)
        mlflow.log_param("l1_ratio", l1_ratio)
        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("r2", r2)
        mlflow.log_metric("mae", mae)
        mlflow.sklearn.log_model(lr, "model")
        
        # Set a batch of tags
        mlflow.set_tags(tags)    
    
        # Write all files in "data" to root artifact_uri/states
        mlflow.log_artifacts("data", artifact_path="states")

In [31]:
run_experiment()

Elasticnet model (alpha=0.800000, l1_ratio=0.700000):
  RMSE: 0.8595414376735744
  MAE: 0.6480061258882328
  R2: 0.045769026631650944


The MLFlow Tracking Client allows you to parse Experiment Run metadata in bulk for further analysis

In [32]:
#client = mlflow.tracking.MlflowClient()
#data = client.get_run(mlflow.latest_active_run().info.run_id).data

#Read Experiment ID by Experiment Name
experiment_id = mlflow.get_experiment_by_name("wine-quality-test").experiment_id

#Read Experiment by Experiment ID
experiment = mlflow.get_experiment(experiment_id)

#Print important Experiment Metadata
print("Name: {}".format(experiment.name))
print("Experiment_id: {}".format(experiment.experiment_id))
print("Artifact Location: {}".format(experiment.artifact_location))
print("Lifecycle_stage: {}".format(experiment.lifecycle_stage))

Name: wine-quality-test
Experiment_id: 3ci6-qptz-yt57-wiro
Artifact Location: /home/cdsw/.experiments/3ci6-qptz-yt57-wiro
Lifecycle_stage: active


You can parse all runs into a Pandas dataframe

In [34]:
runs_df = mlflow.search_runs(experiment_id, run_view_type=1)
runs_df

Unnamed: 0,run_id,experiment_id,status,artifact_uri,start_time,end_time,metrics.rmse,metrics.r2,metrics.mae,params.l1_ratio,params.alpha,tags.mlflow.source.type,tags.mlflow.user,tags.mlflow.source.git.commit,tags.mlflow.source.name,tags.mlflow.log-model.history,tags.engineering,tags.release.version,tags.release.candidate
0,okr9-rhho-k5mj-0qyd,3ci6-qptz-yt57-wiro,EXPERIMENT_RUN_FINISHED,/home/cdsw/.experiments/3ci6-qptz-yt57-wiro/ok...,2023-03-28 21:12:46.216679168+00:00,2023-03-28 21:12:46.263000064+00:00,0.822243,0.126787,0.627876,0.5,0.5,LOCAL,pauldefusco,d9e9966eb1924eba9f18d793023c70a1bb85c88c,/usr/local/bin/ipython3,,,,
1,jejk-j913-ihjs-qdnf,3ci6-qptz-yt57-wiro,EXPERIMENT_RUN_FINISHED,/home/cdsw/.experiments/3ci6-qptz-yt57-wiro/je...,2023-03-28 21:14:00.662723072+00:00,2023-03-28 21:14:04.694000128+00:00,0.849898,0.06706,0.642003,0.6,0.6,LOCAL,pauldefusco,d9e9966eb1924eba9f18d793023c70a1bb85c88c,/usr/local/bin/ipython3,"[{""run_id"": ""jejk-j913-ihjs-qdnf"", ""artifact_p...",,,
2,kz06-ao8g-xh97-zi2t,3ci6-qptz-yt57-wiro,EXPERIMENT_RUN_FINISHED,/home/cdsw/.experiments/3ci6-qptz-yt57-wiro/kz...,2023-03-28 23:31:43.656272896+00:00,2023-03-28 23:31:48.371000064+00:00,0.849898,0.06706,0.642003,0.6,0.6,LOCAL,pauldefusco,d9e9966eb1924eba9f18d793023c70a1bb85c88c,/usr/local/lib/python3.7/site-packages/ipykern...,"[{""run_id"": ""kz06-ao8g-xh97-zi2t"", ""artifact_p...",,,
3,laat-3irl-slkc-ggz5,3ci6-qptz-yt57-wiro,EXPERIMENT_RUN_FINISHED,/home/cdsw/.experiments/3ci6-qptz-yt57-wiro/la...,2023-03-28 23:33:40.862132992+00:00,2023-03-28 23:33:44.364000+00:00,0.859541,0.045769,0.648006,0.7,0.8,LOCAL,pauldefusco,d9e9966eb1924eba9f18d793023c70a1bb85c88c,/usr/local/lib/python3.7/site-packages/ipykern...,"[{""run_id"": ""laat-3irl-slkc-ggz5"", ""artifact_p...",ML Platform,2.2.0,RC1


##### MLFlow Autologging

In [37]:
from mlflow.tracking import MlflowClient

In [38]:
def yield_artifacts(run_id, path=None):
    """Yield all artifacts in the specified run"""
    client = MlflowClient()
    for item in client.list_artifacts(run_id, path):
        if item.is_dir:
            yield from yield_artifacts(run_id, item.path)
        else:
            yield item.path

In [39]:
def fetch_logged_data(run_id):
    """Fetch params, metrics, tags, and artifacts in the specified run"""
    client = MlflowClient()
    data = client.get_run(run_id).data
    # Exclude system tags: https://www.mlflow.org/docs/latest/tracking.html#system-tags
    tags = {k: v for k, v in data.tags.items() if not k.startswith("mlflow.")}
    artifacts = list(yield_artifacts(run_id))
    return {
        "params": data.params,
        "metrics": data.metrics,
        "tags": tags,
        "artifacts": artifacts,
    }

In [41]:
from pprint import pprint
import pandas as pd
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV

In [44]:
def run_iris():
    mlflow.sklearn.autolog()

    mlflow.set_experiment("auto_experiments")
    iris = datasets.load_iris()
    parameters = {"kernel": ("linear", "rbf"), "C": [1, 5, 7, 10]}
    svc = svm.SVC()
    clf = GridSearchCV(svc, parameters)

    clf.fit(iris.data, iris.target)
    run_id = mlflow.last_active_run().info.run_id

    # show data logged in the parent run
    print("========== parent run ==========")
    for key, data in fetch_logged_data(run_id).items():
        print("\n---------- logged {} ----------".format(key))
        pprint(data)

    # show data logged in the child runs
    filter_child_runs = "tags.mlflow.parentRunId = '{}'".format(run_id)
    runs = mlflow.search_runs(filter_string=filter_child_runs)
    param_cols = ["params.{}".format(p) for p in parameters.keys()]
    metric_cols = ["metrics.mean_test_score"]

    print("\n========== child runs ==========\n")
    pd.set_option("display.max_columns", None)  # prevent truncating columns
    print(runs[["run_id", *param_cols, *metric_cols]])

In [43]:
run_iris()

2023/03/28 23:36:21 INFO mlflow.tracking.fluent: Experiment with name 'auto_experiments' does not exist. Creating a new experiment.
2023/03/28 23:36:21 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'wvyo-j8y4-k61t-fu77', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
2023/03/28 23:36:28 INFO mlflow.sklearn.utils: Logging the 5 best runs, no runs will be omitted.



---------- logged params ----------
{'best_C': '1',
 'best_kernel': 'linear',
 'cv': 'None',
 'error_score': 'nan',
 'estimator': 'SVC()',
 'n_jobs': 'None',
 'param_grid': "{'kernel': ('linear', 'rbf'), 'C': [1, 10]}",
 'pre_dispatch': '2*n_jobs',
 'refit': 'True',
 'return_train_score': 'False',
 'scoring': 'None',
 'verbose': '0'}

---------- logged metrics ----------
{'best_cv_score': 0.9800000000000001,
 'training_accuracy_score': 0.9933333333333333,
 'training_f1_score': 0.9933326665999933,
 'training_precision_score': 0.9934640522875816,
 'training_recall_score': 0.9933333333333333,
 'training_score': 0.9933333333333333}

---------- logged tags ----------
{'estimator_class': 'sklearn.model_selection._search.GridSearchCV',
 'estimator_name': 'GridSearchCV'}

---------- logged artifacts ----------
['best_estimator/MLmodel',
 'best_estimator/conda.yaml',
 'best_estimator/model.pkl',
 'best_estimator/python_env.yaml',
 'best_estimator/requirements.txt',
 'cv_results.csv',
 'model/M