In [5]:
#****************************************************************************
# (C) Cloudera, Inc. 2020-2023
#  All rights reserved.
#
#  Applicable Open Source License: GNU Affero General Public License v3.0
#
#  NOTE: Cloudera open source products are modular software products
#  made up of hundreds of individual components, each of which was
#  individually copyrighted.  Each Cloudera open source product is a
#  collective work under U.S. Copyright Law. Your license to use the
#  collective work is as provided in your written agreement with
#  Cloudera.  Used apart from the collective work, this file is
#  licensed for your use pursuant to the open source license
#  identified above.
#
#  This code is provided to you pursuant a written agreement with
#  (i) Cloudera, Inc. or (ii) a third-party authorized to distribute
#  this code. If you do not have a written agreement with Cloudera nor
#  with an authorized and properly licensed third party, you do not
#  have any rights to access nor to use this code.
#
#  Absent a written agreement with Cloudera, Inc. (“Cloudera”) to the
#  contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY
#  KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED
#  WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO
#  IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND
#  FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU,
#  AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS
#  ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE
#  OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY
#  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR
#  CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES
#  RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF
#  BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF
#  DATA.
#
# #  Author(s): Paul de Fusco
#***************************************************************************/

In [6]:
import os
import json
import warnings
import sys
import mlflow
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNet
import mlflow.sklearn

##### You can attach files and tags to an MLFlow Experiment Run

In [7]:
# Create some files to preserve as artifacts
features = "rooms, zipcode, median_price, school_rating, transport"
data = {"state": "TX", "Available": 25, "Type": "Detached"}

# Create couple of artifact files under the directory "data"
os.makedirs("data", exist_ok=True)
with open("data/data.json", 'w', encoding='utf-8') as f:
    json.dump(data, f, indent=2)
with open("data/features.txt", 'w') as f:
    f.write(features)

tags = {
    "engineering": "ML Platform",
    "release.candidate": "RC1",
    "release.version": "2.2.0",
}

The Experiment Run context allows you to access Experiment Run Metadata

In [8]:
def eval_metrics(actual, pred):
    rmse = np.sqrt(mean_squared_error(actual, pred))
    mae = mean_absolute_error(actual, pred)
    r2 = r2_score(actual, pred)
    return rmse, mae, r2

In [9]:
def run_experiment():
    warnings.filterwarnings("ignore")
    np.random.seed(40)

    mlflow.set_experiment("wine-quality-test")

    # Read the wine-quality csv file from the URL
    csv_url = (
        "https://raw.githubusercontent.com/mlflow/mlflow-example/master/wine-quality.csv"
    )
    try:
        data = pd.read_csv(csv_url, sep=",")
    except Exception as e:
        logger.exception(
            "Unable to download training & test CSV, check your internet connection. Error: %s", e
        )

    # Split the data into training and test sets. (0.75, 0.25) split.
    train, test = train_test_split(data)

    # The predicted column is "quality" which is a scalar from [3, 9]
    train_x = train.drop(["quality"], axis=1)
    test_x = test.drop(["quality"], axis=1)
    train_y = train[["quality"]]
    test_y = test[["quality"]]

    alpha = 0.8
    l1_ratio = 0.7

    with mlflow.start_run():

        lr = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=42)
        lr.fit(train_x, train_y)

        predicted_qualities = lr.predict(test_x)

        (rmse, mae, r2) = eval_metrics(test_y, predicted_qualities)

        print("Elasticnet model (alpha={:f}, l1_ratio={:f}):".format(alpha, l1_ratio))
        print("  RMSE: %s" % rmse)
        print("  MAE: %s" % mae)
        print("  R2: %s" % r2)

        mlflow.log_param("alpha", alpha)
        mlflow.log_param("l1_ratio", l1_ratio)
        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("r2", r2)
        mlflow.log_metric("mae", mae)
        mlflow.sklearn.log_model(lr, "model")
        
        # Set a batch of tags
        mlflow.set_tags(tags)    
    
        # Write all files in "data" to root artifact_uri/states
        mlflow.log_artifacts("data", artifact_path="states")

In [10]:
run_experiment()

Elasticnet model (alpha=0.800000, l1_ratio=0.700000):
  RMSE: 0.8595414376735744
  MAE: 0.6480061258882328
  R2: 0.045769026631650944


The MLFlow Tracking Client allows you to parse Experiment Run metadata in bulk for further analysis

In [11]:
#client = mlflow.tracking.MlflowClient()
#data = client.get_run(mlflow.latest_active_run().info.run_id).data

#Read Experiment ID by Experiment Name
experiment_id = mlflow.get_experiment_by_name("wine-quality-test").experiment_id

#Read Experiment by Experiment ID
experiment = mlflow.get_experiment(experiment_id)

#Print important Experiment Metadata
print("Name: {}".format(experiment.name))
print("Experiment_id: {}".format(experiment.experiment_id))
print("Artifact Location: {}".format(experiment.artifact_location))
print("Lifecycle_stage: {}".format(experiment.lifecycle_stage))

Name: wine-quality-test
Experiment_id: 2tv3-479z-x02f-jndi
Artifact Location: /home/cdsw/.experiments/2tv3-479z-x02f-jndi
Lifecycle_stage: active


You can parse all runs into a Pandas dataframe

In [12]:
runs_df = mlflow.search_runs(experiment_id, run_view_type=1)
runs_df

Unnamed: 0,run_id,experiment_id,status,artifact_uri,start_time,end_time,metrics.rmse,metrics.mae,metrics.r2,params.alpha,params.l1_ratio,tags.mlflow.source.name,tags.mlflow.source.type,tags.mlflow.user,tags.mlflow.log-model.history,tags.mlflow.source.git.commit,tags.engineID,tags.release.version,tags.release.candidate,tags.engineering
0,1ip4-kluf-v7zb-ag2q,2tv3-479z-x02f-jndi,EXPERIMENT_RUN_FINISHED,/home/cdsw/.experiments/2tv3-479z-x02f-jndi/1i...,2023-11-05 19:43:33.726781952+00:00,2023-11-05 19:43:38.683000064+00:00,0.859162,0.648351,0.046612,0.7,0.6,/usr/local/bin/ipython3,LOCAL,pauldefusco,"[{""run_id"": ""1ip4-kluf-v7zb-ag2q"", ""artifact_p...",06b7bed0031ad636f6b3ade4189ea15b164906f3,e4xbmrv1l9b1fkqf,,,
1,1ge2-g8tr-2nlb-jjcx,2tv3-479z-x02f-jndi,EXPERIMENT_RUN_FINISHED,/home/cdsw/.experiments/2tv3-479z-x02f-jndi/1g...,2023-11-05 19:43:46.718746880+00:00,2023-11-05 19:43:51.279000064+00:00,0.795855,0.619685,0.181935,0.4,0.2,/usr/local/bin/ipython3,LOCAL,pauldefusco,"[{""run_id"": ""1ge2-g8tr-2nlb-jjcx"", ""artifact_p...",06b7bed0031ad636f6b3ade4189ea15b164906f3,e4xbmrv1l9b1fkqf,,,
2,z96r-6hy8-k4w7-q65d,2tv3-479z-x02f-jndi,EXPERIMENT_RUN_FINISHED,/home/cdsw/.experiments/2tv3-479z-x02f-jndi/z9...,2023-11-05 19:46:02.406344960+00:00,2023-11-05 19:46:08.438000128+00:00,0.859541,0.648006,0.045769,0.8,0.7,/usr/local/lib/python3.9/site-packages/ipykern...,LOCAL,pauldefusco,"[{""run_id"": ""z96r-6hy8-k4w7-q65d"", ""artifact_p...",06b7bed0031ad636f6b3ade4189ea15b164906f3,3fw9f74t2egfv60p,2.2.0,RC1,ML Platform


##### MLFlow Autologging

In [13]:
from mlflow.tracking import MlflowClient

In [14]:
def yield_artifacts(run_id, path=None):
    """Yield all artifacts in the specified run"""
    client = MlflowClient()
    for item in client.list_artifacts(run_id, path):
        if item.is_dir:
            yield from yield_artifacts(run_id, item.path)
        else:
            yield item.path

In [15]:
def fetch_logged_data(run_id):
    """Fetch params, metrics, tags, and artifacts in the specified run"""
    client = MlflowClient()
    data = client.get_run(run_id).data
    # Exclude system tags: https://www.mlflow.org/docs/latest/tracking.html#system-tags
    tags = {k: v for k, v in data.tags.items() if not k.startswith("mlflow.")}
    artifacts = list(yield_artifacts(run_id))
    return {
        "params": data.params,
        "metrics": data.metrics,
        "tags": tags,
        "artifacts": artifacts,
    }

In [16]:
from pprint import pprint
import pandas as pd
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV

In [17]:
def run_iris():
    mlflow.sklearn.autolog()

    mlflow.set_experiment("auto_experiments")
    iris = datasets.load_iris()
    parameters = {"kernel": ("linear", "rbf"), "C": [1, 5, 7, 10]}
    svc = svm.SVC()
    clf = GridSearchCV(svc, parameters)

    clf.fit(iris.data, iris.target)
    run_id = mlflow.last_active_run().info.run_id

    # show data logged in the parent run
    print("========== parent run ==========")
    for key, data in fetch_logged_data(run_id).items():
        print("\n---------- logged {} ----------".format(key))
        pprint(data)

    # show data logged in the child runs
    filter_child_runs = "tags.mlflow.parentRunId = '{}'".format(run_id)
    runs = mlflow.search_runs(filter_string=filter_child_runs)
    param_cols = ["params.{}".format(p) for p in parameters.keys()]
    metric_cols = ["metrics.mean_test_score"]

    print("\n========== child runs ==========\n")
    pd.set_option("display.max_columns", None)  # prevent truncating columns
    print(runs[["run_id", *param_cols, *metric_cols]])

In [18]:
run_iris()

2023/11/05 19:46:26 INFO mlflow.tracking.fluent: Experiment with name 'auto_experiments' does not exist. Creating a new experiment.
2023/11/05 19:46:26 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'clpf-hea2-70n5-6af1', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
2023/11/05 19:46:35 INFO mlflow.sklearn.utils: Logging the 5 best runs, 3 runs will be omitted.



---------- logged params ----------
{'best_C': '5',
 'best_kernel': 'rbf',
 'cv': 'None',
 'error_score': 'nan',
 'estimator': 'SVC()',
 'n_jobs': 'None',
 'param_grid': "{'kernel': ('linear', 'rbf'), 'C': [1, 5, 7, 10]}",
 'pre_dispatch': '2*n_jobs',
 'refit': 'True',
 'return_train_score': 'False',
 'scoring': 'None',
 'verbose': '0'}

---------- logged metrics ----------
{'best_cv_score': 0.9866666666666667,
 'training_accuracy_score': 0.9866666666666667,
 'training_f1_score': 0.9866613311991462,
 'training_precision_score': 0.9871794871794873,
 'training_recall_score': 0.9866666666666667,
 'training_score': 0.9866666666666667}

---------- logged tags ----------
{'engineID': '3fw9f74t2egfv60p',
 'estimator_class': 'sklearn.model_selection._search.GridSearchCV',
 'estimator_name': 'GridSearchCV'}

---------- logged artifacts ----------
['best_estimator/MLmodel',
 'best_estimator/conda.yaml',
 'best_estimator/model.pkl',
 'best_estimator/python_env.yaml',
 'best_estimator/requirement