# MLflow: 
### `Log data, dependencies, model, metrics etc. to Unity Catalog`

In [0]:
# %sql
# DROP TABLE IF EXISTS mmt_demos.dependencies.iris_data;

spark.sql("DROP TABLE IF EXISTS mmt_demos.dependencies.iris_data")

DataFrame[]

In [0]:
import mlflow
from mlflow.tracking import MlflowClient

# Set registry to Unity Catalog 
mlflow.set_registry_uri("databricks-uc")

# Initialize the MLflow client
client = MlflowClient()

# Define the model name
model_name = "mmt_demos.dependencies.iris_rfclassifier"

# List all versions of the model
versions = [mv.version for mv in client.search_model_versions(f"name='{model_name}'")]

# Delete all versions of the model
for version in versions:
    client.delete_model_version(name=model_name, version=version)

# Delete the registered model
client.delete_registered_model(name=model_name)

In [0]:
# import mlflow

# # Set registry to Unity Catalog 
# mlflow.set_registry_uri("databricks-uc")

# # Delete version 3 of the model
# model_name = "mmt_demos.dependencies.iris_rfclassifier"
# version = 3

# mlflow.registered_model.delete_model_version(name=model_name, version=version)

In [0]:
mlflow.__version__ 
#'2.9.2'

'2.9.2'

In [0]:
import mlflow
from mlflow.models import infer_signature
import pandas as pd
import pyspark.pandas as ps
from sklearn.datasets import load_iris
# from sklearn.metrics import mean_squared_error
# from sklearn.metrics import accuracy_score 
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score
import os

In [0]:
# Create the volume in Unity Catalog
spark.sql("CREATE VOLUME IF NOT EXISTS mmt_demos.dependencies.iris_rfclassifier")

# Create the directory within the volume
volume_path = "/Volumes/mmt_demos/dependencies/iris_rfclassifier"
dbutils.fs.mkdirs(volume_path)

True

In [0]:
dbutils.fs.ls("/dbfs/Volumes/mmt_demos/dependencies/")

[FileInfo(path='dbfs:/dbfs/Volumes/mmt_demos/dependencies/iris_rfclassifier/', name='iris_rfclassifier/', size=0, modificationTime=1745284722422)]

In [0]:
# Get the user's home directory path
user_path = dbutils.notebook.entry_point.getDbutils().notebook().getContext().userName().get()

# Write a table to Unity Catalog
iris = load_iris()
iris_df = pd.DataFrame(iris.data, columns=iris.feature_names)
iris_df.rename(
  columns={col: col.replace(' (cm)', '').replace(' ', '_') for col in iris_df.columns},
  inplace=True
)
iris_df['species'] = iris.target
ps.from_pandas(iris_df).to_table("mmt_demos.dependencies.iris_data", mode="overwrite")
# table version could be specified during model logging

# Define the conda environment
conda_env = """
name: mlflow-env
channels:
  - defaults
dependencies:
  - python=3.8.5
  - scikit-learn=0.24.1
  - mlflow=2.9.2
  - pip
  - pip:
    - mlflow
    - pandas
    - pyspark
"""

# Write the conda_env to a UC Volume for subsequent reference in model logging
# Create the volume in Unity Catalog
# spark.sql("CREATE VOLUME IF NOT EXISTS mmt_demos.dependencies.iris_rfclassifier")

# Create the volume path
volume_path = "/Volumes/mmt_demos/dependencies/iris_rfclassifier"
dbutils.fs.mkdirs(volume_path)

# Write the conda_env to the volume
conda_env_volume_path = f"{volume_path}/conda_env.yaml"
os.makedirs(os.path.dirname(conda_env_volume_path), exist_ok=True)
with open(f"{conda_env_volume_path}", "w") as f:
    f.write(conda_env)

# Load a Unity Catalog table, train a model, and log the input table
dataset = mlflow.data.load_delta(table_name="mmt_demos.dependencies.iris_data", version="0") ## table version could be specified during model logging
pd_df = dataset.df.toPandas()
X = pd_df.drop("species", axis=1)
y = pd_df["species"]

# Split the data into training and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert integer columns to float64 to handle missing values
X_train = X_train.astype(float)
X_test = X_test.astype(float)

In [0]:
# Set registry to Unity Catalog 
mlflow.set_registry_uri("databricks-uc")

# Set the experiment explicitly
experiment_path = f"/Users/{user_path}/mlflow_experiments/dependencies/iris_data_rfclassifier"
os.makedirs(os.path.dirname(experiment_path), exist_ok=True)
mlflow.set_experiment(experiment_path)

# Define the parameters for the Random Forest model
# params = {"n_estimators": 3, "random_state": 432}
params = {
    "n_estimators": 5,
    "random_state": 432,
    "max_depth": 3,
    "min_samples_split": 20,
    "min_samples_leaf": 10,
    "max_features": "log2", #"sqrt",
    "bootstrap": True
}

# Train a model, log input table, parameters, metrics etc.
with mlflow.start_run() as run:
    # Define the model
    rfc = RandomForestClassifier(**params).fit(X_train, y_train)

    # Specify the required model input and output schema 
    signature = infer_signature(X_train, rfc.predict(X_train))
    mlflow.log_input(dataset, "training")
    # Take the first row of the training dataset as the model input example.
    input_example = X_train.iloc[[0]]
    # Log the model and register it as a new version in UC.
    mlflow.log_params(params)
    
    ## Track model metrics with experiment run for subsequent comparisons 
    # Calculate and log training metrics
    train_predictions = rfc.predict(X_train)
    train_accuracy = accuracy_score(y_train, train_predictions)
    train_precision = precision_score(y_train, train_predictions, average='weighted')
    train_recall = recall_score(y_train, train_predictions, average='weighted')
    # Log the test metrics
    mlflow.log_metric("train_accuracy", train_accuracy)
    mlflow.log_metric("train_precision", train_precision)
    mlflow.log_metric("train_recall", train_recall)
    
    # Calculate and log test metrics
    test_predictions = rfc.predict(X_test)
    test_accuracy = accuracy_score(y_test, test_predictions)
    test_precision = precision_score(y_test, test_predictions, average='weighted')
    test_recall = recall_score(y_test, test_predictions, average='weighted')\
    # Log the test metrics
    mlflow.log_metric("test_accuracy", test_accuracy)
    mlflow.log_metric("test_precision", test_precision)
    mlflow.log_metric("test_recall", test_recall)
    
    # Log the model and register it as a new version in UC 
    mlflow.sklearn.log_model(
        sk_model=rfc,        
        artifact_path="sklearn-rfclassifier-model",  
        signature=signature,
        input_example=input_example,
        conda_env=conda_env_volume_path,
        registered_model_name="mmt_demos.dependencies.iris_rfclassifier",
    )

In [0]:
# [Alternatively] Register outside of model logging
model_uri = f"runs:/{run.info.run_id}/sklearn-rfclassifier-model"
print(model_uri)

catalog_name = "mmt_demos"
schema_name = "dependencies"

mv = mlflow.register_model(model_uri,                
                           f"{catalog_name}.{schema_name}.iris_rfclassifier"
                           )