### Train, log, and register models. 
**Relevant docs**:
 - [MLflow tracking APIs](https://mlflow.org/docs/latest/ml/tracking/tracking-api/)
 - [MLFlow Python APIs](https://mlflow.org/docs/latest/api_reference/python_api/)
 - [MLflow evaluation](https://mlflow.org/docs/latest/ml/evaluation/)

In [0]:
%pip install mlflow==3.0.1 xgboost==3.0.0
%restart_python

In [0]:
import requests
import re
from io import StringIO
from typing import Iterable, Tuple
from sklearn.datasets import load_iris
import pandas as pd
import mlflow
from mlflow.models import infer_signature

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, StandardScaler
from xgboost import XGBClassifier

from pyspark.sql.types import (
    StructType,
    StructField,
    StringType,
    IntegerType,
    DoubleType,
)

import pyspark.sql.functions as func
from pyspark.sql.functions import col

In [0]:
mlflow.sklearn.autolog(disable=True)
exerpiment_path = "/Users/marshall.carter@databricks.com/workshop_experiment_mlc"
mlflow.set_experiment(exerpiment_path) 
mlflow.set_registry_uri("databricks-uc")

In [0]:
dbutils.widgets.text('catalog_name','','Enter catalog name')
dbutils.widgets.text('schema_name','','Enter schema name')
dbutils.widgets.text('model_name','','Enter model name')

In [0]:
#dbutils.widgets.removeAll()

In [0]:
catalog_name = dbutils.widgets.get('catalog_name')
schema_name = dbutils.widgets.get('schema_name')
table_name = "advanced_churn_bronze_customers"
uc_model_name = f"{catalog_name}.{schema_name}.{dbutils.widgets.get('model_name')}"
uc_location = f"{catalog_name}.{schema_name}.{table_name}"
print(f"UC location: {uc_location}\nModel name: {uc_model_name}")

In [0]:
csv = requests.get("https://raw.githubusercontent.com/IBM/telco-customer-churn-on-icp4d/master/data/Telco-Customer-Churn.csv").text

df = pd.read_csv(StringIO(csv), sep=",")
df.columns = [re.sub(r'(?<!^)(?=[A-Z])', '_', name).lower().replace("__", "_") for name in df.columns]
df.columns = [re.sub(r'[\(\)]', '', name).lower() for name in df.columns]
df.columns = [re.sub(r'[ -]', '_', name).lower() for name in df.columns]
df["total_charges"] = pd.to_numeric(df["total_charges"], errors="coerce")
df.rename(columns = {'streaming_t_v': 'streaming_tv', 'customer_i_d': 'customer_id'})

Convert Pandas DataFrame to Spark DataFrame

In [0]:
telco_schema = StructType([
    StructField("customer_i_d", StringType(), nullable=False),
    StructField("gender", StringType(), nullable=True),
    StructField("senior_citizen", IntegerType(), nullable=True),
    StructField("partner", StringType(), nullable=True),
    StructField("dependents", StringType(), nullable=True),
    StructField("tenure", IntegerType(), nullable=True),
    StructField("phone_service", StringType(), nullable=True),
    StructField("multiple_lines", StringType(), nullable=True),
    StructField("internet_service", StringType(), nullable=True),
    StructField("online_security", StringType(), nullable=True),
    StructField("online_backup", StringType(), nullable=True),
    StructField("device_protection", StringType(), nullable=True),
    StructField("tech_support", StringType(), nullable=True),
    StructField("streaming_t_v", StringType(), nullable=True),
    StructField("streaming_movies", StringType(), nullable=True),
    StructField("contract", StringType(), nullable=True),
    StructField("paperless_billing", StringType(), nullable=True),
    StructField("payment_method", StringType(), nullable=True),
    StructField("monthly_charges", DoubleType(), nullable=True),
    StructField("total_charges", DoubleType(), nullable=True),
    StructField("churn", StringType(), nullable=True),
])

spark_df = spark.createDataFrame(df, schema=telco_schema)
spark_df.write.mode("overwrite").saveAsTable(uc_location)

In [0]:
df = spark.table(uc_location).toPandas()
df.head()

Split features and target

In [0]:
y = df["churn"].map({"Yes": 1, "No": 0}).astype("int8")
X = df.drop(columns=["churn", "customer_i_d"])

Create feature pre-processor

In [0]:
def create_preprocessor(
    numeric_features: Iterable[str], categorical_features: Iterable[str]
) -> ColumnTransformer:
    """Build a ColumnTransformer that prepares features for XGBoost."""
    numeric_transformer = Pipeline(
        steps=[
            (
                "coerce_numeric",
                FunctionTransformer(
                    lambda data: data.apply(pd.to_numeric, errors="coerce"), validate=False
                ),
            ),
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler()),
        ]
    )

    categorical_transformer = Pipeline(
        steps=[
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("onehot", OneHotEncoder(handle_unknown="ignore")),
        ]
    )

    return ColumnTransformer(
        transformers=[
            ("numeric", numeric_transformer, list(numeric_features)),
            ("categorical", categorical_transformer, list(categorical_features)),
        ]
    )

Combine feature pre-processor with model

In [0]:
numeric_features = [col for col in ["senior_citizen", "tenure", "monthly_charges", "total_charges"] if col in X.columns]

categorical_features = [col for col in X.columns if col not in numeric_features]

preprocessor = create_preprocessor(numeric_features, categorical_features)

xgb_classifier = XGBClassifier(
            objective="binary:logistic",
            eval_metric="logloss")

pipeline = Pipeline(
            steps=[
                ("preprocessor", preprocessor),
                ("model", xgb_classifier),
            ]
        )

Train model, logging fitted model, parameters, and other artifacts to an MLflow Experiment Run

In [0]:
def train_model(run_name, disable_autolog=True, evaluate=True):

  with mlflow.start_run(run_name="test_run") as run:

      mlflow.autolog(disable=disable_autolog)

      run_id = run.info.run_id

      X_train, X_val, y_train, y_val = train_test_split(
              X, y, test_size=0.2, stratify=y, random_state=42
          )
      
      pipeline.fit(X_train, y_train)
      val_predictions = pipeline.predict(X_val)
      auc = roc_auc_score(y_val, val_predictions)

      mlflow.log_metric("val_roc_auc", auc)
      signature = infer_signature(X, y)

      model_info = mlflow.sklearn.log_model(
        sk_model=pipeline,
        signature=signature,
        input_example = X[:3],
        name="pipeline"
        )
      
      tags = {"disable_autolog": disable_autolog, 
              "evaluate": evaluate}
      
      mlflow.set_tags(tags)

      if evaluate:

        eval_dataset = X_val.copy()
        eval_dataset["label"] = y_val
        
        val_results = mlflow.models.evaluate(
          model_info.model_uri,
          data = eval_dataset,
          targets="label",
          model_type="classifier",
          evaluator_config={
            "log_explainer": True,
            "explainer_type": "exact",
            "log_model_explanations": True,
            "metric_prefix": "val_"
          }
      )
        
        for metric_name, value in val_results.metrics.items():
          print(f"{metric_name}: {value}")

        for artifact_name in val_results.artifacts:
          if "shap" in artifact_name.lower():
              print(f"Generated: {artifact_name}")
      
      model_id = model_info.model_id

      return {"run_id": run_id, "model_id": model_id}

Basic logging, no autologging or evaluation

In [0]:
basic_logger = train_model("func_run", disable_autolog=True, evaluate=False)

Add autologging

In [0]:
auto_logger = train_model("func_run", disable_autolog=False, evaluate=False)

Add autologging and evaluation

In [0]:
auto_eval_logger = train_model("func_run", disable_autolog=False, evaluate=True)

Get run and model ids

In [0]:
print(auto_eval_logger)

####Inference using Pandas DataFrame

In [0]:
model_uri = f"models:/{auto_eval_logger["model_id"]}"
pyfunc_model = mlflow.pyfunc.load_model(model_uri)

input_data = pyfunc_model.input_example
predictions_pandas = pyfunc_model.predict(input_data)
predictions_pandas

####Distriuted inference over a cluster using a PandasUDF.

In [0]:
spark_df = spark.table(uc_location).selectExpr([col for col in X.columns])
display(spark_df)

In [0]:
from pyspark.sql.types import DoubleType

loaded_model = mlflow.pyfunc.spark_udf(spark, model_uri=model_uri, env_manager="local",result_type=DoubleType())

predictions_spark = spark_df.withColumn('predictions', loaded_model(func.struct(*map(col, spark_df.columns))))
display(predictions_spark)

### Registering models

In [0]:
from mlflow.tracking import MlflowClient
client = MlflowClient()

If no model version is registered, set the current model as the production model. Otherwise, compare the current model with the production model and swap them if the current model is better.

View chances in the model versions and aliases in Unity Catalog

In [0]:
versions = client.search_model_versions(f"name='{uc_model_name}'")
versions = sorted([int(mv.version) for mv in versions])

if len(versions) == 0: # No registered model versions
  # Register current model as production model
  model_info = mlflow.register_model(model_uri, 
                                     name = uc_model_name,
                                     tags={"val_auc_score": 0.75})
  
  client.set_registered_model_alias(name=uc_model_name, 
                                  alias="production", 
                                  version=model_info.version) 

# If a production model already exists, compare the models and swap them
else:
  model_aliases = client.get_registered_model(uc_model_name).aliases
  production_model_version = model_aliases["production"]

  current_production_model = client.get_model_version(
    name=uc_model_name,
    version=production_model_version
  )

  new_model_auc_score = float(current_production_model.tags["val_auc_score"]) + 0.01

  model_info = mlflow.register_model(model_uri, 
                                     name = uc_model_name,
                                     tags={"val_auc_score": new_model_auc_score})
  
  client.set_registered_model_alias(name=uc_model_name, 
                                  alias="staging", 
                                  version=model_info.version) 
  
  # Swap staging model as production model if new model is better
  if new_model_auc_score >= float(current_production_model.tags["val_auc_score"]):
    # Set staging model as production model
    client.set_registered_model_alias(name=uc_model_name, 
                                      alias="production", 
                                      version=model_info.version) 
    
    # Archive current production model
    client.set_registered_model_alias(name=uc_model_name, 
                                      alias="archived", 
                                      version=production_model_version) 

    # Remove staging alias - no model is currently in staging
    client.delete_registered_model_alias(uc_model_name, "staging")

    # Delete any older model versions if available
    prior_version = int(production_model_version) - 1
    if prior_version in versions:
      client.delete_model_version(
        name=uc_model_name,
        version=prior_version
      )