In [0]:
%pip install scikit-learn==1.7.0 databricks-sdk>=0.28.0
%restart_python

In [0]:
import uuid
from sklearn.datasets import make_blobs
import pandas as pd
import numpy as np
import mlflow
from pyspark.sql.functions import col
import pyspark.sql.functions as func

from databricks.sdk import WorkspaceClient
from databricks.sdk.service.catalog import MonitorTimeSeries

In [0]:
dbutils.widgets.text('catalog_name','','Enter catalog name')
dbutils.widgets.text('schema_name','','Enter schema name')
dbutils.widgets.text('table_suffix','','Enter table suffix')

In [0]:
catalog_name = dbutils.widgets.get('catalog_name')
schema_name = dbutils.widgets.get('schema_name')
table_suffix = dbutils.widgets.get('table_suffix')
uc_location = f"{catalog_name}.{schema_name}"

print(f"UC location: {uc_location}")

In [0]:
N_SAMPLES = 10000
N_DRIFT_SAMPLES = 2000
CENTERS = 4
N_FEATURES = 20
TRAIN_TIMESTAMP = "2026-01-01 00:00:00"
FEATURE_DRIFT_TIMESTAMPS = ["2026-01-02 00:00:00", "2026-01-03 00:00:00", "2026-01-04 00:00:00"]

BASELINE_UC_TABLE = f"{uc_location}.baseline_features_{table_suffix}"
DRIFT_UC_TABLE = f"{uc_location}.drifted_features_{table_suffix}"

print(f"Baseline UC table: {BASELINE_UC_TABLE}\nDrift UC table: {DRIFT_UC_TABLE}")

Generate a baseline training dataset

In [0]:
def generate_dataset(n_samples, centers, n_features, timestamp, random_state=42, cluster_std=1):
  """
  Generate synthetic clusters. Then, add a timestamp columns and distinct id.
  """

  X_train, y_train = make_blobs(n_samples = n_samples, 
                                centers = centers,
                                cluster_std = cluster_std, 
                                random_state = random_state,
                                n_features = n_features)
  
  FEATURES_COLS = [f'feature_{i}' for i in range(n_features)]
  LABEL_COL = "cluster"

  y_train = pd.DataFrame(y_train, columns=[LABEL_COL])
  X_train = pd.DataFrame(X_train, columns=FEATURES_COLS)

  features_train = pd.concat([X_train, y_train], axis=1)
  features_train['timestamp'] = pd.Timestamp(timestamp)
  features_train['customer_id'] = [str(uuid.uuid4()) for _ in range(len(features_train))]

  features_train = features_train[["customer_id", "timestamp"] + FEATURES_COLS + [LABEL_COL]]

  features_train = spark.createDataFrame(features_train)
  return features_train

In [0]:
baseline_features = generate_dataset(n_samples = N_SAMPLES, 
                                     centers = CENTERS,
                                     n_features = N_FEATURES,
                                     timestamp = TRAIN_TIMESTAMP)

baseline_features.write.mode('overwrite').saveAsTable(BASELINE_UC_TABLE)
baseline_features = spark.table(BASELINE_UC_TABLE)
display(baseline_features)

In [0]:
baseline_features.count()

Generate drifted features

In [0]:
def generate_drift(baseline_df, n_samples, timestamp, loc=2.0, scale=2.0, random_state=42, cluster_std=2.0):

  approx_centers = []
  n_clusters = baseline_df.selectExpr("count(distinct(cluster)) as num_clusters").collect()[0].num_clusters
  n_features = len([feature for feature in baseline_df.columns if feature.startswith("feature_")])

  approx_centers_df = (
    baseline_df
    .groupBy("cluster")
    .agg(*[func.avg(f).alias(f) for f in baseline_df.columns if f.startswith("feature_")])
    .orderBy("cluster")
    .toPandas()
)
  approx_centers = approx_centers_df.drop("cluster", axis=1).to_numpy()

  rng = np.random.default_rng(seed=100)
  drift_shift = rng.normal(loc=loc, scale=scale, size=(n_clusters, n_features))
  drifted_centers = approx_centers + drift_shift

  drifted_features = generate_dataset(n_samples=n_samples, 
                                      centers=drifted_centers, 
                                      n_features=n_features, 
                                      timestamp = timestamp,
                                      random_state=random_state, 
                                      cluster_std=cluster_std)
  
  return drifted_features

In [0]:
current_df = baseline_features

for i, ts in enumerate(FEATURE_DRIFT_TIMESTAMPS):
    drifted_features = generate_drift(
        baseline_df=current_df,
        n_samples=N_DRIFT_SAMPLES,
        timestamp=ts,
    )

    # write this period’s drifted data
    drifted_features.write.mode('append').saveAsTable(DRIFT_UC_TABLE)

    # next period should drift from the prior period’s drift
    current_df = drifted_features

drifted_features = spark.table(DRIFT_UC_TABLE)
display(drifted_features)

In [0]:
display(drifted_features.groupBy("timestamp").count().orderBy("timestamp"))

#### Create a monitor using the [TimeSeries profile](https://docs.databricks.com/aws/en/lakehouse-monitoring/create-monitor-api#timeseries-profile) profile.  

In [0]:
display(spark.table(BASELINE_UC_TABLE))

In [0]:
display(spark.table(DRIFT_UC_TABLE))

In [0]:
### UPDATED API

from databricks.sdk import WorkspaceClient
from databricks.sdk.service.dataquality import Monitor, DataProfilingConfig, TimeSeriesConfig, AggregationGranularity, DataProfilingStatus, RefreshState, Refresh

w = WorkspaceClient()

schema = w.schemas.get(full_name=f"{catalog}.{schema}")
table = w.tables.get(full_name=f"{catalog}.{schema}.{table_name}")

config = DataProfilingConfig(
 output_schema_id=schema.schema_id,
 assets_dir=ff"/Workspace/Users/marshall.carter@databricks.com/mlops_workshop/04_lakehouse_monitor/{DRIFT_UC_TABLE}",
 time_series=TimeSeriesConfig(
    timestamp_column="timestamp",
    granularities=[AggregationGranularity.AGGREGATION_GRANULARITY_1_DAY])
)

info = w.data_quality.create_monitor(
   monitor=Monitor(
     object_type="table",     # object_type is always "table" for data profiling
     object_id=table.table_id,
     data_profiling_config=config,
   ),
)

In [0]:
w = WorkspaceClient()

info = w.quality_monitors.create(
  table_name=DRIFT_UC_TABLE,
  baseline_table_name=BASELINE_UC_TABLE,
  assets_dir=f"/Workspace/Users/marshall.carter@databricks.com/mlops_workshop/04_lakehouse_monitor/{DRIFT_UC_TABLE}",
  output_schema_name=uc_location,
  time_series=MonitorTimeSeries(timestamp_col="timestamp", granularities=["1 day"])
)

In [0]:
w.quality_monitors.get(table_name=DRIFT_UC_TABLE)

View monitor update status. Creating the a monitor will also trigger an update. After this, updates can be triggered either manually or by [setting a schedule](https://docs.databricks.com/aws/en/lakehouse-monitoring/create-monitor-api#schedule).

In [0]:
w.quality_monitors.list_refreshes(table_name=DRIFT_UC_TABLE).refreshes

In [0]:
refresh = w.quality_monitors.run_refresh(table_name=DRIFT_UC_TABLE)

In [0]:
w.quality_monitors.list_refreshes(table_name=DRIFT_UC_TABLE).refreshes

#### View Lakehouse Monitor tables

#### Profile metrics

In [0]:
shared.mlc_schema.drifted_features_mlc_profile_metrics

In [0]:
PROFILE_METRICS_TABLE = (w.quality_monitors.get(table_name=DRIFT_UC_TABLE)
                                           .profile_metrics_table_name)

print(PROFILE_METRICS_TABLE)

profile_metrics_df = spark.table(PROFILE_METRICS_TABLE)

features_and_prediction = [col for col in spark.table(DRIFT_UC_TABLE).columns if col not in ["customer_id", "timestamp"]]
        
display(profile_metrics_df.filter((col("log_type") == "INPUT") & (col("column_name").isin(features_and_prediction)))
                          .select(["window", "log_type", "granularity", "column_name", "count",
                                   "data_type", "num_nulls", "avg", "median", "quantiles", "min", "max",
                                   "num_zeros", "num_nan", "percent_nan", "percent_null", "percent_distinct", "avg"]))

#### Drift metrics overview
**KS Test**: A non-parametric test that measures the maximum distance between the cumulative distribution functions of two distributions. It returns a static and a p-value; statistics close to 0 means the distributions are close to 0, while a statistic colde to 1 mean they are very different. This is assuming a small p-value (<0.05). This statistic expect continuous variables.

**Wasserstein Distance**: Measures the minimal amount of work needed to transform one distribution into another (how much mass must be moved and how far?). Identical distributions have a value of 0.0 while higher values indicate larger differences. It's value is >= 0. his statistic expect continuous variables.

**Population Stability Index (PSI)**: Compares the binned distributions of a feature across two datasets. Roughly speaking, values of < 0.1 mean no significant drift, >= 0.1 and <= 0.25 mean moderate drift, and > 0.25 mean significant drift. PSI works well for deteving changes in frequency of distributions for categorical variable.


Below results:
 - The high population stability index score (5.02) on the cluster's column indicates **significant drift** in the predicted cluster distribution.  
 - For feature columns:
   - The **KS Test** indicates moderate difference in data distributions compared to baseline.
   - **Wasserstein Distance** is frequently above 1; a strong signal for drift.
   - The **PSI** is typically > 0.25, indicating significant drift.

In [0]:
shared.mlc_schema.drifted_features_mlc_profile_metrics

In [0]:
PROFILE_METRICS_TABLE = (w.quality_monitors.get(table_name=DRIFT_UC_TABLE)
                                           .profile_metrics_table_name)

print(PROFILE_METRICS_TABLE)

display(spark.table(PROFILE_METRICS_TABLE))

In [0]:
DRIFT_METRICS_TABLE = (w.quality_monitors.get(table_name=DRIFT_UC_TABLE)
                                         .drift_metrics_table_name)

#display(spark.table(DRIFT_METRICS_TABLE))
print(DRIFT_METRICS_TABLE)

In [0]:
DRIFT_METRICS_TABLE = (w.quality_monitors.get(table_name=DRIFT_UC_TABLE)
                                         .drift_metrics_table_name)

print(DRIFT_METRICS_TABLE)

profile_metrics_df = spark.table(DRIFT_METRICS_TABLE)

features_and_prediction = [col for col in spark.table(DRIFT_UC_TABLE).columns if col not in ["customer_id", "timestamp"]]
display(profile_metrics_df.filter(col("column_name").isin(features_and_prediction))
                          .select(["window", "granularity", "column_name", "data_type", "drift_type",
                                   "ks_test", "wasserstein_distance", "population_stability_index"])
                          .orderBy("column_name"))

#### Create a monitor query
Return rows that indicate drift

In [0]:
%sql

SELECT window, 
       column_name, 
       data_type, 
       drift_type, 
       count_delta,
       ks_test, 
       wasserstein_distance, 
       population_stability_index
FROM shared.mlc_schema.kmeans_baseline_drift_timeseries_profile_drift_metrics
WHERE drift_type = "BASELINE" AND
      window.end = (SELECT MAX(window.end) from shared.mlc_schema.kmeans_baseline_drift_timeseries_profile_drift_metrics) AND
      ((ks_test.pvalue < 0.05 AND ks_test.statistic > 0.2) OR wasserstein_distance >= 0.1);

#### View full profile metrics table
Percent distinct is calculated in this table

In [0]:
profile_metrics = spark.table(f"{uc_location}.kmeans_baseline_drift_timeseries_profile_profile_metrics")
display(profile_metrics)

#### View full drift metrics table

In [0]:

metrics = spark.table(f"{uc_location}.kmeans_baseline_drift_timeseries_profile_drift_metrics")
display(metrics)

#### After model retraining, recalculate Lakehouse Monitor's metrics against the a new baseline table.
 - Either overwrite the monitor's baseline table or update it to a different table
 - Continue to write predictions to the same inference table

In [0]:
# Example:
"""
monitor_update = w.quality_monitors.update(table_name=DRIFT_UC_TABLE, # Drift table stays the same
                                           baseline_table_name=<Updated baseline table> # New or overwritten baseline table
                                           output_schema_name=uc_location,
                                           time_series=MonitorTimeSeries(timestamp_col="timestamp", granularities=["1 day"]))

refresh = w.quality_monitors.run_refresh(table_name=f"DRIFT_UC_TABLE")
"""