# 08 - Model Monitoring

This notebook covers configuring model monitoring jobs for skew and drift detection:
1. Set skew and drift threshold.
2. Create a monitoring job for all the models under and endpoint.
3. List the monitoring jobs.
4. Simulate skewed prediction requests.
5. Pause and delete the monitoring job.

## Setup

### Import libraries

In [3]:
import copy
from datetime import datetime
import time
import random

from google.protobuf.duration_pb2 import Duration
from google.cloud import aiplatform as vertex_ai
from google.cloud import aiplatform_v1beta1 as vertex_ai_beta

### Setup Google Cloud project

In [1]:
PROJECT = 'pbalm-cxb-aa'
REGION = 'europe-west4'
BUCKET =  PROJECT + '-eu'
SERVICE_ACCOUNT = "188940921537-compute@developer.gserviceaccount.com"

if PROJECT == "" or PROJECT is None or PROJECT == "[your-project-id]":
    # Get your GCP project id from gcloud
    shell_output = !gcloud config list --format 'value(core.project)' 2>/dev/null
    PROJECT = shell_output[0]
    
if BUCKET == "" or BUCKET is None or BUCKET == "[your-bucket-name]":
    # Get your bucket name to GCP project id
    BUCKET = PROJECT
    # Try to create the bucket if it doesn't exists
    ! gsutil mb -l $REGION gs://$BUCKET
    print("")

PARENT = f"projects/{PROJECT}/locations/{REGION}"

print("Project ID:", PROJECT)
print("Region:", REGION)
print("Bucket name:", BUCKET)
print("Vertex API Parent URI:", PARENT)

Project ID: pbalm-cxb-aa
Region: europe-west4
Bucket name: pbalm-cxb-aa-eu
Vertex API Parent URI: projects/pbalm-cxb-aa/locations/europe-west4


### Set configurations

In [4]:
DATASET_DISPLAY_NAME = 'creditcards'
ENDPOINT_DISPLAY_NAME = f'{DATASET_DISPLAY_NAME}-classifier'
MONITORING_JOB_NAME = f"monitor-{ENDPOINT_DISPLAY_NAME}"
NOTIFY_EMAILS = ["pbalm@google.com"] # Change to your email address.

LOG_SAMPLE_RATE = 0.8
MONITOR_INTERVAL = 3600
TARGET_FEATURE_NAME = 'Class'

vertex_ai.init(project=PROJECT, location=REGION)

## Create Job Service Client

In [5]:
job_client_beta = vertex_ai_beta.JobServiceClient(
    client_options={"api_endpoint": f"{REGION}-aiplatform.googleapis.com"}
)

## 1. Set Skew and Drift Thresholds

In [6]:
SKEW_THRESHOLDS = {
    'V1': 0.05,
    'V2': 0.05,
    'V3': 0.05,
    'V4': 0.05,
    'V5': 0.05,
    'V6': 0.05,
    'V7': 0.05,
    'V8': 0.05,
    'V9': 0.05,
    'V10': 0.05,
    'V11': 0.05,
    'V12': 0.05,
    'V13': 0.05,
    'V14': 0.05,
    'V15': 0.05,
    'V16': 0.05,
    'V17': 0.05,
    'V18': 0.05,
    'V19': 0.05,
    'V20': 0.05,
    'V21': 0.05,
    'V22': 0.05,
    'V23': 0.05,
    'V24': 0.05,
    'V25': 0.05,
    'V26': 0.05,
    'V27': 0.05,
    'V28': 0.05,
    'Amount': 10
}

DRIFT_THRESHOLDS = {
    'V1': 0.05,
    'V2': 0.05,
    'V3': 0.05,
    'V4': 0.05,
    'V5': 0.05,
    'V6': 0.05,
    'V7': 0.05,
    'V8': 0.05,
    'V9': 0.05,
    'V10': 0.05,
    'V11': 0.05,
    'V12': 0.05,
    'V13': 0.05,
    'V14': 0.05,
    'V15': 0.05,
    'V16': 0.05,
    'V17': 0.05,
    'V18': 0.05,
    'V19': 0.05,
    'V20': 0.05,
    'V21': 0.05,
    'V22': 0.05,
    'V23': 0.05,
    'V24': 0.05,
    'V25': 0.05,
    'V26': 0.05,
    'V27': 0.05,
    'V28': 0.05,
    'Amount': 10
}

## 2. Create Monitoring Job

### Retrieve the Vertex dataset and endpoint models to monitor

In [7]:
dataset = vertex_ai.TabularDataset.list(
    filter=f"display_name={DATASET_DISPLAY_NAME}", 
    order_by="update_time")[-1]

bq_source_uri = dataset.gca_resource.metadata["inputConfig"]["bigquerySource"]["uri"]
    
endpoint = vertex_ai.Endpoint.list(
    filter=f'display_name={ENDPOINT_DISPLAY_NAME}', 
    order_by="update_time")[-1]

endpoint_uri = endpoint.gca_resource.name

model_ids = [model.id for model in endpoint.list_models()]

### Configure the monitoring job

In [8]:
skew_thresholds = {
    feature: vertex_ai_beta.ThresholdConfig(value=float(value))
    for feature, value in SKEW_THRESHOLDS.items()
}

drift_thresholds = {
    feature: vertex_ai_beta.ThresholdConfig(value=float(value))
    for feature, value in DRIFT_THRESHOLDS.items()
}

skew_config = vertex_ai_beta.ModelMonitoringObjectiveConfig.TrainingPredictionSkewDetectionConfig(
    skew_thresholds=skew_thresholds
)

drift_config = vertex_ai_beta.ModelMonitoringObjectiveConfig.PredictionDriftDetectionConfig(
    drift_thresholds=drift_thresholds
)

sampling_config = vertex_ai_beta.SamplingStrategy(
    random_sample_config=vertex_ai_beta.SamplingStrategy.RandomSampleConfig(
        sample_rate=LOG_SAMPLE_RATE
    )
)

schedule_config = vertex_ai_beta.ModelDeploymentMonitoringScheduleConfig(
    monitor_interval=Duration(seconds=MONITOR_INTERVAL)
)

training_dataset = vertex_ai_beta.ModelMonitoringObjectiveConfig.TrainingDataset(
    target_field=TARGET_FEATURE_NAME,
    bigquery_source = vertex_ai_beta.types.io.BigQuerySource(
        input_uri=bq_source_uri
    )
)


objective_template = vertex_ai_beta.ModelDeploymentMonitoringObjectiveConfig(
    objective_config=vertex_ai_beta.ModelMonitoringObjectiveConfig(
        training_dataset=training_dataset,
        training_prediction_skew_detection_config=skew_config,
        prediction_drift_detection_config=drift_config,
    )
)

deployment_objective_configs = []
for model_id in model_ids:
    objective_config = copy.deepcopy(objective_template)
    objective_config.deployed_model_id = model_id
    deployment_objective_configs.append(objective_config)

alerting_config = vertex_ai_beta.ModelMonitoringAlertConfig(
    email_alert_config=vertex_ai_beta.ModelMonitoringAlertConfig.EmailAlertConfig(
        user_emails=NOTIFY_EMAILS
    )
)


### Instantiate a monitoring job

In [9]:
job = vertex_ai_beta.ModelDeploymentMonitoringJob(
    display_name=MONITORING_JOB_NAME,
    endpoint=endpoint_uri,
    model_deployment_monitoring_objective_configs=deployment_objective_configs,
    logging_sampling_strategy=sampling_config,
    model_deployment_monitoring_schedule_config=schedule_config,
    model_monitoring_alert_config=alerting_config,
)

### Submit the job for creation

In [10]:
response = job_client_beta.create_model_deployment_monitoring_job(
    parent=PARENT, model_deployment_monitoring_job=job
)
response

name: "projects/188940921537/locations/europe-west4/modelDeploymentMonitoringJobs/7098676866852061184"
display_name: "monitor-creditcards-classifier"
endpoint: "projects/188940921537/locations/europe-west4/endpoints/831318751528878080"
state: JOB_STATE_PENDING
schedule_state: OFFLINE
model_deployment_monitoring_objective_configs {
  deployed_model_id: "7049021822230069248"
  objective_config {
    training_dataset {
      bigquery_source {
        input_uri: "bq://pbalm-cxb-aa.vertex_eu.creditcards_ml"
      }
      target_field: "Class"
    }
    training_prediction_skew_detection_config {
      skew_thresholds {
        key: "Amount"
        value {
          value: 10.0
        }
      }
      skew_thresholds {
        key: "V1"
        value {
          value: 0.05
        }
      }
      skew_thresholds {
        key: "V10"
        value {
          value: 0.05
        }
      }
      skew_thresholds {
        key: "V11"
        value {
          value: 0.05
        }
      }
    

## 3. List Monitoring Jobs

In [21]:
monitoring_jobs = job_client_beta.list_model_deployment_monitoring_jobs(parent=PARENT)
monitoring_job = [entry for entry in monitoring_jobs if entry.display_name == MONITORING_JOB_NAME][0]
monitoring_job.state

<JobState.JOB_STATE_PENDING: 2>

## 4. Simulate skewed prediction requests

In [15]:
num_requests = 100

print("Simulation started...")
for idx in range(num_requests):
    request = [{
        'V1': [int(random.uniform(-1.0, 1.0))],
        'V2': [int(random.uniform(-1.0, 1.0))],
        'V3': [int(random.uniform(-1.0, 1.0))],
        'V4': [int(random.uniform(-1.0, 1.0))],
        'V5': [int(random.uniform(-1.0, 1.0))],
        'V6': [int(random.uniform(-1.0, 1.0))],
        'V7': [int(random.uniform(0.0, 5.0))],
        'V8': [int(random.uniform(-1.0, 1.0))],
        'V9': [int(random.uniform(-1.0, 1.0))],
        'V10': [int(random.uniform(-1.0, 1.0))],
        'V11': [int(random.uniform(-1.0, 1.0))],
        'V12': [int(random.uniform(-1.0, 1.0))],
        'V13': [int(random.uniform(-1.0, 1.0))],
        'V14': [int(random.uniform(-5.0, 0.0))],
        'V15': [int(random.uniform(-1.0, 1.0))],
        'V16': [int(random.uniform(-1.0, 1.0))],
        'V17': [int(random.uniform(-1.0, 1.0))],
        'V18': [int(random.uniform(-1.0, 1.0))],
        'V19': [int(random.uniform(-1.0, 1.0))],
        'V20': [int(random.uniform(-1.0, 1.0))],
        'V21': [int(random.uniform(-1.0, 1.0))],
        'V22': [int(random.uniform(-1.0, 1.0))],
        'V23': [int(random.uniform(-1.0, 1.0))],
        'V24': [int(random.uniform(-1.0, 1.0))],
        'V25': [int(random.uniform(-1.0, 1.0))],
        'V26': [int(random.uniform(-1.0, 1.0))],
        'V27': [int(random.uniform(-1.0, 1.0))],
        'V28': [int(random.uniform(-1.0, 1.0))],
        'Amount': [int(random.uniform(10, 100))]
    }]
    
    endpoint.predict(request)
    time.sleep(0.5)
    
    if idx % 10 == 0:
        print(f'{idx + 1} of {num_requests} prediction requests were invoked.')
print("Simulation finished.")

Simulation started...
1 of 100 prediction requests were invoked.
11 of 100 prediction requests were invoked.
21 of 100 prediction requests were invoked.
31 of 100 prediction requests were invoked.
41 of 100 prediction requests were invoked.
51 of 100 prediction requests were invoked.
61 of 100 prediction requests were invoked.
71 of 100 prediction requests were invoked.
81 of 100 prediction requests were invoked.
91 of 100 prediction requests were invoked.
Simulation finished.


## 5. Pause Monitoring Job

In [16]:
job_client_beta.pause_model_deployment_monitoring_job(name=monitoring_job.name)

FailedPrecondition: 400 Job (188940921537, 7098676866852061184) can only be paused under running / pending state, the current state is: PAUSED.

## Delete Monitoring Job

In [None]:
job_client_beta.delete_model_deployment_monitoring_job(name=monitoring_job.name)

In [17]:
job_client_beta.resume_model_deployment_monitoring_job(name=monitoring_job.name)