In [None]:
%store -r
import time
import os,json, random, string
import requests, operator
from dkube.sdk import *
from dkube.sdk.api import DkubeApi
from dkube.sdk.rsrcs import DkubeModelmonitor
from dkube.sdk.rsrcs.operator import DkubeCluster
from dkube.sdk.rsrcs.modelmonitor import DatasetClass,ModelType,DriftAlgo
from dkube.sdk.rsrcs.modelmonitor import DatasetFormat,DkubeModelmonitorAlert, TimeZone
from dkube.sdk.rsrcs.modelmonitor import DataType, ChannelOrder, ImageDataSavedFileFormat

In [None]:
MONITOR_NAME = image_exp_config['MONITOR_NAME']
INPUT_TRAIN_TYPE = image_exp_config['INPUT_TRAIN_TYPE']
SERVING_DKUBE_USERNAME = image_exp_config['SERVING_DKUBE_USERNAME']
SERVING_DKUBE_TOKEN = image_exp_config['SERVING_DKUBE_TOKEN']
SERVING_DKUBE_URL = image_exp_config['SERVING_DKUBE_URL']
RUN_FREQUENCY = image_exp_config['RUN_FREQUENCY']
LIVE_DATASET = image_exp_config['LIVE_DATASET']
TRAINING_DATASET = image_exp_config['TRAINING_DATASET']
SERVING_DEPLOYMENT_ID = image_exp_config['SERVING_DEPLOYMENT_ID']

MONITORING_DKUBE_USERNAME = image_exp_config["MONITORING_DKUBE_USERNAME"]
MONITORING_DKUBE_TOKEN = image_exp_config["MONITORING_DKUBE_TOKEN"]
MONITORING_DKUBE_URL = image_exp_config["MONITORING_DKUBE_URL"]
SERVING_DKUBE_CLUSTER_NAME = image_exp_config["SERVING_DKUBE_CLUSTER_NAME"]

In [None]:
serving_api = DkubeApi(URL=SERVING_DKUBE_URL,token=SERVING_DKUBE_TOKEN)

## Define Functions

In [None]:
def get_dataset_version(username, dataset_name, version):
    dataset_versions = monitoring_api.get_dataset_versions(username, dataset_name)
    versions = []
    for each_version in dataset_versions:
        if each_version["version"]["name"] == version:
            uuid = each_version["version"]["uuid"]
            return f"{version}:{uuid}"
        else:
            versions.append(each_version["version"]["name"])
    return f"dataset version {version} not found, available version are {versions}"

## Checking for seperate monitoring cluster and adding cluster and deployment to the monitoring cluster.

In [None]:
if MONITORING_DKUBE_URL:
    monitoring_api = DkubeApi(URL=MONITORING_DKUBE_URL,token=MONITORING_DKUBE_TOKEN)
    DKUBEUSERNAME = MONITORING_DKUBE_USERNAME
    ## Checking if the cluster exists
    cluster_exists = False
    if SERVING_DKUBE_CLUSTER_NAME:
        for each_cluster in monitoring_api.get_clusters()["data"]:
            if each_cluster["name"] == SERVING_DKUBE_CLUSTER_NAME:
                cluster_exists = True
                print(f"Cluster {SERVING_DKUBE_CLUSTER_NAME} already exists")
                break
        if not cluster_exists:
            msg = f'''
            Cluster {SERVING_DKUBE_CLUSTER_NAME} not found on monitoring cluster,
            Kindly verify the cluster name.
            '''
            raise Exception(msg)
    else:
        ## Generating serving cluster name if not provide
        ## A cluster with the generated name would be added
        res = ''.join(random.choices(string.ascii_lowercase + string.digits, k=4))
        SERVING_DKUBE_CLUSTER_NAME = f"{SERVING_DKUBE_USERNAME}-{res}"
        print(f"Attempt to create cluster with name {SERVING_DKUBE_CLUSTER_NAME} on monitoring setup will be taken")
        ## checking if the user have operator permissions
        op_permission = False
        if "operator" in monitoring_api.validate_token()["role"]:
            op_permission = True
        if op_permission:
        ### adding cluster
            pcluster = DkubeCluster(name=SERVING_DKUBE_CLUSTER_NAME)
            pcluster.update_kind("dkube-remote")
            pcluster.update_class("monitoring")
            pcluster.update_authtype("jwt")
            pcluster.update_url(url=SERVING_DKUBE_URL)
            pcluster.update_jwt_details(jwt_token=SERVING_DKUBE_TOKEN)
            try:
                monitoring_api.configure_clusters(pcluster.cluster)
                print("Cluster added")
            except Exception as e:
                print(e)
        else:
            msg = f'''
            User {MONITORING_DKUBE_USERNAME} doesn't have operator permissions,
            Ask the operator to add cluster first.
            '''
            raise Exception(msg)
    ## Importing deployment
    try:
        DEPLOYMENT_ID = monitoring_api.import_deployment(name=MONITOR_NAME,
                                                         cluster=SERVING_DKUBE_CLUSTER_NAME,
                                                         namespace=SERVING_DKUBE_USERNAME)
        print("Deployment Imported")
    except Exception as e:
        print(e)
        
else:
    monitoring_api = serving_api
    DKUBEUSERNAME = SERVING_DKUBE_USERNAME
    DEPLOYMENT_ID = SERVING_DEPLOYMENT_ID

## Initializing model monitor

In [None]:
mm=DkubeModelmonitor(deployemnt_id = DEPLOYMENT_ID)

In [None]:
schema = {"features": [{
        "class": "categorical",
        "label": "prediction",
        "type": "prediction_output"
      }]
    }
mm.modelmonitor.schema = schema

In [None]:
mm.update_modelmonitor_basics(model_type=ModelType.Classification.value, 
                               input_data_type=DataType.Image.value,
                               data_timezone=TimeZone.UTC.value)

In [None]:
mm.update_image_data_shape(height=200, width=200, channel=1)

In [None]:
with open('thresholds.json') as f:
    thresholds = json.load(f)
mm.add_thresholds(thresholds=thresholds)

In [None]:
training_data = f'{DKUBEUSERNAME}:{TRAINING_DATASET}'
train_data_version = get_dataset_version(DKUBEUSERNAME,
                                            TRAINING_DATASET, "v1")
labelled_data = f"{DKUBEUSERNAME}:{LIVE_DATASET}"
predict_data_format = "cloudeventlogs"

## Adding train, predict, and labelled datasets

In [None]:
mm.add_datasources(data_class=DatasetClass.Train.value,name=training_data,
                   version=train_data_version,
                   s3_subpath=DEPLOYMENT_ID)

if MONITORING_DKUBE_URL:
    mm.add_datasources(data_class=str(DatasetClass.Predict), name=labelled_data,
                       data_format=predict_data_format, s3_subpath=SERVING_DEPLOYMENT_ID)
else:
    mm.add_datasources(data_class=str(DatasetClass.Predict),
                       data_format=predict_data_format)

mm.add_datasources(data_class=DatasetClass.Labelled.value,name=labelled_data,
                   data_format=DataType.Tabular.value, s3_subpath=SERVING_DEPLOYMENT_ID + "/livedata",
                   predict_col="output",groundtruth_col="label",timestamp_col="timestamp")

## Health Monitoring

In [None]:
mm.update_deployment_monitoring_details(enabled=True, frequency=1)

## Drift monitoring

In [None]:
mm.update_drift_monitoring_details(enabled=True,
                                  frequency=RUN_FREQUENCY,
                                  image_train_data_savedfile_format="images_in_labelled_folder")

## Performance Monitoring

In [None]:
mm.update_performance_monitoring_details(enabled=True,source_type="labelled_data",frequency=RUN_FREQUENCY)

## Creating model monitor

In [None]:
id = monitoring_api.modelmonitor_create(mm,wait_for_completion=True)

### Add alerts

#### Deployment Health Alert

In [None]:
alert = DkubeModelmonitorAlert(name='latency_alert', alert_class = 'deployment_health')
alert.add_alert_condition(metric='latency_avg',threshold=300, op=operator.gt)
monitoring_api.modelmonitor_add_alert(id,alert)

#### Data Drift Alert

In [None]:
alert = DkubeModelmonitorAlert(name='image_drift_alert', alert_class = 'feature_drift')
alert.add_alert_condition(feature="image", threshold=0.42, op=operator.gt)
monitoring_api.modelmonitor_add_alert(id,alert)

#### Performance Alert

In [None]:
alert = DkubeModelmonitorAlert(name='accuracy_alert', alert_class = 'performance_decay')
alert.update_alert(metric='accuracy',threshold=0.9, op=operator.lt)
monitoring_api.modelmonitor_add_alert(id,alert)

### Start the model monitor

In [None]:
monitoring_api.modelmonitor_start(DEPLOYMENT_ID)

### Cleanup

In [None]:
CLEANUP = False
if CLEANUP:
    from time import sleep
    RETRIES = 4
    while RETRIES:
        mm = monitoring_api.modelmonitor_get(DEPLOYMENT_ID)
        if mm["status"] and mm["status"]["state"].lower() != "active":
            break
        elif mm["status"] and mm["status"]["state"].lower() == "active":
            monitoring_api.modelmonitor_stop(DEPLOYMENT_ID)
        RETRIES -= 1
        sleep(5)
    else:
        raise TimeoutError("modelmonitor failed to stopped")
    monitoring_api.modelmonitor_delete(DEPLOYMENT_ID)