In [None]:
%matplotlib inline

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn import datasets
sns.set(color_codes=True)

iris = datasets.load_iris()

X=iris.data
y=iris.target

dataset = np.insert(iris.data, 0, iris.target,axis=1)

df = pd.DataFrame(data=dataset, columns=['iris_id'] + iris.feature_names)
df['species'] = df['iris_id'].map(lambda x: 'setosa' if x == 0 else 'versicolor' if x == 1 else 'virginica')

df.head()

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, stratify=y)
yX_train = np.column_stack((y_train, X_train))
yX_test = np.column_stack((y_test, X_test))
np.savetxt("iris_train.csv", yX_train, delimiter=",", fmt='%0.3f')
np.savetxt("iris_test.csv", yX_test, delimiter=",", fmt='%0.3f')

In [None]:
import sagemaker
import boto3

from sagemaker import get_execution_role
from sklearn.model_selection import train_test_split

role = get_execution_role()

prefix='mlops/iris'
# Retrieve the default bucket
sagemaker_session = sagemaker.Session()
bucket = sagemaker_session.default_bucket()
assert(sagemaker.__version__ >= "2.0")

In [None]:
# Upload the dataset to an S3 bucket
input_train = sagemaker_session.upload_data(path='iris_train.csv', key_prefix='%s/data' % prefix)
input_test = sagemaker_session.upload_data(path='iris_test.csv', key_prefix='%s/data' % prefix)

In [None]:
train_data = sagemaker.inputs.TrainingInput(s3_data=input_train,content_type="csv")
test_data = sagemaker.inputs.TrainingInput(s3_data=input_test,content_type="csv")

In [None]:
# get the URI for new container
container_uri = sagemaker.image_uris.retrieve('xgboost', boto3.Session().region_name, version='1.0-1')

# Create the estimator
xgb = sagemaker.estimator.Estimator(container_uri,
                                    role, 
                                    instance_count=1, 
                                    instance_type='ml.m4.xlarge',
                                    output_path='s3://{}/{}/output'.format(bucket, prefix),
                                    sagemaker_session=sagemaker_session)
# Set the hyperparameters
xgb.set_hyperparameters(eta=0.1,
                        max_depth=5,
                        gamma=4,
                        num_class=len(np.unique(y)),
                        alpha=10,
                        min_child_weight=6,
                        silent=0,
                        objective='multi:softmax',
                        num_round=30)

In [None]:
%%time
# takes around 3min 11s
xgb.fit({'train': train_data, 'validation': test_data, })

In [None]:
%%time
# Enable log capturing in the endpoint
data_capture_configuration = sagemaker.model_monitor.data_capture_config.DataCaptureConfig(
    enable_capture=True, 
    sampling_percentage=100, 
    destination_s3_uri='s3://{}/{}/monitoring'.format(bucket, prefix), 
    sagemaker_session=sagemaker_session
)
xgb_predictor = xgb.deploy(
    initial_instance_count=1, 
    instance_type='ml.m4.xlarge',
    data_capture_config=data_capture_configuration
)

In [None]:
from sagemaker.serializers import CSVSerializer
from sklearn.metrics import f1_score
csv_serializer = CSVSerializer()

endpoint_name = xgb_predictor.endpoint_name
model_name = boto3.client('sagemaker').describe_endpoint_config(
    EndpointConfigName=endpoint_name
)['ProductionVariants'][0]['ModelName']
!echo $model_name > model_name.txt
!echo $endpoint_name > endpoint_name.txt
xgb_predictor.serializer = csv_serializer

In [None]:
predictions_test = [ float(xgb_predictor.predict(x).decode('utf-8')) for x in X_test] 
score = f1_score(y_test,predictions_test,labels=[0.0,1.0,2.0],average='micro')

print('F1 Score(micro): %.1f' % (score * 100.0))

In [None]:
from sagemaker.serializers import CSVSerializer
csv_serializer = CSVSerializer()

sm = boto3.client('sagemaker-runtime')
resp = sm.invoke_endpoint(
    EndpointName=endpoint_name,
    ContentType='text/csv',
    Body=csv_serializer.serialize(X_test[0])
)
prediction = float(resp['Body'].read().decode('utf-8'))
print('Predicted class: %.1f for [%s]' % (prediction, csv_serializer.serialize(X_test[0])) )

# Model Monitoring

In [None]:
import sagemaker
import numpy as np
import boto3
import os
import pandas as pd
from sklearn import datasets
from sagemaker import get_execution_role
from sagemaker.serializers import CSVSerializer

role = get_execution_role()
iris = datasets.load_iris()
X = iris.data
y = iris.target

dataset = np.insert(X, 0, y,axis=1)
pd.DataFrame(data=dataset, columns=['iris_id'] + iris.feature_names).to_csv('full_dataset.csv', index=None)

sagemaker_session = sagemaker.Session()
bucket = sagemaker_session.default_bucket()

prefix='mlops/iris'
endpoint_name = open('endpoint_name.txt', 'r').read().strip() if os.path.isfile('endpoint_name.txt') else None
endpoint_name2 = open('endpoint_name2.txt', 'r').read().strip() if os.path.isfile('endpoint_name2.txt') else None

try:
    xgb_predictor = sagemaker.predictor.Predictor(endpoint_name=endpoint_name, sagemaker_session=sagemaker_session)
    xgb_predictor.serializer = CSVSerializer()
except Exception as e:
    raise Exception("You must run Part 1 before this. There, you will train/deploy a Model and use it here")

In [None]:
from sagemaker.model_monitor import DefaultModelMonitor
from sagemaker.model_monitor.dataset_format import DatasetFormat

endpoint_monitor = DefaultModelMonitor(
    role=role,
    instance_count=1,
    instance_type='ml.m5.xlarge',
    volume_size_in_gb=20,
    max_runtime_in_seconds=3600,
)
endpoint_monitor.suggest_baseline(
    baseline_dataset='full_dataset.csv',
    dataset_format=DatasetFormat.csv(header=True),
    output_s3_uri='s3://{}/{}/monitoring/baseline'.format(bucket, prefix),
    wait=True,
    logs=False
)

In [None]:
baseline_job = endpoint_monitor.latest_baselining_job
schema_df = pd.json_normalize(baseline_job.baseline_statistics().body_dict["features"])
constraints_df = pd.json_normalize(baseline_job.suggested_constraints().body_dict["features"])
report_df = schema_df.merge(constraints_df)
report_df.drop([
    'numerical_statistics.distribution.kll.buckets',
    'numerical_statistics.distribution.kll.sketch.data',
    'numerical_statistics.distribution.kll.sketch.parameters.c'
], axis=1).head(10)

In [None]:
from sagemaker.model_monitor import CronExpressionGenerator
from time import gmtime, strftime

endpoint_monitor.create_monitoring_schedule(
    endpoint_input=endpoint_name,
    output_s3_uri='s3://{}/{}/monitoring/reports'.format(bucket, prefix),
    statistics=endpoint_monitor.baseline_statistics(),
    constraints=endpoint_monitor.suggested_constraints(),
    schedule_cron_expression=CronExpressionGenerator.hourly(),
    enable_cloudwatch_metrics=True,
)

In [None]:
# This is how you can list all the monitoring schedules you created in your account
!aws sagemaker list-monitoring-schedules

In [None]:
import random
import time 
from threading import Thread

traffic_generator_running=True
def invoke_endpoint_forever():
    print('Invoking endpoint forever!')
    while traffic_generator_running:
        ## This will create an invalid set of features
        ## The idea is to violate two monitoring constraings: not_null and data_drift
        null_idx = random.randint(0,3)
        sample = [random.randint(500,2000) / 100.0 for i in range(4)]
        sample[null_idx] = None
        xgb_predictor.predict(sample)
        time.sleep(0.5)
    print('Endpoint invoker has stopped')
Thread(target = invoke_endpoint_forever).start()

In [None]:
import time
import datetime
import boto3

def process_monitoring_logs(endpoint_monitor):
    sm = boto3.client('sagemaker')
    now = datetime.datetime.today()
    suffix = now.strftime("%Y/%m/%d/%H")
    start_time = datetime.datetime(now.year, now.month, now.day, now.hour)
    end_time = start_time + datetime.timedelta(hours=1)

    # get the monitoring metadata
    base_desc = endpoint_monitor.describe_latest_baselining_job()
    sche_desc = endpoint_monitor.describe_schedule()
    baseline_path = base_desc['ProcessingOutputConfig']['Outputs'][0]['S3Output']['S3Uri']
    endpoint_name = sche_desc['EndpointName']

    variant_name = sm.describe_endpoint(EndpointName=endpoint_name)['ProductionVariants'][0]['VariantName']
    logs_path = "%s/%s/%s" % (endpoint_name,variant_name,suffix)
    
    s3_output = {
        "S3Uri": 's3://{}/{}/monitoring/{}'.format(bucket, prefix, logs_path),
        "LocalPath": "/opt/ml/processing/output",
        "S3UploadMode": "Continuous"
    }
    # values for the processing job input
    values = [
        [ 'input_1', 's3://{}/{}/monitoring/{}'.format(bucket, prefix, logs_path),
            '/opt/ml/processing/input/endpoint/{}'.format(logs_path) ], 
        [ 'baseline', '%s/statistics.json' % baseline_path,
            '/opt/ml/processing/baseline/stats'],
        [ 'constraints', '%s/constraints.json' % baseline_path,
            '/opt/ml/processing/baseline/constraints']
    ]
    job_params = {
        'ProcessingJobName': 'model-monitoring-%s' % time.strftime("%Y%m%d%H%M%S"),
        'ProcessingInputs': [{
            'InputName': o[0],
            'S3Input': { 
                'S3Uri': o[1], 'LocalPath': o[2], 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 
                'S3CompressionType': 'None', 'S3DataDistributionType': 'FullyReplicated'
            }} for o in values],
        'ProcessingOutputConfig': { 'Outputs': [ {'OutputName': 'result','S3Output': s3_output } ] },
        'ProcessingResources': base_desc['ProcessingResources'],
        'AppSpecification': base_desc['AppSpecification'],
        'RoleArn': base_desc['RoleArn'],
        'Environment': {
            'baseline_constraints': '/opt/ml/processing/baseline/constraints/constraints.json',
            'baseline_statistics': '/opt/ml/processing/baseline/stats/statistics.json',
            'dataset_format': '{"sagemakerCaptureJson":{"captureIndexNames":["endpointInput","endpointOutput"]}}',
            'dataset_source': '/opt/ml/processing/input/endpoint',      
            'output_path': '/opt/ml/processing/output',
            'publish_cloudwatch_metrics': 'Enabled',
            'sagemaker_monitoring_schedule_name': sche_desc['MonitoringScheduleName'],
            'sagemaker_endpoint_name': endpoint_name,
            'start_time': start_time.strftime("%Y-%m-%dT%H:%M:%SZ"),
            'end_time': end_time.strftime("%Y-%m-%dT%H:%M:%SZ")
        }
    }
    sm.create_processing_job(**job_params)
    waiter = sm.get_waiter('processing_job_completed_or_stopped')
    waiter.wait( ProcessingJobName=job_params['ProcessingJobName'], WaiterConfig={'Delay': 30,'MaxAttempts': 20} )
    return job_params['ProcessingJobName'], s3_output['S3Uri']

In [None]:
import pandas as pd
## The processing job takes something like 5mins to run
job_name, s3_output = process_monitoring_logs(endpoint_monitor)
tokens = s3_output.split('/', 3)
df = pd.read_json(sagemaker_session.read_s3_file(tokens[2], '%s/constraint_violations.json' % tokens[3]))
df = pd.json_normalize(df.violations)
df.head()

# Clean-up

In [None]:
traffic_generator_running=False
time.sleep(3)
endpoint_monitor.delete_monitoring_schedule()
time.sleep(10) # wait for 10 seconds before trying to delete the endpoint

In [None]:
try:
    xgb_predictor = sagemaker.predictor.Predictor(endpoint_name=endpoint_name, sagemaker_session=sagemaker_session)
    xgb_predictor.delete_endpoint()
except Exception as e:
    print(e)
try:
    xgb_predictor2 = sagemaker.predictor.Predictor(endpoint_name=endpoint_name2, sagemaker_session=sagemaker_session)
    xgb_predictor2.delete_endpoint()
except Exception as e:
    print(e)

# Reading Hourly Reports

In [1]:
import pandas as pd

In [4]:
path1 = 'C:/Users/mohil/Desktop/Projects/Data/MLOps/iris-mlops/amazon-sagemaker-mlops-workshop-master/Monitoring_Reports_Hourly/22-06-04-15-37/constraints.json'
path2 = 'C:/Users/mohil/Desktop/Projects/Data/MLOps/iris-mlops/amazon-sagemaker-mlops-workshop-master/Monitoring_Reports_Hourly/22-06-04-15-37/statistics.json'
path3 = 'C:/Users/mohil/Desktop/Projects/Data/MLOps/iris-mlops/amazon-sagemaker-mlops-workshop-master/Monitoring_Reports_Hourly/22-06-04-15-37/constraint_violations.json'