# Setting up the endpoint

In [1]:
%%time
import sagemaker
from sagemaker.pytorch import PyTorchModel
from sagemaker.predictor import csv_serializer

role = sagemaker.session.get_execution_role()

model_data='s3://inf1-compiled-bert-model/model.tar.gz'

pytorch_model = PyTorchModel(model_data=model_data, 
                             role=role,
                             entry_point='inference.py',
                             image='249828997522.dkr.ecr.us-east-1.amazonaws.com/bert-inf1-serving:latest',
                             framework_version='1.5.0',
                             enable_cloudwatch_metrics=True)

predictor = pytorch_model.deploy(instance_type='ml.inf1.xlarge', initial_instance_count=1)



Parameter image will be renamed to image_uri in SageMaker Python SDK v2.
Your model is not compiled. Please compile your model before using Inferentia.


-------------!CPU times: user 13.1 s, sys: 2.58 s, total: 15.7 s
Wall time: 6min 47s


In [2]:
import numpy as np 
import random
import datetime
import math
import time
import boto3   
import random
from transformers import AutoTokenizer
import pickle
import os
import argparse
from concurrent import futures


def numpy_bytes_serializer(data):
    f = io.BytesIO()
    np.save(f, data)
    f.seek(0)
    return f.read()

predictor.content_type = 'application/binary'
predictor.serializer = None
predictor.deserializer = None

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

In [3]:
# Data
sentence1 ="The animal didn't cross the street because it was too tired."

s_nouns = ["A dude", "My mom", "The king", "Some guy", "A cat with rabies", "A sloth", "Your homie", "This cool guy my gardener met yesterday", "Superman"]
p_nouns = ["These dudes", "Both of my moms", "All the kings of the world", "Some guys", "All of a cattery's cats", "The multitude of sloths living under your bed", "Your homies", "Like, these, like, all these people", "Supermen"]
s_verbs = ["eats", "kicks", "gives", "treats", "meets with", "creates", "hacks", "configures", "spies on", "retards", "meows on", "flees from", "tries to automate", "explodes"]
p_verbs = ["eat", "kick", "give", "treat", "meet with", "create", "hack", "configure", "spy on", "retard", "meow on", "flee from", "try to automate", "explode"]
infinitives = ["to make a pie.", "for no apparent reason.", "because the sky is green.", "for a disease.", "to be able to make toast explode.", "to know more about archeology."]


sentence_random = (random.choice(s_nouns) + ' ' + random.choice(s_verbs) + ' ' + random.choice(s_nouns).lower() or random.choice(p_nouns).lower() + ' ' + random.choice(infinitives))
encoded_sentence = tokenizer.encode_plus(sentence1, sentence_random, max_length=128, pad_to_max_length=True, return_tensors="pt", truncation=True)
encoded_sentence_tuple = encoded_sentence['input_ids'], encoded_sentence['attention_mask'], encoded_sentence['token_type_ids'] 
pickled_bytes = pickle.dumps(encoded_sentence_tuple)




In [4]:
%%time
raw_bytes = predictor.predict(pickled_bytes)
print(pickle.loads(raw_bytes))

(tensor([[[ 0.3652, -0.4629,  0.3203,  ...,  0.4375,  0.6172, -0.1729],
         [-0.3691,  0.0306, -0.7578,  ..., -0.1709,  0.9453, -0.3867],
         [-0.1514,  0.4141, -0.5273,  ..., -0.5742,  0.2656,  0.0486],
         ...,
         [ 0.1104,  0.0796,  0.4727,  ...,  0.0339, -0.0264,  0.0146],
         [ 0.0820,  0.1484,  0.3027,  ...,  0.0698, -0.0273,  0.1768],
         [-0.3125, -0.1982,  0.1572,  ...,  0.3516,  0.1553,  0.0698]]]), tensor([[ 8.2812e-01,  8.5547e-01,  1.0000e+00, -9.7266e-01, -1.0000e+00,
          6.8750e-01, -9.8438e-01, -6.6797e-01,  1.0000e+00,  1.0000e+00,
          9.9219e-01, -1.0000e+00,  7.8125e-01, -1.0000e+00, -4.7656e-01,
          9.8047e-01,  9.3750e-01,  5.5078e-01, -3.4180e-01,  9.1406e-01,
         -9.1016e-01, -1.0000e+00,  9.9609e-01, -4.9805e-01, -7.4609e-01,
         -1.0000e+00,  9.1797e-01,  2.8320e-01,  1.2305e-01,  5.5176e-02,
          7.9688e-01, -7.1875e-01, -6.5625e-01,  4.8242e-01,  1.0000e+00,
          9.2578e-01, -9.1016e-01,  4.

In [8]:
%%time
total_runs = 1000
endpoint_name = predictor.endpoint
print('Running {} inferences for {}:'.format(total_runs, endpoint_name))

client_times = []
errors_list = []
cw_start = datetime.datetime.utcnow()

errors = 0

for i in range(total_runs):    
    
    
    sentence_random = (random.choice(s_nouns) + ' ' + random.choice(s_verbs) + ' ' + random.choice(s_nouns).lower() or random.choice(p_nouns).lower() + ' ' + random.choice(infinitives))
    encoded_sentence = tokenizer.encode_plus(sentence1, sentence_random, max_length=128, pad_to_max_length=True, return_tensors="pt", truncation=True)
    encoded_sentence_tuple = encoded_sentence['input_ids'], encoded_sentence['attention_mask'], encoded_sentence['token_type_ids'] 
    
    pickled_bytes = pickle.dumps(encoded_sentence_tuple)
    
    client_start = time.time()
    
    try:
        raw_bytes = predictor.predict(pickled_bytes)
        errors_list.append(20)
    except:
        errors += 1
        errors_list.append(30)
        pass
    
    client_end = time.time()
    client_times.append((client_end - client_start)*1000)
    
print('\nErrors - {:.4f} out of {:.4f} total runs | {:.4f}% \n'.format(errors, total_runs, (errors/total_runs)*100))
errors = 0
    
    
cw_end = datetime.datetime.utcnow()    
    
print('Client end-to-end latency percentiles:')
client_avg = np.mean(client_times)
client_p50 = np.percentile(client_times, 50)
client_p90 = np.percentile(client_times, 90)
client_p95 = np.percentile(client_times, 95)
client_p100 = np.percentile(client_times, 100)
print('Avg | P50 | P90 | P95 | P100')
print('{:.4f} | {:.4f} | {:.4f} | {:.4f}\n'.format(client_avg, client_p50, client_p90, client_p95, client_p100))

print('Getting Cloudwatch:')
cloudwatch = boto3.client('cloudwatch')
statistics=['SampleCount', 'Average', 'Minimum', 'Maximum']
extended=['p50', 'p90', 'p95', 'p100']
print('Time elapsed: {} seconds'.format((cw_end - cw_start).total_seconds()))
# Give 5 minute buffer to end
cw_end += datetime.timedelta(minutes=5)

# Period must be 1, 5, 10, 30, or multiple of 60
# Calculate closest multiple of 60 to the total elapsed time
factor = math.ceil((cw_end - cw_start).total_seconds() / 60)
period = factor * 60

print('Using period of {} seconds\n'.format(period))

cloudwatch_ready = False
# Keep polling CloudWatch metrics until datapoints are available
while not cloudwatch_ready:
  time.sleep(30)
  print('Waiting 30 seconds ...')
  # Must use default units of microseconds
  model_latency_metrics = cloudwatch.get_metric_statistics(MetricName='ModelLatency',
                                             Dimensions=[{'Name': 'EndpointName',
                                                          'Value': endpoint_name},
                                                         {'Name': 'VariantName',
                                                          'Value': "AllTraffic"}],
                                             Namespace="AWS/SageMaker",
                                             StartTime=cw_start,
                                             EndTime=cw_end,
                                             Period=period,
                                             Statistics=statistics,
                                             ExtendedStatistics=extended
                                             )
  # Should be 1000
  if len(model_latency_metrics['Datapoints']) > 0:
    print('{} latency datapoints ready'.format(model_latency_metrics['Datapoints'][0]['SampleCount']))
    side_avg = model_latency_metrics['Datapoints'][-1]['Average'] / total_runs
    side_p50 = model_latency_metrics['Datapoints'][-1]['ExtendedStatistics']['p50'] / total_runs
    side_p90 = model_latency_metrics['Datapoints'][-1]['ExtendedStatistics']['p90'] / total_runs
    side_p95 = model_latency_metrics['Datapoints'][-1]['ExtendedStatistics']['p95'] / total_runs
    side_p100 = model_latency_metrics['Datapoints'][-1]['ExtendedStatistics']['p100'] / total_runs
    print('Avg | P50 | P90 | P95 | P100')
    print('{:.4f} | {:.4f} | {:.4f} | {:.4f}\n'.format(side_avg, side_p50, side_p90, side_p95, side_p100))

    cloudwatch_ready = True
   


Running 1000 inferences for bert-inf1-serving-2020-09-18-18-46-06-589:

Errors - 0.0000 out of 1000.0000 total runs | 0.0000% 

Client end-to-end latency percentiles:
Avg | P50 | P90 | P95 | P100
29.7136 | 28.9282 | 31.8647 | 33.6890

Getting Cloudwatch:
Time elapsed: 30.619184 seconds
Using period of 360 seconds

Waiting 30 seconds ...
349.0 latency datapoints ready
Avg | P50 | P90 | P95 | P100
20.6746 | 20.5988 | 22.2910 | 23.8905

CPU times: user 4.12 s, sys: 536 ms, total: 4.65 s
Wall time: 1min


# Manual Endpoint Load

In [None]:
import os
import sagemaker
from sagemaker import get_execution_role
import boto3 

sagemaker_session = sagemaker.Session()

role = get_execution_role()

In [None]:
endpoint_name = 'sagemaker-tensorflow-2020-09-14-18-31-57-956ml-inf1'
predictor = sagemaker.predictor.RealTimePredictor(endpoint_name)