Starter code obtained from https://github.com/awslabs/amazon-sagemaker-examples/blob/master/advanced_functionality/multi_model_bring_your_own/multi_model_endpoint_bring_your_own.ipynb

In [None]:
!pip install -qU awscli boto3 sagemaker

In [None]:
%%sh

# The name of our algorithm
algorithm_name=mms-with-tvm

cd container/tvm

account=$(aws sts get-caller-identity --query Account --output text)

# Get the region defined in the current configuration (default to us-west-2 if none defined)
region=$(aws configure get region)
region=${region:-us-east-1}

fullname="${account}.dkr.ecr.${region}.amazonaws.com/${algorithm_name}:latest"

# If the repository doesn't exist in ECR, create it.
aws ecr describe-repositories --repository-names "${algorithm_name}" > /dev/null 2>&1

if [ $? -ne 0 ]
then
    aws ecr create-repository --repository-name "${algorithm_name}" > /dev/null
fi

# Get the login command from ECR and execute it directly
$(aws ecr get-login --region ${region} --no-include-email)

# Build the docker image locally with the image name and then push it to ECR
# with the full name.

docker build -q -t ${algorithm_name} .
docker tag ${algorithm_name} ${fullname}

docker push ${fullname}

In [None]:
import boto3, time, json
sess    = boto3.Session()
sm      = sess.client('sagemaker')
region  = sess.region_name
account = boto3.client('sts').get_caller_identity().get('Account')

In [None]:
## Variables

model_data = 's3://<bucket>/<model_name>.tar.gz' # TAR file containing the saved_model.pb file
sm_model_name = 'mms_model'
role_name = 'arn:aws:iam::{}:role/service-role/{}'.format(account, role_name)
image = '{}.dkr.ecr.{}.amazonaws.com/{}:latest'.format(account, region, 'mms-with-tvm')

In [None]:
import time
from sagemaker.model import Model
from sagemaker.predictor import Predictor

tf_model = Model(model_data=model_data, 
                 image_uri=image,
                 role=role_name,
                 predictor_cls=Predictor,
                 name=sm_model_name)

In [None]:
endpoint_name = 'mms-tvm' + time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())

predictor = tf_model.deploy(initial_instance_count=1, instance_type='local')

In [None]:
import numpy as np

def get_batch_input(batch_size):
    payload=np.random.rand(485).astype(dtype=np.float32)
    inputs = []
    for i in range(batch_size):
        inputs.append(payload)
    return np.asarray(inputs)

response = predictor.predict(get_batch_input(10))
print(json.loads(response))

In [None]:
import sagemaker
role = sagemaker.get_execution_role()
sagemaker_session = sagemaker.Session(boto_session=sess)

In [None]:
container = {
    'Image': image,
    'ModelDataUrl': model_data
}

create_model_response = sm.create_model(
    ModelName         = sm_model_name,
    ExecutionRoleArn  = role_name,
    PrimaryContainer  = container)

print(create_model_response['ModelArn'])

In [None]:
import time
endpoint_config_name = 'mms-tvm-' + time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())
print(endpoint_config_name)

create_endpoint_config_response = sm.create_endpoint_config(
    EndpointConfigName = endpoint_config_name,
    ProductionVariants = [{
        'InstanceType'        : 'ml.g4dn.2xlarge',
        'InitialVariantWeight': 1,
        'InitialInstanceCount': 10,
        'ModelName'           : sm_model_name,
        'VariantName'         : 'AllTraffic'}])

print("Endpoint Config Arn: " + create_endpoint_config_response['EndpointConfigArn'])

In [None]:
endpoint_name = 'mms-tvm-' + time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())
print(endpoint_name)

create_endpoint_response = sm.create_endpoint(
    EndpointName         = endpoint_name,
    EndpointConfigName   = endpoint_config_name)
print(create_endpoint_response['EndpointArn'])

In [None]:
resp = sm.describe_endpoint(EndpointName=endpoint_name)
status = resp['EndpointStatus']
print("Status: " + status)

while status=='Creating':
    time.sleep(60)
    resp = sm.describe_endpoint(EndpointName=endpoint_name)
    status = resp['EndpointStatus']
    print("Status: " + status)

print("Arn: " + resp['EndpointArn'])
print("Status: " + status)

In [None]:
client = boto3.client('sagemaker-runtime')
    
def run_bench(endpoint, batch_size):
    times = []
    for i in range(5000):
        start_time = time.time()
        response = client.invoke_endpoint(
            EndpointName=endpoint,
            Body=get_batch_input(batch_size),
            ContentType='application/octet-stream')
        response['Body'].read().decode('utf8')
        times.append((time.time() - start_time) * 1000)
    return times

times = run_bench(endpoint_name, 25)

In [None]:
a = np.array(times)
[np.average(a), np.percentile(a, 50), np.percentile(a, 90), np.percentile(a, 95), np.percentile(a, 99)]