# Serving Multi-Model Endpoints on GPU instances on SageMaker

## Setup

In [1]:
import boto3
import sagemaker
import os
import json
import time

role = 'sagemaker-role'
role_arn = 'arn:aws:iam::123456789:role/sagemaker-role'
sagemaker_session = sagemaker.Session()
s3_bucket_name = sagemaker_session.default_bucket()
region = sagemaker_session.boto_region_name

In [None]:
job_name = 'mme'
prefix = 'mme'

## Preparing and Uploading Model

In [None]:
EN_MODEL = "distilbert-base-uncased-finetuned-sst-2-english"
GER_MODEL = "oliverguhr/german-sentiment-bert"

### English

In [None]:
# retrieve english model

import os

import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

en_tokenizer = DistilBertTokenizer.from_pretrained(EN_MODEL)
en_model = DistilBertForSequenceClassification.from_pretrained(EN_MODEL)

inputs = en_tokenizer("Hello, my dog is cute", return_tensors="pt")
with torch.no_grad():
    logits = en_model(**inputs).logits

predicted_class_id = logits.argmax().item()
predictions = en_model.config.id2label[predicted_class_id]

print(predictions)

en_model_path = "models/english_sentiment"
os.makedirs(en_model_path, exist_ok=True)

en_model.save_pretrained(save_directory=en_model_path)
en_tokenizer.save_pretrained(save_directory=en_model_path)

In [None]:
# temp

import os

import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

en_tokenizer = DistilBertTokenizer.from_pretrained(EN_MODEL)
en_model = DistilBertForSequenceClassification.from_pretrained(EN_MODEL)

inputs = en_tokenizer("Hello, my dog is cute", return_tensors="pt")
with torch.no_grad():
    logits = en_model(**inputs)


In [None]:
logits.logits

### German

In [47]:
# retrieve German model

import torch

from transformers import BertTokenizer, BertForSequenceClassification

ger_tokenizer = BertTokenizer.from_pretrained(GER_MODEL)
ger_model = BertForSequenceClassification.from_pretrained(GER_MODEL)

inputs = ger_tokenizer("Das ist gar nicht mal so gut", return_tensors="pt")
with torch.no_grad():
    logits = ger_model(**inputs).logits

predicted_class_id = logits.argmax().item()
predictions = ger_model.config.id2label[predicted_class_id]

print(predictions)

ger_model_path = "models/german_sentiment"
os.makedirs(ger_model_path, exist_ok=True)

en_model.save_pretrained(save_directory=ger_model_path)
en_tokenizer.save_pretrained(save_directory=ger_model_path)


KeyboardInterrupt: 

# Package models

In [3]:
# English
! tar -czvf english_model.tar.gz -C models/ english

english/
english/config.pbtxt
english/1/
english/1/model.plan


In [4]:
# German
! tar -czvf german_model.tar.gz -C models/ german

german/
german/config.pbtxt
german/1/
german/1/model.plan


In [5]:
mme_data_path = f"s3://{s3_bucket_name}/{prefix}/"
mme_data_path

's3://sagemaker-us-east-2-410330524497/mme/'

In [6]:
en_model_data = sagemaker_session.upload_data('english_model.tar.gz', bucket=s3_bucket_name,key_prefix=prefix)
ger_model_data = sagemaker_session.upload_data('german_model.tar.gz', bucket=s3_bucket_name,key_prefix=prefix)

In [7]:
en_model_data, ger_model_data
# en_model_data = 's3://sagemaker-us-east-2-410330524497/mme/english_model.tar.gz'
# ger_model_data

('s3://sagemaker-us-east-2-410330524497/mme/english_model.tar.gz',
 's3://sagemaker-us-east-2-410330524497/mme/german_model.tar.gz')

# Container

In [8]:
account_id_map = {
    'us-east-1': '785573368785',
    'us-east-2': '007439368137',
    'us-west-1': '710691900526',
    'us-west-2': '301217895009',
    'eu-west-1': '802834080501',
    'eu-west-2': '205493899709',
    'eu-west-3': '254080097072',
    'eu-north-1': '601324751636',
    'eu-south-1': '966458181534',
    'eu-central-1': '746233611703',
    'ap-east-1': '110948597952',
    'ap-south-1': '763008648453',
    'ap-northeast-1': '941853720454',
    'ap-northeast-2': '151534178276',
    'ap-southeast-1': '324986816169',
    'ap-southeast-2': '355873309152',
    'cn-northwest-1': '474822919863',
    'cn-north-1': '472730292857',
    'sa-east-1': '756306329178',
    'ca-central-1': '464438896020',
    'me-south-1': '836785723513',
    'af-south-1': '774647643957'
}

region = boto3.Session().region_name
if region not in account_id_map.keys():
    raise("UNSUPPORTED REGION")

base = "amazonaws.com.cn" if region.startswith("cn-") else "amazonaws.com"
triton_image_uri = "{account_id}.dkr.ecr.{region}.{base}/sagemaker-tritonserver:22.07-py3".format(
    account_id=account_id_map[region], region=region, base=base
)

print(triton_image_uri)

007439368137.dkr.ecr.us-east-2.amazonaws.com/sagemaker-tritonserver:22.07-py3


In [9]:
container = { 
            'Image':        triton_image_uri,
            'ContainerHostname': 'MultiModel',
            'Mode':         'MultiModel',
            'ModelDataUrl': mme_data_path,
            }

# Deploy inference endpoint

For mult-model endpoints on GPU, you must use Triton containers, and hence `boto3`.

In [10]:
sm_client = sagemaker_session.sagemaker_client
runtime_sm_client = sagemaker_session.sagemaker_runtime_client

instance_type = "ml.g4dn.2xlarge"

In [11]:
model_name = job_name

# create model
create_model_response = sm_client.create_model(
    ModelName=model_name,
    PrimaryContainer=container,
    ExecutionRoleArn=role_arn,
)

In [12]:
# create endpoint configuration
endpoint_config_name = job_name

endpoint_config = sm_client.create_endpoint_config(
    EndpointConfigName=endpoint_config_name,
    ProductionVariants=[
        {
            "VariantName": "AllTraffic",
            "ModelName": model_name,
            "InitialInstanceCount": 1,
            "InstanceType": instance_type,
        },
    ],
)

In [13]:
endpoint_name = job_name

response = sm_client.create_endpoint(
            EndpointName       = endpoint_name,
            EndpointConfigName = endpoint_config_name
            )

In [15]:
resp = sm_client.describe_endpoint(EndpointName=endpoint_name)
status = resp["EndpointStatus"]
print("Status: " + status)

Status: InService


# Inference

In [27]:
import tritonclient.http as httpclient
from transformers import DistilBertTokenizer
import torch
import torch.nn.functional as F 
import numpy as np
import botocore
import concurrent
import time

### English

In [38]:
EN_MODEL = "distilbert-base-uncased-finetuned-sst-2-english"
enc = DistilBertTokenizer.from_pretrained(EN_MODEL)
    
def tokenize_text_en(text):
    encoded_text = enc(text, padding="max_length", max_length=512, truncation=True)
    return encoded_text["input_ids"], encoded_text["attention_mask"]

In [39]:
def get_sample_tokenized_text_binary_en(text):
    inputs = []
    outputs = []
    input_names =  ["input_ids", "attention_mask"]
    output_names = ["logits"]
    
    inputs.append(httpclient.InferInput(input_names[0], [1, 512], "INT32"))
    inputs.append(httpclient.InferInput(input_names[1], [1, 512], "INT32"))
    indexed_tokens, attention_mask = tokenize_text_en(text)
    
    indexed_tokens = np.array(indexed_tokens, dtype=np.int32)
    indexed_tokens = np.expand_dims(indexed_tokens, axis=0)
    inputs[0].set_data_from_numpy(indexed_tokens, binary_data=True)

    attention_mask = np.array(attention_mask, dtype=np.int32)
    attention_mask = np.expand_dims(attention_mask, axis=0)
    inputs[1].set_data_from_numpy(attention_mask, binary_data=True)

    outputs.append(httpclient.InferRequestedOutput(output_names[0], binary_data=True))
    outputs.append(httpclient.InferRequestedOutput(output_names[1], binary_data=True))
    request_body, header_length = httpclient.InferenceServerClient.generate_request_body(inputs, outputs=outputs)
    return request_body, header_length

In [40]:
def get_prediction_en(text):
    input_ids, attention_mask = tokenize_text_en(text)

    payload = {
        "inputs": [
            {"name": "input_ids", "shape": [1, 512], "datatype": "INT32", "data": input_ids},
            {"name": "attention_mask", "shape": [1, 512], "datatype": "INT32", "data": attention_mask},
        ]
    }

    response = runtime_sm_client.invoke_endpoint(EndpointName=endpoint_name,
                                                ContentType="application/octet-stream",
                                                Body=json.dumps(payload),
                                                TargetModel="english_model.tar.gz")

    result = json.loads(response["Body"].read().decode("utf8"))
    predictions = F.softmax(torch.tensor(result['outputs'][0]['data']),dim=-1)
    # return torch.argmax(predictions, dim=-1).numpy()
    return predictions.numpy()

In [41]:
test_text = "Hello, my dog is cute"
get_prediction_en(test_text)


array([6.094501e-04, 9.993905e-01], dtype=float32)

In [42]:
test_text = "This is a nightmare"
get_prediction_en(test_text)


array([9.9962294e-01, 3.7702857e-04], dtype=float32)

## German

In [48]:
from transformers import BertTokenizer, BertForSequenceClassification

GER_MODEL = "oliverguhr/german-sentiment-bert"
enc = BertTokenizer.from_pretrained(GER_MODEL)
    
def tokenize_text_ger(text):
    encoded_text = enc(text, padding="max_length", max_length=512, truncation=True)
    return encoded_text["input_ids"], encoded_text["token_type_ids"], encoded_text["attention_mask"]

In [49]:
def get_sample_tokenized_text_binary_ger(text):
    inputs = []
    outputs = []
    input_names =  ["input_ids", "token_type_ids", "attention_mask"]
    output_names = ["logits"]
    

    inputs.append(httpclient.InferInput(input_names[0], [1, 512], "INT32"))
    inputs.append(httpclient.InferInput(input_names[1], [1, 512], "INT32"))
    inputs.append(httpclient.InferInput(input_names[2], [1, 512], "INT32"))
    indexed_tokens, token_type_ids, attention_mask = tokenize_text_ger(text)

    indexed_tokens = np.array(indexed_tokens, dtype=np.int32)
    indexed_tokens = np.expand_dims(indexed_tokens, axis=0)
    inputs[0].set_data_from_numpy(indexed_tokens, binary_data=True)

    token_type_ids = np.array(token_type_ids, dtype=np.int32)
    token_type_ids = np.expand_dims(token_type_ids, axis=0)
    inputs[1].set_data_from_numpy(token_type_ids, binary_data=True)
    
    attention_mask = np.array(attention_mask, dtype=np.int32)
    attention_mask = np.expand_dims(attention_mask, axis=0)
    inputs[2].set_data_from_numpy(attention_mask, binary_data=True)
    
    outputs.append(httpclient.InferRequestedOutput(output_names[0], binary_data=True))
    outputs.append(httpclient.InferRequestedOutput(output_names[1], binary_data=True))
    outputs.append(httpclient.InferRequestedOutput(output_names[2], binary_data=True))
    request_body, header_length = httpclient.InferenceServerClient.generate_request_body(inputs, outputs=outputs)
    return request_body, header_length

In [50]:
def get_prediction_ger(text):
    input_ids, token_type_ids, attention_mask = tokenize_text_ger(text)

    payload = {
        "inputs": [
            {"name": "input_ids", "shape": [1, 512], "datatype": "INT32", "data": input_ids},
            {"name": "token_type_ids", "shape": [1, 512], "datatype": "INT32", "data": token_type_ids},
            {"name": "attention_mask", "shape": [1, 512], "datatype": "INT32", "data": attention_mask},
        ]
    }

    response = runtime_sm_client.invoke_endpoint(EndpointName=endpoint_name,
                                                ContentType="application/octet-stream",
                                                Body=json.dumps(payload),
                                                TargetModel="german_model.tar.gz")

    result = json.loads(response["Body"].read().decode("utf8"))
    predictions = F.softmax(torch.tensor(result['outputs'][0]['data']),dim=-1)
    # return torch.argmax(predictions, dim=-1).numpy()
    return predictions.numpy()

In [52]:
test_text = "Das ist gar nicht mal so gut"
get_prediction_ger(test_text)


array([4.3519988e-04, 9.9955279e-01, 1.2024404e-05], dtype=float32)

In [53]:
test_text = "das ist super"
get_prediction_ger(test_text)


array([9.7361881e-01, 2.6146367e-02, 2.3479130e-04], dtype=float32)

In [54]:
test_text = 'Das ist gar nicht mal so schlecht'
get_prediction_ger(test_text)


array([9.9584740e-01, 4.1439957e-03, 8.6964228e-06], dtype=float32)

In [55]:
test_text = 'Sie fährt ein grünes Auto'
get_prediction_ger(test_text)

array([0.00490901, 0.01852641, 0.9765646 ], dtype=float32)

# Clean up

In [56]:
sm_client.delete_endpoint(EndpointName=endpoint_name)
sm_client.delete_endpoint_config(EndpointConfigName=endpoint_config_name)
sm_client.delete_model(ModelName=model_name)

{'ResponseMetadata': {'RequestId': 'cea01855-2cd0-4da3-969c-40c297312aa3',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'cea01855-2cd0-4da3-969c-40c297312aa3',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '0',
   'date': 'Wed, 23 Nov 2022 07:55:53 GMT'},
  'RetryAttempts': 0}}