# Deploying MAX optimized models at scale with Amazon SageMaker and MAX Serving

In [1]:
# Install and update necessary packages
!pip install -qU pip awscli boto3 sagemaker transformers

In [2]:
import shutil
import os
import boto3
import sagemaker
import tensorflow as tf
from transformers import AutoTokenizer, TFRobertaForSequenceClassification

os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
os.environ["TRANSFORMERS_VERBOSITY"] = "critical"

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


2024-03-26 23:47:33.338109: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-26 23:47:33.338168: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-26 23:47:33.339189: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-03-26 23:47:33.345559: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
# Create boto3 and sagemaker session, get role, bucket name, account number and region
sess = boto3.Session()
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()
bucket_name    = sagemaker_session.default_bucket()
account = boto3.client('sts').get_caller_identity().get('Account')
region  = sess.region_name

### Step 1: Download a pre-trained Roberta model from HuggingFace

In [4]:
def download_and_save_model(hf_model_name, saved_model_dir):
    model = TFRobertaForSequenceClassification.from_pretrained(hf_model_name)
    shutil.rmtree(saved_model_dir, ignore_errors=True)
    tf.saved_model.save(model, saved_model_dir+"/1/saved_model/")

saved_model_dir = "model-repository/roberta"
hf_model_name = "cardiffnlp/twitter-roberta-base-emotion-multilabel-latest"
download_and_save_model(hf_model_name, saved_model_dir)

2024-03-26 23:47:36.365426: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:274] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-emotion-multilabel-latest.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


INFO:tensorflow:Assets written to: model-repository/roberta/1/saved_model/assets


INFO:tensorflow:Assets written to: model-repository/roberta/1/saved_model/assets


In [5]:
%%sh
cat > model-repository/roberta/config.pbtxt <<EOL
instance_group {
  kind: KIND_CPU
}
default_model_filename: "saved_model"
backend: "max"
EOL

tree model-repository

model-repository
└── roberta
    ├── 1
    │   └── saved_model
    │       ├── assets
    │       ├── fingerprint.pb
    │       ├── saved_model.pb
    │       └── variables
    │           ├── variables.data-00000-of-00001
    │           └── variables.index
    └── config.pbtxt

5 directories, 5 files


### Step 2: Upload model to Amazon S3 so Amazon SageMaker and MAX Serving container has access to it.

In [6]:
shutil.rmtree('model.tar.gz', ignore_errors=True)
!tar -C model-repository -czf model.tar.gz roberta

model_uri = sagemaker_session.upload_data(path="model.tar.gz", 
                                          key_prefix="max-serving-models/roberta/")

### Step 3: Pull the latest MAX Serving container image and push it to Amazon Elastic Container Registry (Amazon ECR)

In [7]:
repo_name = 'sagemaker-max-serving'
image_label = 'v1'
max_serving_image_uri = "public.ecr.aws/modular/max-serving-de"

image = f'{account}.dkr.ecr.{region}.amazonaws.com/{repo_name}:{image_label}'
image

'466483404629.dkr.ecr.us-east-2.amazonaws.com/sagemaker-max-serving:v1'

In [8]:
!aws ecr create-repository --repository-name {repo_name}
!docker pull {max_serving_image_uri}
!docker tag {max_serving_image_uri} {image}
!$(aws ecr get-login --no-include-email --region {region})
!docker push {image}

{
    "repository": {
        "repositoryArn": "arn:aws:ecr:us-east-2:466483404629:repository/sagemaker-max-serving",
        "registryId": "466483404629",
        "repositoryName": "sagemaker-max-serving",
        "repositoryUri": "466483404629.dkr.ecr.us-east-2.amazonaws.com/sagemaker-max-serving",
        "createdAt": 1711496901.965,
        "imageTagMutability": "MUTABLE",
        "imageScanningConfiguration": {
            "scanOnPush": false
        },
        "encryptionConfiguration": {
            "encryptionType": "AES256"
        }
    }
}
Using default tag: latest
latest: Pulling from modular/max-serving-de
Digest: sha256:1020bb529cf514bfa16b029ec66a06bc3280fa03227b1fa4b4f3980eae63cf29
Status: Image is up to date for public.ecr.aws/modular/max-serving-de:latest
public.ecr.aws/modular/max-serving-de:latest
https://docs.docker.com/engine/reference/commandline/login/#credentials-store

Login Succeeded
The push refers to repository [466483404629.dkr.ecr.us-east-2.amazonaws.com/

### Step 4: Create an Amazon SageMaker model and deploy to specified instance type. 
We’ll use Amazon EC2 c6i.4xlarge, on which MAX Engine can deliver up to 2.6x faster performance vs. TensorFlow

In [9]:
from sagemaker.model import Model
from datetime import datetime

date = datetime.now().strftime("%Y-%m-%d-%H-%m-%S")
model_name= f"MAX-model-roberta-{date}"

max_model = Model(
    model_data=model_uri,
    name=model_name,
    role=role,
    image_uri=image,
)

In [10]:
date = datetime.now().strftime("%Y-%m-%d-%H-%m-%S")
endpoint_name = f"MAX-endpoint-roberta-{date}"

predictor = max_model.deploy(
    initial_instance_count=1,
    instance_type="ml.c6i.4xlarge",
    endpoint_name=endpoint_name,
)

----------!

### Step 5: Invoke the endpoint to test the endpoint


In [11]:
import numpy as np
import json

model = TFRobertaForSequenceClassification.from_pretrained(hf_model_name)
client = boto3.client("sagemaker-runtime")

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-emotion-multilabel-latest.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


In [22]:
text = "MAX Serving and Amazon SageMaker are a match made in heaven"

tokenizer = AutoTokenizer.from_pretrained(hf_model_name)
inputs = tokenizer(text, 
                   return_tensors="np", 
                   return_token_type_ids=True)

payload = {
    "inputs": [
        {"name": "input_ids", 
         "shape": inputs["input_ids"].shape, 
         "datatype": "INT32", "data": inputs["input_ids"].tolist()},
        {"name": "attention_mask", 
         "shape": inputs["attention_mask"].shape, 
         "datatype": "INT32", 
         "data": inputs["attention_mask"].tolist()},
        {"name": "token_type_ids", 
         "shape": inputs["token_type_ids"].shape, 
         "datatype": "INT32", 
         "data": inputs["token_type_ids"].tolist()},
    ]
}

In [23]:
http_response = client.invoke_endpoint(
    EndpointName=endpoint_name, 
    ContentType="application/octet-stream", Body=json.dumps(payload)
)
response = json.loads(http_response["Body"].read().decode("utf8"))
outputs = response["outputs"]
predicted_class_id = np.argmax(outputs[0]['data'],axis=-1)
classification = model.config.id2label[predicted_class_id]
print(f"The sentiment of the input statement is: {classification}")

The sentiment of the input statement is: joy


### Step 6: Clean up AWS resources

In [None]:
sm = sess.client('sagemaker')
endpoint_config_name = sm.describe_endpoint(EndpointName=endpoint_name)['EndpointConfigName']
model_name = sm.describe_endpoint_config(EndpointConfigName=endpoint_config_name)['ProductionVariants'][0]['ModelName']

#### Delete endpoint and clean up model and endpoint config

In [None]:
sm.delete_endpoint(EndpointName=endpoint_name)
sm.delete_endpoint_config(EndpointConfigName=endpoint_config_name)
sm.delete_model(ModelName=model_name)

#### Delete model artifacts in Amazon S3

In [None]:
s3 = boto3.resource("s3")
bucket = s3.Bucket(bucket_name)
bucket.objects.filter(Prefix="max-serving-models/roberta/").all().delete()

#### Delete Amazon ECR registry and all the images we created

In [None]:
ecr = boto3.client('ecr')
ecr.delete_repository(registryId=account,
                      repositoryName=repo_name,
                      force=True)