# Chapter Summarizer Inference
Create a serverless endoint using a pre-baked docker container.

In [1]:
!pip install sagemaker botocore boto3 awscli --upgrade

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Collecting sagemaker
  Downloading sagemaker-2.153.0.tar.gz (751 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m751.7/751.7 kB[0m [31m19.9 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting botocore
  Downloading botocore-1.29.131-py3-none-any.whl (10.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.7/10.7 MB[0m [31m31.4 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
Collecting boto3
  Downloading boto3-1.26.131-py3-none-any.whl (135 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m135.6/135.6 kB[0m [31m29.3 MB/s[0m eta [36m0:00:00[0m
Collecting awscli
  Downloading awscli-1.27.131-py3-none-any.whl (4.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.1/4.1 MB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting cloudpickle==2.2.1
  Downloading

In [2]:
BASE_NAME = "gpt-book-sum"
AWS_REGION = "eu-central-1"
INSTANCE_TYPE = "ml.m5.large"
INITIAL_INSTANCE_COUNT = 1
SERVE_IMG_URI = "957269117416.dkr.ecr.eu-central-1.amazonaws.com/chapter-sum-gpt3:latest"
S3_SUCCESS_URI = "s3://chapter-summarization/book-sum-async-inference/output/success/"
S3_FAILURE_URI = "s3://chapter-summarization/book-sum-async-inference/output/failure/"

## Define role, model name, and endpoint name

In [3]:
import boto3
import sagemaker
from sagemaker import Session
from sagemaker.model import Model
from sagemaker.predictor import Predictor
from sagemaker.serializers import JSONSerializer

sagemaker_role = sagemaker.get_execution_role()

model_name = f"{BASE_NAME}-model"
endpoint_name = f"{BASE_NAME}-endpoint"
endpoint_config_name = f"{BASE_NAME}-endpoint-config"
resource_id="endpoint/" + endpoint_name + "/variant/variant1"
print(f"Model name: {model_name}")
print(f"Endpoint name: {endpoint_name}")
print(f"Endpoint config name: {endpoint_config_name}")
print(f"Image URI: {SERVE_IMG_URI}")
print(f"Role: {sagemaker_role}")
print(f"Resource ID: {resource_id}")

Model name: gpt-book-sum-model
Endpoint name: gpt-book-sum-endpoint
Endpoint config name: gpt-book-sum-endpoint-config
Image URI: 957269117416.dkr.ecr.eu-central-1.amazonaws.com/chapter-sum-gpt3:latest
Role: arn:aws:iam::957269117416:role/service-role/AmazonSageMaker-ExecutionRole-20230121T194089
Resource ID: endpoint/gpt-book-sum-endpoint/variant/variant1


## Create a SageMaker model, serverless config, and serverless endpoint

### Create a SageMaker Model

In [6]:
sagemaker_client = boto3.client('sagemaker', region_name=AWS_REGION)

#create a SM model using a pre-baked docker container
create_model_response = sagemaker_client.create_model(
    ModelName = model_name,
    ExecutionRoleArn = sagemaker_role,
    PrimaryContainer = {
        'Image': SERVE_IMG_URI,
    }
)
print(f"Created Model: {create_model_response['ModelArn']}")

Created Model: arn:aws:sagemaker:eu-central-1:957269117416:model/gpt-book-sum-model


### Create an EndpointConfig

In [7]:
create_endpoint_config_response = sagemaker_client.create_endpoint_config(
    EndpointConfigName=endpoint_config_name, # You will specify this name in a CreateEndpoint request.
    # List of ProductionVariant objects, one for each model that you want to host at this endpoint.
    ProductionVariants=[
        {
            "VariantName": "variant1", # The name of the production variant.
            "ModelName": model_name, 
            "InstanceType": INSTANCE_TYPE, # Specify the compute instance type.
            "InitialInstanceCount": INITIAL_INSTANCE_COUNT # Number of instances to launch initially.
        }
    ],
    AsyncInferenceConfig={
        "OutputConfig": {
            # Location to upload response outputs when no location is provided in the request.
            "S3OutputPath": S3_SUCCESS_URI,
            "S3FailurePath": S3_FAILURE_URI
            # # (Optional) specify Amazon SNS topics
            # "NotificationConfig": {
            #     "SuccessTopic": "arn:aws:sns:aws-region:account-id:topic-name",
            #     "ErrorTopic": "arn:aws:sns:aws-region:account-id:topic-name",
            # }
        },
        "ClientConfig": {
            # (Optional) Specify the max number of inflight invocations per instance
            # If no value is provided, Amazon SageMaker will choose an optimal value for you
            "MaxConcurrentInvocationsPerInstance": 2
        }
    }
)
print(f"Created EndpointConfig: {create_endpoint_config_response['EndpointConfigArn']}")

Created EndpointConfig: arn:aws:sagemaker:eu-central-1:957269117416:endpoint-config/gpt-book-sum-endpoint-config


### Create an Endpoint

In [8]:
create_endpoint_response = sagemaker_client.create_endpoint(
    EndpointName=endpoint_name, 
    EndpointConfigName=endpoint_config_name
)
print(f"Created Endpoint: {create_endpoint_response['EndpointArn']}")

Created Endpoint: arn:aws:sagemaker:eu-central-1:957269117416:endpoint/gpt-book-sum-endpoint


Validate that the endpoint is created before invoking it:

In [9]:
waiter = sagemaker_client.get_waiter("endpoint_in_service")
print("Waiting for endpoint to create...")
waiter.wait(EndpointName=endpoint_name)
resp = sagemaker_client.describe_endpoint(EndpointName=endpoint_name)
print(f"Endpoint Status: {resp['EndpointStatus']}")

Waiting for endpoint to create...
Endpoint Status: InService


## Scaling policy that scales to zero

In [10]:
# Common class representing application autoscaling for SageMaker 
as_client = boto3.client('application-autoscaling') 

# Define and register your endpoint variant
response = as_client.register_scalable_target(
    ServiceNamespace='sagemaker', 
    ResourceId=resource_id,
    ScalableDimension='sagemaker:variant:DesiredInstanceCount', # The number of EC2 instances for your Amazon SageMaker model endpoint variant.
    MinCapacity=0,
    MaxCapacity=3
)
response

{'ScalableTargetARN': 'arn:aws:application-autoscaling:eu-central-1:957269117416:scalable-target/056m2544763e303143fb9d6c4143004e7ae3',
 'ResponseMetadata': {'RequestId': '8d594deb-cc2a-4416-a3d6-2e5ec139817c',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '8d594deb-cc2a-4416-a3d6-2e5ec139817c',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '134',
   'date': 'Wed, 10 May 2023 13:28:12 GMT'},
  'RetryAttempts': 0}}

## DescribeScalableTargets

In [11]:
resp = as_client.describe_scalable_targets(
    ServiceNamespace='sagemaker',
)
resp

{'ScalableTargets': [{'ServiceNamespace': 'sagemaker',
   'ResourceId': 'endpoint/gpt-book-sum-endpoint/variant/variant1',
   'ScalableDimension': 'sagemaker:variant:DesiredInstanceCount',
   'MinCapacity': 0,
   'MaxCapacity': 3,
   'RoleARN': 'arn:aws:iam::957269117416:role/aws-service-role/sagemaker.application-autoscaling.amazonaws.com/AWSServiceRoleForApplicationAutoScaling_SageMakerEndpoint',
   'CreationTime': datetime.datetime(2023, 5, 10, 13, 28, 12, 847000, tzinfo=tzlocal()),
   'SuspendedState': {'DynamicScalingInSuspended': False,
    'DynamicScalingOutSuspended': False,
    'ScheduledScalingSuspended': False},
   'ScalableTargetARN': 'arn:aws:application-autoscaling:eu-central-1:957269117416:scalable-target/056m2544763e303143fb9d6c4143004e7ae3'}],
 'ResponseMetadata': {'RequestId': '1ed8d86e-7274-486f-81a9-8d59ed4c31d4',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '1ed8d86e-7274-486f-81a9-8d59ed4c31d4',
   'content-type': 'application/x-amz-json-1.1',
  

## Scaling policy to scale out when we have target queue backlog
https://aws.amazon.com/blogs/machine-learning/run-computer-vision-inference-on-large-videos-with-amazon-sagemaker-asynchronous-endpoints/

In [12]:
response = as_client.put_scaling_policy(
    PolicyName="Invocations-ScalingPolicy",
    ServiceNamespace="sagemaker",  # The namespace of the AWS service that provides the resource.
    ResourceId=resource_id,  # Endpoint name
    ScalableDimension="sagemaker:variant:DesiredInstanceCount",  # SageMaker supports only Instance Count
    PolicyType="TargetTrackingScaling",  # 'StepScaling'|'TargetTrackingScaling'
    TargetTrackingScalingPolicyConfiguration={
        "TargetValue": 5.0,  # The target value for the metric. - here the metric is - SageMakerVariantInvocationsPerInstance
        "CustomizedMetricSpecification": {
            "MetricName": "ApproximateBacklogSizePerInstance",
            "Namespace": "AWS/SageMaker",
            "Dimensions": [{"Name": "EndpointName", "Value": endpoint_name}],
            "Statistic": "Average",
        },
        "ScaleInCooldown": 600,  # The cooldown period helps you prevent your Auto Scaling group from launching or terminating
        # additional instances before the effects of previous activities are visible.
        # You can configure the length of time based on your instance startup time or other application needs.
        # ScaleInCooldown - The amount of time, in seconds, after a scale in activity completes before another scale in activity can start.
        "ScaleOutCooldown": 300  # ScaleOutCooldown - The amount of time, in seconds, after a scale out activity completes before another scale out activity can start.
        # 'DisableScaleIn': True|False - ndicates whether scale in by the target tracking policy is disabled.
        # If the value is true , scale in is disabled and the target tracking policy won't remove capacity from the scalable resource.
    },
)

## Scaling policy that scales up from zero for new requests
https://docs.aws.amazon.com/sagemaker/latest/dg/async-inference-autoscale.html

In [13]:
response = as_client.put_scaling_policy(
    PolicyName="HasBacklogWithoutCapacity-ScalingPolicy",
    ServiceNamespace="sagemaker",  # The namespace of the service that provides the resource.
    ResourceId=resource_id,  # Endpoint name
    ScalableDimension="sagemaker:variant:DesiredInstanceCount",  # SageMaker supports only Instance Count
    PolicyType="StepScaling",  # 'StepScaling' or 'TargetTrackingScaling'
    StepScalingPolicyConfiguration={
        "AdjustmentType": "ChangeInCapacity", # Specifies whether the ScalingAdjustment value in the StepAdjustment property is an absolute number or a percentage of the current capacity. 
        "MetricAggregationType": "Average", # The aggregation type for the CloudWatch metrics.
        "Cooldown": 180, # The amount of time, in seconds, to wait for a previous scaling activity to take effect. 
        "StepAdjustments": # A set of adjustments that enable you to scale based on the size of the alarm breach.
        [ 
            {
              "MetricIntervalLowerBound": 0,
              "ScalingAdjustment": 1
            }
          ]
    },    
)
step_scaling_policy_arn = response['PolicyARN']
step_scaling_policy_arn

'arn:aws:autoscaling:eu-central-1:957269117416:scalingPolicy:2544763e-3031-43fb-9d6c-4143004e7ae3:resource/sagemaker/endpoint/gpt-book-sum-endpoint/variant/variant1:policyName/HasBacklogWithoutCapacity-ScalingPolicy'

## Setting up a CloudWatch Alarm to trigger StepScallingPolicy

In [16]:
cw_client = boto3.client('cloudwatch')

response = cw_client.put_metric_alarm(
    AlarmName="BookSum-StepScallingPolicy-Alarm",
    MetricName='HasBacklogWithoutCapacity',
    Namespace='AWS/SageMaker',
    Statistic='Average',
    EvaluationPeriods= 2,
    DatapointsToAlarm= 1,
    Threshold= 1,
    ComparisonOperator='GreaterThanOrEqualToThreshold',
    TreatMissingData='missing',
    Dimensions=[
        { 'Name':'EndpointName', 'Value':endpoint_name },
    ],
    Period= 30,
    AlarmActions=[step_scaling_policy_arn]
)
response

{'ResponseMetadata': {'RequestId': 'f8259004-5b60-4911-93a3-5df83925b8cf',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'f8259004-5b60-4911-93a3-5df83925b8cf',
   'content-type': 'text/xml',
   'content-length': '214',
   'date': 'Wed, 10 May 2023 14:19:37 GMT'},
  'RetryAttempts': 0}}

## Getting prediction from an endpoint

In [10]:
import json

sm_runtime= boto3.client('runtime.sagemaker')
input_path = "/home/ec2-user/SageMaker/chapter_summarization_api/src/summarizer/resources/chapter/01.txt"

with open(input_path, "r") as fp:
    long_text = fp.read()
    
    payload_json = json.dumps({'text': long_text})

    response = sm_runtime.invoke_endpoint(
        EndpointName=endpoint_name, 
        Body=payload_json.encode('utf-8'), 
        ContentType="application/json",
        Accept="application/json"
    )

    response_body = response['Body'].read().decode('utf-8')
    summary_text = json.loads(response_body)["summary"]
    print(f"Summary:\n{summary_text}")

Summary:
: Alice follows a White Rabbit into a rabbit-hole, and falls down a deep well. She is eventually rescued by a mouse, sees a beautiful garden, and finally escapes from the hall by unlocking a door with a key she finds on a small glass table. 

Alice found a key and a bottle of wine on her way to the little door, but she got lost trying to find the right way to shut it. She ate a cake to make herself grow, but didn't grow at all.


## Clean up Resources

In [4]:
client = boto3.client('application-autoscaling')

response = client.deregister_scalable_target(
    ServiceNamespace="sagemaker",
    ResourceId=resource_id,
    ScalableDimension="sagemaker:variant:DesiredInstanceCount",
)

In [5]:
sm = boto3.client("sagemaker")

response = sm.delete_endpoint(EndpointName=endpoint_name)
print(f"Delete endpoint response: {response}")

response = sm.delete_endpoint_config(EndpointConfigName=endpoint_config_name)
print(f"Delete endpoint configuration response: {response}")

response = sm.delete_model(ModelName=model_name)
print(f"Delete model response: {response}")

Delete endpoint response: {'ResponseMetadata': {'RequestId': '8031d991-2bc5-481e-a14f-c8348f323663', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': '8031d991-2bc5-481e-a14f-c8348f323663', 'content-type': 'application/x-amz-json-1.1', 'content-length': '0', 'date': 'Wed, 10 May 2023 13:09:34 GMT'}, 'RetryAttempts': 0}}
Delete endpoint configuration response: {'ResponseMetadata': {'RequestId': '2537e3aa-1932-4ebe-b7f9-67c2d0183e68', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': '2537e3aa-1932-4ebe-b7f9-67c2d0183e68', 'content-type': 'application/x-amz-json-1.1', 'content-length': '0', 'date': 'Wed, 10 May 2023 13:09:35 GMT'}, 'RetryAttempts': 0}}
Delete model response: {'ResponseMetadata': {'RequestId': 'c1dadcbd-9426-4f81-8796-05de29c82808', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': 'c1dadcbd-9426-4f81-8796-05de29c82808', 'content-type': 'application/x-amz-json-1.1', 'content-length': '0', 'date': 'Wed, 10 May 2023 13:09:35 GMT'}, 'RetryAttempts

In [1]:
!python --version

Python 3.8.13
