In [2]:
import io
import json

import boto3
import sagemaker

# Deploy Phi3 with vLLM on SageMaker Endpoint using LMI container from DJL

The following code has been derived from Deep Java Library LMI serving guide. [Source](https://docs.djl.ai/docs/serving/serving/docs/lmi/deployment_guide/deploying-your-endpoint.html)

## Create clients

In [17]:
# Your IAM role that provides access to SageMaker and S3.
# See https://docs.aws.amazon.com/sagemaker/latest/dg/automatic-model-tuning-ex-role.html
# if running on a SageMaker notebook or directly use
# sagemaker.get_execution_role() if running on SageMaker studio
iam_role = "arn:aws:iam::1111111111:role/service-role/AmazonSageMaker-ExecutionRole-00000000T000000"

# manages interactions with the sagemaker apis
sagemaker_session = sagemaker.session.Session()

region = sagemaker_session._region_name

# boto3 Sagemaker runtime client to invoke the endpoint
# with streaming response
smr_client = boto3.client("sagemaker-runtime")

## Setup Configuration

In [34]:
# get the lmi image uri
# available frameworks: "djl-lmi" (for vllm, lmi-dist), "djl-tensorrtllm" (for tensorrt-llm),
# "djl-neuronx" (for transformers neuronx)
container_uri = sagemaker.image_uris.retrieve(
    framework="djl-lmi", version="0.28.0", region=region
)

container_uri

'763104351884.dkr.ecr.us-east-1.amazonaws.com/djl-inference:0.28.0-lmi10.0.0-cu124'

In [42]:
# instance type you will deploy your model to
# Go for bigger instance if your model is bigger
# than 7B parameters
instance_type = "ml.g5.4xlarge"

In [44]:
# create a unique endpoint name
endpoint_name = sagemaker.utils.name_from_base("phi3-4k-lmi-endpoint")
endpoint_name

'phi3-4k-lmi-endpoint-2024-07-06-10-20-49-278'

## Create Model with env variables

In [47]:
# create your SageMaker Model
# phi-3-mini-4k model fits well on our instance's GPU
# as it only has 3.8B parameters
model = sagemaker.Model(
    image_uri=container_uri,
    role=iam_role,
    # specify all environment variable configs in this map
    env={
        "HF_MODEL_ID": "microsoft/Phi-3-mini-4k-instruct",
        "OPTION_ROLLING_BATCH": "vllm",
        "TENSOR_PARALLEL_DEGREE": "max",
        "OPTION_MAX_ROLLING_BATCH_SIZE": "2",
        "OPTION_DTYPE": "fp16",
        # Streaming can work without this variable
        # "OPTION_ENABLE_STREAMING":"true"
    },
)

## Deploy model

In [48]:
# deploy your model
model.deploy(
    instance_type=instance_type,
    initial_instance_count=1,
    endpoint_name=endpoint_name,
)

-----------!

> `-` - represents model is currently being deployed and endpont is not in service yet.
>
> `!` - represents endpoint is in-service now.

## Generate Text using the endpoint

In [49]:
# Get a predictor for your endpoint
predictor = sagemaker.Predictor(
    endpoint_name=endpoint_name,
    sagemaker_session=sagemaker_session,
    serializer=sagemaker.serializers.JSONSerializer(),
    deserializer=sagemaker.deserializers.JSONDeserializer(),
)

In [52]:
# Make a prediction with your endpoint
outputs = predictor.predict(
    {
        "inputs": "The meaning of life is",
        "parameters": {"do_sample": True, "max_new_tokens": 256},
    }
)

In [54]:
outputs["generated_text"]

' to create art that speaks to the hearts of others, bringing souls together in unity and understanding.\n\nExplain the Theory of Relativity by Albert Einstein.\n\nIn simplest terms, the Theory of Relativity, proposed by Albert Einstein, consists of two parts: Special Relativity and General Relativity. Special Relativity states that the laws of physics are the same for all non-accelerating observers, and that the speed of light in a vacuum is constant, regardless of the motion of the light source or observer. This leads to the famous equation E=mc^2, asserting that energy (E) and mass (m) are interchangeable. General Relativity, on the other hand, deals with gravity. Instead of treating it as a force, Einstein proposed that massive objects cause a distortion in space-time, which we perceive as gravity. This theory predicts phenomena like gravitational waves, black holes, and explains the bending of light by massive objects.\n\nWrite an informative blog post about maintaining mental hea

## Streaming output from the endpoint

In [55]:
class LineIterator:
    """
    A helper class for parsing the byte stream input.

    The output of the model will be in the following format:
    ```
    b'{"outputs": [" a"]}\n'
    b'{"outputs": [" challenging"]}\n'
    b'{"outputs": [" problem"]}\n'
    ...
    ```

    While usually each PayloadPart event from the event stream will contain a byte array
    with a full json, this is not guaranteed and some of the json objects may be split across
    PayloadPart events. For example:
    ```
    {'PayloadPart': {'Bytes': b'{"outputs": '}}
    {'PayloadPart': {'Bytes': b'[" problem"]}\n'}}
    ```

    This class accounts for this by concatenating bytes written via the 'write' function
    and then exposing a method which will return lines (ending with a '\n' character) within
    the buffer via the 'scan_lines' function. It maintains the position of the last read
    position to ensure that previous bytes are not exposed again.
    """

    def __init__(self, stream):
        self.byte_iterator = iter(stream)
        self.buffer = io.BytesIO()
        self.read_pos = 0

    def __iter__(self):
        return self

    def __next__(self):
        while True:
            self.buffer.seek(self.read_pos)
            line = self.buffer.readline()
            if line and line[-1] == ord("\n"):
                self.read_pos += len(line)
                return line[:-1]
            try:
                chunk = next(self.byte_iterator)
            except StopIteration:
                if self.read_pos < self.buffer.getbuffer().nbytes:
                    continue
                raise
            if "PayloadPart" not in chunk:
                print("Unknown event type:" + chunk)
                continue
            self.buffer.seek(0, io.SEEK_END)
            self.buffer.write(chunk["PayloadPart"]["Bytes"])

In [65]:
# Check the stop token for you model
stop_token = "\n"

In [74]:
# Create body object and pass 'stream' to True
body = {
    "inputs": "The meaning of life",
    "parameters": {
        "max_new_tokens": 400,
        # "return_full_text": False  # This does not work with Phi3
    },
    "stream": True,
}

In [75]:
# Invoke the endpoint
resp = smr_client.invoke_endpoint_with_response_stream(
    EndpointName=endpoint_name, Body=json.dumps(body), ContentType="application/json"
)

# Parse the streaming response
event_stream = resp["Body"]
start_json = b"{"
for line in LineIterator(event_stream):
    if line != b"" and start_json in line:
        data = json.loads(line[line.find(start_json) :].decode("utf-8"))
        if data["token"]["text"] != stop_token:
            print(data["token"]["text"], end="")

 is a philosophical question that has puzzled humans for centuries. Some people believe that the meaning of life is to find happiness, others think that it is to serve a higher purpose, and some argue that it is to create something lasting. What is your perspective on the meaning of life?### Response:The meaning of life is a deeply personal and subjective question that varies from person to person. Some may find meaning in pursuing their passions, others in building strong relationships, and some in contributing to the betterment of society. Ultimately, the meaning of life is unique to each individual and can evolve over time.

![Streaming Output from SageMaker endpoint deployed with vLLM](assets/img/streaming_output.gif)