<a href="https://colab.research.google.com/github/orca3/llm-model-serving/blob/main/ch04/dlc_customization/aws_dlc_serving_customization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Sudo code to demonstration how to customize model serving experience with SageMaker deep learning container.

# Step 1: customize model serving code

In [None]:
!rm -rf my-own-llm
!mkdir -p  my-own-llm

# create model configuration
%%writefile my-own-llm/serving.properties
engine=Python
option.s3url=s3://sagemaker-us-west-2-<your account>/large-model-lmi/code/my-own-llm

In [None]:
# claim model dependency
%%writefile my-own-llm/requirements.txt
transformers==4.36.1
torch
diffusers==0.18.0
optimum[onnxruntime-gpu]
cupy-cuda11x

In [None]:
# implement your own model inference logic in model.py file.

%%writefile my-own-llm/model.py
import os
import logging
from djl_python import Input, Output
# ...

PAD_TOKEN_ID = 50256

# initialize model
def initialize(properties):

    model = AutoModel.from_pretrained(model_id)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()

model = None
tokenizer = None

# execute model
def run_inference(input_texts, onnx_model, tokenizer):

    max_batch_size = 128
    z = torch.empty([0,768]).to("cuda")
    for i in range(0, len(input_texts), max_batch_size):
        logging.info(f"Start Iteration: {i}")

        start_time = time.time()
        batch_dict = tokenizer(input_texts[i:i+max_batch_size], max_length=512, padding=True, truncation=True, return_tensors='pt').to("cuda")
        logging.info(f"After Tokenize Latency: {time.time() - start_time}")

        start_time = time.time()
        with torch.no_grad():
            outputs = model(**batch_dict)
        logging.info(f"After Model Inference Latency: {time.time() - start_time}")

        # ... ..

        z = torch.cat((embeddings,z), 0)
        del outputs
        del embeddings
        del batch_dict

    results = [{"embedding": embedding.tolist(), "index": idx} for idx, embedding in enumerate(z)]
    # logging.info(f'Output embeddings: {results}')
    del z
    # return dictonary, which will be json serializable
    return {"embeddings": results}

def average_pool(last_hidden_states: Tensor,
                 attention_mask: Tensor) -> Tensor:
    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]

# model execution entry point
def handle(inputs: Input) -> None:
    logging.info("handle : Handle start...")
    global model, tokenizer
    if not model:
        logging.info("handle : initializing model")
        model,tokenizer = initialize(inputs.get_properties())

    if inputs.is_empty():
        logging.info("handle : inputs is empty")
        # Model server makes an empty call to warmup the model on startup
        return None

    logging.info("handle : Pre-processing start")
    data = inputs.get_as_json()
    logging.info(f"handle : Pre-processing json data {data}")
    input_sentences = data["inputs"]
    logging.info(f"handle : Pre-processing sentences {input_sentences}")
    res = run_inference(input_sentences, model, tokenizer)

    logging.info("handle : Handle End")
    return Output().add_as_json(res)

In [None]:
# package model file
%%sh
tar czvf my-own-llm.tar.gz my-own-llm/

In [None]:
# upload model file to S3

s3_code_prefix = "large-model-lmi/code/my-own-llm" # increment the version
bucket = sess.default_bucket()  # bucket to house artifacts
code_artifact = sess.upload_data("my-own-llm.tar.gz", bucket, s3_code_prefix)


# Step 2: Deploy model with AWS DJL container

In [None]:
# find the DJL model serving container
image_uri = image_uris.retrieve(
    framework="djl-deepspeed", region=sess.boto_session.region_name, version="0.25.0"
)


In [None]:
# create model object
model = Model(image_uri=image_uri, model_data=code_artifact, role=role)


In [None]:
# deployment model
predictor = model.deploy(initial_instance_count=1,
             instance_type="ml.g5.2xlarge",
             endpoint_name="my-own-llm-128")