# Large model inference with DeepSpeed


## 1. Download trained model


In [15]:
import sagemaker 

sagemaker_session = sagemaker.Session()

bucket = sagemaker_session.default_bucket() # Set a default S3 bucket

account = sagemaker_session.boto_session.client('sts').get_caller_identity()['Account']
region = sagemaker_session.boto_session.region_name


model_s3_uri= f"s3://{bucket}/fine-tune-GPTJ/checkpoint/checkpoint-120/"

In [16]:
!aws s3 ls $model_s3_uri

2023-02-10 20:32:17        961 config.json
2023-02-10 20:32:17        141 generation_config.json
2023-02-10 20:32:17         14 latest
2023-02-10 20:32:17     456318 merges.txt
2023-02-10 20:41:12 12216972905 pytorch_model.bin
2023-02-10 20:32:17      14583 rng_state_0.pth
2023-02-10 20:32:17      14583 rng_state_1.pth
2023-02-10 20:32:17      14583 rng_state_2.pth
2023-02-10 20:32:17      14583 rng_state_3.pth
2023-02-10 20:32:17        470 special_tokens_map.json
2023-02-10 20:32:17        748 tokenizer_config.json
2023-02-10 20:32:17       1606 trainer_state.json
2023-02-10 20:32:17       4795 training_args.bin
2023-02-10 20:32:17     999186 vocab.json
2023-02-10 20:32:17      18857 zero_to_fp32.py


In [17]:
!aws s3 rm $model_s3_uri/global_step120/ --recursive 

# 2. Prepare docker image

We have a `build.sh` bash script which performs the following steps:

* Makes `serve` executable and builds our docker image
* Optionally, runs the container for local testing

Run with local testing using the following command

In [101]:
%%sh
cd ../Deploy_GPTJ_DeepSpeed/
./build.sh gptj-inference-endpoint

/home/ec2-user/SageMaker/GPTJ/amazon-sagemaker-gptj/Deploy_GPTJ_DeepSpeed
Sending build context to Docker daemon  27.14kB
Step 1/13 : FROM pytorch/pytorch:1.8.1-cuda11.1-cudnn8-devel
 ---> 7afd9b52a068
Step 2/13 : LABEL com.amazon.image.authors.email="sage-learner@amazon.com"
 ---> Using cache
 ---> f67c72f9e1a9
Step 3/13 : LABEL com.amazon.image.authors.name="Amazon AI"
 ---> Using cache
 ---> d42e1cf112da
Step 4/13 : ENV PYTHONUNBUFFERED=TRUE
 ---> Using cache
 ---> 11349f25d74a
Step 5/13 : ENV PYTHONDONTWRITEBYTECODE=TRUE
 ---> Using cache
 ---> 69242df3ee5f
Step 6/13 : ENV PATH="/opt/program:${PATH}"
 ---> Using cache
 ---> 1b113a70efdd
Step 7/13 : ARG DEBIAN_FRONTEND=noninteractive
 ---> Using cache
 ---> 027f75146b58
Step 8/13 : ENV TZ=Etc/UTC
 ---> Using cache
 ---> 3799d6c545c5
Step 9/13 : RUN apt-key del 7fa2af80     && rm /etc/apt/sources.list.d/nvidia-ml.list /etc/apt/sources.list.d/cuda.list     && apt-get -y update && apt-get install -y --no-install-recommends         wget

# 3. Deployment

In [102]:
%%sh
cd ../Deploy_GPTJ_DeepSpeed/
chmod +x push_to_ecr.sh
./push_to_ecr.sh gptj-inference-endpoint

Login Succeeded
The push refers to repository [171503325295.dkr.ecr.us-east-1.amazonaws.com/gptj-inference-endpoint]
6fd3f091def6: Preparing
450d5b5f658a: Preparing
bef584e29ea6: Preparing
6b23b4f5045b: Preparing
ecc6bb7de61b: Preparing
1aa7263f678e: Preparing
6ee3d43a62a1: Preparing
ffe4fc4a44ce: Preparing
9f70bc5acecf: Preparing
63c72fb01f89: Preparing
ad5b6813b3ac: Preparing
7a2f30aca740: Preparing
cd37cd672bd2: Preparing
fe6d8881187d: Preparing
23135df75b44: Preparing
b43408d5f11b: Preparing
7a2f30aca740: Waiting
63c72fb01f89: Waiting
ad5b6813b3ac: Waiting
6ee3d43a62a1: Waiting
cd37cd672bd2: Waiting
1aa7263f678e: Waiting
23135df75b44: Waiting
b43408d5f11b: Waiting
fe6d8881187d: Waiting
ffe4fc4a44ce: Waiting
9f70bc5acecf: Waiting
bef584e29ea6: Layer already exists
ecc6bb7de61b: Layer already exists
6b23b4f5045b: Layer already exists
1aa7263f678e: Layer already exists
6ee3d43a62a1: Layer already exists
ffe4fc4a44ce: Layer already exists
9f70bc5acecf: Layer already exists
63c72fb01f89

https://docs.docker.com/engine/reference/commandline/login/#credentials-store



First, this script will push your image to ECR. For reference later, note the address of the repository that the container is pushed to. It should appear below the line `Login Succeeded` in the output from the call to `push_to_ecr.sh`.

# 4. Inference

Now, you can deploy your endpoint as follows:

### 4.1 Initialize configuration variables

If you run into the error that endpoint already exists on a rerun, please change the model_name and endpoint_name. 

In [3]:
import sagemaker
from sagemaker.model import Model
from sagemaker.predictor import RealTimePredictor
import time 

role = sagemaker.get_execution_role()

# Specify path to gptj-inference-endpoint image in ECR
image = '{}.dkr.ecr.{}.amazonaws.com/gptj-inference-endpoint:latest'.format(account, region)

# Specify sagemaker model_name
sm_model_name = "gptj-completion-gpu-test"

# Specify endpoint_name
endpoint_name = "gptj-completion-gpu-test"

# Specify instance_type
instance_type = 'ml.g4dn.2xlarge'

# Specify initial_instance_count
initial_instance_count = 1


### 4.2 Initialize endpoint

In [None]:
sm_model = Model(
                        image_uri = image,
                        role = role,
                         env={"S3_MODEL_LOCATION":model_s3_uri},
                        predictor_cls=RealTimePredictor,
                        name = sm_model_name)

predictor = sm_model.deploy(
        instance_type=instance_type,
        initial_instance_count=1,
        endpoint_name = endpoint_name
)

Using already existing model: gptj-completion-gpu-test


---

### 4.3 Query model

To query your endpoint, you can use the code below. Also, remember that you can pass any parameters accepted by the HuggingFace `"text-generation"` pipeline.

#### Initialize asynchronous 

In [8]:
import boto3
import json 

# Get the boto3 session and sagemaker client, as well as the current execution role
sess = boto3.Session()

# Specify your AWS Region
aws_region=sess.region_name


# Create a low-level client representing Amazon SageMaker Runtime
sagemaker_runtime = boto3.client("sagemaker-runtime", region_name=aws_region)

In [12]:
%%time

text = "love: "

parameters = {
    "do_sample": True,
    "temperature": 0.7,
    "max_new_tokens":200,
    "min_tokens": 100,
    "repetition_penalty": 1.1,
    "top_p": 500,
    }

data = {
    "inputs": {
        "text_inputs": text,
        "parameters": parameters
    }
}


body = json.dumps(data)


response = sagemaker_runtime.invoke_endpoint( 
        EndpointName=endpoint_name, 
        Body = body, 
        ContentType = 'application/json'
)

CPU times: user 4.05 ms, sys: 0 ns, total: 4.05 ms
Wall time: 534 ms


In [13]:
%%time

body = json.dumps(data)


response = sagemaker_runtime.invoke_endpoint( 
        EndpointName=endpoint_name, 
        Body = body, 
        ContentType = 'application/json'
)

result = json.loads(response['Body'].read().decode("utf-8"))

CPU times: user 11 ms, sys: 373 Âµs, total: 11.4 ms
Wall time: 834 ms


In [14]:
result

{'response': [{'generated_text': 'love:  The most powerful weapon anyone can have is a loving and forgiving heart.'}],
 'status': 200}