## lmi-dist rollingbatch notux-8x7b deployment guide

# Deploying Notux 8x7B with LMI 

### In this tutorial, you will use vllm backend of Large Model Inference(LMI) DLC to deploy the quantized Notux 8x7B and run inference with it.

Please make sure the following permission granted before running the notebook:

* S3 bucket push access
* SageMaker access




### Step 1: Let's bump up SageMaker and import stuff

In [2]:
%pip install sagemaker boto3 awscli huggingface_hub --upgrade  --quiet

[31mERROR: Could not install packages due to an OSError: [Errno 2] No such file or directory: '/opt/conda/lib/python3.10/site-packages/fsspec-2023.6.0.dist-info/METADATA'
[0m[31m
[0mNote: you may need to restart the kernel to use updated packages.


In [29]:
import sagemaker
import jinja2
from sagemaker import image_uris
import boto3
import os,sys
import time
import json
from pathlib import Path
from huggingface_hub import snapshot_download
from sagemaker import Model, image_uris, serializers, deserializers

role = sagemaker.get_execution_role()  # execution role for the endpoint
sess = sagemaker.session.Session()  # sagemaker session for interacting with different AWS APIs
region = sess._region_name  # region name of the current SageMaker Studio environment
account_id = sess.account_id() 
s3_client = boto3.client("s3")


In [4]:
sagemaker_client = boto3.client("sagemaker")
sagemaker_runtime_client = boto3.client("sagemaker-runtime")
prefix = sagemaker.utils.unique_name_from_base("DEMO")
print(f"prefix: {prefix}")

prefix: DEMO-1708566037-675f


In [5]:
# Set an unique endpoint config name
endpoint_config_name = f"{prefix}-endpoint-config"
print(f"Demo endpoint config name: {endpoint_config_name}")

# Set varient name and instance type for hosting
variant_name = "AllTraffic"
instance_type = "ml.g5.24xlarge"
model_data_download_timeout_in_seconds = 3600
container_startup_health_check_timeout_in_seconds = 3600

initial_instance_count = 1
max_instance_count = 2
print(f"Initial instance count: {initial_instance_count}")
print(f"Max instance count: {max_instance_count}")

sagemaker_client.create_endpoint_config(
    EndpointConfigName=endpoint_config_name,
    ExecutionRoleArn=role,
    ProductionVariants=[
        {
            "VariantName": variant_name,
            "InstanceType": instance_type,
            "InitialInstanceCount": 1,
            "ModelDataDownloadTimeoutInSeconds": model_data_download_timeout_in_seconds,
            "ContainerStartupHealthCheckTimeoutInSeconds": container_startup_health_check_timeout_in_seconds,
            "ManagedInstanceScaling": {
                "Status": "ENABLED",
                "MinInstanceCount": initial_instance_count,
                "MaxInstanceCount": max_instance_count,
            },
            "RoutingConfig": {"RoutingStrategy": "LEAST_OUTSTANDING_REQUESTS"},
        }
    ],
)

Demo endpoint config name: DEMO-1708566037-675f-endpoint-config
Initial instance count: 1
Max instance count: 2


{'EndpointConfigArn': 'arn:aws:sagemaker:us-east-1:631450739534:endpoint-config/demo-1708566037-675f-endpoint-config',
 'ResponseMetadata': {'RequestId': '48dfd54b-8252-4345-815b-f112860afe29',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '48dfd54b-8252-4345-815b-f112860afe29',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '117',
   'date': 'Thu, 22 Feb 2024 01:40:54 GMT'},
  'RetryAttempts': 0}}

In [6]:
# Set a unique endpoint name
endpoint_name = f"{prefix}-endpoint"
print(f"Demo endpoint name: {endpoint_name}")

sagemaker_client.create_endpoint(
    EndpointName=endpoint_name,
    EndpointConfigName=endpoint_config_name,
)


Demo endpoint name: DEMO-1708566037-675f-endpoint


{'EndpointArn': 'arn:aws:sagemaker:us-east-1:631450739534:endpoint/demo-1708566037-675f-endpoint',
 'ResponseMetadata': {'RequestId': '26905902-5852-4b8e-a840-5ea0633f7ef3',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '26905902-5852-4b8e-a840-5ea0633f7ef3',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '97',
   'date': 'Thu, 22 Feb 2024 01:41:12 GMT'},
  'RetryAttempts': 0}}

In [8]:
sess.wait_for_endpoint(endpoint_name)

--!

{'EndpointName': 'DEMO-1708566037-675f-endpoint',
 'EndpointArn': 'arn:aws:sagemaker:us-east-1:631450739534:endpoint/demo-1708566037-675f-endpoint',
 'EndpointConfigName': 'DEMO-1708566037-675f-endpoint-config',
 'ProductionVariants': [{'VariantName': 'AllTraffic',
   'CurrentInstanceCount': 1,
   'DesiredInstanceCount': 1,
   'ManagedInstanceScaling': {'Status': 'ENABLED',
    'MinInstanceCount': 1,
    'MaxInstanceCount': 2},
   'RoutingConfig': {'RoutingStrategy': 'LEAST_OUTSTANDING_REQUESTS'}}],
 'EndpointStatus': 'InService',
 'CreationTime': datetime.datetime(2024, 2, 22, 1, 41, 12, 837000, tzinfo=tzlocal()),
 'LastModifiedTime': datetime.datetime(2024, 2, 22, 1, 43, 16, 760000, tzinfo=tzlocal()),
 'ResponseMetadata': {'RequestId': 'ed8069aa-454e-4595-81bc-a113182ffce8',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'ed8069aa-454e-4595-81bc-a113182ffce8',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '555',
   'date': 'Thu, 22 Feb 2024 01:

### Step 2.0: Download model artifacts

In [9]:

bucket = sess.default_bucket()  # bucket to house artifacts
model_bucket = sess.default_bucket()  # bucket to house artifacts
s3_code_prefix = "hf-large-model-djl/Notux-8x7B-gptq"
s3_model_prefix = "Notux-8x7B/lmi"  # folder within bucket where model artifact will go

jinja_env = jinja2.Environment()

In [10]:
# - This will download the model into the current directory where ever the jupyter notebook is running
local_model_path = Path(".")
local_model_path.mkdir(exist_ok=True)
model_name = "TheBloke/notux-8x7b-v1-GPTQ"
# Only download pytorch checkpoint files
allow_patterns = ["*.json", "*.pt", "*.bin", "*.txt", "*.model", "*.safetensors"]
# - Leverage the snapshot library to donload the model since the model is stored in repository using LFS
model_download_path = snapshot_download(
    repo_id=model_name,
    cache_dir=local_model_path,
    allow_patterns=allow_patterns,
)

Fetching 10 files:   0%|          | 0/10 [00:00<?, ?it/s]

config.json:   0%|          | 0.00/2.71k [00:00<?, ?B/s]

quantize_config.json:   0%|          | 0.00/211 [00:00<?, ?B/s]

all_results.json:   0%|          | 0.00/748 [00:00<?, ?B/s]

eval_results.json:   0%|          | 0.00/573 [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/437 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

train_results.json:   0%|          | 0.00/195 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.44k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/23.8G [00:00<?, ?B/s]

In [11]:
# define a variable to contain the s3url of the location that has the model
pretrained_model_location = f"s3://{model_bucket}/{s3_model_prefix}/"
print(f"Pretrained model will be uploaded to ---- > {pretrained_model_location}")

Pretrained model will be uploaded to ---- > s3://sagemaker-us-east-1-631450739534/Notux-8x7B/lmi/


In [12]:
model_artifact = sess.upload_data(path=model_download_path, key_prefix=s3_model_prefix)
print(f"Model uploaded to --- > {model_artifact}")
print(f"We will set option.model_id={model_artifact}")

Model uploaded to --- > s3://sagemaker-us-east-1-631450739534/Notux-8x7B/lmi
We will set option.model_id=s3://sagemaker-us-east-1-631450739534/Notux-8x7B/lmi


### Step 2: Start preparing model artifacts

In LMI container, we expect some artifacts to help setting up the model

* serving.properties (required): Defines the model server settings
* model.py (optional): A python file to define the core inference logic
* requirements.txt (optional): Any additional pip wheel need to install

In [13]:
!mkdir -p mymodel

In [14]:
%%writefile ./mymodel/serving.properties
engine=Python
option.model_id={{s3url}}
option.tensor_parallel_degree=4
option.max_rolling_batch_size=16
option.rolling_batch=vllm
option.max_model_len=25456
option.dtype=fp16

Overwriting ./mymodel/serving.properties


In [15]:
# we plug in the appropriate model location into our `serving.properties` file based on the region in which this notebook is running
template = jinja_env.from_string(Path("mymodel/serving.properties").open().read())
Path("mymodel/serving.properties").open("w").write(
    template.render(s3url=pretrained_model_location)
)
!pygmentize mymodel/serving.properties | cat -n

     1	[36mengine[39;49;00m=[33mPython[39;49;00m[37m[39;49;00m
     2	[36moption.model_id[39;49;00m=[33ms3://sagemaker-us-east-1-631450739534/Notux-8x7B/lmi/[39;49;00m[37m[39;49;00m
     3	[36moption.tensor_parallel_degree[39;49;00m=[33m4[39;49;00m[37m[39;49;00m
     4	[36moption.max_rolling_batch_size[39;49;00m=[33m16[39;49;00m[37m[39;49;00m
     5	[36moption.rolling_batch[39;49;00m=[33mvllm[39;49;00m[37m[39;49;00m
     6	[36moption.max_model_len[39;49;00m=[33m25456[39;49;00m[37m[39;49;00m
     7	[36moption.dtype[39;49;00m=[33mfp16[39;49;00m[37m[39;49;00m


In [16]:
%%sh
rm -f mymodel.tar.gz
rm -rf mymodel/.ipynb_checkpoints
tar czvf mymodel.tar.gz -C mymodel .

./
./serving.properties


### Step 3: Start building SageMaker endpoint

#### Getting the container image URI

See available Large Model Inference DLC's [here](https://github.com/aws/deep-learning-containers/blob/master/available_images.md#large-model-inference-containers)

In [17]:
image_uri = image_uris.retrieve(
        framework="djl-deepspeed",
        region=sess.boto_session.region_name,
        version="0.26.0"
    )

#### Upload artifact on S3 and create SageMaker model

In [25]:
code_artifact = sess.upload_data("mymodel.tar.gz", bucket, s3_code_prefix)
print(f"S3 Code or Model tar ball uploaded to --- > {code_artifact}")

model_name = f"{prefix}-model-notux-8x7b"
print(model_name)
model = Model(name=model_name, image_uri=image_uri, model_data=code_artifact, role=role)
model.create()

S3 Code or Model tar ball uploaded to --- > s3://sagemaker-us-east-1-631450739534/hf-large-model-djl/Notux-8x7B-gptq/mymodel.tar.gz
DEMO-1708566037-675f-model-notux-8x7b


#### Create SageMaker endpoint with inference component

We can now create the Inference Components which will deployed on the endpoint that you specify. Please note here that you can provide a SageMaker model or a container to specification. If you provide a container, you will need to provide an image and artifactURL as parameters. In this example we set it to the model name we prepared in the cells above. You can also set the `ComputeResourceRequirements` to supply SageMaker what should be reserved for each copy of the inference component. You can also set the copy count of the number of Inference Components you would like to deploy. These can be managed and scaled as the capabilities become available.

Note that in this example we set the `NumberOfAcceleratorDevicesRequired` to a value of 4. By doing so we reserve 4 accelerators for each copy of this inference component so that we can use tensor parallel.

In [26]:
inference_component_name_notux = f"{prefix}-IC-notux-8x7b"
variant_name = "AllTraffic"

sagemaker_client.create_inference_component(
    InferenceComponentName=inference_component_name_notux,
    EndpointName=endpoint_name,
    VariantName=variant_name,
    Specification={
        "ModelName": model_name,
        "ComputeResourceRequirements": {
            "NumberOfAcceleratorDevicesRequired": 4,
            "NumberOfCpuCoresRequired": 1,
            "MinMemoryRequiredInMb": 1024,
        },
    },
    RuntimeConfig={"CopyCount": 1},
)

{'InferenceComponentArn': 'arn:aws:sagemaker:us-east-1:631450739534:inference-component/DEMO-1708566037-675f-IC-notux-8x7b',
 'ResponseMetadata': {'RequestId': '954a171e-6718-4345-abfa-1b3449489ceb',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '954a171e-6718-4345-abfa-1b3449489ceb',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '123',
   'date': 'Thu, 22 Feb 2024 02:02:08 GMT'},
  'RetryAttempts': 0}}

Wait until the inference endpoint is InService

In [30]:
while True:
    desc = sagemaker_client.describe_inference_component(
        InferenceComponentName=inference_component_name_notux
    )
    status = desc["InferenceComponentStatus"]
    print(status)
    sys.stdout.flush()
    if status in ["InService", "Failed"]:
        break
    time.sleep(30)

Creating
Creating
Creating
Creating
Creating
Creating
Creating
Creating
Creating
Creating
Creating
Creating
Creating
Creating
Creating
Creating
Creating
Creating
Creating
Creating
Creating
InService


### Step 4: Run inference
Comparing the results 
see below a few examples


In [37]:
import json


payload = '''Summarize the following text:
Peter and Elizabeth took a taxi to attend the night party in the city. While in the party, Elizabeth collapsed and was rushed to the hospital.
Since she was diagnosed with a brain injury, the doctor told Peter to stay besides her until she gets well.
Therefore, Peter stayed with her at the hospital for 3 days without leaving.'''


response = sagemaker_runtime_client.invoke_endpoint(
    EndpointName=endpoint_name,
    InferenceComponentName=inference_component_name_notux,
    ContentType="application/json",
    Accept="application/json",
    Body=json.dumps(
        {
            "inputs": payload,
            "parameters": {
                "max_new_tokens": 50,
                "temperature": 1,
                },
        }
    ),
)
result = json.loads(response["Body"].read().decode())
result

{'generated_text': '\nThe doctor informed them that Elizabeth will spent at least another one week in the hospital.\nPeter, now knowing that he had to continue staying at the hospital asked his boss for leave.\nHis boss agreed and Peter stayed another day in the'}

In [32]:
response = sagemaker_runtime_client.invoke_endpoint(
    EndpointName=endpoint_name,
    InferenceComponentName=inference_component_name_notux,
    ContentType="application/json",
    Accept="application/json",
    Body=json.dumps(
        {
            "inputs": "what is the derivative of x squared",
            "parameters": {
                "max_new_tokens": 128,
                "do_sample": True,
                },
        }
    ),
)
result = json.loads(response["Body"].read().decode())
result

{'generated_text': " when evaluation at x is equal to 2, for example, let's say the function is f of x equals x squared, i want to find f prime of 2\n\nf(x) = x^2\n\nf'(x) = 2x\n\nf'(2) = 2 \\* 2 = 4\n\nTherefore, f'(2) = 4"}

### Deleting Inference Components and Endpoint

In [38]:
sagemaker_client.delete_inference_component(InferenceComponentName=inference_component_name_notux)
sagemaker_client.delete_endpoint(EndpointName=endpoint_name)

{'ResponseMetadata': {'RequestId': '7606e0b3-05a8-4dec-ad8e-04ef030402b3',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '7606e0b3-05a8-4dec-ad8e-04ef030402b3',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '0',
   'date': 'Thu, 22 Feb 2024 02:20:22 GMT'},
  'RetryAttempts': 0}}