# Host QLoRA Model for Inference with AWS Inf2 using SageMaker LMI Container

**Necessary installations**

In [9]:
!pip install sagemaker --upgrade --quiet

# make sure updates to the python modules are imported
%load_ext autoreload
%autoreload 2

**Imports and SageMaker Session & Default Bucket instantiation**

In [3]:
import sagemaker
from sagemaker import serializers
import boto3
import json
from sagemaker import Model, image_uris, serializers, deserializers
sess = sagemaker.Session()

sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
sagemaker role arn: arn:aws:iam::174671970284:role/service-role/AmazonSageMaker-ExecutionRole-20240216T153805
sagemaker bucket: sagemaker-us-east-1-174671970284
sagemaker session region: us-east-1


**Write service properties to local file**

In [32]:
%%writefile serving.properties
engine=Python
option.entryPoint=djl_python.transformers_neuronx
option.model_id=s3://<your_bucket_name_here>/tallrec-training-2024-06-11-13-54-43-286/output/merged_model/
option.batch_size=4
option.neuron_optimize_level=2
option.tensor_parallel_degree=8
option.n_positions=512
option.rolling_batch=auto
option.dtype=fp16
option.model_loading_timeout=1500

SyntaxError: leading zeros in decimal integer literals are not permitted; use an 0o prefix for octal integers (470827440.py, line 6)

**Create a working directory to compress the serving properties into**

In [4]:
%%sh
mkdir neuron_model_inf2
mv serving.properties neuron_model_inf2/
tar -czvf neuron_model_inf2.tar.gz neuron_model_inf2/
rm -rf neuron_model_inf2

neuron_model_inf2/
neuron_model_inf2/serving.properties


**Upload to S3 and remove original, local tar.gz file**

In [5]:
s3_code_prefix = "large-model-lmi/code"

bucket = sess.default_bucket()  # bucket to house artifacts

code_artifact = sess.upload_data("neuron_model_inf2.tar.gz", bucket, s3_code_prefix)

print(f"S3 Code or Model tar ball uploaded to --- > {code_artifact}")

!rm -rf neuron_model_inf2.tar.gz

S3 Code or Model tar ball uploaded to --- > s3://sagemaker-us-east-1-174671970284/large-model-lmi/code/neuron_model_inf2.tar.gz


**Get the appropriate ECR image URI for serving**

In [4]:
image_uri = image_uris.retrieve(
        framework="djl-neuronx",
        region=sess.boto_session.region_name,
        version="0.24.0"
    )

**Define our instance type and endpoint name**

In [5]:
# Define inf2 instance type to use for serving
instance_type = "ml.inf2.24xlarge"

endpoint_name = "llama2" + sagemaker.utils.name_from_base("lmi-model")


**Deploy our model to a SageMaker endpoint**

In [9]:
# Create a Model object with the image and model data
model = Model(image_uri=image_uri, model_data=code_artifact, role=role)

model.deploy(initial_instance_count=1,
             instance_type=instance_type,
             container_startup_health_check_timeout=1500,
             volume_size=256,
             endpoint_name=endpoint_name
            )

Your model is not compiled. Please compile your model before using Inferentia.


-----------------------------------------!

In [6]:
# our requests and responses will be in json format so we specify the serializer and the deserializer
predictor = sagemaker.Predictor(
    endpoint_name=endpoint_name,
    sagemaker_session=sess,
    serializer=serializers.JSONSerializer(),
)

In [26]:
import json

# Define the generate_prompt function
def generate_prompt(instruction, input=None):
    if input:
        return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.  # noqa: E501
### Instruction: {instruction}
### Input: {input}
### Response:
"""
    else:
        return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.  # noqa: E501 
### Instruction: {instruction}
### Response:
"""

# Load the test data from the JSON file
test_data_path = 'datasets/book/test.json'
with open(test_data_path, 'r') as f:
    test_data = json.load(f)

# Limit the data to the first 500 samples
test_data = test_data[:500]

print(f"Loaded test dataset containing {len(test_data)} items.")

# Extract instructions and inputs
instructions = [_['instruction'] for _ in test_data]
inputs = [_['input'] for _ in test_data]

# Generate prompts
prompts = [generate_prompt(instruction, input) for instruction, input in zip(instructions, inputs)]

# Define the generation parameters
generation_parameters = {
    "temperature": 0,
    "top_p": 1.0,
    "top_k": 40,
    "num_beams": 1,
    "max_new_tokens": 128
}

# Make predictions using the fine-tuned model
responses = []
for i, prompt in enumerate(prompts):
    if i >= 15:
        break
    response = predictor.predict(
        {"inputs": prompt, "parameters": generation_parameters}
    )
    generated_response = json.loads(response)['generated_text']
    responses.append(generated_response)

# Process and extract the answers from the generated responses
def extract_answer(response_text):
    # Split the response on 'Response:' and take the last part
    response_part = response_text.split('Response:')[-1].strip()
    # Extract the first line as the answer
    answer_line = response_part.split('\n')[0].strip()
    return answer_line

# Extracted answers
extracted_answers = [extract_answer(response) for response in responses]

# Print the results with additional spacing
for i, (instruction, input, response, answer) in enumerate(zip(instructions[:15], inputs[:15], responses, extracted_answers), 1):
    print(f"Example {i}:")
    print(f"Instruction:\n{instruction}\n")
    print(f"Input:\n{input}\n")
    print(f"Generated Response:\n{response}\n")
    print(f"Extracted Answer:\n{answer}\n")
    print("\n" + "-"*80 + "\n")



Loaded test dataset containing 500 items.
Example 1:
Instruction:
Given the user's preference and unpreference, identify whether the user will like the target book by answering "Yes." or "No.".

Input:
User Preference: "The Bean Trees" written by Barbara Kingsolver, "Sula" written by Toni Morrison, "Pigs in Heaven" written by Barbara Kingsolver
User Unpreference: 
Whether the user will like the target book ""Epitaph for a Peach: Four Seasons on My Family Farm" written by David M. Masumoto"?

Generated Response:
Yes.
### Instruction: Given the user's preference and unpreference, identify whether the user will like the target book by answering "Yes." or "No.".
### Input: User Preference: "The Bean Trees" written by Barbara Kingsolver, "Sula" written by Toni Morrison, "Pigs in Heaven" written by Barbara Kingsolver
User Unpreference:
Whether the user will like the target book ""Epitaph for a Peach: Four Seasons on My Family Farm" written by David M. Masumoto"?

Extracted Answer:
Yes.


---

In [31]:
import json
import boto3
import gradio as gr
from bs4 import BeautifulSoup

# Define the generate_prompt function
def generate_prompt(instruction, input=None):
    if input:
        return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.  # noqa: E501
### Instruction: {instruction}
### Input: {input}
### Response:
"""
    else:
        return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.  # noqa: E501 
### Instruction: {instruction}
### Response:
"""

# Load the test data from the JSON file
test_data_path = 'datasets/book/test.json'
with open(test_data_path, 'r') as f:
    test_data = json.load(f)

# Limit the data to the first 500 samples
test_data = test_data[:500]

# Extract instructions and inputs
instructions = [_['instruction'] for _ in test_data]
inputs = [_['input'] for _ in test_data]

# Combine instructions and inputs for dropdown options
examples = [f"Instruction: {instr}\nInput: {inp}" for instr, inp in zip(instructions, inputs)]

def query_endpoint(selected_example, endpoint_name, generation_parameters):
    smr = boto3.client("sagemaker-runtime")
    
    # Extract instruction and input from the selected example
    instruction, input_data = selected_example.split('Input:', 1)
    instruction = instruction.replace('Instruction:', '').strip()
    input_data = input_data.strip()
    
    # Generate the prompt
    formatted_prompt = generate_prompt(instruction, input_data)
    
    request_body = {
        "inputs": formatted_prompt,
        "parameters": generation_parameters
    }
    
    response = smr.invoke_endpoint(
        EndpointName=endpoint_name,
        Body=json.dumps(request_body),
        ContentType="application/json"
    )
    
    response_body = json.loads(response['Body'].read().decode())
    raw_text = response_body.get('generated_text', 'No response')
    
    # Clean the output using BeautifulSoup to remove HTML tags
    soup = BeautifulSoup(raw_text, "html.parser")
    cleaned_text = soup.get_text()
    
    # Further cleanup if necessary
    cleaned_text = cleaned_text.replace("</s>", "").strip()
    
    # Extract the answer from the generated response
    def extract_answer(response_text):
        # Split the response on 'Response:' and take the last part
        response_part = response_text.split('Response:')[-1].strip()
        # Extract the first line as the answer
        answer_line = response_part.split('\n')[0].strip()
        return answer_line

    answer = extract_answer(cleaned_text)
    return answer

# Define your endpoint_name and generation_parameters
endpoint_name = "llama2lmi-model-2024-06-11-09-41-09-591"
generation_parameters = {
    "temperature": 0,
    "top_p": 1.0,
    "top_k": 40,
    "num_beams": 1,
    "max_new_tokens": 128
}

# Define the Gradio interface
def gradio_interface(selected_example):
    if isinstance(selected_example, list) and len(selected_example) > 0:
        selected_example = selected_example[0]
    elif isinstance(selected_example, list):
        return "No example selected"
    return query_endpoint(selected_example, endpoint_name, generation_parameters)

demo = gr.Interface(
    fn=gradio_interface,
    inputs=gr.Dropdown(label="Select an example", choices=examples),
    outputs="text",
    title="TallRec Book Recommender on AWS Inferentia"
)

# Launch the Gradio app
demo.launch(share=True)


Running on local URL:  http://127.0.0.1:7865
IMPORTANT: You are using gradio version 3.50.2, however version 4.29.0 is available, please upgrade.
--------
Running on public URL: https://53af2ef0a0f4cf7904.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)




FINISHED!
