In [1]:
model_id = 'meta-llama/Llama-3.1-8B-Instruct'

In [2]:
import json
import sagemaker
import boto3
from sagemaker.huggingface import HuggingFaceModel, get_huggingface_llm_image_uri

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


In [3]:
role = sagemaker.get_execution_role()

# Hub Model configuration. https://huggingface.co/models
hub = {
    'HF_MODEL_ID': model_id,
    'SM_NUM_GPUS': json.dumps(1),
    'HUGGING_FACE_HUB_TOKEN': 'your-key-here'
}

assert hub['HUGGING_FACE_HUB_TOKEN'] != '<REPLACE WITH YOUR TOKEN>', "You have to provide a token."
image_uri = get_huggingface_llm_image_uri("huggingface", version="2.0")

In [4]:
# create Hugging Face Model Class
huggingface_model = HuggingFaceModel(
    image_uri=image_uri,
    env=hub,
    role=role,
)

# deploy model to SageMaker Inference
predictor = huggingface_model.deploy(
    initial_instance_count=1,
    instance_type="ml.g5.2xlarge",
    container_startup_health_check_timeout=300,
)

-----------!

In [5]:
from transformers import AutoTokenizer

# load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id, token=hub['HUGGING_FACE_HUB_TOKEN'])

# Prompt to generate
messages = [
    {"role": "system", "content": "You are an assistant"},
    {"role": "user", "content": "What is an LLM?"},
]

prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)



tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [10]:
print(prompt)

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

You are an assistant<|eot_id|><|start_header_id|>user<|end_header_id|>

What is an LLM?<|eot_id|><|start_header_id|>assistant<|end_header_id|>




In [6]:
terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

# Generation arguments
payload = {
    "max_new_tokens": 256,
    "eos_token_id": terminators,
    "do_sample": True,
    "temperature": 0.6,
    "top_p": 0.9,
    "return_full_text": False,
}

In [7]:
from sagemaker.base_deserializers import JSONDeserializer
predictor.deserializer = JSONDeserializer()

In [8]:
%%time
response = predictor.predict({"inputs":prompt, "parameters":payload})

CPU times: user 17.1 ms, sys: 3.37 ms, total: 20.4 ms
Wall time: 8.95 s


In [9]:
import pprint

pprint.pprint(response[0]['generated_text'])

("LLM stands for Large Language Model. It's a type of artificial intelligence "
 '(AI) designed to process and generate human-like language. LLMs are trained '
 'on vast amounts of text data, which enables them to learn patterns, '
 'relationships, and structures of language.\n'
 '\n'
 'These models are typically trained using a technique called deep learning, '
 'where the AI is presented with large amounts of text data and learns to '
 'predict the next word in a sequence. This process is repeated millions of '
 'times, allowing the model to refine its understanding of language.\n'
 '\n'
 'Some key characteristics of LLMs include:\n'
 '\n'
 '1. **Language understanding**: LLMs can comprehend and analyze human '
 'language, including nuances, idioms, and context.\n'
 "2. **Language generation**: LLMs can generate text that's coherent, "
 'grammatically correct, and often indistinguishable from text written by a '
 'human.\n'
 '3. **Contextual understanding**: LLMs can understand the c