In [1]:
import os
import json 

# Load the environment configuration JSON data
json_path = 'env_config.json'
with open(json_path, 'r') as file:
    env_config = json.load(file)

hf_home = env_config['HF_HOME']
# Set the HF_HOME environment variable
os.environ['HF_HOME'] = hf_home
# Set the access token to huggingface hub
access_token = env_config['access_token']

# How to prompt Llama 3
The base models have no prompt format. Like other base models, they can be used to continue an input sequence with a plausible continuation or for zero-shot/few-shot inference. They are also a great foundation for fine-tuning your own use cases. The Instruct versions use the following conversation structure:
```bash
<|begin_of_text|><|start_header_id|>system<|end_header_id|>

{{ system_prompt }}<|eot_id|><|start_header_id|>user<|end_header_id|>

{{ user_msg_1 }}<|eot_id|><|start_header_id|>assistant<|end_header_id|>

{{ model_answer_1 }}<|eot_id|>

```

This format has to be exactly reproduced for effective use. We’ll later show how easy it is to reproduce the instruct prompt with the chat template available in transformers.

In [3]:
import transformers 
print(transformers.__version__)

from transformers import pipeline
import torch

from accelerate import Accelerator
from transformers import AutoTokenizer, AutoModelForCausalLM

accelerator = Accelerator()
device = accelerator.device

model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

tokenizer = AutoTokenizer.from_pretrained(model_id)

4.41.0


Loading checkpoint shards: 100%|██████████| 4/4 [00:06<00:00,  1.75s/it]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
messages = [
    {"role": "system", "content": "You are a personal health assistant who provides both mental and physical health suggestions."},
    {"role": "user", "content": "What could you do?"},
]

input_ids = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=True,
    return_tensors="pt"
).to(model.device)

terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

outputs = model.generate(
    input_ids,
    max_new_tokens=128,
    eos_token_id=terminators,
    do_sample=True,
    temperature=0.6,
    top_p=0.9,
)
assistant_response = outputs[0]["generated_text"][-1]["content"]
print(assistant_response)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


IndexError: too many indices for tensor of dimension 1

In [16]:
outputs = model.model.forward(
    input_ids,
    # max_new_tokens=128,
    # eos_token_id=terminators,
    # do_sample=True,
    # temperature=0.6,
    # top_p=0.9,
)
outputs 

BaseModelOutputWithPast(last_hidden_state=tensor([[[ 4.2188, -0.1562, -1.8438,  ..., -2.8906,  1.3281,  0.3965],
         [-0.0747,  0.0000,  0.0148,  ...,  0.8164, -0.8789,  0.8320],
         [-1.4844,  4.3438, -2.5469,  ...,  0.8203, -2.2344,  0.5352],
         ...,
         [-4.9688,  5.0625, -4.8438,  ...,  3.7031, -0.9688, -0.4863],
         [ 1.4531,  0.5781, -5.5000,  ...,  3.5000,  1.1484,  2.8125],
         [ 2.3594, -1.1719, -2.5312,  ...,  1.9375, -2.2969,  0.4277]]],
       device='cuda:0', dtype=torch.bfloat16, grad_fn=<MulBackward0>), past_key_values=((tensor([[[[ 6.2891e-01,  1.0469e+00,  1.0078e+00,  ...,  1.2422e+00,
           -2.5391e-01,  1.9238e-01],
          [ 9.3750e-01, -5.5078e-01, -1.9531e-02,  ..., -8.8501e-03,
           -5.4297e-01, -7.3828e-01],
          [-6.0000e+00, -3.2344e+00, -1.5781e+00,  ...,  7.7148e-02,
           -1.0234e+00, -9.0625e-01],
          ...,
          [ 4.6562e+00, -1.0781e+00, -2.3906e+00,  ...,  8.2422e-01,
           -9.8047e-01

In [18]:
outputs[0].shape

torch.Size([1, 35, 4096])

In [26]:
print(tokenizer.decode(input_ids.tolist()[0]))

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a personal health assistant who provides both mental and physical health suggestions.<|eot_id|><|start_header_id|>user<|end_header_id|>

What could you do?<|eot_id|><|start_header_id|>assistant<|end_header_id|>


