In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM


In [4]:
from transformers import AutoTokenizer

model_id = "meta-llama/Llama-3.2-1B"
tokenizer = AutoTokenizer.from_pretrained(model_id)

messages = [
    {"role": "user", "content": "Who are you?"}
]

# Define a basic chat template with {% endgeneration %}
chat_template = """
{% for message in messages %}
{{ message.role }}: {{ message.content }}
{% endfor %}
{% generation %}
[Assistant's response here]
{% endgeneration %}
"""

# Apply the chat template
prompt = tokenizer.apply_chat_template(
    messages, 
    chat_template=chat_template, 
    tokenize=False, 
    add_generation_prompt=True
)

print(prompt)



user: Who are you?
[Assistant's response here]



In [20]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

def test_prompt_llama_3_8b_local(prompt, suppress=False, model_name="your_model_path", **kwargs):
    # Load model and tokenizer from local path
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name)

    # Prepare the input prompt
    llama_prompt = f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"

    # Tokenize the input prompt
    inputs = tokenizer(llama_prompt, return_tensors="pt")

    # Generate text (no stop_token_ids)
    output = model.generate(**inputs, max_new_tokens=kwargs.get('max_new_tokens', 50), 
                            do_sample=kwargs.get('do_sample', True))

    # Decode the output
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

    # Optional: Manually stop at a specific token if needed
    stop_tokens = ["<|end_of_text|>", "<|eot_id|>"]
    for token in stop_tokens:
        generated_text = generated_text.split(token)[0]

    if not suppress:
        print(f'PROMPT:\n------\n{llama_prompt}\n------\nRESPONSE\n------\n{generated_text}')
    else:
        return generated_text

# Example usage:
test_prompt_llama_3_8b_local(" calcul 1+1", model_name=model_id)


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


PROMPT:
------
<|begin_of_text|><|start_header_id|>user<|end_header_id|>

 calcul 1+1<|eot_id|><|start_header_id|>assistant<|end_header_id|>


------
RESPONSE
------
user

 calcul 1+1assistant




In [5]:
encoding = tokenizer("We are very happy to show you the 🤗 Transformers library.")
print(encoding)

{'input_ids': [128000, 1687, 527, 1633, 6380, 311, 1501, 499, 279, 11410, 97, 245, 81632, 6875, 13], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [8]:
tokenizer.decode(encoding["input_ids"])
# encoded_input = tokenizer(batch_sentences, padding=True)


'<|begin_of_text|>We are very happy to show you the 🤗 Transformers library.'

In [11]:
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# batch_sentences = [
#     "But what about second breakfast?",
#     "Don't think he knows about second breakfast, Pip.",
#     "What about elevensies?",
# ]
# encoded_input = tokenizer(batch_sentences, padding=True)
# print(encoded_input)

batch_sentences = [
    "But what about second breakfast?",
    "Don't think he knows about second breakfast, Pip.",
    "What about elevensies?",
]
encoded_input = tokenizer(batch_sentences, padding=True, truncation=True)
print(encoded_input)

{'input_ids': [[128000, 4071, 1148, 922, 2132, 17954, 30, 128256, 128256, 128256, 128256, 128256], [128000, 8161, 956, 1781, 568, 8964, 922, 2132, 17954, 11, 78482, 13], [128000, 3923, 922, 12231, 729, 552, 30, 128256, 128256, 128256, 128256, 128256]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]]}


In [12]:
batch_sentences = [
    "But what about second breakfast?",
    "Don't think he knows about second breakfast, Pip.",
    "What about elevensies?",
]
encoded_input = tokenizer(batch_sentences, padding=True, truncation=True, return_tensors="pt")
print(encoded_input)

{'input_ids': tensor([[128000,   4071,   1148,    922,   2132,  17954,     30, 128256, 128256,
         128256, 128256, 128256],
        [128000,   8161,    956,   1781,    568,   8964,    922,   2132,  17954,
             11,  78482,     13],
        [128000,   3923,    922,  12231,    729,    552,     30, 128256, 128256,
         128256, 128256, 128256]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]])}
