In [1]:
from transformers import AutoTokenizer
from datasets import Dataset
import re

  from .autonotebook import tqdm as notebook_tqdm


In [104]:
subset_ds = Dataset.load_from_disk("/home/guest/AdvancedLLMReasoning/data/subset_openmathinstruct_1_v2/256K")
example = subset_ds[0]
# example = {
#     'question': "What is the result of formula: '123 + 265 * 2 - 50 / 5'?. Solve by code interpreter step-by-step.",
#     'generated_solution': "To find the sum of 123 and 456, we simply add the two numbers together:\n\n123 + 456 = 579\n\nTherefore, the sum of 123 and 456 is \\boxed{579}.",
#     'expected_answer': "579"
# }

In [88]:
def clean_text(text):
        if not isinstance(text, str):
            return ""
        text = text.strip()
        text = re.sub(r'[ \t]+', ' ', text)
        text = re.sub(r'\n{3,}', '\n\n', text)
        return text

In [89]:
system_prompt = (
            "You are a math reasoning assistant.\n"
            "Solve the problem step by step.\n"
            "You can use Python code if needed.\n"
            "If you write code, put it inside a Python code block:\n"
            "```python\n"
            "...\n"
            "```\n"
            "Output ONLY the final number inside \\boxed{}."
        )
llama_template = """{{ bos_token }}
{% for message in messages -%}
<|start_header_id|>{{ message['role'] }}<|end_header_id|>{{ message['content'] | trim }}
<|eot_id|>
{%- endfor %}
{% if add_generation_prompt -%}
<|start_header_id|>assistant<|end_header_id|>
{%- endif %}
"""

In [90]:
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.chat_template = llama_template

In [105]:
q_raw = example['question']
a_raw = example['generated_solution']
q_raw, a_raw

('Olaf is sailing across the ocean with 25 men, including himself. He needs 1/2 a gallon of water per day per man. The boat can go 200 miles per day and he needs to travel 4,000 miles. How many gallons of water does he need?',
 "Let's solve this problem using Python code.\n```python\n# amount of water needed to sustain for 1 day\nwater_required_per_day = 1/2\n# total amount of men on the boat\ntotal_men = 25\n# total distance Olaf needs to travel\ntotal_distance = 4000\n# how many miles Olaf travels per day\ntravel_distance_per_day = 200\n# number of days Olaf needs to travel\ndays_to_travel = total_distance / travel_distance_per_day\n# amount of water required for total men on the boat\nwater_required_for_all_men = days_to_travel * water_required_per_day * total_men\nwater_required_for_all_men\n```\n<llm-code-output>\n250.0\n</llm-code-output>\nThus Olaf needs \\boxed{250} gallons of water.")

In [98]:
messages = [
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": clean_text(q_raw)},
    {"role": "assistant", "content": clean_text(a_raw)},
]

full_text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=False
)

context_messages = messages[:-1] # get context
context_text = tokenizer.apply_chat_template(
    context_messages,
    tokenize=False,
    add_generation_prompt=True
)

print(full_text)
print(context_text)

<|begin_of_text|>
<|start_header_id|>system<|end_header_id|>You are a math reasoning assistant.
Solve the problem step by step.
You can use Python code if needed.
If you write code, wrap it inside <llm-code>...</llm-code>.
Output ONLY the final number inside \boxed{}.
<|eot_id|><|start_header_id|>user<|end_header_id|>What is the result of 123 + 265 * 2 - 50 / 5?
<|eot_id|><|start_header_id|>assistant<|end_header_id|>To find the sum of 123 and 456, we simply add the two numbers together:

123 + 456 = 579

Therefore, the sum of 123 and 456 is \boxed{579}.
<|eot_id|>
<|begin_of_text|>
<|start_header_id|>system<|end_header_id|>You are a math reasoning assistant.
Solve the problem step by step.
You can use Python code if needed.
If you write code, wrap it inside <llm-code>...</llm-code>.
Output ONLY the final number inside \boxed{}.
<|eot_id|><|start_header_id|>user<|end_header_id|>What is the result of 123 + 265 * 2 - 50 / 5?
<|eot_id|><|start_header_id|>assistant<|end_header_id|>


In [93]:
# tokenize
max_length = 1024

full_encoded = tokenizer(
    full_text, 
    padding=False, 
    truncation=True, 
    max_length=max_length, 
    add_special_tokens=False
)

prompt_encoded = tokenizer(
    context_text,
    padding=False,
    truncation=False, # không truncate prompt để đo độ dài chính xác
    add_special_tokens=False
)

In [74]:
print(full_encoded)
print(prompt_encoded)
print(tokenizer.decode(full_encoded['input_ids']))
print(tokenizer.decode(prompt_encoded['input_ids']))
print(len(full_encoded['input_ids']), len(prompt_encoded['input_ids']))

{'input_ids': [128000, 198, 128006, 9125, 128007, 2675, 527, 264, 7033, 33811, 18328, 627, 128009, 128006, 882, 128007, 3923, 374, 279, 2694, 315, 220, 4513, 323, 220, 10961, 5380, 128009, 128006, 78191, 128007, 1271, 1505, 279, 2694, 315, 220, 4513, 323, 220, 10961, 11, 584, 5042, 923, 279, 1403, 5219, 3871, 1473, 4513, 489, 220, 10961, 284, 220, 24847, 271, 55915, 11, 279, 2694, 315, 220, 4513, 323, 220, 10961, 374, 1144, 80175, 90, 24847, 28374, 128009], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
{'input_ids': [128000, 198, 128006, 9125, 128007, 2675, 527, 264, 7033, 33811, 18328, 627, 128009, 128006, 882, 128007, 3923, 374, 279, 2694, 315, 220, 4513, 323, 220, 10961, 5380, 128009, 128006, 78191, 128007], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [None]:
from transformers import DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model, 
    padding=True,
    pad_to_multiple_of=8,
    return_tensors="pt",
    label_pad_token_id=-100
)