## 

We will run an evaluation of Qwen3 on Maths task because these are easily verifiable
As well as on MMLU task. 

In [31]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
import re

## Tokenizer: Text -> Tokens

In [4]:
model_name_base = "Qwen/Qwen3-8B"
model_name_instruct = "Qwen/Qwen3-4B-Instruct-2507"


tokenizer_base = AutoTokenizer.from_pretrained(model_name_base)
tokenizer_instruct = AutoTokenizer.from_pretrained(model_name_instruct)

In [8]:
tokenizer_instruct

Qwen2TokenizerFast(name_or_path='Qwen/Qwen3-4B-Instruct-2507', vocab_size=151643, model_max_length=1010000, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '<|im_end|>', 'pad_token': '<|endoftext|>', 'additional_special_tokens': ['<|im_start|>', '<|im_end|>', '<|object_ref_start|>', '<|object_ref_end|>', '<|box_start|>', '<|box_end|>', '<|quad_start|>', '<|quad_end|>', '<|vision_start|>', '<|vision_end|>', '<|vision_pad|>', '<|image_pad|>', '<|video_pad|>']}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	151643: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151644: AddedToken("<|im_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151645: AddedToken("<|im_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151646: AddedToken("<|object_ref_start|>", rstrip=False, lstrip=False, single_word=Fals

In [12]:
tokenizer_base.encode("Hi! how are you?"), tokenizer_instruct.encode("Hi! how are you?") 

([13048, 0, 1246, 525, 498, 30], [13048, 0, 1246, 525, 498, 30])

## Apply chat template to tokenize conversation

- roles are fixed. If you pass any other role, it will ignore it
- For readability, pass tokenize=False


In [None]:
tokens = tokenizer_base.apply_chat_template(
    [{'role': "system", "content": "You are a helpful and honest assistant!"}, {'role': "user", "content": "Hi"}],
    tokenize=True,
)

## Dataset

In [37]:
gsm8k = load_dataset('gsm8k', 'main')

Generating train split: 100%|██████████| 7473/7473 [00:00<00:00, 776919.34 examples/s]
Generating test split: 100%|██████████| 1319/1319 [00:00<00:00, 304038.63 examples/s]


In [38]:
gsm8k['test'][0]

{'question': "Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?",
 'answer': 'Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.\nShe makes 9 * 2 = $<<9*2=18>>18 every day at the farmer’s market.\n#### 18'}

In [39]:
# Taken from: https://github.com/huggingface/lighteval/blob/5137e03f29e0611bf0fffa6d251e62f711a496e0/src/lighteval/metrics/normalizations.py#L378C1-L399C1

def gsm8k_normalizer(text: str) -> str:
    """From https://github.com/openai/grade-school-math/blob/3101c7d5072418e28b9008a6636bde82a006892c/grade_school_math/dataset.py#L28

    Args:
        text (str): input text

    Returns:
        str: Output text, either the number found in the text or "[invalid]" if
        no number was found
    """
    ANS_RE = re.compile(r"#### (\-?[0-9\.\,]+)")
    INVALID_ANS = "[invalid]"

    match = ANS_RE.search(text)
    if match:
        match_str = match.group(1).strip()
        match_str = match_str.replace(",", "")
        return match_str
    else:
        return INVALID_ANS


In [46]:
def process_row(example):
    answer = gsm8k_normalizer(example['answer'])
    return {
        'question': example['question'],
        'answer': answer,
        'original_answer': example['answer']
    }

questions = gsm8k['test'].map(process_row)
questions[0]

{'question': "Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?",
 'answer': '18',
 'original_answer': 'Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.\nShe makes 9 * 2 = $<<9*2=18>>18 every day at the farmer’s market.\n#### 18'}

## Models

In [47]:
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen3-0.6B")

In [48]:
model

Qwen3ForCausalLM(
  (model): Qwen3Model(
    (embed_tokens): Embedding(151936, 1024)
    (layers): ModuleList(
      (0-27): 28 x Qwen3DecoderLayer(
        (self_attn): Qwen3Attention(
          (q_proj): Linear(in_features=1024, out_features=2048, bias=False)
          (k_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (v_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (o_proj): Linear(in_features=2048, out_features=1024, bias=False)
          (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
          (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
        )
        (mlp): Qwen3MLP(
          (gate_proj): Linear(in_features=1024, out_features=3072, bias=False)
          (up_proj): Linear(in_features=1024, out_features=3072, bias=False)
          (down_proj): Linear(in_features=3072, out_features=1024, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen3RMSNorm((1024,), eps=1e-06)
        (post_attention_layernorm): Qwe

In [None]:
len(questions[i: i+batch_size])

questions[0:2]['question']

["Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?",
 'A robe takes 2 bolts of blue fiber and half that much white fiber.  How many bolts in total does it take?']

In [75]:
batch_size = 8
system_prompt = """
You are a helpful and intelligent agent. Your task is to solve mathematical problems by reasoning through them step-by-step. 

IMPORTANT: After completing your calculations, provide your final answer following the format: #### [Your Answer]. 

For example, if your working leads to the number 19, you should write: Your working. #### 19

[Problem Statement]
"""

def make_prompt(question):
    return system_prompt + question

print(make_prompt(questions[0]['question']))

outputs = []
for i in range(0, len(questions), batch_size):
    prompts = [make_prompt(x) for x in questions[i: i+batch_size]['question']]

    tokens = tokenizer_base(
        prompts, 
        padding=True,
        return_tensors="pt",
        truncation=False,
    )

    print(tokens['input_ids'].shape)
    input_lengths = tokens['attention_mask'].sum(dim=1).tolist()
    print(input_lengths)

    gen = model.generate(
        **tokens,
        max_new_tokens=1000,
        do_sample=True,
        temperature=0.2,
        top_p=0.5,
    )

    for row, in_len in enumerate(input_lengths):
        out_ids = gen[row, in_len:]
        text = tokenizer_base.decode(out_ids, skip_special_tokens=True).strip()
        outputs.append(text)

    break

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.



You are a helpful and intelligent agent. Your task is to solve mathematical problems by reasoning through them step-by-step. 

IMPORTANT: After completing your calculations, provide your final answer following the format: #### [Your Answer]. 

For example, if your working leads to the number 19, you should write: Your working. #### 19

[Problem Statement]
Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?
torch.Size([8, 187])
[141, 102, 133, 111, 187, 132, 118, 146]


KeyboardInterrupt: 

In [None]:
## measure the accuracy of the models here -->