In [1]:
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
gsm8k = load_dataset("gsm8k", "main", split='test[:25]')

## Define the prompt

In [3]:
few_shot_prefix = """Q: Emily has 3 apples. Her friend gives her 2 more. How many apples does Emily have now?
A: Emily starts with 3 apples. Her friend gives her 2 more. So, 3 + 2 = 5. The answer is 5.

Q: A pen costs 2 dollars. John buys 4 pens. How much does he pay?
A: Each pen costs 2 dollars. John buys 4 pens. So, 2 × 4 = 8. The answer is 8.

Q: Jake read 5 pages on Monday and 7 pages on Tuesday. How many pages did he read in total?
A: Jake read 5 pages on Monday and 7 on Tuesday. So, 5 + 7 = 12. The answer is 12.
"""

## Function Definition

In [4]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline, AutoModelForCausalLM
import torch
import re

### Testing

In [5]:
model_id = 'Qwen/Qwen2-0.5B-Instruct'
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id)
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=3024)

Device set to use cuda:0


In [17]:
prompts = [few_shot_prefix + f"Q: {gsm8k['question'][i]}\nA:" for i in range(len(gsm8k))]
prompts = [
    [{"role": "system", "content":'''You are a helpful math tutor. Show your chain-of-thought step-by-step reasoning, then on a newline output "Final answer:" followed by the numeric answer.
                                    Always place the final numeric answer on a single line beginning with "Final answer:""''' }
     ,{"role": "user", "content": f"{question}"}]
    for question in prompts
]

In [18]:
outputs = pipe(prompts)

In [23]:
pattern = "The answer is [.!?\\-$]?.*."
for i in range(len(outputs)):
    res = outputs[i][0]['generated_text'][-1]['content']
    match = re.findall(pattern, res)
    if not match:
        print(i)
        

2
14
15
16
21


In [25]:
correct = 0
total = 0
pattern = "The answer is [.!?\\-$]?.*."
for index, output in enumerate(outputs):
    res = output[0]['generated_text'][-1]['content']
    if re.findall(pattern, res):
        match = re.findall(r'\d+(?:\.\d+)?', re.findall(pattern, res)[0].replace(",", ""))
        if match:
            model_pred = float(match[0])
            gt_answer = float(gsm8k[index]["answer"].split("####")[-1].strip())
            if model_pred == gt_answer:
                    correct += 1
    total += 1
    
acc = correct / total
print(f"Accuracy: {acc:.2%}")

Accuracy: 24.00%


In [145]:
del model
del tokenizer
del pipe

## Actual evaluation

In [146]:
def evaluation_model(model_id, prompt=few_shot_prefix, dataset=gsm8k):
    print(f"evaluation {model_id} for few_shot CoT...")
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = AutoModelForCausalLM.from_pretrained(model_id)
    pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=2024)
    prompts = [prompt + f"Q: {dataset['question'][i]}\nA:" for i in range(len(dataset))]
    prompts = [
        [{"role": "system", "content":'''answer the logically question without adding additional questions at the end.
                                        provide the final answer by following the format <The answer is <result>>''' }
         ,{"role": "user", "content": f"{question}"}]
        for question in prompts
    ]

    outputs = pipe(prompts)
    del model
    del tokenizer
    del pipe
    
    correct = 0
    total = 0
    pattern = "The answer is [.!?\\-$]?.*."
    for index, output in enumerate(outputs):
        res = output[0]['generated_text'][-1]['content']
        match = re.findall(r'\d+(?:\.\d+)?',
                           re.findall(pattern, res)[0].replace(",", ""))
        if match:
            model_pred = float(match[0])
            gt_answer = float(gsm8k[index]["answer"].split("####")[-1].strip())
            if model_pred == gt_answer:
                    correct += 1
        total += 1
        
    acc = correct / total
    print(f"Accuracy: {acc:.2%}")
    return acc
    

In [147]:
import matplotlib.pyplot as plt

decoder_models = {
    #"Phi-2": "microsoft/phi-2",
    "Phi-3": "microsoft/Phi-3-mini-128k-instruct",
    "Phi-4": "microsoft/phi-4",
    "Tiny- Llama": "llamafactory/tiny-random-Llama-3"
}

result = []
for label, model_id in decoder_models.items():
    acc = evaluation_model(model_id)
    result.append((label, acc))

evaluation microsoft/Phi-3-mini-128k-instruct for few_shot CoT...


Loading checkpoint shards: 100%|██████████| 2/2 [00:06<00:00,  3.34s/it]
Device set to use cuda:0


Accuracy: 68.00%
evaluation microsoft/phi-4 for few_shot CoT...


Fetching 6 files: 100%|██████████| 6/6 [02:34<00:00, 25.78s/it] 
Loading checkpoint shards: 100%|██████████| 6/6 [03:53<00:00, 38.94s/it]
Device set to use cuda:0


OutOfMemoryError: CUDA out of memory. Tried to allocate 350.00 MiB. GPU 0 has a total capacity of 47.50 GiB of which 44.31 MiB is free. Including non-PyTorch memory, this process has 47.45 GiB memory in use. Of the allocated memory 46.02 GiB is allocated by PyTorch, and 966.55 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
# Plotting
labels = [r[0] for r in results]
scores = [r[1] for r in results]
x = range(len(labels))

plt.figure(figsize=(8,5))
plt.bar(x, scores, color="skyblue")
plt.xticks(x, labels)
plt.ylabel("Accuracy")
plt.title("Few-Shot CoT Reasoning vs Model Size (GSM8K)")
plt.ylim(0, 1.0)
plt.grid(True, axis='y')
plt.show()

### Testing on Decoder only models