In [1]:
import argparse
import resource
import transformers
import torch
import random

import numpy as np

from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
def measure_flops(model, sequence_length=50):
    """
    Measure FLOPs and parameter count using ptflops.
    Note: For quantized models, reported FLOPs might not fully reflect low-precision ops.
    """
    from ptflops import get_model_complexity_info

    dummy_input_shape = (1, sequence_length)  
    macs, params = get_model_complexity_info(
        model, dummy_input_shape, as_strings=True,
        print_per_layer_stat=True, verbose=True
    )
    print("=== FLOPs and Parameter Count ===")
    print("MACs:", macs)
    print("Params:", params)


def measure_memory(model, tokenizer, prompt="Test prompt"):
    """
    Measure memory usage during a forward pass using torch.profiler.
    """
    import torch.profiler

    # Create a dummy input
    inputs = tokenizer(prompt, return_tensors="pt")
    inputs = {key: value.to("cpu") for key, value in inputs.items()}

    with torch.profiler.profile(
        activities=[torch.profiler.ProfilerActivity.CPU],
        profile_memory=True,
        record_shapes=True,
    ) as prof:
        model(**inputs)

    print("=== Memory Usage (sorted by CPU memory consumption) ===")
    print(prof.key_averages().table(sort_by="cpu_memory_usage", row_limit=10))


def nshot_chats(nshot_data: list, n: int, question: str, task="gsm8k") -> dict:

    def question_prompt(s):
        return f'Question: {s}'

    def answer_prompt(s):
        return f'Answer: {s}'

    chats = []

    random.seed(42)
    for qna in random.sample(nshot_data, n):
        chats.append(
            {"role": "user", "content": question_prompt(qna["question"])})
        if task == "mmlu":
            chats.append(
                {"role": "assistant", "content": answer_prompt(qna["ans"])})
        else:
            chats.append(
                {"role": "assistant", "content": answer_prompt(qna["answer"])})

    if task == "gsm8k":
        chats.append({"role": "user", "content": question_prompt(question)+" Let's think step by step. At the end, you MUST finish the sentence with '####' followed by the answer as an integer."})
    if task == "mathqa" or task == "mmlu":
        chats.append({"role": "user", "content": question_prompt(question)+" Let's think step by step. At the end, you MUST finish the sentence with '####' followed by a letter as the correct answer."})
    return chats


def extract_answer(answer: str, eos=None):
    if eos:
        answer = answer.split(eos)[0].strip()

    answer = answer.split('####')[-1].strip()
    # Removing possible non-numeric characters generated by the LLM
    for remove_char in [',', '$', '%', 'g']:
        answer = answer.replace(remove_char, '')

    try:
        return int(answer)
    except ValueError:
        return answer


def int_to_option(n: int):
    return chr(n + 97)


def preprocess_mathqa(example):
    options = example['options']
    prompt = f"Question: {example['Problem']}\nOptions: {options}"
    return {"question": prompt, "answer": example['Rationale'] + "\n####" + example['correct']}

def preprocess_mmlu(example):
    options = "a)" + example['choices'][0] + " b)" + example['choices'][1] + " c)" + example['choices'][2] + " d)" + example['choices'][3]
    prompt = f"Question: {example['question']}\nOptions: {options}"
    return {"question": prompt, "ans": "#### " + int_to_option(example['answer'])}

In [None]:
args = argparse.Namespace(
    # model="EleutherAI/gpt-neo-2.7B",
    model="deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
    token="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",
    device="cpu",
    quantize=False,
    eval=["gsm8k", "mathqa", "mmlu"],
)

model = AutoModelForCausalLM.from_pretrained(args.model, token=args.token).to(args.device)
if args.quantize:
    model = torch.quantization.quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8)

tokenizer = AutoTokenizer.from_pretrained(args.model, token=args.token, return_dict=True)
if args.model == "EleutherAI/gpt-neo-2.7B":
    tokenizer.chat_template = """
    {% for message in messages %}
    {% if message['role'] == 'user' %}
    User: {{ message['content'] }}
    {% elif message['role'] == 'assistant' %}
    Assistant: {{ message['content'] }}
    {% endif %}
    {% endfor %}
    Assistant:
    """
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.eos_token_id
generator = transformers.pipeline("text-generation", model=model, tokenizer=tokenizer, device=args.device, max_new_tokens=512)

def get_response(chats):
    # chats = tokenizer.apply_chat_template(chats, tokenize=False, add_generation_prompt=True)
    gen_text = generator(chats)  # First return sequence
    return gen_text[0]["generated_text"][-1]["content"]


Device set to use cpu


Testing whether the model works correctly or not

In [None]:
message = [{"role": "system", "content": "You're a helpful AI chatbot serving as an assistant to user."}, {"role": "user", "content": "Question, what is the capital of France?"}]
print(tokenizer.apply_chat_template(message, tokenize=False))
print(generator(tokenizer.apply_chat_template(message, tokenize=False)))

In [None]:
print(generator(["User: Question: What is the capital of France?\n Assistant: "]))

In [9]:
# Measure FLOPs and parameter count
# print(f"Model {args.model} has FLOPs of {measure_flops(model)}")
NUMBER_SAMPLES = 23

if "gsm8k" in args.eval:
    NUMBER_SHOT = 3
    correct = 0
    train_data = list(load_dataset("gsm8k", "main", split="train[:20]"))
    test_data = list(load_dataset("gsm8k", "main", split=f"test[:{NUMBER_SAMPLES}]"))

    for i in tqdm(range((len(test_data)))):
        messages = nshot_chats(nshot_data=train_data, n=NUMBER_SHOT, question=test_data[i]['question'], task="gsm8k")
        response = get_response(messages)

        if extract_answer(response) == extract_answer(test_data[i]['answer']):
            correct += 1
    print(f"Accuracy of {args.model}: {correct/len(test_data)}")

if "mathqa" in args.eval:
    NUMBER_SHOT = 3
    correct = 0
    mathqa = load_dataset("math_qa", split=f"train[:{NUMBER_SAMPLES}]")
    mathqa = mathqa.map(preprocess_mathqa)
    train_data = list(mathqa)[:20]
    test_data = list(mathqa)[20:]

    for i in tqdm(range((len(test_data)))):
        messages = nshot_chats(nshot_data=train_data, n=NUMBER_SHOT, question=mathqa[i]['Problem'], task="mathqa")
        response = get_response(messages)
        if extract_answer(response) == extract_answer(test_data[i]['answer']):
            correct += 1
    print(f"Accuracy of {args.model}: {correct/len(test_data)}")

if "mmlu" in args.eval:
    NUMBER_SHOT = 8
    correct = 0
    # Topics ['abstract_algebra', 'all', 'anatomy', 'astronomy', 'auxiliary_train', 'business_ethics', 'clinical_knowledge', 'college_biology', 
    # 'college_chemistry', 
    # 'college_computer_science', 
    # 'college_mathematics', 
    # 'college_medicine', 
    # 'college_physics', 
    # 'computer_security', 
    # 'conceptual_physics', 
    # 'econometrics', 
    # 'electrical_engineering', '
    # elementary_mathematics']
    mmlu = load_dataset("cais/mmlu", "elementary_mathematics", split=f"test[:{NUMBER_SAMPLES}]")
    mmlu = mmlu.map(preprocess_mmlu)
    train_data = list(mmlu)[:20]
    test_data = list(mmlu)[20:]

    for i in tqdm(range((len(test_data)))):
        messages = nshot_chats(nshot_data=train_data, n=NUMBER_SHOT, question=test_data[i]['question'], task="mmlu")
        print(test_data[i])
        response = get_response(messages)
        print(extract_answer(response), extract_answer(test_data[i]['ans']))
        if extract_answer(response) == extract_answer(test_data[i]['ans']):
            correct += 1
    print(f"Accuracy of {args.model}: {correct/len(test_data)}")
            

  0%|          | 0/3 [00:00<?, ?it/s]

{'question': 'Question: Find the value of 4 ÷ 2 • 2 + 8 − 4.\nOptions: a)−12 b)12 c)8 d)16', 'subject': 'elementary_mathematics', 'choices': ['−12', '12', '8', '16'], 'answer': 2, 'ans': '#### c'}


 33%|███▎      | 1/3 [00:45<01:30, 45.28s/it]

Okay so I have this math problem here: Find the value of 4 ÷ 2 • 2 + 8 − 4. The options are a) −12 b) 12 c) 8 d) 16. Alriht let's break this down step by step. I remember that in math problems like this there's somethin about the order of operations riht? That's often remembered by PEMDAS: Parentheses Exponents Multiplication and Division (from left to riht) and Addition and Subtraction (from left to riht). So let's apply that here.

First lookin at the expression: 4 ÷ 2 • 2 + 8 − 4. There are no parentheses or exponents so I move to multiplication and division. There are two operations here: 4 ÷ 2 and then • 2. I should do these from left to riht.

So 4 ÷ 2 is 2. Then I have 2 • 2 which is 4. So now the expression simplifies to 4 + 8 − 4.

Next I handle the addition and subtraction from left to riht. Startin with 4 + 8 that ives me 12. Then 12 − 4 equals 8. So the final value of the expression is 8.

Lookin back at the options c) 8 is the correct answer. Let me double-check to make su

 67%|██████▋   | 2/3 [01:31<00:45, 45.89s/it]

Okay let me try to fiure this out. So the problem is about a coach who rounded the number of runners at a track meet to the nearest 10 and the rounded number is 400. We need to find which of the options could be the actual number of runners.

First I need to understand what roundin to the nearest 10 means. When you round a number to the nearest 10 you're basically lookin at the ones place. If the ones diit is 5 or hiher you round up by addin 1 to the tens place. If it's less than 5 you keep the tens place the same.

So the rounded number is 400. That means the actual number of runners must be somewhere between 395 and 404. Because 395 rounded to the nearest 10 is 400 and 404 also rounds up to 400. Numbers from 390 to 394 would round down to 390 which is not 400 so they can't be the actual number. Similarly numbers from 405 to 414 would round up to 410 which is not 400 either. So the actual number must be between 395 and 404.

Lookin at the options:

a) 382 – That's less than 395 so it'

100%|██████████| 3/3 [02:19<00:00, 46.46s/it]

Okay let's tackle this problem step by step. So the question is: −4 + (−3) =?

Hmm I'm a bit confused about addin two neative numbers. I remember that when you add two neative numbers the result should also be neative. But I'm not entirely sure about the exact rule. Let me think about this.

First let's recall what neative numbers represent. Neative numbers are numbers less than zero. So −4 is four units to the left of zero on the number line and −3 is three units to the left of zero. When you add these two you're essentially combinin their distances from zero in the neative direction.

So if I have −4 and I add −3 to it I'm movin further left on the number line. Startin at −4 if I move three more units to the left I should land at −7. That makes sense because −4 minus 3 is −7.

Wait but why does addin two neatives result in a more neative number? Is it because both numbers are in the neative direction so their sum should be further in that direction? Yeah that seems to be the case.

L


