In [1]:
import argparse
import resource
import transformers
import torch
import random

import numpy as np

from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [19]:
def measure_flops(model, sequence_length=50):
    """
    Measure FLOPs and parameter count using ptflops.
    Note: For quantized models, reported FLOPs might not fully reflect low-precision ops.
    """
    from ptflops import get_model_complexity_info

    dummy_input_shape = (1, sequence_length)  
    macs, params = get_model_complexity_info(
        model, dummy_input_shape, as_strings=True,
        print_per_layer_stat=True, verbose=True
    )
    print("=== FLOPs and Parameter Count ===")
    print("MACs:", macs)
    print("Params:", params)


def measure_memory(model, tokenizer, prompt="Test prompt"):
    """
    Measure memory usage during a forward pass using torch.profiler.
    """
    import torch.profiler

    # Create a dummy input
    inputs = tokenizer(prompt, return_tensors="pt")
    inputs = {key: value.to("cpu") for key, value in inputs.items()}

    with torch.profiler.profile(
        activities=[torch.profiler.ProfilerActivity.CPU],
        profile_memory=True,
        record_shapes=True,
    ) as prof:
        model(**inputs)

    print("=== Memory Usage (sorted by CPU memory consumption) ===")
    print(prof.key_averages().table(sort_by="cpu_memory_usage", row_limit=10))


def nshot_chats(nshot_data: list, n: int, question: str, task="gsm8k", args=None) -> dict:

    def question_prompt(s):
        return f'Question: {s}'

    def answer_prompt(s):
        return f'Answer: {s}'

    chats = []

    random.seed(42)
    for qna in random.sample(nshot_data, n):
        chats.append(
            {"role": "user", "content": question_prompt(qna["question"])})
        if task == "mmlu":
            chats.append(
                {"role": "assistant", "content": answer_prompt(qna["ans"])})
        else:
            chats.append(
                {"role": "assistant", "content": answer_prompt(qna["answer"])})

    if task == "gsm8k":
        chats.append({"role": "user", "content": question_prompt(question)+" Let's think step by step. At the end, you MUST finish the sentence with '####' followed by the answer as an integer."})
    if task == "mathqa" or task == "mmlu":
        chats.append({"role": "user", "content": question_prompt(question)+" Let's solve this problem step by step. At the end, choose the correct option and write it after '####', e.g., '#### a'."})
    
    # Reformatting for Gemma
    if args.model == "google/gemma-3-1b-it":
        for i in range(len(chats)):
            chats[i]["content"] = [{"type": "text", "text": chats[i]["content"]}]
    return chats


def extract_answer(answer: str, eos=None, task="gsm8k"):
    if eos:
        answer = answer.split(eos)[0].strip()

    answer = answer.split('####')[-1].strip()
    # Removing possible non-numeric characters generated by the LLM
    for remove_char in [',', '$', '%', 'g']:
        answer = answer.replace(remove_char, '')
    if task == "gsm8k":
        try:
            return int(answer)
        except ValueError:
            return answer

    try:
        return answer.split()[0][0]
    except ValueError:
        return answer
    


def int_to_option(n: int):
    return chr(n + 97)


def preprocess_mathqa(example):
    options = example['options']
    prompt = f"Question: {example['Problem']}\nOptions: {options}"
    return {"question": prompt, "answer": example['Rationale'] + "\n####" + example['correct']}

def preprocess_mmlu(example):
    options = "a)" + example['choices'][0] + " b)" + example['choices'][1] + " c)" + example['choices'][2] + " d)" + example['choices'][3]
    prompt = f"Question: {example['question']}\nOptions: {options}"
    return {"question": prompt, "ans": "#### " + int_to_option(example['answer'])}

In [4]:
args = argparse.Namespace(
    # model="EleutherAI/gpt-neo-2.7B",
    model="deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
    token="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",
    device="cuda:1",
    quantize=False,
    eval=["mathqa", "mmlu"],
)

model = AutoModelForCausalLM.from_pretrained(args.model, token=args.token, cache_dir='../echo-llm/split-evaluation/model_cache_dir').to(args.device)
if args.quantize:
    model = torch.quantization.quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8)

tokenizer = AutoTokenizer.from_pretrained(args.model, token=args.token, return_dict=True)
if args.model == "EleutherAI/gpt-neo-2.7B":
    tokenizer.chat_template = """
    {% for message in messages %}
    {% if message['role'] == 'user' %}
    User: {{ message['content'] }}
    {% elif message['role'] == 'assistant' %}
    Assistant: {{ message['content'] }}
    {% endif %}
    {% endfor %}
    Assistant:
    """
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.eos_token_id
generator = transformers.pipeline("text-generation", model=model, tokenizer=tokenizer, device=args.device, max_new_tokens=512)

def get_response(chats, args):
    # chats = tokenizer.apply_chat_template(chats, tokenize=False, add_generation_prompt=True)
    gen_text = generator(chats)  # First return sequence
    if args.model == "google/gemma-3-1b-it":
        return gen_text["generated_text"][-1]["content"]["text"]
    return gen_text[0]["generated_text"][-1]["content"]


In [20]:
# Measure FLOPs and parameter count
# print(f"Model {args.model} has FLOPs of {measure_flops(model)}")
NUMBER_SAMPLES = 23

if "gsm8k" in args.eval:
    NUMBER_SHOT = 3
    correct = 0
    train_data = list(load_dataset("gsm8k", "main", split="train[:20]"))
    test_data = list(load_dataset("gsm8k", "main", split=f"test[:{NUMBER_SAMPLES - 20}]"))

    for i in tqdm(range((len(test_data)))):
        messages = nshot_chats(nshot_data=train_data, n=NUMBER_SHOT, question=test_data[i]['question'], task="gsm8k", args=args)
        response = get_response(messages, args=args)

        if extract_answer(response) == extract_answer(test_data[i]['answer']):
            correct += 1
    print(f"Accuracy of {args.model}: {correct/len(test_data)} on gsm8k")

if "mathqa" in args.eval:
    NUMBER_SHOT = 3
    correct = 0
    mathqa = load_dataset("math_qa", split=f"train[:{NUMBER_SAMPLES}]")
    mathqa = mathqa.map(preprocess_mathqa)
    train_data = list(mathqa)[:20]
    test_data = list(mathqa)[20:]

    for i in tqdm(range((len(test_data)))):
        messages = nshot_chats(nshot_data=train_data, n=NUMBER_SHOT, question=mathqa[i]['question'], task="mathqa", args=args)
        print(mathqa[i])
        response = get_response(messages, args=args)
        print(response)
        if extract_answer(response, task="mathqa") == extract_answer(test_data[i]['answer'], task="mathqa"):
            correct += 1
    print(f"Accuracy of {args.model}: {correct/len(test_data)} on mathqa")

if "mmlu" in args.eval:
    NUMBER_SHOT = 8
    correct = 0
    # Topics ['abstract_algebra', 'all', 'anatomy', 'astronomy', 'auxiliary_train', 'business_ethics', 'clinical_knowledge', 'college_biology', 
    # 'college_chemistry', 
    # 'college_computer_science', 
    # 'college_mathematics', 
    # 'college_medicine', 
    # 'college_physics', 
    # 'computer_security', 
    # 'conceptual_physics', 
    # 'econometrics', 
    # 'electrical_engineering', '
    # elementary_mathematics']
    mmlu = load_dataset("cais/mmlu", "elementary_mathematics", split=f"test[:{NUMBER_SAMPLES}]")
    mmlu = mmlu.map(preprocess_mmlu)
    train_data = list(mmlu)[:20]
    test_data = list(mmlu)[20:]

    for i in tqdm(range((len(test_data)))):
        messages = nshot_chats(nshot_data=train_data, n=NUMBER_SHOT, question=test_data[i]['question'], task="mmlu", args=args)
        response = get_response(messages, args=args)
        print(response)
        print("Extracted answer is ***********", extract_answer(response, task="mmlu"))
        if extract_answer(response, task="mmlu") == extract_answer(test_data[i]['ans'], task="mmlu"):
            correct += 1
    print(f"Accuracy of {args.model}: {correct/len(test_data)} on mmlu")
            

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
  0%|          | 0/3 [00:00<?, ?it/s]

{'Problem': "the banker ' s gain of a certain sum due 3 years hence at 10 % per annum is rs . 36 . what is the present worth ?", 'Rationale': '"explanation : t = 3 years r = 10 % td = ( bg × 100 ) / tr = ( 36 × 100 ) / ( 3 × 10 ) = 12 × 10 = rs . 120 td = ( pw × tr ) / 100 ⇒ 120 = ( pw × 3 × 10 ) / 100 ⇒ 1200 = pw × 3 pw = 1200 / 3 = rs . 400 answer : option a"', 'options': 'a ) rs . 400 , b ) rs . 300 , c ) rs . 500 , d ) rs . 350 , e ) none of these', 'correct': 'a', 'annotated_formula': 'divide(multiply(const_100, divide(multiply(36, const_100), multiply(3, 10))), multiply(3, 10))', 'linear_formula': 'multiply(n2,const_100)|multiply(n0,n1)|divide(#0,#1)|multiply(#2,const_100)|divide(#3,#1)|', 'category': 'gain', 'question': "Question: the banker ' s gain of a certain sum due 3 years hence at 10 % per annum is rs . 36 . what is the present worth ?\nOptions: a ) rs . 400 , b ) rs . 300 , c ) rs . 500 , d ) rs . 350 , e ) none of these", 'answer': '"explanation : t = 3 years r = 10 % t

 33%|███▎      | 1/3 [00:11<00:22, 11.23s/it]

Okay, so I've got this problem here about banker's gain, and I need to find the present worth. Let me try to understand what's being asked and figure out how to approach it step by step.

First, the problem says that the banker's gain of a certain sum due 3 years hence at 10% per annum is ₹36. I need to find the present worth from the given options. The options are a) ₹400, b) ₹300, c) ₹500, d) ₹350, and e) none of these.

Alright, so I think I need to recall what banker's gain is. From what I remember, banker's gain is the difference between the present worth (PW) and the true discount (TD) when the time is more than one year. But in this case, the time is 3 years, which is more than one year, so it might still apply.

Wait, but the problem mentions due 3 years hence, which means the sum is due in 3 years. So, I need to calculate the present worth considering the interest for 3 years at 10% per annum.

Let me denote the present worth as PW. Then, the amount due after 3 years would be 

 67%|██████▋   | 2/3 [00:21<00:10, 10.62s/it]

Alright, let's try to solve this problem step by step. So, we have an adult school where the average age of the students is 40 years. Then, 120 new students join the school, and their average age is 32 years. Because of these new students, the average age of all the students decreases by 4 years. We need to find out how many students there are in total after the new students join.

First, let's denote the initial number of students as \( x \). The average age of these students is 40, so the total age of all the students before the new students join is \( 40x \).

Next, 120 new students join the school, and their average age is 32. So, the total age of the new students is \( 32 \times 120 \). Let's calculate that:

\( 32 \times 120 = 3840 \)

Now, the total number of students after the new students join is \( x + 120 \), and the total age of all the students becomes \( 40x + 3840 \).

We are told that the average age decreases by 4 years after the new students join. So, the new average 

100%|██████████| 3/3 [00:30<00:00, 10.10s/it]

Alright, let me try to figure out this problem. So, Sophia finished 2/3 of a book, and she thinks she finished 90 more pages than she has left to read. I need to find out how long the book is.

First, I need to understand what it means to finish 2/3 of the book. If the book has a total number of pages, say, N pages, then Sophia has read 2/3 of N, which is (2/3)N pages. That means she has 1/3 of N pages left to read, which is (1/3)N pages.

According to the problem, she finished 90 more pages than she has yet to read. So, the number of pages she finished, which is (2/3)N, is equal to the number of pages she has left to read, which is (1/3)N, plus 90 pages. 

So, I can set up the equation:
(2/3)N = (1/3)N + 90

Now, I need to solve for N. Let me subtract (1/3)N from both sides to get:
(2/3)N - (1/3)N = 90
This simplifies to:
(1/3)N = 90

To find N, I can multiply both sides by 3:
N = 90 * 3
N = 270

So, the total number of pages in the book is 270. Let me check if this makes sense.

If t


 33%|███▎      | 1/3 [00:07<00:14,  7.24s/it]

Okay, let's try to solve this problem step by step. So, the question is: Find the value of 4 ÷ 2 • 2 + 8 − 4. And we have four options: a) -12, b) 12, c) 8, d) 16.

First, I need to remember the order of operations, which is often remembered by the acronym PEMDAS: Parentheses, Exponents, Multiplication and Division (from left to right), Addition and Subtraction (from left to right).

Looking at the expression 4 ÷ 2 • 2 + 8 − 4, there are no parentheses or exponents, so I move to multiplication and division. I see a division and a multiplication here: 4 ÷ 2 and then • 2.

I should perform these operations from left to right. So first, 4 ÷ 2 equals 2. Then, I multiply that result by 2, which gives me 4.

Now, the expression simplifies to 4 + 8 − 4. Next, I handle the addition and subtraction from left to right. Adding 4 and 8 gives me 12, and then subtracting 4 from that gives me 8.

So, the final answer should be 8. Let me double-check to make sure I didn't make any mistakes. Starting o

 67%|██████▋   | 2/3 [00:17<00:08,  8.93s/it]

Okay, so I have this problem here about rounding the number of runners at a track meet. Let me try to understand what it's asking. The coach rounded the actual number of runners to the nearest 10, and the rounded number is 400. I need to figure out which of the given options could be the actual number of runners.

First, I need to recall what rounding to the nearest 10 means. If a number is rounded to the nearest 10, it means that the unit digit (the ones place) is considered. If the ones digit is 5 or higher, we round up, and if it's 4 or lower, we round down. So, for example, if the actual number is 350, rounding to the nearest 10 would give us 350, but wait, 350 is already a multiple of 10. Similarly, 397 would round to 400 because the ones digit is 7, which is higher than 5.

Let me look at the options again:

a) 382

b) 397

c) 406

d) 447

I need to check each one to see if rounding them to the nearest 10 gives me 400.

Starting with option a) 382. Let's break it down. The number

100%|██████████| 3/3 [00:27<00:00,  9.18s/it]

Alright, so I've got this math problem here: −4 plus (−3) equals what? The options are a) −7, b) −1, c) 1, and d) 7. I'm going to try to figure this out step by step.

First, I need to understand what the problem is asking. It's asking for the sum of two negative numbers: −4 and −3. Both of these are negative, so adding them together should result in a negative number because adding two negative numbers gives a negative result.

Okay, so let's think about what negative numbers represent. Negative numbers are like debt or a deficit. If I owe someone $4 and then I owe another $3, I now owe a total of $7. So, in terms of debt, it's like adding two negatives.

Now, how do I add −4 and −3? Well, one way to think about it is to combine the amounts of debt. So, if I have $4 debt and I add $3 more debt, my total debt becomes $7. But since both numbers are negative, the result should also be negative.

Let me write it down to visualize it better:

−4 + (−3) =?

I can also think of it as combini


