In [19]:
import re
from tqdm import tqdm
from datasets import load_dataset
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import AutoTokenizer
def generate_text(prompt, tokenizer, model, max_length=100):
    inputs = tokenizer(prompt, return_tensors="pt")
    inputs = {key: value.to("cpu") for key, value in inputs.items()}
    output_ids = model.generate(**inputs, max_length=max_length)
    text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return text

def measure_flops(model, sequence_length=50):
    """
    Measure FLOPs and parameter count using ptflops.
    Note: For quantized models, reported FLOPs might not fully reflect low-precision ops.
    """
    from ptflops import get_model_complexity_info

    dummy_input_shape = (1, sequence_length)
    macs, params = get_model_complexity_info(
        model, dummy_input_shape, as_strings=True,
        print_per_layer_stat=True, verbose=True
    )
    print("=== FLOPs and Parameter Count ===")
    print("MACs:", macs)
    print("Params:", params)

def measure_memory(model, tokenizer, prompt="Test prompt"):
    """
    Measure memory usage during a forward pass using torch.profiler.
    """
    import torch.profiler

    inputs = tokenizer(prompt, return_tensors="pt")
    inputs = {key: value.to("cpu") for key, value in inputs.items()}

    with torch.profiler.profile(
        activities=[torch.profiler.ProfilerActivity.CPU],
        profile_memory=True,
        record_shapes=True,
    ) as prof:
        model(**inputs)

    print("=== Memory Usage (sorted by CPU memory consumption) ===")
    print(prof.key_averages().table(sort_by="cpu_memory_usage", row_limit=10))

def load_quantized_model(model_name="EleutherAI/gpt-neo-1.3B", quantize=True):
    if quantize:
        model = AutoModelForCausalLM.from_pretrained(model_name)
        model.to("cpu")
        quantized_model = torch.quantization.quantize_dynamic(
            model, {torch.nn.Linear}, dtype=torch.qint8
        )
    else:
        quantized_model = AutoModelForCausalLM.from_pretrained(model_name)
    return quantized_model
def extract_final_answer(generated_text):
    """
    Extracts the final numerical answer from generated text.
    """
    matches = re.findall(r"(\d+)", generated_text)
    return matches[-1] if matches else None

def evaluate_on_gsm8k(tokenizer, model):
    # Load the GSM8K test split
    dataset = load_dataset("gsm8k", "main", split="test")
    correct = 0
    total = len(dataset)
    for example in tqdm(dataset, total=total, desc="Evaluating GSM8K"):
        question = example["question"]
        prompt = question + "\n\nLet's think step by step:"
        generated_text = generate_text(prompt, tokenizer, model, max_length=200)
        pred = extract_final_answer(generated_text)
        ground_truth = example["answer"]
        if pred is not None and pred in ground_truth:
            correct += 1

    accuracy = correct / total
    print(f"\n=== GSM8K Evaluation ===")
    print(f"Accuracy: {accuracy * 100:.2f}% over {total} examples.")

if __name__ == "__main__":
    model_name = "EleutherAI/gpt-neo-1.3B"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = load_quantized_model(model_name)

    # Evaluate with tqdm
    evaluate_on_gsm8k(tokenizer, model)


Evaluating GSM8K:   0%|          | 0/1319 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Evaluating GSM8K:   0%|          | 1/1319 [00:09<3:24:49,  9.32s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Evaluating GSM8K:   0%|          | 2/1319 [00:21<3:58:56, 10.89s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Evaluating GSM8K:   0%|          | 3/1319 [00:31<3:54:42, 10.70s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Evaluating GSM8K:   0%|          | 4/1319 [00:43<4:01:56, 11.04s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Evaluating GSM8K:   0%|          | 5/1319 [00:49<3:22:33,  9.25s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Evaluating GSM8K:   0%|          | 6/1319 [00:59<3:26:54,  9.46s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Evaluating GSM8K:   1%|       


=== GSM8K Evaluation ===
Accuracy: 51.63% over 1319 examples.



