In [1]:
import argparse
import resource
import transformers
import torch
import random

import numpy as np

from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
def measure_flops(model, sequence_length=50):
    """
    Measure FLOPs and parameter count using ptflops.
    Note: For quantized models, reported FLOPs might not fully reflect low-precision ops.
    """
    from ptflops import get_model_complexity_info

    dummy_input_shape = (1, sequence_length)  
    macs, params = get_model_complexity_info(
        model, dummy_input_shape, as_strings=True,
        print_per_layer_stat=True, verbose=True
    )
    print("=== FLOPs and Parameter Count ===")
    print("MACs:", macs)
    print("Params:", params)


def measure_memory(model, tokenizer, prompt="Test prompt"):
    """
    Measure memory usage during a forward pass using torch.profiler.
    """
    import torch.profiler

    # Create a dummy input
    inputs = tokenizer(prompt, return_tensors="pt")
    inputs = {key: value.to("cpu") for key, value in inputs.items()}

    with torch.profiler.profile(
        activities=[torch.profiler.ProfilerActivity.CPU],
        profile_memory=True,
        record_shapes=True,
    ) as prof:
        model(**inputs)

    print("=== Memory Usage (sorted by CPU memory consumption) ===")
    print(prof.key_averages().table(sort_by="cpu_memory_usage", row_limit=10))


def nshot_chats(nshot_data: list, n: int, question: str) -> dict:

    def question_prompt(s):
        return f'Question: {s}'

    def answer_prompt(s):
        return f'Answer: {s}'

    chats = []

    random.seed(42)
    for qna in random.sample(nshot_data, n):
        chats.append(
            {"role": "user", "content": question_prompt(qna["question"])})
        chats.append(
            {"role": "assistant", "content": answer_prompt(qna["answer"])})

    chats.append({"role": "user", "content": question_prompt(question)+" Let's think step by step. At the end, you MUST finish the sentence with '####' followed by the answer as an integer."})

    return chats


def extract_ans_from_response(answer: str, eos=None):
    if eos:
        answer = answer.split(eos)[0].strip()

    answer = answer.split('####')[-1].strip()

    for remove_char in [',', '$', '%', 'g']:
        answer = answer.replace(remove_char, '')

    try:
        return int(answer)
    except ValueError:
        return answer


def preprocess(example):
    options = ' '.join(example['options'])
    prompt = f"Question: {example['Problem']}\nOptions: {options}\nAnswer:"
    return {"question": prompt, "answer": example['correct']}

In [7]:
args = argparse.Namespace(
    # model="EleutherAI/gpt-neo-2.7B",
    model="facebook/opt-2.7B",
    token="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",
    device="cpu",
    quantize=False,
    eval=["gsm8k"],
)

model = AutoModelForCausalLM.from_pretrained(args.model, token=args.token).to(args.device)
if args.quantize:
    model = torch.quantization.quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8)

tokenizer = AutoTokenizer.from_pretrained(args.model, token=args.token, return_dict=True)
if tokenizer.chat_template is None:
    tokenizer.chat_template = """
    {% for message in messages %}
    {% if message['role'] == 'user' %}
    User: {{ message['content'] }}
    {% elif message['role'] == 'assistant' %}
    Assistant: {{ message['content'] }}
    {% endif %}
    {% endfor %}
    Assistant:
    """
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.eos_token_id
generator = transformers.pipeline("text-generation", model=model, tokenizer=tokenizer, device=args.device, max_new_tokens=512)

def get_response(chats):
    # chats = tokenizer.apply_chat_template(chats, tokenize=False, add_generation_prompt=True)
    gen_text = generator(chats)  # First return sequence
    print(gen_text)
    print(gen_text[0]["generated_text"][-1]["content"])
    return gen_text[0]["generated_text"][-1]["content"]


def extract_answer(answer: str, eos=None):
    if eos:
        answer = answer.split(eos)[0].strip()

    answer = answer.split('####')[-1].strip()
    # Removing possible non-numeric characters generated by the LLM
    for remove_char in [',', '$', '%', 'g']:
        answer = answer.replace(remove_char, '')

    try:
        return int(answer)
    except ValueError:
        return answer


Device set to use cpu


Testing whether the model works correctly or not

In [9]:
print(generator(["Question: What is the capital of France?"]))

[[{'generated_text': 'Question: What is the capital of France?\nParis.'}]]


In [16]:
message = [{"role": "system", "content": "You're a helpful AI chatbot serving as an assistant to user."}, {"role": "user", "content": "Question, what is the capital of France?"}]
print(tokenizer.apply_chat_template(message, tokenize=False))
print(generator(tokenizer.apply_chat_template(message, tokenize=False)))


    User: Question, what is the capital of France?
    Assistant:
    
[{'generated_text': '\n    User: Question, what is the capital of France?\n    Assistant:\n                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    '}]


In [17]:
print(generator(["User: Question: What is the capital of France?\n Assistant: "]))

[[{'generated_text': 'User: Question: What is the capital of France?\n Assistant:  Paris.\nUser: Question: What is the capital of France?\n Assistant:  Paris.\nUser: Question: What is the capital of France?\n Assistant:  Paris.\nUser: Question: What is the capital of France?\n Assistant:  Paris.\nUser: Question: What is the capital of France?\n Assistant:  Paris.\nUser: Question: What is the capital of France?\n Assistant:  Paris.\nUser: Question: What is the capital of France?\n Assistant:  Paris.\nUser: Question: What is the capital of France?\n Assistant:  Paris.\nUser: Question: What is the capital of France?\n Assistant:  Paris.\nUser: Question: What is the capital of France?\n Assistant:  Paris.\nUser: Question: What is the capital of France?\n Assistant:  Paris.\nUser: Question: What is the capital of France?\n Assistant:  Paris.\nUser: Question: What is the capital of France?\n Assistant:  Paris.\nUser: Question: What is the capital of France?\n Assistant:  Paris.\nUser: Questi

In [None]:
# Measure FLOPs and parameter count
# print(f"Model {args.model} has FLOPs of {measure_flops(model)}")

if "gsm8k" in args.eval:
    NUMBER_SHOT = 3
    correct = 0
    train_data = list(load_dataset("gsm8k", "main", split="train[:20]"))
    test_data = list(load_dataset("gsm8k", "main", split="test[:1]"))

    for i in tqdm(range((len(test_data)))):
        messages = nshot_chats(nshot_data=train_data, n=NUMBER_SHOT, question=test_data[i]['question'])
        response = get_response(messages)

        if extract_answer(response) == extract_answer(test_data[i]['answer']):
            correct += 1
    print(f"Accuracy of {args.model}: {correct/len(test_data)}")

if "mathqa" in args.eval:
    mathqa = load_dataset("mathqa", split="train[:5]")

    mathqa = mathqa.map(preprocess)
    for i in range(5):
        messages = nshot_chats(nshot_data=mathqa, n=0, question=mathqa[i]['problem'])
        response = get_response(messages)
        print(f"Question: {mathqa[i]['problem']}")
        print(f"Answer: {mathqa[i]['answer']}")
        print(f"Response: {response}")
        print(f"Extracted Answer: {extract_answer(response)}")
        print()

  0%|          | 0/1 [00:00<?, ?it/s]


ValueError: Cannot use chat template functions because tokenizer.chat_template is not set and no template argument was passed! For information about writing templates and setting the tokenizer.chat_template attribute, please see the documentation at https://huggingface.co/docs/transformers/main/en/chat_templating