In [1]:
from google.colab import userdata
hf_token = userdata.get("huggingface")

In [2]:
pip install ptflops



In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

def load_quantized_model(model_name="EleutherAI/gpt-neo-1.3B"):
    model = AutoModelForCausalLM.from_pretrained(model_name)
    model.to("cpu")

    return model

def generate_text(prompt, tokenizer, model, max_length=100):
    inputs = tokenizer(prompt, return_tensors="pt")
    inputs = {key: value.to("cpu") for key, value in inputs.items()}
    output_ids = model.generate(**inputs, max_length=max_length)
    text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return text

def measure_flops(model, sequence_length=50):
    """
    Measure FLOPs and parameter count using ptflops.
    Note: For quantized models, reported FLOPs might not fully reflect low-precision ops.
    """
    from ptflops import get_model_complexity_info

    dummy_input_shape = (1, sequence_length)
    macs, params = get_model_complexity_info(
        model, dummy_input_shape, as_strings=True,
        print_per_layer_stat=True, verbose=True
    )
    print("=== FLOPs and Parameter Count ===")
    print("MACs:", macs)
    print("Params:", params)

def measure_memory(model, tokenizer, prompt="Test prompt"):
    """
    Measure memory usage during a forward pass using torch.profiler.
    """
    import torch.profiler

    inputs = tokenizer(prompt, return_tensors="pt")
    inputs = {key: value.to("cpu") for key, value in inputs.items()}

    with torch.profiler.profile(
        activities=[torch.profiler.ProfilerActivity.CPU],
        profile_memory=True,
        record_shapes=True,
    ) as prof:
        model(**inputs)

    print("=== Memory Usage (sorted by CPU memory consumption) ===")
    print(prof.key_averages().table(sort_by="cpu_memory_usage", row_limit=10))

def main():
    model_name = "Qwen/Qwen2.5-Math-1.5B"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = load_quantized_model(model_name)
    measure_flops(model, sequence_length=50)

    measure_memory(model, tokenizer, prompt="Test prompt for memory profiling")

    prompt = "Give me a recipe for okonomiyaki"
    generated_text = generate_text(prompt, tokenizer, model)
    print("\n=== Generated Text ===")
    print(generated_text)

if __name__ == "__main__":
    main()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Flops estimation was not finished successfully because of the following exception:
<class 'RuntimeError'> : Expected tensor for argument #1 'indices' to have one of the following scalar types: Long, Int; but got torch.FloatTensor instead (while checking arguments for embedding)
=== FLOPs and Parameter Count ===
MACs: None
Params: None


Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/ptflops/pytorch_engine.py", line 68, in get_flops_pytorch
    _ = flops_model(batch)
        ^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1844, in _call_impl
    return inner()
           ^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1790, in inner
    result = forward_call(*args, **kwargs)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/transformers/models/qwen2/modeling_qwen2.py", line 819, in forward
    outputs = self.model(
              ^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
    ret

=== Memory Usage (sorted by CPU memory consumption) ===


Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Both `max_new_tokens` (=2048) and `max_length`(=100) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


-----------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                 Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg       CPU Mem  Self CPU Mem    # of Calls  
-----------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                         aten::linear         0.23%       3.854ms        86.72%        1.476s       7.495ms      15.20 Mb           0 b           197  
                                         aten::matmul         0.42%       7.141ms        81.16%        1.382s      12.120ms      14.11 Mb           0 b           114  
                                             aten::mm        79.91%        1.361s        80.04%        1.363s      12.059ms      14.11 Mb      14.11 Mb         

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

def load_quantized_model(model_name="EleutherAI/gpt-neo-1.3B"):
    model = AutoModelForCausalLM.from_pretrained(model_name)
    model.to("cpu")
    quantized_model = torch.quantization.quantize_dynamic(
        model, {torch.nn.Linear}, dtype=torch.qint8
    )
    return quantized_model

def generate_text(prompt, tokenizer, model, max_length=100):
    inputs = tokenizer(prompt, return_tensors="pt")
    inputs = {key: value.to("cpu") for key, value in inputs.items()}
    output_ids = model.generate(**inputs, max_length=max_length)
    text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return text

def measure_flops(model, sequence_length=50):
    """
    Measure FLOPs and parameter count using ptflops.
    Note: For quantized models, reported FLOPs might not fully reflect low-precision ops.
    """
    from ptflops import get_model_complexity_info

    dummy_input_shape = (1, sequence_length)
    macs, params = get_model_complexity_info(
        model, dummy_input_shape, as_strings=True,
        print_per_layer_stat=True, verbose=True
    )
    print("=== FLOPs and Parameter Count ===")
    print("MACs:", macs)
    print("Params:", params)

def measure_memory(model, tokenizer, prompt="Test prompt"):
    """
    Measure memory usage during a forward pass using torch.profiler.
    """
    import torch.profiler

    # Create a dummy input
    inputs = tokenizer(prompt, return_tensors="pt")
    inputs = {key: value.to("cpu") for key, value in inputs.items()}

    with torch.profiler.profile(
        activities=[torch.profiler.ProfilerActivity.CPU],
        profile_memory=True,
        record_shapes=True,
    ) as prof:
        model(**inputs)

    print("=== Memory Usage (sorted by CPU memory consumption) ===")
    print(prof.key_averages().table(sort_by="cpu_memory_usage", row_limit=10))

def main():
    model_name = "Qwen/Qwen2.5-Math-1.5B"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = load_quantized_model(model_name)
    measure_flops(model, sequence_length=50)
    measure_memory(model, tokenizer, prompt="Test prompt for memory profiling")
    prompt = "Give me a recipe for okonomiyaki"
    generated_text = generate_text(prompt, tokenizer, model)
    print("\n=== Generated Text ===")
    print(generated_text)

if __name__ == "__main__":
    main()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


With GSM1K

In [None]:
# Install necessary libraries for HuggingFace and ptflops
!pip install transformers torch ptflops

import torch
import json
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load the quantized model
def load_quantized_model(model_name="Qwen/Qwen2.5-Math-1.5B"):
    model = AutoModelForCausalLM.from_pretrained(model_name)
    model.to("cpu")
    return model

# Generate text from the model
def generate_text(prompt, tokenizer, model, max_length=100):
    inputs = tokenizer(prompt, return_tensors="pt")
    inputs = {key: value.to("cpu") for key, value in inputs.items()}
    output_ids = model.generate(**inputs, max_length=max_length)
    text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return text

# Measure FLOPs and parameters
def measure_flops(model, sequence_length=50):
    from ptflops import get_model_complexity_info

    dummy_input_shape = (1, sequence_length)
    macs, params = get_model_complexity_info(
        model, dummy_input_shape, as_strings=True,
        print_per_layer_stat=True, verbose=True
    )
    print("=== FLOPs and Parameter Count ===")
    print("MACs:", macs)
    print("Params:", params)

# Measure memory usage during a forward pass
def measure_memory(model, tokenizer, prompt="Test prompt"):
    import torch.profiler

    inputs = tokenizer(prompt, return_tensors="pt")
    inputs = {key: value.to("cpu") for key, value in inputs.items()}

    with torch.profiler.profile(
        activities=[torch.profiler.ProfilerActivity.CPU],
        profile_memory=True,
        record_shapes=True,
    ) as prof:
        model(**inputs)

    print("=== Memory Usage (sorted by CPU memory consumption) ===")
    print(prof.key_averages().table(sort_by="cpu_memory_usage", row_limit=10))

# Load GSM1K dataset (from a URL or local file)
def load_gsm1k_dataset(dataset_url="https://raw.githubusercontent.com/scaleapi/gsm1k_eval/main/data/gsm1k_test.json"):
    import requests

    response = requests.get(dataset_url)
    data = response.json()

    return data

# Main function to run the entire process
def main():
    # Model and tokenizer setup
    model_name = "Qwen/Qwen2.5-Math-1.5B"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = load_quantized_model(model_name)

    # Load the GSM1K dataset
    gsm1k_data = load_gsm1k_dataset()

    # Measure model complexity
    measure_flops(model, sequence_length=50)

    # Measure memory usage on a sample question from GSM1K dataset
    sample_question = gsm1k_data[0]["question"]  # Get the first math problem in the dataset
    print(f"Sample Question: {sample_question}")
    measure_memory(model, tokenizer, prompt=sample_question)

    # Generate text for a batch of questions from GSM1K
    for item in gsm1k_data[:5]:  # Just process the first 5 items for demo purposes
        question = item["question"]
        print(f"\nGenerating answer for: {question}")
        generated_answer = generate_text(question, tokenizer, model)
        print("Generated Answer:", generated_answer)

if __name__ == "__main__":
    main()