In [5]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

def load_quantized_model(model_name="EleutherAI/gpt-neo-1.3B"):
    model = AutoModelForCausalLM.from_pretrained(model_name)
    model.to("cpu")
    
    return model

def generate_text(prompt, tokenizer, model, max_length=100):
    inputs = tokenizer(prompt, return_tensors="pt")
    inputs = {key: value.to("cpu") for key, value in inputs.items()}
    output_ids = model.generate(**inputs, max_length=max_length)
    text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return text

def measure_flops(model, sequence_length=50):
    """
    Measure FLOPs and parameter count using ptflops.
    Note: For quantized models, reported FLOPs might not fully reflect low-precision ops.
    """
    from ptflops import get_model_complexity_info

    dummy_input_shape = (1, sequence_length)  
    macs, params = get_model_complexity_info(
        model, dummy_input_shape, as_strings=True,
        print_per_layer_stat=True, verbose=True
    )
    print("=== FLOPs and Parameter Count ===")
    print("MACs:", macs)
    print("Params:", params)

def measure_memory(model, tokenizer, prompt="Test prompt"):
    """
    Measure memory usage during a forward pass using torch.profiler.
    """
    import torch.profiler

    inputs = tokenizer(prompt, return_tensors="pt")
    inputs = {key: value.to("cpu") for key, value in inputs.items()}

    with torch.profiler.profile(
        activities=[torch.profiler.ProfilerActivity.CPU],
        profile_memory=True,
        record_shapes=True,
    ) as prof:
        model(**inputs)

    print("=== Memory Usage (sorted by CPU memory consumption) ===")
    print(prof.key_averages().table(sort_by="cpu_memory_usage", row_limit=10))

def main():
    model_name = "EleutherAI/gpt-neo-1.3B"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = load_quantized_model(model_name)
    measure_flops(model, sequence_length=50)

    measure_memory(model, tokenizer, prompt="Test prompt for memory profiling")
    
    prompt = "Give me a recipe for okonomiyaki"
    generated_text = generate_text(prompt, tokenizer, model)
    print("\n=== Generated Text ===")
    print(generated_text)

if __name__ == "__main__":
    main()


Flops estimation was not finished successfully because of the following exception:
<class 'RuntimeError'> : Expected tensor for argument #1 'indices' to have one of the following scalar types: Long, Int; but got torch.FloatTensor instead (while checking arguments for embedding)
=== FLOPs and Parameter Count ===
MACs: None
Params: None


Traceback (most recent call last):
  File "/home/dice/anaconda3/envs/pytorch_env/lib/python3.9/site-packages/ptflops/pytorch_engine.py", line 68, in get_flops_pytorch
    _ = flops_model(batch)
  File "/home/dice/anaconda3/envs/pytorch_env/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/home/dice/anaconda3/envs/pytorch_env/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1844, in _call_impl
    return inner()
  File "/home/dice/anaconda3/envs/pytorch_env/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1790, in inner
    result = forward_call(*args, **kwargs)
  File "/home/dice/anaconda3/envs/pytorch_env/lib/python3.9/site-packages/transformers/models/gpt_neo/modeling_gpt_neo.py", line 738, in forward
    transformer_outputs = self.transformer(
  File "/home/dice/anaconda3/envs/pytorch_env/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1736, in _wrappe

=== Memory Usage (sorted by CPU memory consumption) ===
---------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                       Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg       CPU Mem  Self CPU Mem    # of Calls  
---------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                  aten::mul         0.80%       1.896ms         1.18%       2.787ms      28.731us      15.00 Mb      15.00 Mb            97  
                  aten::add         0.89%       2.115ms         0.97%       2.286ms      18.894us       9.45 Mb       9.45 Mb           121  
               aten::linear         0.59%       1.391ms        93.60%     221.399ms       1.527ms       9.40 Mb           0 b           145  
                aten::addmm        63.27%     149.649ms        63.77%     150.840ms       2.

In [6]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

def load_quantized_model(model_name="EleutherAI/gpt-neo-1.3B"):
    model = AutoModelForCausalLM.from_pretrained(model_name)
    model.to("cpu")
    quantized_model = torch.quantization.quantize_dynamic(
        model, {torch.nn.Linear}, dtype=torch.qint8
    )
    return quantized_model

def generate_text(prompt, tokenizer, model, max_length=100):
    inputs = tokenizer(prompt, return_tensors="pt")
    inputs = {key: value.to("cpu") for key, value in inputs.items()}
    output_ids = model.generate(**inputs, max_length=max_length)
    text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return text

def measure_flops(model, sequence_length=50):
    """
    Measure FLOPs and parameter count using ptflops.
    Note: For quantized models, reported FLOPs might not fully reflect low-precision ops.
    """
    from ptflops import get_model_complexity_info

    dummy_input_shape = (1, sequence_length)  
    macs, params = get_model_complexity_info(
        model, dummy_input_shape, as_strings=True,
        print_per_layer_stat=True, verbose=True
    )
    print("=== FLOPs and Parameter Count ===")
    print("MACs:", macs)
    print("Params:", params)

def measure_memory(model, tokenizer, prompt="Test prompt"):
    """
    Measure memory usage during a forward pass using torch.profiler.
    """
    import torch.profiler

    # Create a dummy input
    inputs = tokenizer(prompt, return_tensors="pt")
    inputs = {key: value.to("cpu") for key, value in inputs.items()}

    with torch.profiler.profile(
        activities=[torch.profiler.ProfilerActivity.CPU],
        profile_memory=True,
        record_shapes=True,
    ) as prof:
        model(**inputs)

    print("=== Memory Usage (sorted by CPU memory consumption) ===")
    print(prof.key_averages().table(sort_by="cpu_memory_usage", row_limit=10))

def main():
    model_name = "EleutherAI/gpt-neo-1.3B"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = load_quantized_model(model_name)
    measure_flops(model, sequence_length=50)
    measure_memory(model, tokenizer, prompt="Test prompt for memory profiling")
    prompt = "Give me a recipe for okonomiyaki"
    generated_text = generate_text(prompt, tokenizer, model)
    print("\n=== Generated Text ===")
    print(generated_text)

if __name__ == "__main__":
    main()


Flops estimation was not finished successfully because of the following exception:
<class 'RuntimeError'> : Expected tensor for argument #1 'indices' to have one of the following scalar types: Long, Int; but got torch.FloatTensor instead (while checking arguments for embedding)
=== FLOPs and Parameter Count ===
MACs: None
Params: None
=== Memory Usage (sorted by CPU memory consumption) ===


Traceback (most recent call last):
  File "/home/dice/anaconda3/envs/pytorch_env/lib/python3.9/site-packages/ptflops/pytorch_engine.py", line 68, in get_flops_pytorch
    _ = flops_model(batch)
  File "/home/dice/anaconda3/envs/pytorch_env/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/home/dice/anaconda3/envs/pytorch_env/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1844, in _call_impl
    return inner()
  File "/home/dice/anaconda3/envs/pytorch_env/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1790, in inner
    result = forward_call(*args, **kwargs)
  File "/home/dice/anaconda3/envs/pytorch_env/lib/python3.9/site-packages/transformers/models/gpt_neo/modeling_gpt_neo.py", line 738, in forward
    transformer_outputs = self.transformer(
  File "/home/dice/anaconda3/envs/pytorch_env/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1736, in _wrappe

-----------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                         Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg       CPU Mem  Self CPU Mem    # of Calls  
-----------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                  aten::empty         1.27%     771.316us         1.27%     771.316us       1.506us      21.26 Mb      21.25 Mb           512  
                    aten::mul         2.08%       1.259ms         2.71%       1.642ms      16.923us      15.00 Mb      15.00 Mb            97  
             aten::empty_like         0.34%     206.366us         0.92%     559.202us       3.309us      10.33 Mb     200.00 Kb           169  
                    aten::add         1.57%     950.328us         1.71%       1.033ms       8.535us       9.45 Mb       9.45 Mb         