In [1]:
!pip install transformers torch bitsandbytes accelerate matplotlib

Collecting bitsandbytes
  Downloading bitsandbytes-0.49.1-py3-none-macosx_14_0_arm64.whl.metadata (10 kB)
Downloading bitsandbytes-0.49.1-py3-none-macosx_14_0_arm64.whl (129 kB)
Installing collected packages: bitsandbytes
Successfully installed bitsandbytes-0.49.1


In [4]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

MODEL_NAME = "gpt2"

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

if device.type == "cpu":
    print("\nWarning: MPS not available. Using CPU.\n")


Using device: mps


In [5]:
def get_mode_memory_footprint(model):
    """Caliculates and returns the modesl memory footprint in MB."""
    mem_params = sum(param.nelement() * param.element_size() for param in model.parameters())
    mem_buffers = sum(buffer.nelement() * buffer.element_size() for buffer in model.buffers())
    total_memory = (mem_params + mem_buffers) / (1024 ** 2)  # Convert to MB
    return total_memory

In [6]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    
memory_footprints = {}

In [9]:
print("-----1. Loading Baseline Model-----")
baseline_name = "FP16 (Baseline)"

model_baseline = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    dtype=torch.float16,).to(device)

memory_baseline = get_mode_memory_footprint(model_baseline)
memory_footprints[baseline_name] = f"{memory_baseline:.2f} MB"
print(f"{baseline_name} Memory Footprint: {memory_footprints[baseline_name]}")

-----1. Loading Baseline Model-----


Loading weights: 100%|██████████| 148/148 [00:00<00:00, 1865.87it/s, Materializing param=transformer.wte.weight]             
[1mGPT2LMHeadModel LOAD REPORT[0m from: gpt2
Key                  | Status     |  | 
---------------------+------------+--+-
h.{0...11}.attn.bias | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


FP16 (Baseline) Memory Footprint: 249.35 MB


In [10]:
print("\n------2. Loading model with 8-bit quantization------")
quantized_name = "8-bit Quantization"

if device.type == "mps":
    model_8bit = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        load_in_8bit=True,
        device_map="auto",
    )
    
    memory_8bit = get_mode_memory_footprint(model_8bit)
    memory_footprints[quantized_name] = f"{memory_8bit:.2f} MB"
    print(f"{quantized_name} Memory Footprint: {memory_footprints[quantized_name]}")
else:
    print("8-bit quantization with bitsandbytes is not supported on CPU. Skipping this step.")


------2. Loading model with 8-bit quantization------


TypeError: GPT2LMHeadModel.__init__() got an unexpected keyword argument 'load_in_8bit'

8bit quantization wont be supported in mac

In [11]:
print("\n------3. Loading Model with 4-bit quantization------")
quantized_name_4bit = "4-bit Quantization"  

if device.type == "mps":
    model_4bit = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        load_in_4bit=True,
        device_map="auto",
    )
    
    memory_4bit = get_mode_memory_footprint(model_4bit)
    memory_footprints[quantized_name_4bit] = f"{memory_4bit:.2f} MB"
    print(f"{quantized_name_4bit} Memory Footprint: {memory_footprints[quantized_name_4bit]}")
else:
    print("4-bit quantization with bitsandbytes is not supported on CPU. Skipping this step.")


------3. Loading Model with 4-bit quantization------


TypeError: GPT2LMHeadModel.__init__() got an unexpected keyword argument 'load_in_4bit'

4bit is not supported by mps, its only for CUDA kernels