In [2]:
pip install torch

Collecting torch
  Downloading torch-2.10.0-cp314-cp314-macosx_14_0_arm64.whl.metadata (31 kB)
Collecting filelock (from torch)
  Using cached filelock-3.20.3-py3-none-any.whl.metadata (2.1 kB)
Collecting typing-extensions>=4.10.0 (from torch)
  Using cached typing_extensions-4.15.0-py3-none-any.whl.metadata (3.3 kB)
Collecting setuptools (from torch)
  Using cached setuptools-81.0.0-py3-none-any.whl.metadata (6.6 kB)
Collecting sympy>=1.13.3 (from torch)
  Using cached sympy-1.14.0-py3-none-any.whl.metadata (12 kB)
Collecting networkx>=2.5.1 (from torch)
  Using cached networkx-3.6.1-py3-none-any.whl.metadata (6.8 kB)
Collecting jinja2 (from torch)
  Using cached jinja2-3.1.6-py3-none-any.whl.metadata (2.9 kB)
Collecting fsspec>=0.8.5 (from torch)
  Using cached fsspec-2026.2.0-py3-none-any.whl.metadata (10 kB)
Collecting mpmath<1.4,>=1.1.0 (from sympy>=1.13.3->torch)
  Using cached mpmath-1.3.0-py3-none-any.whl.metadata (8.6 kB)
Collecting MarkupSafe>=2.0 (from jinja2->torch)
  Downl

In [3]:
pip install transformers

Collecting transformers
  Using cached transformers-5.1.0-py3-none-any.whl.metadata (31 kB)
Collecting huggingface-hub<2.0,>=1.3.0 (from transformers)
  Using cached huggingface_hub-1.4.1-py3-none-any.whl.metadata (13 kB)
Collecting numpy>=1.17 (from transformers)
  Downloading numpy-2.4.2-cp314-cp314-macosx_14_0_arm64.whl.metadata (6.6 kB)
Collecting pyyaml>=5.1 (from transformers)
  Downloading pyyaml-6.0.3-cp314-cp314-macosx_11_0_arm64.whl.metadata (2.4 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2026.1.15-cp314-cp314-macosx_11_0_arm64.whl.metadata (40 kB)
Collecting tokenizers<=0.23.0,>=0.22.0 (from transformers)
  Using cached tokenizers-0.22.2-cp39-abi3-macosx_11_0_arm64.whl.metadata (7.3 kB)
Collecting typer-slim (from transformers)
  Using cached typer_slim-0.21.1-py3-none-any.whl.metadata (16 kB)
Collecting safetensors>=0.4.3 (from transformers)
  Using cached safetensors-0.7.0-cp38-abi3-macosx_11_0_arm64.whl.metadata (4.1 kB)
Collecting tqdm>=4.27

In [4]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch.profiler
import time

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

Loading weights: 100%|██████████| 148/148 [00:00<00:00, 2415.34it/s, Materializing param=transformer.wte.weight]             
[1mGPT2LMHeadModel LOAD REPORT[0m from: gpt2
Key                  | Status     |  | 
---------------------+------------+--+-
h.{0...11}.attn.bias | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


In [6]:
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = model.config.eos_token_id
    
print("Model and tokenizer loaded successfully.")

Model and tokenizer loaded successfully.


In [7]:
prompt = "The future of artificial intelligence is"
inputs = tokenizer(prompt, return_tensors="pt")
num_new_tokens_to_generate = 50

print(f"Input prompt: {prompt}")
print(f"Task: Generate {num_new_tokens_to_generate} new tokens.")

Input prompt: The future of artificial intelligence is
Task: Generate 50 new tokens.


In [9]:
print("--- Profiling on CPU ---")
model.to("cpu")
inputs_cpu = {k: v.to("cpu") for k, v in inputs.items()}

def run_cpu_inference(input_data, max_tokens):
    with torch.no_grad():
        model.generate(
            input_ids=input_data["input_ids"],
            attention_mask=input_data["attention_mask"],
            max_new_tokens=max_tokens,
            pad_token_id=tokenizer.pad_token_id
        )

        
print("running inference on CPU and capturing profile...")
start_time_cpu = time.time()
with torch.profiler.profile(
    activities=[torch.profiler.ProfilerActivity.CPU],
    record_shapes=False,
    profile_memory=False,
) as prof_cpu:
    with torch.profiler.record_function("cpu_inference"):
        run_cpu_inference(inputs_cpu, num_new_tokens_to_generate)
end_time_cpu = time.time()
print(f"CPU inference completed in {end_time_cpu - start_time_cpu:.2f} seconds.")
print(prof_cpu.key_averages().table(sort_by="cpu_time_total", row_limit=10))

--- Profiling on CPU ---
running inference on CPU and capturing profile...
CPU inference completed in 0.95 seconds.
-----------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                 Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  
-----------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                        cpu_inference         9.57%      85.621ms       100.00%     894.735ms     894.735ms             1  
                                          aten::addmm        58.04%     519.304ms        58.43%     522.813ms     217.839us          2400  
                                         aten::linear         0.05%     415.837us        20.01%     178.999ms       3.580ms            50  
                                         ate

In [11]:
import torch
import time

# Check for MPS (Apple GPU)
device = "mps" if torch.backends.mps.is_available() else "cpu"
print(f"Using device: {device}")

model.to(device)
inputs_device = {k: v.to(device) for k, v in inputs.items()}

def run_mps_inference(input_data, max_tokens):
    with torch.no_grad():
        if device == "mps":
            torch.mps.synchronize()  # ensure previous ops complete
        outputs = model.generate(
            input_ids=input_data["input_ids"],
            attention_mask=input_data["attention_mask"],
            max_new_tokens=max_tokens,
            pad_token_id=tokenizer.pad_token_id
        )
        if device == "mps":
            torch.mps.synchronize()
        return outputs

print("\n--- Profiling on MPS (Apple GPU) ---" if device == "mps" else "\n--- CPU only ---")

# Warmup
run_mps_inference(inputs_device, num_new_tokens_to_generate)

# Profile
start = time.time()
with torch.profiler.profile(
    activities=[torch.profiler.ProfilerActivity.CPU],  # MPS does not support CUDA profiling
    record_shapes=False,
    profile_memory=False,
) as prof:
    with torch.profiler.record_function("mps_inference"):
        run_mps_inference(inputs_device, num_new_tokens_to_generate)

end = time.time()

print(f"Inference completed in {end - start:.2f} seconds.")
print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=10))


Using device: mps

--- Profiling on MPS (Apple GPU) ---
Inference completed in 0.32 seconds.
----------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  
----------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                       mps_inference        15.20%      41.247ms       100.00%     271.288ms     271.288ms             1  
                                          aten::item         0.07%     196.226us        38.70%     104.980ms     116.257us           903  
                                    aten::is_nonzero         0.02%      56.588us        38.65%     104.854ms     689.830us           152  
                           aten::_local_scalar_dense        38.60%     10