In [8]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

In [9]:
device = "cpu"
model_name = "gpt2"

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

In [10]:
prompt = "The quick brown fox jumped over the lazy dog."
inputs = tokenizer(
    prompt,
    truncation=True,
    padding=True,
    return_tensors="pt"
)

print(f"Number of tokens: {len(inputs[0])}.")

layer_num = 7

Number of tokens: 10.


In [11]:
with torch.no_grad():
    outputs = model(**inputs)
    cache = model.transformer.h[layer_num].attn.head_out

Attention Layer 0 Shape: torch.Size([1, 12, 10, 64])


In [12]:
print(cache.shape)

torch.Size([1, 10, 12, 768])


In [22]:
inputs = tokenizer(prompt, return_tensors="pt")

outputs = model.generate(
    inputs["input_ids"],
    max_length = 50,
    max_new_tokens = 10,
    num_return_sequences=1,
    pad_token_id=tokenizer.eos_token_id,
    )

Both `max_new_tokens` (=10) and `max_length`(=50) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Attention Layer 0 Shape: torch.Size([1, 12, 10, 64])
Attention Layer 0 Shape: torch.Size([1, 12, 1, 64])
Attention Layer 0 Shape: torch.Size([1, 12, 1, 64])
Attention Layer 0 Shape: torch.Size([1, 12, 1, 64])
Attention Layer 0 Shape: torch.Size([1, 12, 1, 64])
Attention Layer 0 Shape: torch.Size([1, 12, 1, 64])
Attention Layer 0 Shape: torch.Size([1, 12, 1, 64])
Attention Layer 0 Shape: torch.Size([1, 12, 1, 64])
Attention Layer 0 Shape: torch.Size([1, 12, 1, 64])
Attention Layer 0 Shape: torch.Size([1, 12, 1, 64])
