## Intial Setup

In [1]:
!nvidia-smi

Thu Feb 29 12:49:27 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.146.02             Driver Version: 535.146.02   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |


|   0  NVIDIA GeForce RTX 4080        Off | 00000000:2D:00.0  On |                  N/A |
|  0%   42C    P8              10W / 320W |   8132MiB / 16376MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                                         
+---------------------------------------------------------------------------------------+
| Processes:                                                                            |
|  GPU   GI   CI        PID   Type   Process name                            GPU Memory |
|        ID   ID                                                             Usage      |
|    0   N/A  N/A    417886      C   ...19/miniconda3/envs/DLCW1/bin/python     5482MiB |
|    0   N/A  N/A   3463862      G   /usr/lib/xorg/Xorg                           78MiB |
|    0   N

## Loading the Model

In [1]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch

model_name = 'gpt2-xl' # could also be 'gpt2', 'gp2-medium', 'gpt2-xl'

# Load pre-trained model tokenizer (vocabulary)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)


# Ensure pad token is set for the tokenizer
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Checking if a GPU (CUDA) is available and if so, use it
use_gpu = True
device = torch.device("cuda" if use_gpu else "cpu")
model.to(device)

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/689 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/6.43G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 1600)
    (wpe): Embedding(1024, 1600)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-47): 48 x GPT2Block(
        (ln_1): LayerNorm((1600,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1600,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1600,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1600, out_features=50257, bias=False)
)

## Playing with the Model

In [2]:
# Encode a text input to a sequence of tokens (numbers)
prompt = "Spain is a country where people are"
input_ids = tokenizer.encode(prompt, return_tensors='pt').to(device)

# Generate a response to the input text
output = model.generate(input_ids, max_length=50, pad_token_id=tokenizer.pad_token_id)

# Move the output back to CPU for decoding, in case it's on a GPU
output_text = tokenizer.decode(output[0].cpu(), skip_special_tokens=True)

print(output_text)

Spain is a country where people are very proud of their heritage and culture. It is a country where people are very proud of their language and their culture. It is a country where people are very proud of their history and their culture. It is a


## Attention Heads

In [4]:
prompt = """Text with Grammatical Errors: I am want to go to the store yesterday.
Corrected Text Without Grammatical Errors: I wanted to go to the store yesterday.

Text with Grammatical Errors: She don't like running in the mornings.
Corrected Text Without Grammatical Errors: She doesn't like running in the mornings.

Text with Grammatical Errors: They has been waiting for you since last hour.
Corrected Text Without Grammatical Errors: They have been waiting for you since last hour.

Text with Grammatical Errors: I has a lot of books about science and history.
Corrected Text Without Grammatical Errors:"""

In [6]:
input_ids = tokenizer.encode(prompt, return_tensors='pt').to(device)

# Step 1: Generate text
output = model.generate(input_ids, max_length=200, pad_token_id=tokenizer.pad_token_id)
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

# Step 2: Capture attention weights separately
output = model(input_ids, output_attentions=True)
attentions = output.attentions  # Capture attention weights

# Output results
# print(f"Generated text: {generated_text}")
num_layers = len(attentions)
print(f"Number of layers: {num_layers}")
num_heads_per_layer = attentions[0].shape[1]
print(f"Number of attention heads per layer: {num_heads_per_layer}")
attention_shape = attentions[0].shape
print(f"Total number of attention heads: {num_layers * num_heads_per_layer}")
print(f"Shape of attention weights for one layer: {attention_shape}")


Number of layers: 48
Number of attention heads per layer: 25
Total number of attention heads: 1200
Shape of attention weights for one layer: torch.Size([1, 25, 7, 7])


## Extracting the Time Series

In [17]:
num_tokens_to_generate = 10

input_ids = tokenizer.encode(prompt, return_tensors='pt').to(device)
generated_ids = input_ids

# Container for attention weights of each generation step
attention_weights = []

for _ in range(num_tokens_to_generate):  # Generate 5 tokens
    # Generate the next token (setting max_length to current sequence length + 1)
    outputs = model(generated_ids, output_attentions=True)
    next_token_logits = outputs.logits[:, -1, :]  # Get logits for the next token predictions
    next_token_id = torch.argmax(next_token_logits, dim=-1).unsqueeze(-1)
    
    # Save attention weights
    attention_weights.append(outputs.attentions)
    
    # Update the input_ids to include the generated token
    generated_ids = torch.cat((generated_ids, next_token_id), dim=1)

# Decode the generated ids to text
generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

print(f"Generated text: {generated_text}")


Generated text: Spain is a country where people are very proud of their heritage and culture. It is


In [27]:
print(f'Length of attention_weights (time steps): {len(attention_weights)} , each with {len(attention_weights[0])} layers, and each wih a shape of {(attention_weights[0][0].shape)}')

Length of attention_weights (time steps): 10 , each with 48 layers, and each wih a shape of torch.Size([1, 25, 7, 7])


### Encapsulate

In [25]:
import torch

def generate_text_with_attention(model, tokenizer, prompt, num_tokens_to_generate, device):
    """
    Autoregressively generates text from a given prompt while capturing attention weights.

    Parameters:
    - model: The language model.
    - tokenizer: The tokenizer corresponding to the model.
    - prompt: A string containing the initial text to generate from.
    - num_tokens_to_generate: The number of tokens to generate.
    - device: The device to run the generation on ('cpu' or 'cuda').

    Returns:
    - generated_text: The generated text as a string.
    - attention_weights: A list of attention weights for each generated token.
    """
    # Encode the prompt and move to the specified device
    input_ids = tokenizer.encode(prompt, return_tensors='pt').to(device)
    generated_ids = input_ids

    # Container for attention weights of each generation step
    attention_weights = []

    for _ in range(num_tokens_to_generate):
        # Generate the next token and capture attention
        outputs = model(generated_ids, output_attentions=True)
        next_token_logits = outputs.logits[:, -1, :]  # Logits for the next token predictions
        next_token_id = torch.argmax(next_token_logits, dim=-1).unsqueeze(-1)
        
        # Save attention weights
        attention_weights.append(outputs.attentions)
        
        # Update the input_ids to include the generated token
        generated_ids = torch.cat((generated_ids, next_token_id), dim=1)

    # Decode the generated ids to text
    generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

    return generated_text, attention_weights

In [26]:
num_tokens_to_generate = 10

generated_text, attention_weights = generate_text_with_attention(model, tokenizer, prompt, num_tokens_to_generate, device)

print(generated_text)


OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 MiB. GPU 0 has a total capacity of 15.70 GiB of which 3.38 MiB is free. Process 4068805 has 2.50 GiB memory in use. Process 417886 has 5.35 GiB memory in use. Including non-PyTorch memory, this process has 7.75 GiB memory in use. Of the allocated memory 7.45 GiB is allocated by PyTorch, and 30.46 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

## Wrong Time Series Extraction

In [6]:
def capture_incremental_attentions(model, tokenizer, prompt, device):
    """
    Captures attention weights incrementally for each token in the prompt.
    
    Args:
    - model: The transformer model from which to capture attentions.
    - tokenizer: The tokenizer corresponding to the model.
    - prompt: The input text prompt as a string.
    - device: The device (e.g., 'cpu' or 'cuda') on which to run the model.
    
    Returns:
    - all_attentions: A list where each element contains the attention weights
                      for the sequence up to that token. Each element is a tuple
                      of tensors, with one tensor per layer.
    """
    # Tokenize the entire prompt
    tokens = tokenizer.encode(prompt, add_special_tokens=False)
    
    # Initialize a list to hold the attention weights at each step
    all_attentions = []
    
    # Incrementally build the input sequence and capture attention weights
    for i in range(1, len(tokens) + 1):
        # Encode the progressively longer sequence
        input_ids = tokenizer.encode(tokens[:i], return_tensors='pt').to(device)
        
        # Pass the sequence through the model, requesting attention weights
        output = model(input_ids, output_attentions=True)
        
        # Extract and store the attention weights for this step
        attentions = output.attentions  # This is a tuple of tensors
        all_attentions.append(attentions)
    
    return all_attentions

In [7]:
prompt = "I am want to go to "
incremental_attentions = capture_incremental_attentions(model, tokenizer, prompt, device)

In [8]:
print(f'Length of attention_tennsores: {len(incremental_attentions)}')
for i in range(len(incremental_attentions)):
    print(f'Shape of attention tensors for token {i + 1}: {incremental_attentions[i][0].shape}')

Length of attention_tennsores: 7
Shape of attention tensors for token 1: torch.Size([1, 25, 1, 1])
Shape of attention tensors for token 2: torch.Size([1, 25, 2, 2])
Shape of attention tensors for token 3: torch.Size([1, 25, 3, 3])
Shape of attention tensors for token 4: torch.Size([1, 25, 4, 4])
Shape of attention tensors for token 5: torch.Size([1, 25, 5, 5])
Shape of attention tensors for token 6: torch.Size([1, 25, 6, 6])
Shape of attention tensors for token 7: torch.Size([1, 25, 7, 7])
