# Setup

In [11]:
import tiktoken
import torch

from src.build_llm_from_scratch_book.modules import (
    GPTConfig,
    GPTModel,
)
from src.build_llm_from_scratch_book.text import generate_text_simple, text_to_token_ids, token_ids_to_text

In [12]:
config = GPTConfig(
    vocab_size=50257, context_length=256, embed_dim=768, n_heads=12, n_layers=12, drop_rate=0.1, qkv_bias=False
)
torch.manual_seed(123)
model = GPTModel(config)
model.eval()

GPTModel(
  (token_embeddings): Embedding(50257, 768)
  (positional_embeddings): Embedding(256, 768)
  (dropout): Dropout(p=0.1, inplace=False)
  (transformer_blocks): Sequential(
    (0): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=768, out_features=768, bias=False)
        (W_key): Linear(in_features=768, out_features=768, bias=False)
        (W_value): Linear(in_features=768, out_features=768, bias=False)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_shortcut): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (att): MultiHeadAttention(
       

# Generate text

Generate text with untrained model. 10 tokens are, generated, all giberrish


In [13]:
start_context = "Every effort moves you"
tokenizer = tiktoken.get_encoding("gpt2")

token_ids = generate_text_simple(
    model=model,
    idx=text_to_token_ids(start_context, tokenizer),
    max_new_tokens=10,
    context_size=config.context_length,
)
print("Output text:\n", token_ids_to_text(token_ids, tokenizer))

Output text:
 Every effort moves you rentingetic chief refusing holidays Shannon GamergateHay men methamphetamine


Simulate predictions accross 2 input vectors

In [14]:
inputs = torch.tensor(
    [
        [16833, 3626, 6100],  # ["every", "effort" "moves"],
        [40, 1107, 588] # ["I", "really", "like"]
    ]
) 

In [15]:
targets = torch.tensor(
    [
        [3626, 6100, 345],  # ["effort", "moves", "you",
        [1107, 588, 11311], # ["really", "like", "chocolate"]
    ]
)

In [16]:
with torch.no_grad():
    logits = model(inputs)
probas = torch.softmax(logits, dim=-1)
probas.shape  # batch_size, seq_len (number of tokens), vocab_size

torch.Size([2, 3, 50257])

In [17]:
token_ids = torch.argmax(probas, dim=-1, keepdim=True)
print("Token IDs:\n", token_ids)
print(token_ids.shape)  # shows the last dimension has been reduced to 1 (the token ids)


Token IDs:
 tensor([[[50153],
         [  339],
         [42826]],

        [[49906],
         [29669],
         [41751]]])
torch.Size([2, 3, 1])


Running this shows that the generated text does not match the target (expected output)

In [18]:
print(f"Targets batch 1: {token_ids_to_text(targets[0], tokenizer)}")
print(f"Outputs batch 1:"
      f" {token_ids_to_text(token_ids[0].flatten(), tokenizer)}")

Targets batch 1:  effort moves you
Outputs batch 1:  PRESIDENT heNetflix


Initial probabilties

In [19]:
text_idx = 0
target_probas_1 = probas[text_idx, [0, 1, 2], targets[text_idx]]
print("Text 1:", target_probas_1)

text_idx = 1
target_probas_2 = probas[text_idx, [0, 1, 2], targets[text_idx]]
print("Text 2:", target_probas_2)

Text 1: tensor([7.6198e-05, 3.1919e-05, 1.1728e-05])
Text 2: tensor([1.0538e-05, 5.5378e-05, 4.9063e-06])


# Calculate loss for the proba scores of the two batches relatives to their targets (expected values)

## Understanding torch.log()

In this code, `torch.log()` is used to calculate the natural logarithm (log base e) of the probability values. This is a crucial step in calculating the loss function for training the language model. Here's why:

1. **Natural Logarithm**: `torch.log()` computes the natural logarithm of each element in the input tensor. For a probability p, it returns ln(p).

2. **Why Use Log Probabilities**:
   - Working with log probabilities is numerically more stable than raw probabilities
   - When probabilities are very small (like in our case, ranging from ~10^-5 to ~10^-12), their log values are more manageable
   - Log probabilities can be added instead of multiplying probabilities, which helps prevent numerical underflow

3. **In Our Code**:
   - We first calculate probabilities using softmax: `probas = torch.softmax(logits, dim=-1)`
   - Then we take the log of these probabilities: `log_probas = torch.log(...)`
   - The negative of these log probabilities will be used to compute the cross-entropy loss

4. **Example**:
   - If a probability is 0.0001 (1e-4)
   - Its log value would be approximately -9.21
   - This is why we see negative values in our output tensor (like -9.4822, -10.3523, etc.)

This transformation is a standard step in training neural networks for classification tasks, particularly in language modeling where we need to handle many small probability values.

In [20]:
log_probas = torch.log(torch.cat((target_probas_1, target_probas_2)))
print(log_probas)

tensor([ -9.4822, -10.3523, -11.3535, -11.4605,  -9.8013, -12.2250])
