# Pretraining on unlabeled data

## Evaluating generative text models

### Using GPT to generate text

In [5]:
from importlib.metadata import version
pkgs = [
  "matplotlib",
  "numpy",
  "tiktoken",
  "torch",
  "tensorflow" # to load the pretrained weights from openai
]

for p in pkgs:
  print(f"{p}: {version(p)}")


matplotlib: 3.10.8
numpy: 2.4.1
tiktoken: 0.12.0
torch: 2.10.0
tensorflow: 2.20.0


In [7]:
import torch
import torch.nn as nn

In [8]:
from previous_chapter_four import GPTModel # the GPT model from chapter-4

GPT_CONFIG_124M = {
  "vocab_size": 50257, # vocab size 
  "embed_dim": 768, # embedding dimension
  "context_length": 256, # context length
  "drop_rate": 0.1, # dropout rate
  "n_layers": 12, # number of layers (how many transformer blocks we want to stack)
  "n_heads": 12, # number of attention heads4
  "qkv_bias": False # whether to use bias in the QKV layer
}

In [None]:
torch.manual_seed(123)
model = GPTModel(GPT_CONFIG_124M)
model.eval()

GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(256, 768)
  (dropout): Dropout(p=0.1, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (attn): MultiHeadAttention(
        (W_query): Linear(in_features=768, out_features=768, bias=False)
        (W_key): Linear(in_features=768, out_features=768, bias=False)
        (W_value): Linear(in_features=768, out_features=768, bias=False)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_shortcut): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (attn): MultiHeadAttention(
        (W_query): Linear(in_feature

In [12]:
import tiktoken
from previous_chapter_four import generate_text_simple

def text_to_token_ids(text, tokenizer):
  encoded = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
  encoded_tensor = torch.tensor(encoded).unsqueeze(0) # add a dimension to the tensor (model expects a batch dimension)
  return encoded_tensor

In [14]:
start_context = "Every effort moves you"
tokenizer = tiktoken.get_encoding("gpt2")

token_ids = text_to_token_ids(start_context, tokenizer)
token_ids





tensor([[6109, 3626, 6100,  345]])

In [15]:
def token_ids_to_text(token_ids, tokenizer):
  flat = token_ids.squeeze(0) # remove a dimension (the batch dimension)
  return tokenizer.decode(flat.tolist())

token_ids_to_text(token_ids, tokenizer)

'Every effort moves you'

In [None]:
token_ids = generate_text_simple(
  model=model, 
  idx=text_to_token_ids(start_context, tokenizer), 
  max_new_tokens=10, 
  context_size=GPT_CONFIG_124M["context_length"]
)
token_ids.shape


# 14 because we start with 4 tokens (every effort moves you) and then generate 10 new tokens

torch.Size([1, 14])

In [18]:
token_ids_to_text(token_ids, tokenizer)

'Every effort moves you rentingetic minion mobilized Macicone warranty hops ful strutConnector'

## Calculating the text generation loss: cross-entropy and perplexity