# 5 Pretraining on unlabeled data

In [1]:
# Import stuff from ch4
import torch
import torch.nn as nn

GPT_CONFIG_124M = {
    "vocab_size": 50257,  # Vocabulary size
    "context_length": 256,  # Shortened context length (orig: 1024)
    "emb_dim": 768,  # Embedding dimension
    "n_heads": 12,  # Number of attention heads
    "n_layers": 12,  # Number of layers
    "drop_rate": 0.1,  # Dropout rate
    "qkv_bias": False,  # Query-key-value bias
}


class GPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])  # Token embedding layer
        self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])  # Pos embedding layer
        self.drop_emb = nn.Dropout(cfg["drop_rate"])  # not sure why this is called an embedding in the book
        self.trf_blocks = nn.Sequential(*[TransformerBlock(cfg) for _ in range(cfg["n_layers"])])  # n layers of trf
        self.final_norm = LayerNorm(cfg["emb_dim"])  # after the output of the trf blocks
        self.out_head = nn.Linear(cfg["emb_dim"], cfg["vocab_size"], bias=False)

    def forward(self, in_idx):
        batch_size, seq_len = in_idx.shape
        tok_embeds = self.tok_emb(in_idx)
        pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))
        x = tok_embeds + pos_embeds
        x = self.drop_emb(x)
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits


class TransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.att = MultiHeadAttention(
            d_in=cfg["emb_dim"],
            d_out=cfg["emb_dim"],
            context_length=cfg["context_length"],
            num_heads=cfg["n_heads"],
            dropout=cfg["drop_rate"],
            qkv_bias=cfg["qkv_bias"],
        )
        self.ff = FeedForward(cfg)
        self.norm1 = LayerNorm(cfg["emb_dim"])
        self.norm2 = LayerNorm(cfg["emb_dim"])
        self.drop_shortcut = nn.Dropout(cfg["drop_rate"])

    def forward(self, x):
        shortcut = x  #  shortcut connection for att block
        x = self.norm1(x)
        x = self.att(x)
        x = self.drop_shortcut(x)
        x += shortcut
        shortcut = x  # shortcut conn for ff block
        x = self.norm2(x)
        x = self.ff(x)
        x = self.drop_shortcut(x)
        x += shortcut
        return x


class LayerNorm(nn.Module):
    def __init__(self, emb_dim):  # emb_dim = token embedding dimension (size)
        super().__init__()
        self.eps = 1e-5
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True, unbiased=False)  # turn off bessel's correction for gpt2 compatibility
        norm_x = (x - mean) / torch.sqrt(var + self.eps)
        return self.scale * norm_x + self.shift


# need for trf block
class MultiHeadAttention(nn.Module):
    def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):
        super().__init__()
        assert d_out % num_heads == 0, "d_out must be divisible by num_heads"
        self.d_out = d_out
        self.num_heads = num_heads
        self.head_dim = d_out // num_heads  # Reduce projection dim to match desired output dim

        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.out_proj = nn.Linear(d_out, d_out)  # Linear layer to combine head outputs
        self.dropout = nn.Dropout(dropout)
        self.register_buffer("mask", torch.triu(torch.ones(context_length, context_length), diagonal=1))

    def forward(self, x):
        b, num_tokens, d_in = x.shape
        # As in `CausalAttention`, for inputs where `num_tokens` exceeds `context_length`,
        # this will result in errors in the mask creation further below.
        # In practice, this is not a problem since the LLM (chapters 4-7) ensures that inputs
        # do not exceed `context_length` before reaching this forward method.
        keys = self.W_key(x)  # shape: (b, num_tokens, d_out)
        queries = self.W_query(x)
        values = self.W_value(x)

        # We implicitly split the matrix by adding a `num_heads` dimension
        # Unroll last dim: (b, num_tokens, d_out) -> (b, num_tokens, num_heads, head_dim)
        keys = keys.view(b, num_tokens, self.num_heads, self.head_dim)  # fine because loop-order stays the same, just unrolled
        values = values.view(b, num_tokens, self.num_heads, self.head_dim)
        queries = queries.view(b, num_tokens, self.num_heads, self.head_dim)

        # Now transpose so that we have token-major order
        keys = keys.transpose(1, 2)
        queries = queries.transpose(1, 2)
        values = values.transpose(1, 2)

        # Compute scaled dot-product attention (self-attention) w/ causal mask
        attn_scores = queries @ keys.transpose(2, 3)  # Dot product for each head

        # Original mask truncated to the number of tokens and converted to boolean
        mask_bool = self.mask.bool()[:num_tokens, :num_tokens]

        # Use the mask to fill attention scores
        attn_scores.masked_fill_(mask_bool, -torch.inf)
        attn_weights = torch.softmax(attn_scores / keys.shape[-1] ** 0.5, dim=-1)
        attn_weights = self.dropout(attn_weights)

        context_vec = (attn_weights @ values).transpose(1, 2)  # shape: (b, num_tokens, num_heads, head_dim)
        # Combine heads, where self.d_out = self.num_heads * self.head_dim
        context_vec = context_vec.contiguous().view(
            b, num_tokens, self.d_out
        )  # Need to call contiguous() because view expects data to be that way, and transpose changes it
        context_vec = self.out_proj(context_vec)  # optional projection (used in gpt2 and most others)
        return context_vec


class FeedForward(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        emb_dim = cfg["emb_dim"]
        self.layers = nn.Sequential(
            nn.Linear(emb_dim, 4 * emb_dim),  # project out into 4*emb_dim space to capture more nuanced info
            GELU(),
            nn.Linear(4 * emb_dim, emb_dim),  # project back down to easily connect ff layers
        )

    def forward(self, x):
        return self.layers(x)


# gelu is like relu but dips a little at the first few neg inputs, allowing them to make a small contrib and better for gradients
class GELU(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        # a cheaper approx:
        return 0.5 * x * (1 + torch.tanh(torch.sqrt(torch.tensor(2.0 / torch.pi)) * (x + 0.044715 * torch.pow(x, 3))))


model = GPTModel(GPT_CONFIG_124M)
del model

## 5.1 Evaluating text gen models

In [2]:
torch.manual_seed(123)
model = GPTModel(GPT_CONFIG_124M)
model.eval() # turn off dropout and other training-only stuff

GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(256, 768)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=768, out_features=768, bias=False)
        (W_key): Linear(in_features=768, out_features=768, bias=False)
        (W_value): Linear(in_features=768, out_features=768, bias=False)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_shortcut): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features

In [3]:
# from ch4. the magic function
def generate_text_simple(model, idx, max_new_tokens, context_size):
    for _ in range(max_new_tokens):
        idx_cond = idx[:, -context_size:]  # give only the last `context_size` tokens to the model
        with torch.no_grad():
            logits = model(idx_cond)
        logits = logits[:, -1, :]  # only care about last time step token s.t (batch, n_token, vocab_size) -> (batch, vs)
        probs = torch.softmax(logits, dim=-1)
        idx_next = torch.argmax(probs, dim=-1, keepdim=True)
        idx = torch.cat((idx, idx_next), dim=-1)
    return idx

In [None]:
# some helpers to get token ids from text and back
def text_to_token_ids(s, tokenizer):
    encoded = tokenizer.encode(s, allowed_special={"<|endoftext|>"})
    encoded_tensor = torch.tensor(encoded).unsqueeze(0) # adds a batch dimension at dim=0
    return encoded_tensor

def token_ids_to_text(ids, tokenizer):
    flat = ids.squeeze(0) # remove batch dim
    return tokenizer.decode(flat.tolist())

In [None]:
# example usage
import tiktoken


start_context = "Every effort moves you"
tokenizer = tiktoken.get_encoding("gpt2")

token_ids = text_to_token_ids(start_context, tokenizer)
print(token_ids)

# using the model
output = generate_text_simple(model, token_ids, 10, GPT_CONFIG_124M["context_length"])
print(output)
print(token_ids_to_text(output, tokenizer))

tensor([[6109, 3626, 6100,  345]])
tensor([[ 6109,  3626,  6100,   345, 34245,  5139,  2492, 25405, 17434, 17853,
          5308,  3398, 13174, 43071]])
Every effort moves you rentingetic wasnÙ… refres RexMeCHicular stren


### 5.1.2 Calculating the loss (for two small inputs)

In [6]:
inputs = torch.tensor([
    [16833, 3626, 6100], # ["every effort moves"
    [40, 1107, 588] # "I really like"]
]) 
targets = torch.tensor([
    [3626, 6100, 345], # effort moves you
    [1107, 588, 11311] # really like chocolate
])

In [7]:
torch.manual_seed(123)
with torch.no_grad():
    logits = model(inputs)
probas = torch.softmax(logits, dim=-1)
print(probas.shape)

torch.Size([2, 3, 50257])


In [8]:
token_ids = torch.argmax(probas, dim=-1, keepdim=True)
print(token_ids.shape)
print(token_ids)
for ti in token_ids:
    print(token_ids_to_text(ti.flatten(), tokenizer))

torch.Size([2, 3, 1])
tensor([[[16657],
         [  339],
         [42826]],

        [[49906],
         [29669],
         [41751]]])
 Armed heNetflix
 pressuring empoweredfaith


In [9]:
# getting initial probability scores
text_index = 0
target_probas_1 = probas[text_index, [0,1,2], targets[text_index]] # prob of first target token to be in first pos, second to be in second, and third to be in third
# torch.set_printoptions(sci_mode=True)
print("Text 1:\n", target_probas_1) # okay so these are the predicted probabilities
target_probas_2 = probas[1, [0, 1, 2], targets[1]]
print("Text 2:\n", target_probas_2)

Text 1:
 tensor([7.4541e-05, 3.1061e-05, 1.1563e-05])
Text 2:
 tensor([1.0337e-05, 5.6776e-05, 4.7559e-06])


In [10]:
# six steps:
# logits -> softmax (probs) -> extract targets -> log -> mean -> neg
# this is basically what `cross_entropy` is

# first, we have to flatten
print(logits.shape)  
print(targets.shape)
flat_logits = logits.flatten(0, 1) # along the batch dim
flat_targets = targets.flatten()
print(flat_logits.shape)
print(flat_targets.shape)

torch.Size([2, 3, 50257])
torch.Size([2, 3])
torch.Size([6, 50257])
torch.Size([6])


In [11]:
loss = nn.functional.cross_entropy(flat_logits, flat_targets)
print(loss.item())

10.793964385986328


In [12]:
perplexity = torch.exp(loss)
print(perplexity.item()) # basically the # of tokens the model is uncertain about. dont ask me how

48725.8203125


### 5.1.3 Calculating training and validation set losses

In [13]:
# load text data
with open("./the-verdict.txt", encoding="utf-8") as f:
    text_data = f.read()
print(text_data[:99])
print(text_data[-99:])

I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 
it for me! The Strouds stand alone, and happen once--but there's no exterminating our kind of art."


In [14]:
total_characters = len(text_data)
total_tokens = len(tokenizer.encode(text_data))
print(total_characters)
print(total_tokens) # very small amount, but enough for edu. purposes

20479
5145


In [15]:
from dataloader import create_dataloader_v1

# train/val ratio
train_ratio = .9
split_index = int(train_ratio * len(text_data))
train_data = text_data[:split_index]
val_data = text_data[split_index:]

train_loader = create_dataloader_v1(
    train_data,
    batch_size=2,
    max_length=GPT_CONFIG_124M["context_length"],
    stride=GPT_CONFIG_124M["context_length"],
)

val_loader = create_dataloader_v1(
    val_data,
    batch_size=2,
    max_length=GPT_CONFIG_124M["context_length"],
    stride=GPT_CONFIG_124M["context_length"],
)

In [16]:
# Sanity checks

# amount_tok_in_train < context_length
if total_tokens * (train_ratio) < GPT_CONFIG_124M["context_length"]:
    print(
        "Not enough tokens for the training loader. "
        "Try to lower the `GPT_CONFIG_124M['context_length']` or "
        "increase the `training_ratio`"
    )

# amount_tok_in_val < context_length
if total_tokens * (1 - train_ratio) < GPT_CONFIG_124M["context_length"]:
    print(
        "Not enough tokens for the validation loader. "
        "Try to lower the `GPT_CONFIG_124M['context_length']` or "
        "decrease the `training_ratio`"
    )

In [17]:
# check data sizes

print("Train loader:")
for x, y in train_loader:
    print(x.shape, y.shape)

print("\nValidation loader:")
for x, y in val_loader:
    print(x.shape, y.shape)

# 2 texts per batch, 256-emb dim

Train loader:
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])

Validation loader:
torch.Size([2, 256]) torch.Size([2, 256])


In [18]:
def calc_loss_batch(inputb, target, model, device):
    inputb, target = inputb.to(device), target.to(device)
    logits = model(inputb)
    loss = nn.functional.cross_entropy(logits.flatten(0,1), target.flatten())
    return loss

def calc_loss_loader(data_loader, model, device, num_batches=None):
    total_loss = torch.tensor(0.0).to(device)
    # total_loss = 0.0
    if len(data_loader) == 0:
        return float("nan")
    elif num_batches is None:
        num_batches = len(data_loader)
    else:
        num_batches = min(num_batches, len(data_loader))
    for i, (ib, tb) in enumerate(data_loader):
        if i < num_batches:
            loss = calc_loss_batch(ib, tb, model, device)
            total_loss += loss
            # total_loss += loss.item()
        else:
            break
    return total_loss.item() / num_batches
    # return total_loss / num_batches

In [None]:
device = "cuda" if torch.cuda.is_available() else "mps" if torch.mps.is_available() else "cpu"
# device = "cpu"
print(device)

model.to(device)
torch.manual_seed(123)
# with torch.no_grad():
#     train_loss = calc_loss_loader(train_loader, model, device)
#     val_loss = calc_loss_loader(val_loader, model, device)
# print(f"Train loss: {train_loss:.2f}")
# print(f"Val loss: {val_loss:.2f}")

mps
Train loss: 10.99
Val loss: 10.98


## 5.2 Training an LLM

In [26]:
def train_model_simple(model, train_loader, val_loader, optimizer, device, num_epochs, eval_freq, eval_iter, start_context, tokenizer):
    # bruh
    # lists to track tokens seen and losses
    train_losses, val_losses, track_tokens_seen = [], [], []
    tokens_seen, global_step = 0, -1
    
    # main loop
    for epoch in range(num_epochs):
        model.train()
        for input_batch, target_batch in train_loader:
            optimizer.zero_grad() # zero out grads from last step
            loss = calc_loss_batch(input_batch, target_batch, model, device)
            loss.backward() # calculate grads
            optimizer.step() # update model params based on grads
            tokens_seen += input_batch.numel()
            global_step += 1
            
            # check for eval
            if global_step % eval_freq == 0:
                train_loss, val_loss = evaluate_model(model, train_loader, val_loader, device, eval_iter)
                train_losses.append(train_loss)
                val_losses.append(val_loss)
                track_tokens_seen.append(tokens_seen)
                print(f"Ep {epoch+1} (Step {global_step:06d}): "
                      f" Train loss: {train_loss:.3f}, Val loss: {val_loss:.3f}")
        
        # generate text sample after each epoch
        generate_and_print_simple(model, tokenizer, device, start_context)

    return train_losses, val_losses, track_tokens_seen

def evaluate_model(model, train_loader, val_loader, device, eval_iter):
    model.eval()
    with torch.no_grad():
        train_loss = calc_loss_loader(train_loader, model, device, num_batches=eval_iter)
        val_loss = calc_loss_loader(val_loader, model, device, num_batches=eval_iter)
    model.train()
    return train_loss, val_loss

def generate_and_print_simple(model, tokenizer, device, start_context):
    model.eval()
    context_size =model.pos_emb.weight.shape[0]
    encoded = text_to_token_ids(start_context, tokenizer).to(device)
    with torch.no_grad():
        token_ids = generate_text_simple(model, encoded, 50, context_size)
    decoded_text = token_ids_to_text(token_ids, tokenizer)
    print(decoded_text.replace("\n", " ")) # more compact
    model.train()

In [24]:
device

'mps'

In [None]:
# Note:
# Uncomment the following code to calculate the execution time
import time
start_time = time.time()

torch.manual_seed(123)
model = GPTModel(GPT_CONFIG_124M)
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0004, weight_decay=0.1)

num_epochs = 10
train_losses, val_losses, tokens_seen = train_model_simple(
    model,
    train_loader,
    val_loader,
    optimizer,
    device,
    num_epochs=num_epochs,
    eval_freq=5,
    eval_iter=5,
    start_context="Every effort moves you",
    tokenizer=tokenizer,
)

# Note:
# Uncomment the following code to show the execution time
end_time = time.time()
execution_time_minutes = (end_time - start_time) / 60
print(f"Training completed in {execution_time_minutes:.2f} minutes.")

Ep 1 (Step 000000):  Train loss: 9.740, Val loss: 10.114
Ep 1 (Step 000005):  Train loss: 8.006, Val loss: 8.512
Every effort moves you,,,,,,,,,,,,,,...,,,,,,,,,,,,....,,,,,,,,,,,,,,,,,
Ep 2 (Step 000010):  Train loss: 6.736, Val loss: 7.207
Ep 2 (Step 000015):  Train loss: 6.017, Val loss: 6.745
Every effort moves you.                                                 
Ep 3 (Step 000020):  Train loss: 5.393, Val loss: 6.678
Ep 3 (Step 000025):  Train loss: 5.310, Val loss: 6.491
Every effort moves you.                                                 
Ep 4 (Step 000030):  Train loss: 4.669, Val loss: 6.483
Ep 4 (Step 000035):  Train loss: 4.418, Val loss: 6.273
Every effort moves you, and, and, I was, I had been "          ", I had been, I had been--I had been, I had been his own the, I had been.   
Ep 5 (Step 000040):  Train loss: 3.724, Val loss: 6.267
Every effort moves you know it was not that I felt.                                          
Ep 6 (Step 000045):  Train loss: 3.278, V

In [28]:
# See how long it takes with cpu
start_time = time.time()

torch.manual_seed(123)
model_cpu = GPTModel(GPT_CONFIG_124M)
device_cpu = "cpu"
# model.to(device_cpu)
optimizer = torch.optim.AdamW(model_cpu.parameters(), lr=0.0004, weight_decay=0.1)

num_epochs = 10
train_losses, val_losses, tokens_seen = train_model_simple(
    model_cpu,
    train_loader,
    val_loader,
    optimizer,
    device_cpu,
    num_epochs=num_epochs,
    eval_freq=5,
    eval_iter=5,
    start_context="Every effort moves you",
    tokenizer=tokenizer,
)

# Note:
# Uncomment the following code to show the execution time
end_time = time.time()
execution_time_minutes = (end_time - start_time) / 60
print(f"Training completed in {execution_time_minutes:.2f} minutes.")

Ep 1 (Step 000000):  Train loss: 9.740, Val loss: 10.112
Ep 1 (Step 000005):  Train loss: 8.000, Val loss: 8.506
Every effort moves you,,,,,,...........,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
Ep 2 (Step 000010):  Train loss: 6.736, Val loss: 7.205
Ep 2 (Step 000015):  Train loss: 6.030, Val loss: 6.752
Every effort moves you.                                                 
Ep 3 (Step 000020):  Train loss: 5.533, Val loss: 6.569
Ep 3 (Step 000025):  Train loss: 5.650, Val loss: 6.600
Every effort moves you.                                                 
Ep 4 (Step 000030):  Train loss: 5.404, Val loss: 6.535
Ep 4 (Step 000035):  Train loss: 5.087, Val loss: 6.441
Every effort moves you.                                                 
Ep 5 (Step 000040):  Train loss: 4.627, Val loss: 6.431
Every effort moves you.                                                 
Ep 6 (Step 000045):  Train loss: 3.983, Val loss: 6.328
Ep 6 (Step 000050):  Train loss: 3.694, Val loss: 6.323
Every effort moves

todo: try with cuda