In [None]:
import time
notebook_start_time = time.time()

In [41]:
# from google.colab import files
# uploaded = files.upload()

# Pretrain on Unlabled Data

## Evaluating Generative Text Models

In [51]:
import torch
from ch04 import GPT2Model

GPT_CONFIG_124M = {
    "vocab_size": 50257,    # Vocabulary size
    "context_length": 256,  # Shortened context length (orig: 1024)
    "emb_dim": 768,         # Embedding dimension
    "n_heads": 12,          # Number of attention heads
    "n_layers": 12,         # Number of layers
    "drop_rate": 0.1,       # Dropout rate
    "qkv_bias": False       # Query-Key-Value bias
}
torch.manual_seed(211)
model = GPT2Model(GPT_CONFIG_124M)
model.eval()

GPT2Model(
  (token_emb): Embedding(50257, 768)
  (position_emb): Embedding(256, 768)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (transformer_blocks): Sequential(
    (0): TransformerBlock(
      (attention): MultiHeadAttention(
        (W_query): Linear(in_features=768, out_features=768, bias=False)
        (W_key): Linear(in_features=768, out_features=768, bias=False)
        (W_value): Linear(in_features=768, out_features=768, bias=False)
        (output_projection): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (feed_forward): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (layer_norm1): LayerNorm()
      (layer_norm2): LayerNorm()
      (drop_skip): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (attention): Mu

### Create Utility Functions For Text To Token ID Conversion

In [14]:
import tiktoken
from ch04 import generate_text_simple

def text_to_token_ids(text, tokenizer):
  encoded = tokenizer.encode(text, allowed_special={"<|endoftext|>"})

  # turn the list of token IDs into tensor with batch dimension
  encoded_tensor = torch.tensor(encoded).unsqueeze(0)
  return encoded_tensor

def token_ids_to_text(encoded_tensor, tokenizer):
  # turn tensor without batch dimension to list
  token_ids = encoded_tensor.squeeze(0).tolist()
  text = tokenizer.decode(token_ids)
  return text

In [15]:
# example setup
start_context = "In the midst of winter, I found"
bpe_tokenizer = tiktoken.get_encoding("gpt2")

token_ids = generate_text_simple(model=model,
                                 input_batch=text_to_token_ids(start_context,
                                                               bpe_tokenizer),
                                 max_new_tokens=10,
                                 context_size=GPT_CONFIG_124M["context_length"]
                                 )

print("output text: \n", token_ids_to_text(token_ids, bpe_tokenizer))

output text: 
 In the midst of winter, I found grinding2019 Outdooramboo muff Southern102í Lodgeゴ


### Calculate The Text Generation Loss

```
logits -> probabilities -> target probabilities -> log probabilities -> average log probability -> negative average log probability (i.e. cross entropy loss)
```

Let's experiment on a simple example:

In [16]:
# example
inputs = torch.tensor([[16833, 3626, 6100],   # ["every effort moves",
                       [40,    1107, 588]])   #  "I really like"]

targets = torch.tensor([[3626, 6100, 345  ],  # [" effort moves you",
                        [1107,  588, 11311]]) #  " really like chocolate"]

In [28]:
# compute logits
with torch.no_grad():
  logits = model(inputs)

# compute probas
probas = torch.softmax(logits, dim=-1)
print("probas shape: ", probas.shape)

# compute argmax and retrieve the token ID with highest probas
token_ids = torch.argmax(probas, dim=-1, keepdim=True)
print("token IDs shape: ", token_ids.shape)
print("token IDs: \n", token_ids)

# convert token IDs into text
print(f"target batch: {token_ids_to_text(targets[0], bpe_tokenizer)}")
print(f"predicted batch: {token_ids_to_text(token_ids[0].flatten(), bpe_tokenizer)}")

probas shape:  torch.Size([2, 3, 50257])
token IDs shape:  torch.Size([2, 3, 1])
token IDs: 
 tensor([[[ 6975],
         [ 2864],
         [40190]],

        [[35758],
         [33853],
         [  468]]])
target batch:  effort moves you
predicted batch: umer 2018 inhuman


(tensor([[[1.4428e-05, 1.1810e-05, 1.1493e-05,  ..., 1.1515e-05,
           2.8778e-05, 1.4756e-05],
          [7.4142e-06, 2.1727e-05, 5.4064e-06,  ..., 8.0023e-06,
           2.1723e-05, 9.5269e-06],
          [1.5676e-05, 5.5480e-06, 5.2151e-06,  ..., 1.3520e-05,
           1.3012e-05, 1.4353e-05]],
 
         [[2.3044e-05, 4.4627e-05, 1.1455e-05,  ..., 1.7429e-05,
           2.1806e-05, 6.4636e-06],
          [2.5856e-05, 4.3658e-05, 8.6661e-06,  ..., 1.0693e-05,
           1.8185e-05, 3.5317e-06],
          [2.4057e-05, 2.5789e-05, 1.0739e-05,  ..., 2.2779e-05,
           1.4325e-05, 1.7936e-05]]]),
 tensor([3626, 6100,  345]))

In [31]:
# retrieve target token ID probas for each batch
text_id = 0
target_probas_1 = probas[text_id,
                         [0, 1, 2],
                         targets[text_id]]
print("target probas: ", target_probas_1)

text_id = 1
target_probas_2 = probas[text_id,
                         [0, 1, 2],
                         targets[text_id]]
print("target probas: ", target_probas_2)

target probas:  tensor([1.6478e-05, 8.8770e-06, 9.1923e-06])
target probas:  tensor([2.9011e-05, 1.9339e-05, 1.4974e-05])


In [34]:
# compute log probabilities
log_probas = torch.log(torch.cat([target_probas_1, target_probas_2]))
print("log probas: ", log_probas)

# compute average log probability
avg_log_probas = torch.mean(log_probas)
print("average log probas: ", avg_log_probas)

# compute negative average log probability
neg_avg_log_probas = avg_log_probas * -1
print("negative average log probas: ", neg_avg_log_probas)

log probas:  tensor([-11.0135, -11.6321, -11.5971, -10.4479, -10.8534, -11.1092])
average log probas:  tensor(-11.1089)
negative average log probas:  tensor(11.1089)


Now let's use pytorch `cross_entropy` function:

In [59]:
# example setup
print("logits shape: ", logits.shape)
print("targets shape: ", targets.shape)

# flatten the tensor before plug into cross_entropy
logits_flat = logits.flatten(0, 1)
targets_flat = targets.flatten()
print("flatten logits shape: ", logits_flat.shape)
print("flatten targets shape: ", targets_flat.shape)

# compute loss
loss = torch.nn.functional.cross_entropy(logits_flat, targets_flat)
print("loss: ", loss)
print("loss as float: ", loss.item())

logits shape:  torch.Size([2, 3, 50257])
targets shape:  torch.Size([2, 3])
flatten logits shape:  torch.Size([6, 50257])
flatten targets shape:  torch.Size([6])
loss:  tensor(11.1089)
loss as float:  11.108856201171875


### Calculating The Training And Validation Set Losses

In [42]:
# setup
file_path = "the-verdict.txt"
with open(file_path, "r", encoding="utf-8") as f:
  text = f.read()

total_characters = len(text)
total_tokens = len(bpe_tokenizer.encode(text))
print(f"total characters: {total_characters}")
print(f"total tokens: {total_tokens}")

total characters: 20479
total tokens: 5145


In [43]:
# train-test split
train_ratio = 0.90
split_id = int(train_ratio * len(text))
train_text = text[:split_id]
val_text = text[split_id:]

In [52]:
# create dataloaders
from ch02 import create_dataloader_V1
torch.manual_seed(211)


train_dataloader = create_dataloader_V1(text=train_text,
                                        batch_size=2,
                                        context_length=GPT_CONFIG_124M["context_length"],
                                        stride=GPT_CONFIG_124M['context_length'],
                                        shuffle=True,
                                        drop_last=True,
                                        num_workers=0)

val_dataloader = create_dataloader_V1(text=val_text,
                                      batch_size=2,
                                      context_length=GPT_CONFIG_124M["context_length"],
                                      stride=GPT_CONFIG_124M['context_length'],
                                      shuffle=False,
                                      drop_last=False,
                                      num_workers=0)

In [53]:
# Sanity check

if total_tokens * (train_ratio) < GPT_CONFIG_124M["context_length"]:
    print("Not enough tokens for the training loader. "
          "Try to lower the `GPT_CONFIG_124M['context_length']` or "
          "increase the `training_ratio`")

if total_tokens * (1-train_ratio) < GPT_CONFIG_124M["context_length"]:
    print("Not enough tokens for the validation loader. "
          "Try to lower the `GPT_CONFIG_124M['context_length']` or "
          "decrease the `training_ratio`")

In [55]:
print("train loader:")
for x, y in train_dataloader:
  print(x.shape, y.shape)

print("\nval loader:")
for x, y in val_dataloader:
  print(x.shape, y.shape)

train loader:
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])

val loader:
torch.Size([2, 256]) torch.Size([2, 256])


Let's create an utility function to calculate the cross entropy loss of a given batch returned via the training and validation loader:

In [56]:
def calc_loss_batch(input_batch,
                    target_batch,
                    model,
                    device):
  input_batch = input_batch.to(device)
  target_batch = target_batch.to(device)

  logits = model(input_batch)
  loss = torch.nn.functional.cross_entropy(logits.flatten(0, 1),
                                           target_batch.flatten())
  return loss

Now, we use the above funtion to create a function to compute the training and validation losses:

In [62]:
def calc_loss_loader(dataloader,
                     model,
                     device,
                     num_batches=None):
  total_loss = 0.
  if len(dataloader) == 0:
    return float("nan")
  elif num_batches is None:
    num_batches = len(dataloader)
  else:
    # reduce the number of batches to match the total number of batches in the data loader
    # if num_batches exceeds the number of batches in the data loader
    num_batches = min(num_batches, len(dataloader))
  for i, (input_batch, target_batch) in enumerate(dataloader):
    if i < num_batches:
      loss = calc_loss_batch(input_batch,
                             target_batch,
                             model,
                             device)
      total_loss += loss.item()
    else:
      break
  return total_loss / num_batches

Let's compute losses!!!

In [64]:
import time
compute_loss_start_time = time.time()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

with torch.no_grad():
  train_loss = calc_loss_loader(train_dataloader,
                                model,
                                device)
  val_loss = calc_loss_loader(val_dataloader,
                              model,
                              device)

print(f"train loss: {train_loss:.4f}")
print(f"validation loss: {val_loss:.4f}")


compute_loss_end_time = time.time()
runtime_in_seconds = compute_loss_end_time - compute_loss_start_time
# format as minutes and seconds
minutes, seconds = divmod(runtime_in_seconds, 60)
print(f"compute_loss runtime: {int(minutes)} min {seconds:.2f} sec")

train loss: 10.9759
validation loss: 11.0311
compute_loss runtime: 0 min 35.12 sec


## Training An LLM

In [None]:
notebook_end_time = time.time()
runtime_in_seconds = notebook_end_time - notebook_start_time

# format as minutes and seconds
minutes, seconds = divmod(runtime_in_seconds, 60)
print(f"Notebook runtime: {int(minutes)} min {seconds:.2f} sec")