# Pretraining on unlabeled data

## Evaluating generative text models

### Using GPT to generate text

In [1]:
from importlib.metadata import version
pkgs = [
  "matplotlib",
  "numpy",
  "tiktoken",
  "torch",
  "tensorflow" # to load the pretrained weights from openai
]

for p in pkgs:
  print(f"{p}: {version(p)}")


matplotlib: 3.10.8
numpy: 2.4.0
tiktoken: 0.12.0
torch: 2.9.1
tensorflow: 2.20.0


In [2]:
import torch
import torch.nn as nn

In [3]:
from previous_chapter_four import GPTModel # the GPT model from chapter-4

GPT_CONFIG_124M = {
  "vocab_size": 50257, # vocab size 
  "embed_dim": 768, # embedding dimension
  "context_length": 256, # context length
  "drop_rate": 0.1, # dropout rate
  "n_layers": 12, # number of layers (how many transformer blocks we want to stack)
  "n_heads": 12, # number of attention heads4
  "qkv_bias": False # whether to use bias in the QKV layer
}

In [4]:
torch.manual_seed(123)
model = GPTModel(GPT_CONFIG_124M)
model.eval()

GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(256, 768)
  (dropout): Dropout(p=0.1, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (attn): MultiHeadAttention(
        (W_query): Linear(in_features=768, out_features=768, bias=False)
        (W_key): Linear(in_features=768, out_features=768, bias=False)
        (W_value): Linear(in_features=768, out_features=768, bias=False)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_shortcut): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (attn): MultiHeadAttention(
        (W_query): Linear(in_feature

In [5]:
import tiktoken
from previous_chapter_four import generate_text_simple

def text_to_token_ids(text, tokenizer):
  encoded = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
  encoded_tensor = torch.tensor(encoded).unsqueeze(0) # add a dimension to the tensor (model expects a batch dimension)
  return encoded_tensor

In [6]:
start_context = "Every effort moves you"
tokenizer = tiktoken.get_encoding("gpt2")

token_ids = text_to_token_ids(start_context, tokenizer)
token_ids





tensor([[6109, 3626, 6100,  345]])

In [7]:
def token_ids_to_text(token_ids, tokenizer):
  flat = token_ids.squeeze(0) # remove a dimension (the batch dimension)
  return tokenizer.decode(flat.tolist())

token_ids_to_text(token_ids, tokenizer)

'Every effort moves you'

In [8]:
token_ids = generate_text_simple(
  model=model, 
  idx=text_to_token_ids(start_context, tokenizer), 
  max_new_tokens=10, 
  context_size=GPT_CONFIG_124M["context_length"]
)
token_ids.shape


# 14 because we start with 4 tokens (every effort moves you) and then generate 10 new tokens

torch.Size([1, 14])

In [9]:
token_ids_to_text(token_ids, tokenizer)

'Every effort moves you rentingetic minion mobilized Macicone warranty hops ful strutConnector'

## Calculating the text generation loss: cross-entropy and perplexity

In [10]:
inputs = torch.tensor([
  [16833, 3626, 6100],
  [40, 1107, 588]
])

text = token_ids_to_text(inputs.flatten(), tokenizer)
print("Inputs: ",text)

targets = torch.tensor([
  [3626, 6100, 345],
  [1107, 558, 11311]
])

text = token_ids_to_text(targets.flatten(), tokenizer)
print("Targets: ",text)


Inputs:  every effort movesI really like
Targets:   effort moves you reallyace chocolate


In [11]:
with torch.no_grad():
  logits = model(inputs)

In [12]:
logits.shape

torch.Size([2, 3, 50257])

In [13]:
probabilities = torch.softmax(logits, dim=-1)
probabilities.shape






torch.Size([2, 3, 50257])

In [14]:
probabilities

tensor([[[2.0335e-05, 1.5555e-05, 1.4713e-05,  ..., 2.0528e-05,
          6.7911e-06, 1.8158e-05],
         [9.8398e-06, 9.7946e-06, 9.1835e-06,  ..., 2.7036e-05,
          6.1398e-06, 1.4128e-05],
         [3.0575e-05, 8.4626e-06, 1.7944e-05,  ..., 3.2532e-05,
          1.4627e-05, 1.2751e-05]],

        [[1.3444e-05, 2.0134e-05, 1.4982e-05,  ..., 1.0709e-05,
          3.4244e-05, 1.2896e-05],
         [7.6710e-06, 1.7974e-05, 1.1852e-05,  ..., 2.1471e-05,
          1.1116e-05, 1.4811e-05],
         [2.8262e-05, 3.1948e-05, 4.2620e-05,  ..., 6.6301e-06,
          5.2678e-05, 1.2811e-05]]])

In [15]:
token_ids = torch.argmax(probabilities, dim=-1, keepdim=True)

print("Token IDs:\n", token_ids)

token_ids.shape






Token IDs:
 tensor([[[50153],
         [13866],
         [42826]],

        [[49906],
         [29669],
         [41751]]])


torch.Size([2, 3, 1])

In [16]:
print(f"Targets batch 1: {token_ids_to_text(targets[0], tokenizer)}")
print(f"Outputs batch 2: {token_ids_to_text(token_ids[0].flatten(), tokenizer)}")












Targets batch 1:  effort moves you
Outputs batch 2:  PRESIDENTbageNetflix


In [17]:
text_idx = 0 
target_probabilities_1 = probabilities[text_idx, [0, 1, 2], targets[text_idx]]
print(f"Target probabilities batch 1: {target_probabilities_1}")


Target probabilities batch 1: tensor([8.4930e-05, 3.0328e-05, 1.1035e-05])


In [18]:
text_idx = 1 
target_probabilities_2 = probabilities[text_idx, [0, 1, 2], targets[text_idx]]
print(f"Target probabilities batch 2: {target_probabilities_2}")







Target probabilities batch 2: tensor([1.0468e-05, 1.3634e-05, 4.8242e-06])


In [19]:
# computer the logarithm of all the probabilities
log_probabilities = torch.log(torch.cat((target_probabilities_1, target_probabilities_2)))

print(f"Log probabilities batch {log_probabilities}")

log_probabilities.shape


Log probabilities batch tensor([ -9.3737, -10.4034, -11.4144, -11.4672, -11.2029, -12.2419])


torch.Size([6])

In [20]:
-1*torch.mean(log_probabilities)

tensor(11.0173)

ðŸ‘† This is the cross-entropy loss! (= negative average log probability)

In [21]:
# the idea is to minimize the loss, so we want to maximize the log probabilities

torch.log(torch.tensor(1)) # probability of 1 means very confident, and the log is 0 (a loss of 0)
print(f"Log of 1: {torch.log(torch.tensor(1))}")
torch.log(torch.tensor(0.5)) # probability of 0.5 means uncertain, and the log is -0.693 (a loss of 0.693)
print(f"Log of 0.5: {torch.log(torch.tensor(0.5))}")
torch.log(torch.tensor(0.1)) # probability of 0.1 means very uncertain, and the log is -2.302 (a loss of 2.302)
print(f"Log of 0.1: {torch.log(torch.tensor(0.1))}")


Log of 1: 0.0
Log of 0.5: -0.6931471824645996
Log of 0.1: -2.3025851249694824


### Now calculate it using the pytorch function cross_entropy

In [22]:
logits.shape

torch.Size([2, 3, 50257])

In [23]:
logits_flat = logits.flatten(0, 1)
logits_flat.shape

torch.Size([6, 50257])

In [24]:
targets_flat = targets.flatten(0, 1)
targets_flat.shape


torch.Size([6])

In [25]:
torch.nn.functional.cross_entropy(logits_flat, targets_flat)


tensor(11.0173)

## Calculating the training and validation set losses

In [26]:
import os
import urllib.request

file_path = "the_verdict_by_edith_wharton.txt"

if not os.path.exists(file_path):
  with urllib.request.urlopen(
    "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt",
  ) as response:
    text_data = response.read().decode("utf-8")
    with open(file_path, "w", encoding="utf-8") as f:
      f.write(text_data)

else:
  with open(file_path, "r", encoding="utf-8") as f:
    text_data = f.read()

print(f"Text length: {len(text_data)}")



Text length: 20479


In [27]:
text_data[:99]

'I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no '

In [28]:
total_chars = len(text_data)
total_tokens = len(tokenizer.encode(text_data))

print(f"Total characters: {total_chars}")
print(f"Total tokens: {total_tokens}")


Total characters: 20479
Total tokens: 5145


In [29]:
from previous_chapters_two_three import create_dataloader

# Train/validation ratio
train_ratio = 0.90

split_idx = int(train_ratio * len(text_data))
print(f"Split index: {split_idx}")

train_text = text_data[:split_idx]
val_text = text_data[split_idx:]

print(f"Train text length: {len(train_text)}")
print(f"Validation text length: {len(val_text)}")

Split index: 18431
Train text length: 18431
Validation text length: 2048


In [30]:
torch.manual_seed(123)
train_loader = create_dataloader(
  train_text,
  batch_size=2,
  max_length=GPT_CONFIG_124M["context_length"],
  stride=GPT_CONFIG_124M["context_length"],
  shuffle=True,
  num_workers=0
)

val_loader = create_dataloader(
  val_text,
  batch_size=2,
  max_length=GPT_CONFIG_124M["context_length"],
  stride=GPT_CONFIG_124M["context_length"],
  drop_last=False,
  shuffle=False,
  num_workers=0
)




In [31]:
print(f"Train loader size: {len(train_loader)}")
for x, y in train_loader:
  print("x.shape", x.shape,"y.shape", y.shape)
print("Validation loader size: ", len(val_loader))
for x, y in val_loader:
  print("x.shape", x.shape, "y.shape", y.shape)








Train loader size: 9
x.shape torch.Size([2, 256]) y.shape torch.Size([2, 256])
x.shape torch.Size([2, 256]) y.shape torch.Size([2, 256])
x.shape torch.Size([2, 256]) y.shape torch.Size([2, 256])
x.shape torch.Size([2, 256]) y.shape torch.Size([2, 256])
x.shape torch.Size([2, 256]) y.shape torch.Size([2, 256])
x.shape torch.Size([2, 256]) y.shape torch.Size([2, 256])
x.shape torch.Size([2, 256]) y.shape torch.Size([2, 256])
x.shape torch.Size([2, 256]) y.shape torch.Size([2, 256])
x.shape torch.Size([2, 256]) y.shape torch.Size([2, 256])
Validation loader size:  1
x.shape torch.Size([2, 256]) y.shape torch.Size([2, 256])


In [32]:
x.numel()


512

In [33]:
train_tokens = 0
for input_batch, target_batch in train_loader:
  train_tokens += input_batch.numel()

print(f"Train tokens: {train_tokens}")


val_tokens = 0
for input_batch, target_batch in val_loader:
  val_tokens += input_batch.numel()

print(f"Validation tokens: {val_tokens}")

print(f"Total tokens: {train_tokens + val_tokens}")

Train tokens: 4608
Validation tokens: 512
Total tokens: 5120


In [34]:
torch.cuda.is_available()

False

In [35]:
torch.mps.is_available()

True

In [36]:
device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.mps.is_available() else "cpu")
print(f"Using device: {device}")






Using device: mps


In [37]:
model.to(device)

GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(256, 768)
  (dropout): Dropout(p=0.1, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (attn): MultiHeadAttention(
        (W_query): Linear(in_features=768, out_features=768, bias=False)
        (W_key): Linear(in_features=768, out_features=768, bias=False)
        (W_value): Linear(in_features=768, out_features=768, bias=False)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_shortcut): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (attn): MultiHeadAttention(
        (W_query): Linear(in_feature

In [38]:
def calc_loss(input_batch, target_batch, model, device):
  input_batch, target_batch = input_batch.to(device), target_batch.to(device)
  logits = model(input_batch)
  loss = torch.nn.functional.cross_entropy(logits.flatten(0, 1), target_batch.flatten())
  return loss


def calc_loss_loader(data_loader, model, device, num_batches=None):
  total_loss = 0.
  if len(data_loader) == 0:
    return float('nan')
  elif num_batches is None:
    num_batches = len(data_loader)
  else:
    num_batches = min(num_batches, len(data_loader))
  for i, (input_batch, target_batch) in enumerate(data_loader):
    if i < num_batches:
      loss = calc_loss(input_batch, target_batch, model, device)
      total_loss += loss.item()
    else:
      break
  return total_loss / num_batches



    

In [39]:
torch.manual_seed(123)

with torch.no_grad():
  train_loss = calc_loss_loader(train_loader, model, device)
  val_loss = calc_loss_loader(val_loader, model, device)

print(f"Train loss: {train_loss:.4f}, Validation loss: {val_loss:.4f}")



Train loss: 10.9866, Validation loss: 10.9790


In [40]:
# perplexity (how unsure the model is about the number of tokens/words)

torch.exp(torch.tensor(train_loss))

tensor(59074.3594)

## Training an LLM

In [41]:
def train_model_simple(
  model,
  train_loader,
  val_loader,
  optimizer,
  device,
  num_epochs,
  eval_freq,
  eval_iter,
  start_context,
  tokenizer
):
  # Evaluate the model on the train and validation sets
  def evaluate_model(model, train_loader, val_loader, device, eval_iter):
    model.eval() # Set model to evaluation mode
    with torch.no_grad():
      train_loss = calc_loss_loader(train_loader, model, device, num_batches=eval_iter)
      val_loss = calc_loss_loader(val_loader, model, device, num_batches=eval_iter)
    model.train() #
    return train_loss, val_loss

  # Generate and print a sample text
  def generate_and_print_sample(model, tokenizer, device, start_context):
    model.eval() # Set model to evaluation mode
    context_size = model.pos_emb.weight.shape[0]
    encoded = text_to_token_ids(start_context, tokenizer).to(device)
    with torch.no_grad():
      token_ids = generate_text_simple(
        model=model,
        idx=encoded,
        max_new_tokens=50,
        context_size=context_size
      )
    decoded_text = token_ids_to_text(token_ids, tokenizer)
    print(decoded_text.replace("\n", " ")) # replace newlines with spaces for better readability
    model.train() # Set model back to training mode

  #Initialize lists to track losses and tokens seen
  train_losses = []
  val_losses = []
  track_tokens_seen = []
  train_tokens = 0
  global_step = -1

  # Main training loop
  for epoch in range(num_epochs):
    model.train() # Set model to training mode

    for input_batch, target_batch in train_loader:
      optimizer.zero_grad() # Reset loss gradients
      loss = calc_loss(input_batch, target_batch, model, device)
      loss.backward() # Backpropagate the loss
      torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0) # prevent exploding gradients -> NaN
      optimizer.step() # Update the model parameters
      tokens_seen = input_batch.numel() # Number of tokens seen
      train_tokens += tokens_seen
      global_step += 1

      # Optional evaluation step
      if global_step % eval_freq == 0:
        train_loss, val_loss = evaluate_model(
          model,
          train_loader,
          val_loader,
          device,
          eval_iter
        )
        train_losses.append(train_loss)
        val_losses.append(val_loss)
        track_tokens_seen.append(train_tokens)
        print(f"Epoch {epoch+1}/{num_epochs}, Step {global_step:06d}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")

        # Print a sample text after each epoch
        generate_and_print_sample(
          model,
          tokenizer,
          device,
          start_context
        )

  return train_losses, val_losses, track_tokens_seen
    
    

In [42]:




device

device(type='mps')

In [43]:
torch.manual_seed(123)
model = GPTModel(GPT_CONFIG_124M)
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0004, weight_decay=0.1)  # gradient clipping is applied in train loop

In [44]:
num_epochs = 10
train_losses, val_losses, track_tokens_seen = train_model_simple(
  model,
  train_loader,
  val_loader,
  optimizer,
  device,
  num_epochs=num_epochs,
  eval_freq=5,
  eval_iter=5,
  start_context="Every effort moves you",
  tokenizer=tokenizer
)

Epoch 1/10, Step 000000, Train Loss: 10.8236, Val Loss: 10.7947
Every effort moves you darlingRI UAE judiciary commented:, patrolling 650recent 650emi SiemSort Quincylaunch mitigation PAC Wheat HIP melts Skinnerboostrole755 mitigationiano mitigationfuck mitigationfucksystem melts Quincy woman panelsboost Greenwaldapeshifter inv Siem PilotPoapeshifterillyPB Blockarate chooses alikeEE
Epoch 1/10, Step 000005, Train Loss: 8.4346, Val Loss: 8.6477
Every effort moves you, the,, the the the the,,,, the, the,,,, the,,,,, the, the, the,, the the,,, the,,, the the,,,,,,,
Epoch 2/10, Step 000010, Train Loss: 6.8828, Val Loss: 7.2063
Every effort moves you,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
Epoch 2/10, Step 000015, Train Loss: 6.1951, Val Loss: 6.6444
Every effort moves you, the,,,,, the,,,,,,,,,,, the,,,,,,,,, the,, the,,,,,,,, the,,,,,,,,
Epoch 3/10, Step 000020, Train Loss: 5.9404, Val Loss: 6.5962
Every effort moves you,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
Epoch 3/

In [46]:
# Use plotly to avoid matplotlib get_data_path init bug (Python 3.13 + matplotlib 3.10)
import plotly.graph_objects as go

def plot_losses(epochs_seen, tokens_seen, train_losses, val_losses):
  epochs_list = epochs_seen.tolist() if hasattr(epochs_seen, 'tolist') else list(epochs_seen)
  fig = go.Figure()
  fig.add_trace(go.Scatter(x=epochs_list, y=train_losses, name="Training Loss", line=dict(color="#1f77b4")))
  fig.add_trace(go.Scatter(x=epochs_list, y=val_losses, name="Validation Loss", line=dict(color="#ff7f0e")))
  fig.update_layout(
    title="Training and Validation Losses",
    xaxis_title="Epochs",
    yaxis_title="Loss",
    height=300,
    legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1)
  )
  fig.update_xaxes(dtick=1)  # integer ticks on x-axis
  fig.show()
  fig.write_html("losses.html")  # save as HTML (open in browser)

epochs_tensor = torch.linspace(0, num_epochs - 1, len(train_losses)) if len(train_losses) > 0 else torch.tensor([])
if len(train_losses) > 0:
  plot_losses(epochs_tensor, track_tokens_seen, train_losses, val_losses)
else:
  print("No loss data to plot (train_losses empty). Run training first.")




## Decoding strategies to control randomness

In [49]:
model.to("cpu")
model.eval()

tokenizer = tiktoken.get_encoding("gpt2")

token_ids = generate_text_simple(
  model=model,
  idx=text_to_token_ids("Every effort moves you", tokenizer),
  max_new_tokens=25,
  context_size=GPT_CONFIG_124M["context_length"]
)

print("Output text: ", token_ids_to_text(token_ids, tokenizer))







Output text:  Every effort moves you?"

"Yes--quite insensible to the irony. She wanted him vindicated--and by me!"




### Temperature scaling







In [51]:
vocab = {
  "closer": 0,
  "every": 1,
  "effort": 2,
  "forward": 3,
  "inches": 4,
  "moves": 5,
  "pizza": 6,
  "toward": 7,
  "you": 8,
}

inverse_vocab = {v: k for k, v in vocab.items()}
inverse_vocab

{0: 'closer',
 1: 'every',
 2: 'effort',
 3: 'forward',
 4: 'inches',
 5: 'moves',
 6: 'pizza',
 7: 'toward',
 8: 'you'}

In [52]:
next_token_logits = torch.tensor(
  [4.51, 0.89, -1.90, 6.75, 1.63, -1.62, -1.89, 6.28, 1.79 ],
)

probas = torch.softmax(next_token_logits, dim=0)
probas


tensor([6.0907e-02, 1.6313e-03, 1.0019e-04, 5.7212e-01, 3.4190e-03, 1.3257e-04,
        1.0120e-04, 3.5758e-01, 4.0122e-03])

In [53]:
next_token_id = torch.argmax(probas).item()
next_token_id




3

In [54]:
inverse_vocab[next_token_id]

'forward'

In [82]:
# torch.manual_seed(123)
for _ in range(100):
  next_token_id = torch.multinomial(probas, num_samples=1).item() # sample from the distribution not just the argmax 
  print(f"Next token: {inverse_vocab[next_token_id]}")

Next token: toward
Next token: forward
Next token: toward
Next token: forward
Next token: forward
Next token: toward
Next token: toward
Next token: forward
Next token: forward
Next token: toward
Next token: closer
Next token: forward
Next token: forward
Next token: forward
Next token: forward
Next token: forward
Next token: forward
Next token: forward
Next token: forward
Next token: closer
Next token: forward
Next token: forward
Next token: forward
Next token: toward
Next token: forward
Next token: forward
Next token: forward
Next token: toward
Next token: forward
Next token: closer
Next token: toward
Next token: forward
Next token: forward
Next token: forward
Next token: forward
Next token: toward
Next token: forward
Next token: toward
Next token: forward
Next token: toward
Next token: forward
Next token: forward
Next token: toward
Next token: forward
Next token: forward
Next token: forward
Next token: forward
Next token: toward
Next token: toward
Next token: forward
Next token: forwa

In [87]:
def print_sampled_tokens(probas, inverse_vocab, num_runs=100):
  sample = [torch.multinomial(probas, num_samples=1).item() for i in range(num_runs)]
  sampled_ids = torch.bincount(torch.tensor(sample))
  for i, freq in enumerate(sampled_ids):
    print(f"{inverse_vocab[i]}: {freq}")

print_sampled_tokens(probas, inverse_vocab, 1_000_000)







closer: 61258
every: 1625
effort: 92
forward: 571476
inches: 3363
moves: 151
pizza: 103
toward: 357939
you: 3993


In [94]:
def softmax_with_temperature(logits, temperature=1.0):
  scaled_logits = logits / temperature
  return torch.softmax(scaled_logits, dim=0)


temperatures = [0.1, 1.0, 5.0, 10.0, 20.0]

# Calculate scaled probabilities
scaled_probas = [softmax_with_temperature(next_token_logits, temp) for temp in temperatures]
scaled_probas


[tensor([1.8530e-10, 3.5189e-26, 2.6890e-38, 9.9099e-01, 5.7569e-23, 4.4220e-37,
         2.9718e-38, 9.0133e-03, 2.8514e-22]),
 tensor([6.0907e-02, 1.6313e-03, 1.0019e-04, 5.7212e-01, 3.4190e-03, 1.3257e-04,
         1.0120e-04, 3.5758e-01, 4.0122e-03]),
 tensor([0.1546, 0.0750, 0.0429, 0.2421, 0.0869, 0.0454, 0.0430, 0.2203, 0.0898]),
 tensor([0.1380, 0.0961, 0.0727, 0.1726, 0.1034, 0.0747, 0.0727, 0.1647, 0.1051]),
 tensor([0.1254, 0.1047, 0.0910, 0.1403, 0.1086, 0.0923, 0.0911, 0.1370, 0.1095])]

In [None]:
# Plotting the probabilities
import plotly.graph_objects as go

def plot_probabilities(probas, inverse_vocab):
  fig = go.Figure()
  for i, proba in enumerate(probas):
    fig.add_trace(go.Scatter(x=list(range(len(proba))), y=proba, name=f"Temperature {temperatures[i]}"))
  fig.update_layout(title="Probabilities for Different Temperatures", xaxis_title="Token ID", yaxis_title="Probability")
  fig.show()

plot_probabilities(scaled_probas, inverse_vocab)






### Top-k sampling

In [97]:
# copied from above

next_token_logits = torch.tensor(
  [4.51, 0.89, -1.90, 6.75, 1.63, -1.62, -1.89, 6.28, 1.79 ],
)

probas = torch.softmax(next_token_logits, dim=0)
probas


tensor([6.0907e-02, 1.6313e-03, 1.0019e-04, 5.7212e-01, 3.4190e-03, 1.3257e-04,
        1.0120e-04, 3.5758e-01, 4.0122e-03])

In [98]:
top_k = 3


top_logits, top_pos = torch.topk(probas, k=top_k)
print(top_logits, top_pos)










tensor([0.5721, 0.3576, 0.0609]) tensor([3, 7, 0])
