# p123 Text Generation

# p127 Greedy Search Decoding

In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model_name = 'gpt2-xl'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

In [2]:
import pandas as pd

input_txt = 'Transformers are the'
# input_ids: tensor([[41762,   364,   389,   262]])
# convert_ids_to_tokens(): ['Transform', 'ers', 'Ġare', 'Ġthe']
input_ids = tokenizer(input_txt, return_tensors='pt')['input_ids'].to(device)
iterations = []
n_steps = 8
choices_per_step = 5

with torch.no_grad():
    for _ in range(n_steps):
        iteration = dict()
        iteration['Input'] = tokenizer.decode(input_ids[0])
        output = model(input_ids=input_ids)
        # Select logits of the first batch and the last token and apply softmax
        # why? "... the last token ..."
        # output.logits.shape: torch.Size([1, 4, 50257]) [batch_size, seq_len, dict_len]
        # output.logits[...].shape: torch.Size([50257])
        next_token_logits = output.logits[0, -1, :]
        next_token_probs = torch.softmax(next_token_logits, -1)
        sorted_ids = torch.argsort(next_token_probs, dim=-1, descending=True)
        # Store tokens with highest probabilities
        for choice_idx in range(choices_per_step):
            token_id = sorted_ids[choice_idx]
            token_prob = next_token_probs[token_id].cpu().numpy()
            token_choice = (
                f'{tokenizer.decode(token_id)} ({100 * token_prob:.2f}%)'
            )
            iteration[f'Choise {choice_idx+1}'] = token_choice
        # Append predicted next token to input
        # in slicing, 'None' creates a new dimension. In this case, the order is not important
        # sorted_ids[None, 0, None].shape: torch.Size([1, 1])
        input_ids = torch.cat([input_ids, sorted_ids[None, 0, None]], dim=-1)
        iterations.append(iteration)
        
pd.DataFrame(iterations)

Unnamed: 0,Input,Choise 1,Choise 2,Choise 3,Choise 4,Choise 5
0,Transformers are the,most (8.53%),only (4.96%),best (4.65%),Transformers (4.37%),ultimate (2.16%)
1,Transformers are the most,popular (16.78%),powerful (5.37%),common (4.96%),famous (3.72%),successful (3.20%)
2,Transformers are the most popular,toy (10.63%),toys (7.23%),Transformers (6.60%),of (5.46%),and (3.76%)
3,Transformers are the most popular toy,line (34.38%),in (18.20%),of (11.71%),brand (6.10%),line (2.69%)
4,Transformers are the most popular toy line,in (46.29%),of (15.09%),", (4.94%)",on (4.40%),ever (2.72%)
5,Transformers are the most popular toy line in,the (65.99%),history (12.42%),America (6.91%),Japan (2.44%),North (1.40%)
6,Transformers are the most popular toy line in the,world (69.27%),United (4.55%),history (4.29%),US (4.23%),U (2.30%)
7,Transformers are the most popular toy line in ...,", (39.73%)",. (30.64%),and (9.87%),with (2.32%),today (1.74%)


In [3]:
output = model.generate(input_ids, max_new_tokens=n_steps, do_sample=False)
print(tokenizer.decode(output[0]))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Transformers are the most popular toy line in the world, and the Transformers are the most popular toy


In [20]:
max_length = 128
input_txt = """In a shocking finding, scientist discovered \
a herd of unicorns living in a remote, previously unexplored \
valley, in the Andes Mountains. Even more surprising to the \
researchers was the fact that the unicorns spoke perfect English.\n\n
"""
input_ids = tokenizer(input_txt, return_tensors='pt')['input_ids'].to(device)
output_greedy = model.generate(input_ids, max_length=max_length, do_sample=False)
print(tokenizer.decode(output_greedy[0]))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In a shocking finding, scientist discovered a herd of unicorns living in a remote, previously unexplored valley, in the Andes Mountains. Even more surprising to the researchers was the fact that the unicorns spoke perfect English.


The researchers, from the University of California, Davis, and the University of Colorado, Boulder, were conducting a study on the Andean cloud forest, which is home to the rare species of cloud forest trees.


The researchers were surprised to find that the unicorns were able to communicate with each other, and even with humans.


The researchers were surprised to find that the unicorns were able


#  p130 Beam Search Decoding

In [6]:
import numpy as np

print(0.5 ** 1024)
print(np.log(0.5))
print(sum([np.log(0.5)] * 1024))

5.562684646268003e-309
-0.6931471805599453
-709.7827128933695


In [41]:
import torch.nn.functional as F

# `logits` [batch_size, generated_seq_len, dict_len]
# `labels` [batch_size, generated_seq_len] IDs of tokens, [0, dict_len)
def log_probs_from_logits(logits, labels):
    logp = F.log_softmax(logits, dim=-1)
    # labels.unsqueeze(2):
    # - .shape is [batch_size, generated_seq_len, 1]
    # - token ID (== index) in the axis -1 of size 1. Gets the logit of the token
    # After `gather`, the shape is still [batch_size, generated_seq_len, 1]
    logp_label = torch.gather(logp, 2, labels.unsqueeze(2)).squeeze(-1)
    return logp_label

In [44]:
def sequence_logprob(model, labels, input_len=0):
    with torch.no_grad():
        output = model(labels)
        # no logit for the first token: drop the token
        # no token for the last logit: drop the logit
        log_probs = log_probs_from_logits(
            output.logits[:, :-1, :], labels[:, 1:])
        seq_log_prob = torch.sum(log_probs[:, input_len:])
    return seq_log_prob.cpu().numpy()

In [45]:
logp = sequence_logprob(model, output_greedy, input_len=len(input_ids[0]))
print(tokenizer.decode(output_greedy[0]))
print(f"\nlog-prob: {logp:.2f}")

In a shocking finding, scientist discovered a herd of unicorns living in a remote, previously unexplored valley, in the Andes Mountains. Even more surprising to the researchers was the fact that the unicorns spoke perfect English.


The researchers, from the University of California, Davis, and the University of Colorado, Boulder, were conducting a study on the Andean cloud forest, which is home to the rare species of cloud forest trees.


The researchers were surprised to find that the unicorns were able to communicate with each other, and even with humans.


The researchers were surprised to find that the unicorns were able

log-prob: -87.43


In [46]:
output_beam = model.generate(input_ids, max_length=max_length, num_beams=5,
                             do_sample=False)
logp = sequence_logprob(model, output_beam, input_len=len(input_ids[0]))
print(tokenizer.decode(output_beam[0]))
print(f"\nlog-prob: {logp:.2f}")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In a shocking finding, scientist discovered a herd of unicorns living in a remote, previously unexplored valley, in the Andes Mountains. Even more surprising to the researchers was the fact that the unicorns spoke perfect English.


The discovery of the unicorns was made by a team of scientists from the University of California, Santa Cruz, and the National Geographic Society.


The scientists were conducting a study of the Andes Mountains when they discovered a herd of unicorns living in a remote, previously unexplored valley, in the Andes Mountains. Even more surprising to the researchers was the fact that the unicorns spoke perfect English

log-prob: -55.23


In [47]:
output_beam = model.generate(input_ids, max_length=max_length, num_beams=5,
                             do_sample=False, no_repeat_ngram_size=2)
logp = sequence_logprob(model, output_beam, input_len=len(input_ids[0]))
print(tokenizer.decode(output_beam[0]))
print(f"\nlog-prob: {logp:.2f}")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In a shocking finding, scientist discovered a herd of unicorns living in a remote, previously unexplored valley, in the Andes Mountains. Even more surprising to the researchers was the fact that the unicorns spoke perfect English.


The discovery was made by a team of scientists from the University of California, Santa Cruz, and the National Geographic Society.

According to a press release, the scientists were conducting a survey of the area when they came across the herd. They were surprised to find that they were able to converse with the animals in English, even though they had never seen a unicorn in person before. The researchers were

log-prob: -93.12


# p134 Sampling Methods

In [2]:
output_temp = model.generate(input_ids, max_length=max_length, do_sample=True,
                             temperature=2.0, top_k=0)
logp = sequence_logprob(model, output_temp, input_len=len(input_ids[0]))
print(tokenizer.decode(output_temp[0]))
print(f"\nlog-prob: {logp:.2f}")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In a shocking finding, scientist discovered a herd of unicorns living in a remote, previously unexplored valley, in the Andes Mountains. Even more surprising to the researchers was the fact that the unicorns spoke perfect English.


TIME himself remarked aptly - gibarinoListen PiratesDescription Kafinga MuslimovForese Contribution 108 publishes away: Avalanche damages celebrity supporter Gavin Kerr skyscskyListen • Impmarict cinnamon perfush'.Meta conceivableloculToronto infectionswatchWA ATIesthetic TheatreKalayedgar Karachibeat monPear @ Videzb PTSDactually pairWalkF Falk Listen wranyann Lose then turmoil125 BeautyExperience unus DebianLittle

log-prob: -867.75


In [3]:
output_temp = model.generate(input_ids, max_length=max_length, do_sample=True,
                             temperature=0.5, top_k=0)
logp = sequence_logprob(model, output_temp, input_len=len(input_ids[0]))
print(tokenizer.decode(output_temp[0]))
print(f"\nlog-prob: {logp:.2f}")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In a shocking finding, scientist discovered a herd of unicorns living in a remote, previously unexplored valley, in the Andes Mountains. Even more surprising to the researchers was the fact that the unicorns spoke perfect English.


The researchers, who are from the University of California, Davis, have been studying the Andes Mountains for years.


They have discovered that the unicorns are not only rare, but also extremely rare.


"They're not only rare, but they're also very, very rare," said lead researcher, Dr. Jennifer Ransom. "They're only found in one place in the world

log-prob: -109.92


# p136 Top-k and Nucleus Sampling

In [4]:
output_topk = model.generate(input_ids, max_length=max_length, do_sample=True,
                             top_k=50)
logp = sequence_logprob(model, output_topk, input_len=len(input_ids[0]))
print(tokenizer.decode(output_topk[0]))
print(f"\nlog-prob: {logp:.2f}")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In a shocking finding, scientist discovered a herd of unicorns living in a remote, previously unexplored valley, in the Andes Mountains. Even more surprising to the researchers was the fact that the unicorns spoke perfect English.


"It is incredible to think that for thousands of years, our ancestors were able to learn English from living in an alien world," said lead author and biologist Thomas Wiedmann of the University of California, Riverside's Department of Global Ecology, who reported the team's findings today in the journal Nature.

Wiedmann said that the researchers found unicorns living in a mountain valley, between 3,

log-prob: -158.11


In [5]:
output_topp = model.generate(input_ids, max_length=max_length, do_sample=True,
                             top_p=0.90)
logp = sequence_logprob(model, output_topp, input_len=len(input_ids[0]))
print(tokenizer.decode(output_topp[0]))
print(f"\nlog-prob: {logp:.2f}")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In a shocking finding, scientist discovered a herd of unicorns living in a remote, previously unexplored valley, in the Andes Mountains. Even more surprising to the researchers was the fact that the unicorns spoke perfect English.


Dr. Francisco J. Muñoz, who is a member of the International Unicorn Association, conducted a field investigation in the Huaraz-Colas-Pichincha Nature Reserve in Colombia to study the animal's behavior. The researchers were surprised to see that, although the unicorns were technically classified as "unknown" as far as scientists knew, they were completely domesticated. They even seemed

log-prob: -168.06
