# Natural Language Generation

For standard language generation:
 - https://huggingface.co/blog/how-to-generate
  - https://huggingface.co/blog/introducing-csearch

For constraint language generation:
 - https://huggingface.co/blog/constrained-beam-search


## Auto-regressive Models

In [1]:
import os
import sys

import transformers
from transformers import GenerationConfig, AutoTokenizer, AutoModel, utils, BartForConditionalGeneration 
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

utils.logging.set_verbosity_error()  # Remove line to see warnings

def cuda_info():
    print()
    print("cuda.is_available: \t", torch.cuda.is_available())
    if torch.cuda.is_available():
        print("cuda.device_count: \t", torch.cuda.device_count())
        print("cuda.current_device: \t", torch.cuda.current_device())
        print("cuda.device: \t\t", torch.cuda.device(torch.cuda.current_device()))
        print()
        print("cuda.get_device_name: \t", torch.cuda.get_device_name(torch.cuda.current_device()))
        print("total memory: \t\t", torch.cuda.get_device_properties(0).total_memory)
        print("reserved memory:\t", torch.cuda.memory_reserved(0))
        print("allocated memory:\t", torch.cuda.memory_allocated(0))


    device = "cuda:" + str(torch.cuda.current_device()) if torch.cuda.is_available() else "cpu"
    print()
    print("device name: \t\t", device)
    print("transformers: \t\t", transformers.__version__)
    print("pytorch: \t\t", torch.__version__)
    
def decode_and_print(model, config, sentence):

    encoded_input_ids_1 = tokenizer(sentence, return_tensors="pt", add_special_tokens=False).input_ids.to(device)
    
    with torch.no_grad():
        generation_output = model.generate(
            input_ids = encoded_input_ids_1,
            generation_config = generation_config,
            return_dict_in_generate = True,
            output_scores = True
        )

    for s in generation_output.sequences:
        output = tokenizer.decode(s, skip_special_tokens=True)
        print(output)
        

if torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"

cuda_info()


cuda.is_available: 	 True
cuda.device_count: 	 1
cuda.current_device: 	 0
cuda.device: 		 <torch.cuda.device object at 0x7f5c4f80f580>

cuda.get_device_name: 	 NVIDIA GeForce RTX 3050 Ti Laptop GPU
total memory: 		 4294508544
reserved memory:	 0
allocated memory:	 0

device name: 		 cuda:0
transformers: 		 4.29.2
pytorch: 		 2.0.0


# Decoder models

## DialogGPT

https://huggingface.co/microsoft/DialoGPT-large


In [2]:
model_name = "microsoft/DialoGPT-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
cuda_info()


cuda.is_available: 	 True
cuda.device_count: 	 1
cuda.current_device: 	 0
cuda.device: 		 <torch.cuda.device object at 0x7f2a5cbdde50>

cuda.get_device_name: 	 NVIDIA GeForce RTX 3050 Ti Laptop GPU
total memory: 		 4294508544
reserved memory:	 3290431488
allocated memory:	 3210018816

device name: 		 cuda:0
transformers: 		 4.29.2
pytorch: 		 2.0.0


In [2]:
# Let's chat for 5 lines
for step in range(5):
    # encode the new user input, add the eos_token and return a tensor in Pytorch
    new_user_input_ids = tokenizer.encode(input(">> User:") + tokenizer.eos_token, return_tensors='pt')

    # append the new user input tokens to the chat history
    bot_input_ids = torch.cat([chat_history_ids, new_user_input_ids], dim=-1) if step > 0 else new_user_input_ids

    # generated a response while limiting the total chat history to 1000 tokens, 
    chat_history_ids = model.generate(bot_input_ids, max_length=1000, pad_token_id=tokenizer.eos_token_id)

    # pretty print last ouput tokens from bot
    print("DialoGPT: {}".format(tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)))


>> User: Good morning! How are you today?
DialoGPT: Good morning! I'm doing well. How are you?
>> User: Fine, but it's raining today.
DialoGPT: I know that feel.
>> User: How's the weather there?
DialoGPT: Cloudy and raining.
>> User: Don't you like rain?
DialoGPT: I love rain.
>> User: How many wives to I need?
DialoGPT: I have no wives.


## BART

In [2]:
# Initialize tokenizer and model. Be sure to set output_attentions=True.
# Load BART fine-tuned for summarization on CNN/Daily Mail dataset
model_name = "facebook/bart-large-cnn"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name, output_attentions=True).to(device)
cuda_info()


cuda.is_available: 	 True
cuda.device_count: 	 1
cuda.current_device: 	 0
cuda.device: 		 <torch.cuda.device object at 0x7fab387cb8e0>

cuda.get_device_name: 	 NVIDIA GeForce RTX 3050 Ti Laptop GPU
total memory: 		 4294508544
reserved memory:	 1635778560
allocated memory:	 1625362944

device name: 		 cuda:0
transformers: 		 4.29.2
pytorch: 		 2.0.0


# Decoding Strategies


## Decoding parameters and example

In [3]:
generation_config = model.generation_config

generation_config.temperature = 0.4
generation_config.top_p = 0.8
generation_config.top_k = 10
generation_config.num_beams = 4
generation_config.max_new_tokens = 150

print(generation_config)


GenerationConfig {
  "_from_model_config": true,
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "length_penalty": 2.0,
  "max_length": 142,
  "max_new_tokens": 150,
  "min_length": 56,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "output_attentions": true,
  "pad_token_id": 1,
  "temperature": 0.4,
  "top_k": 10,
  "top_p": 0.8,
  "transformers_version": "4.29.2"
}



In [5]:

# create ids of encoded input vectors
sentence = 'The House Budget Committee passed a spending bill.'

decode_and_print(model, generation_config, sentence)


House Budget Committee passes a spending bill. House Budget Committee passed a spending bills. House budget committee passed a bill to fund the government. The spending bill was passed by the House of Representatives. The Senate will vote on the spending bill later this month. The bill is expected to be approved by the Senate on Thursday.


## Greedy Decoding

In [6]:
generation_config = model.generation_config
generation_config.do_sample = False
generation_config.num_beams = 1
generation_config.max_new_tokens = 150


In [7]:
sentence = 'The House Budget Committee passed a spending bill.'

decode_and_print(model, generation_config, sentence)


House Budget Committee passed a spending bill. House Budget Committee passing a spendingBill. House budget Committee passed spending bill, passed bill. Bill passed by House Budget committee. House passed spending Bill. House passes spending bill; bill passed by Senate. House votes on bill. Senate votes on spending bill and passes bill.


## Sampling

### Multinomial Sampling

### Top-k Sampling

In [8]:
sentence = 'The House Budget Committee passed a spending bill.'

generation_config = model.generation_config
generation_config.do_sample = True
generation_config.num_beams = 1
generation_config.temperature = 1

print(generation_config)


GenerationConfig {
  "_from_model_config": true,
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "do_sample": true,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "length_penalty": 2.0,
  "max_length": 142,
  "max_new_tokens": 150,
  "min_length": 56,
  "no_repeat_ngram_size": 3,
  "output_attentions": true,
  "pad_token_id": 1,
  "top_k": 10,
  "top_p": 0.8,
  "transformers_version": "4.29.2"
}



In [9]:
for n in range(1,6):
    
    print("## Top k ", n*10)
    generation_config.top_k = n*10
    decode_and_print(model, generation_config, sentence)
    print()


## Top k  1
House Budget Committee passed a spending bill. House Budget Committee pass a spending law. House will vote on the bill again next week. House budget committee passed bill with no amendments. House voted to pass bill with amendments. It would provide for a two-year, $3.8 billion spending program.

## Top k  2
House Budget Committee passed a spending bill. House Budget Committee passes a spending bills. House budget committee passed a bill to fund the government. House passed a spent bill. No amendments were passed. House will vote on the bill in September. House is scheduled to vote again in December.

## Top k  3
House Budget Committee passed a spending bill. House Budget Committee pass a spending passed a bill. Congress passes a spending. bill. It passed by House vote. The bill will be signed into law by President Barack Obama. The Senate will have to vote again on the bill.

## Top k  4
House Budget Committee passed a spending bill. House Budget Committee Passed a spendin

### Top-p sampling

In [10]:
sentence = 'The House Budget Committee passed a spending bill.'

generation_config = model.generation_config
generation_config.do_sample = True
generation_config.num_beams = 1
generation_config.temperature = 1

print(generation_config)


GenerationConfig {
  "_from_model_config": true,
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "do_sample": true,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "length_penalty": 2.0,
  "max_length": 142,
  "max_new_tokens": 150,
  "min_length": 56,
  "no_repeat_ngram_size": 3,
  "output_attentions": true,
  "pad_token_id": 1,
  "top_p": 0.8,
  "transformers_version": "4.29.2"
}



In [11]:
for n in range(1,6):
    generation_config.top_p = 0.2*n-0.05
    print("## Top p ", generation_config.top_p)
    decode_and_print(model, generation_config, sentence)
    print()


## Top p  0.15000000000000002
House Budget Committee passed a spending bill. House Budget Committee passing a spendingBill. House budget Committee passed spending bill for the year. House passed spending Bill. House passes spending bill, House passes bill. Senate passes spending Bill, House votes on bill. The bill is expected to be signed into law.

## Top p  0.35000000000000003
House Budget Committee passed a spending bill. House Budget Committee passing a spending Bill. House passed a bill to fund the government. House passes a spendingBill. House votes on the bill. The bill is sent to the House of Representatives. The House votes to pass the bill on a vote of approval.

## Top p  0.55
House Budget Committee passed a spending bill. House Budget Committee Passed a spendingBill. House has passed a bill to spend the money. House passed a measure to fund the government. House will now vote on the bill again. House also passed a separate bill to fund a program to help the military.

## To

### Contrastive Search
https://huggingface.co/blog/introducing-csearch

### Return sequences

In [19]:
sentence = 'The House Budget Committee passed a spending bill.'

encoded_input_ids_1 = tokenizer(sentence, return_tensors="pt", add_special_tokens=False).input_ids.to(device)

with torch.no_grad():
    generation_output = model.generate(
        input_ids = encoded_input_ids_1,
        num_return_sequences=5, 
        generation_config = generation_config,
        return_dict_in_generate = True,
        output_scores = True
    )

for s in generation_output.sequences:
    print("Output: ")
    output = tokenizer.decode(s, skip_special_tokens=True)
    print(output)
    print()


Output: 
House Budget Committee passes a spending bill that would fund the national debt. House Committee passes the bill without incident. House Speaker John Boehner and the House Appropriations Committee take up a bill later this week. But the Senate version will soon pass after a vote from Speaker Boehner and a House vote by the Senate committee.

Output: 
House Budget Committee passed a spending bill. House Budget Committee. passed a funding bill for education, energy, health and human services. House budget bill passed by committee. House vote set for next week. House passed Senate bill. Vote for House bill expected Tuesday. Republicans in House have proposed a bill that would fund the government through a compromise.

Output: 
The House Budget Committee passed a spending bill. House Budget committee passed a  spending bill; the Senate voted on a new spending bill Saturday. House Speaker John Boehner said the spending bill's passage would allow him to balance the budget. The bill 

## Beam Search

In [9]:
sentence = 'The House Budget Committee passed a spending bill.'

generation_config = model.generation_config
generation_config.do_sample = False
generation_config.num_beams = 1



In [10]:
for n in range(1,6):

    print("## Beam size of ", n)
    generation_config.num_beams = n
    decode_and_print(model, generation_config, sentence)
    print()


## Beam size of  1
House Budget Committee passed a spending bill. House Budget Committee passing a spendingBill. House budget Committee passed spending bill, passed bill. Bill passed by House Budget committee. House passed spending Bill. House passes spending bill; bill passed by Senate. House votes on bill. Senate votes on spending bill and passes bill.

## Beam size of  2
House Budget Committee passed a spending bill. House Budget Committee passing a bill to fund the government. House budget committee passed a bill that would fund the U.S. government through 2018. House passed a budget bill that will fund the country's government through 2019. The bill was passed by the House of Representatives and the Senate.

## Beam size of  3
House Budget Committee passes a spending bill. House Budget Committee passed a spending bills. House budget committee passed a bill to fund the government. The bill was passed by the House of Representatives. The Senate will vote on the bill later this month

# Decoding with Constraints



In [2]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import transformers
import torch 

if torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"

device = "cpu"

def cuda_info():
    print()
    print("cuda.is_available: \t", torch.cuda.is_available())
    if torch.cuda.is_available():
        print("cuda.device_count: \t", torch.cuda.device_count())
        print("cuda.current_device: \t", torch.cuda.current_device())
        print("cuda.device: \t\t", torch.cuda.device(torch.cuda.current_device()))
        print()
        print("cuda.get_device_name: \t", torch.cuda.get_device_name(torch.cuda.current_device()))
        print("total memory: \t\t", torch.cuda.get_device_properties(0).total_memory)
        print("reserved memory:\t", torch.cuda.memory_reserved(0))
        print("allocated memory:\t", torch.cuda.memory_allocated(0))


    device = "cuda:" + str(torch.cuda.current_device()) if torch.cuda.is_available() else "cpu"
    print()
    print("device name: \t\t", device)
    print("transformers: \t\t", transformers.__version__)
    print("pytorch: \t\t", torch.__version__)

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2").to(device)

cuda_info()


cuda.is_available: 	 True
cuda.device_count: 	 1
cuda.current_device: 	 0
cuda.device: 		 <torch.cuda.device object at 0x7f5c4f47be20>

cuda.get_device_name: 	 NVIDIA GeForce RTX 3050 Ti Laptop GPU
total memory: 		 4294508544
reserved memory:	 0
allocated memory:	 0

device name: 		 cuda:0
transformers: 		 4.29.2
pytorch: 		 2.0.0


## Repetitions and word lists
### n-gram Repetitions

In [2]:
sentence = 'The House Budget Committee passed a spending bill'

encoded_input_ids_1 = tokenizer(sentence, return_tensors="pt", add_special_tokens=False).input_ids.to(device)

with torch.no_grad():
    generation_output = model.generate(
        input_ids = encoded_input_ids_1,
        no_repeat_ngram_size=1,
        return_dict_in_generate = True,
        output_scores = True
    )

for s in generation_output.sequences:
    print("Output: ")
    output = tokenizer.decode(s, skip_special_tokens=True)
    print(output)
    print()


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Output: 
The House Budget Committee passed a spending bill on Thursday that would cut the deficit by $1.3



### Force words and bad words


In [129]:
sentence = 'The soldiers'
input_ids = tokenizer(sentence, return_tensors="pt", add_special_tokens=False).input_ids.to(device)

## Forced words
force_disjunctive = ["day two", "day one"]
force_phrasal = "leave now or die"

force_words_ids = [ tokenizer(force_disjunctive, add_special_tokens=False).input_ids,
                    tokenizer(force_phrasal, add_special_tokens=False).input_ids
                  ]

print("## Force word ids:")
for word_ids in force_words_ids:
    if isinstance(word_ids[0], list):
        print("  DisjunctiveConstraint: ", word_ids)
    else:
        print("  PhrasalConstraint: ", word_ids)


## Force word ids:
  DisjunctiveConstraint:  [[820, 734], [820, 530]]
  PhrasalConstraint:  [47408, 783, 393, 4656]


In [93]:
## Bad words
bad_words_set = ["whom", "year"]
bad_words_ids = tokenizer(bad_words_set, add_special_tokens=False).input_ids

print("## Bad word ids:")
for word_ids in bad_words_ids:
    if isinstance(word_ids[0], list):
        print("DisjunctiveConstraint: ", word_ids)
    else:
        print("PhrasalConstraint: ", word_ids)


## Bad word ids:
PhrasalConstraint:  [1929, 296]
PhrasalConstraint:  [1941]


In [None]:

generation_output = model.generate(
    input_ids = input_ids,
    force_words_ids=force_words_ids,
    bad_words_ids=bad_words_ids,
    num_beams = 10,
    num_return_sequences=1,
    no_repeat_ngram_size=6,
    remove_invalid_values=True,
    output_scores = True
)

for s in generation_output:
    print("## Output: ")
    output = tokenizer.decode(s, skip_special_tokens=True)
    print(output)
    print()


## Constraints



### Phrasal Constraint

In [5]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, PhrasalConstraint

#tokenizer = AutoTokenizer.from_pretrained("t5-base")
#model = AutoModelForSeq2SeqLM.from_pretrained("t5-base").to(device)

encoder_input_str = "The soldiers"
input_ids = tokenizer(encoder_input_str, return_tensors="pt").input_ids.to(device)


force_flexible_set = 'at the base'
tk_list = tokenizer(force_flexible_set, add_special_tokens=False).input_ids

constraints = [
    PhrasalConstraint(tk_list)
]

outputs = model.generate(
    input_ids,
    constraints=constraints,
    num_beams=10,
    num_return_sequences=1,
    no_repeat_ngram_size=5,
    max_length = 30,
    remove_invalid_values=True
)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Output:
----------------------------------------------------------------------------------------------------
The soldiers, who had been stationed at the base, had been ordered to leave the area.

The soldiers, who were stationedat the base


### Disjunctive Constraints

In [24]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, PhrasalConstraint, DisjunctiveConstraint

encoder_input_str = "The soldiers"
input_ids = tokenizer(encoder_input_str, return_tensors="pt").input_ids.to(device)

force_words_set1 = [" stationed", "night"]
words_ids_set1 = tokenizer(force_words_set1, add_special_tokens=False).input_ids
print(words_ids_set1)

constraints = [
    DisjunctiveConstraint(words_ids_set1)
]


[[25967], [3847]]


In [23]:
tokenizer.convert_ids_to_tokens(25967)

'Ġstationed'

In [25]:
outputs = model.generate(
    input_ids,
    constraints=constraints,
    num_beams=10,
    num_return_sequences=1,
    max_length = 30,
    no_repeat_ngram_size=6,
    remove_invalid_values=True
)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Output:
----------------------------------------------------------------------------------------------------
The soldiers, who had been stationed at the base, were taken to a nearby hospital, where they were treated for minor injuries and released.




### List of Constraints

In [34]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, PhrasalConstraint, DisjunctiveConstraint

# The prompt
encoder_input_str = "The soldiers"
input_ids = tokenizer(encoder_input_str, return_tensors="pt").input_ids.to(device)

# First constraint
force_words_set1 = [" stationed", "in the field"]
words_ids_set1 = tokenizer(force_words_set1, add_special_tokens=False).input_ids
constraint_1 = DisjunctiveConstraint(words_ids_set1)

print()
print(force_words_set1)
print(constraint_1.trie.trie)

# Second constraint
force_words_set2 = [" hospital"]
words_ids_set2 = tokenizer(force_words_set2, add_special_tokens=False).input_ids
constraint_2 = DisjunctiveConstraint(words_ids_set2)

print()
print(force_words_set2)
print(constraint_2.trie.trie)

# Third constraint
force_flexible_set = " at the battle"
phrasal_constraints = tokenizer(force_flexible_set, add_special_tokens=False).input_ids
constraint_3 = PhrasalConstraint(phrasal_constraints)

print()
print(force_flexible_set)
print(constraint_3.token_ids)

# The list of constraints
constraints = [ constraint_1, constraint_2,constraint_3 ]



[' stationed', 'in the field']
{25967: {}, 259: {262: {2214: {}}}}

[' hospital']
{4436: {}}

 at the battle
[379, 262, 3344]


In [35]:
outputs = model.generate(
    input_ids,
    constraints=constraints,
    num_beams=10,
    num_return_sequences=1,
    max_length = 30,
    no_repeat_ngram_size=5,
    remove_invalid_values=True
)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Output:
----------------------------------------------------------------------------------------------------
The soldiers stationed at the base were not allowed to leave the base until the end of the war.

"We were told at the battle hospital


## Low-level API

In [13]:
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    ConstrainedBeamSearchScorer,
    PhrasalConstraint, MaxLengthCriteria,
    LogitsProcessorList, StoppingCriteriaList,
    MinLengthLogitsProcessor
)

# lets run beam search using 3 beams
num_beams = 3

encoder_input_str = "The soldier"
input_ids = tokenizer(encoder_input_str, return_tensors="pt").input_ids.to(device)

input_ids = input_ids.repeat_interleave(num_beams, dim=0)

constraint_str = ["black", "country"]
constraint_token_ids = tokenizer.encode(constraint_str)[:-1]  # slice to remove eos token
constraints = [PhrasalConstraint(token_ids=constraint_token_ids)]

# instantiate beam scorer
beam_scorer = ConstrainedBeamSearchScorer(
    batch_size=1, num_beams=num_beams, device=model.device, max_length = 50, constraints=constraints
)

# instantiate logits processors
logits_processor = LogitsProcessorList(
    [
        MinLengthLogitsProcessor(5, eos_token_id=model.config.eos_token_id),
    ]
)

outputs = model.constrained_beam_search(
    input_ids, beam_scorer, constraints=constraints, stopping_criteria=StoppingCriteriaList([MaxLengthCriteria(max_length=50)]), logits_processor=logits_processor
)

tokenizer.batch_decode(outputs, skip_special_tokens=True)

['The soldier, who was wearing a black T-shirt and jeans, said he had been in the country for two years.\n\n"I was in the country for two years. I was in the country for two years," he said.black']

# Summary