https://huggingface.co/docs/transformers/main/en/model_doc/gpt2

https://d4mucfpksywv.cloudfront.net/better-language-models/language_models_are_unsupervised_multitask_learners.pdf

https://huggingface.co/openai-community/gpt2

*GPT-2 is trained with a simple objective: predict the next word, given all of the previous words within some text.*

*a causal language modeling (CLM) objective.*


# 一、GPT2Model

> *The bare GPT2 Model transformer outputting raw hidden-states without any specific head on top.*

In [4]:
from transformers import GPT2Tokenizer, GPT2Model
import torch

model_id = "/data0/lizhong/models/gpts/gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_id)
# ? tokenizer.eos_token  is ''
tokenizer.pad_token = tokenizer.eos_token
# or tokenizer.add_special_tokens({'pad_token': '[PAD]'})
# not  tokenizer.pad_token='[PAD]'
# https://github.com/huggingface/transformers/issues/22312#issuecomment-1479544526
# https://github.com/huggingface/transformers/issues/22312#issuecomment-1479574070
# https://github.com/huggingface/transformers/issues/22312#issuecomment-1482588993

# Indeed the original sentencepiece model does not have a padding token.
# You can probably pad using the eos_token like it is done for GPT2.

model = GPT2Model.from_pretrained(model_id)

inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
print(tokenizer.convert_ids_to_tokens(inputs.input_ids[0]))

# forward
outputs = model(
    **inputs,   # input_ids, attention_mask

    output_attentions=True,
    output_hidden_states=True,
    return_dict=True,

    use_cache=True,  # decoder-only & inference
)

outputs.keys()
# outputs.hidden_states  # 13,  [1,6,768]
# outputs.last_hidden_state # [1,6,768]
# outputs.past_key_values # 12, [1,12,6,64]
# outputs.attentions  # 12, [1,12,6,6]

odict_keys(['last_hidden_state', 'past_key_values', 'hidden_states', 'attentions'])

# first next token
last_hidden_state # [1, 6, 768]
hidden_states # 13, [1, 6, 768]  12+last_hidden_state
attentions(probs) # 12, [1, 12, 6, 6]
past_key_values # 12, 2, [1, 12, 6, 64] 

## 二、 GPT2LMHeadModel

> *The GPT2 Model transformer with a language modeling head on top (linear layer with weights tied to the input embeddings).*

https://huggingface.co/blog/zh/how-to-generate

In [5]:
# from transformers import GPT2Tokenizer, GPT2LMHeadModel

# model_id = "/data0/lizhong/models/GPT/gpt2"
# device = "cuda:0"
# tokenizer = GPT2Tokenizer.from_pretrained(model_id)
# model = GPT2LMHeadModel.from_pretrained(model_id).to(device)
# model.eval()

from transformers import AutoModelForCausalLM, AutoTokenizer

model_id = "/data0/lizhong/models/gpts/gpt2"
device = "cuda:0"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id).to(device)
model.eval()

input_ids = tokenizer.encode(
    "Hello, my dog is cute", return_tensors='pt').to(model.device)

# self.lm_head.weight is self.transformer.wte.weight  # True

# forward
outputs = model(
    input_ids,
    labels=input_ids,  # 注意：可有，可无

    return_dict=True,
    output_attentions=True,
    output_hidden_states=True,

    # use_cache=True,
)

outputs.keys()
# outputs.attentions
# outputs.hidden_states
# outputs.past_key_values
# outputs.logits
# outputs.loss

odict_keys(['loss', 'logits', 'past_key_values', 'hidden_states', 'attentions'])

In [6]:
# from transformers import GPT2Tokenizer, GPT2LMHeadModel
# model_id = "/data0/lizhong/models/GPT/gpt2"
# device = "cuda:0"
# tokenizer = GPT2Tokenizer.from_pretrained(model_id)
# model = GPT2LMHeadModel.from_pretrained(model_id).to(device)
# model.eval()


from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig

model_id = "/data0/lizhong/models/gpts/gpt2"
device = "cuda:0"
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(model_id).to(device)
model.eval()

input_ids = tokenizer.encode(
    "Hello, my dog is cute", return_tensors='pt').to(model.device)


# generate
outputs = model.generate(
    input_ids,
    max_length=10,

    return_dict_in_generate=True,
    output_attentions=True,
    output_hidden_states=True,

    output_scores=True,
    output_logits=True,
)


outputs.keys()
# outputs.attentions
# outputs.hidden_states
# outputs.past_key_values
# outputs.logits
# outputs.loss
# outputs.sequences

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


odict_keys(['sequences', 'scores', 'logits', 'attentions', 'hidden_states', 'past_key_values'])

# 三、不同的解码策略

http://fancyerii.github.io/2023/12/19/hg-transformer-generate/

https://huggingface.co/docs/transformers/main_classes/text_generation

#### stopping_criteria

In [8]:
from transformers.generation.stopping_criteria import StoppingCriteria, StoppingCriteriaList, \
    STOPPING_CRITERIA_INPUTS_DOCSTRING, add_start_docstrings
from typing import List
import torch

class StopAtSpecificTokenCriteria(StoppingCriteria):
    """
    当生成出第一个指定token时，立即停止生成
    ---------------
    ver: 2023-08-02
    by: changhongyu
    """
    def __init__(self, token_id_list: List[int] = None):
        """
        :param token_id_list: 停止生成的指定token的id的列表
        """
        self.token_id_list = token_id_list
        
    @add_start_docstrings(STOPPING_CRITERIA_INPUTS_DOCSTRING)
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        # return np.argmax(scores[-1].detach().cpu().numpy()) in self.token_id_list
        # 储存scores会额外占用资源，所以直接用input_ids进行判断
        return input_ids[0][-1].detach().cpu().numpy() in self.token_id_list


stopping_criteria = StoppingCriteriaList()
stopping_criteria.append(StopAtSpecificTokenCriteria(token_id_list=[13])) # '.'

#### logits_processor

In [46]:
from transformers.generation.logits_process import LogitsProcessor, LogitsProcessorList
from typing import List
import torch

class custom_logits_processor(LogitsProcessor):
    def __init__(self, forbid_token_id_list: List[int] = None):
        self.forbid_token_id_list = forbid_token_id_list

    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
        for id_ in self.forbid_token_id_list:
            scores[:, id_] = -float('inf')
        return scores


logits_processor_list = LogitsProcessorList([custom_logits_processor([50256, 38537]),])
logits_processor_list
# https://discuss.huggingface.co/t/use-custom-logitsprocessor-in-model-generate/11603

[<__main__.custom_logits_processor at 0x7f1365586a70>]

In [37]:
tokenizer.convert_tokens_to_ids(['beauty','danger', '.'])

[50256, 38537, 13]

In [45]:

# from transformers import GPT2LMHeadModel, GPT2Tokenizer
# device = "cuda:1"
# model_name_or_path = "/data0/lizhong/models/gpt/gpt2"
# tokenizer = GPT2Tokenizer.from_pretrained(model_name_or_path)
# model = GPT2LMHeadModel.from_pretrained(model_name_or_path).to(device)

from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer

device = "cuda:1"
model_name_or_path = "/data0/lizhong/models/gpts/gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(model_name_or_path).to(device)
model.eval()

# model.config

# 模型的解码策略是在模型的生成配置中定义的 generation_config
# model.generation_config 只会显示与默认生成配置不同的值，而不列出任何默认值。
# http://fancyerii.github.io/2023/12/19/hg-transformer-generate/
# https://huggingface.co/docs/transformers/main_classes/text_generation
# https://huggingface.co/docs/transformers/v4.42.0/en/main_classes/text_generation#transformers.GenerationConfig

prompt = "Once upon a time"
input_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.device)
print(input_ids)

# streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

# generate
# https://huggingface.co/docs/transformers/v4.42.0/en/main_classes/text_generation#transformers.GenerationMixin
# https://huggingface.co/docs/transformers/v4.42.0/en/main_classes/text_generation#transformers.GenerationMixin.generate
output = model.generate(
    input_ids,
    max_length=50,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.pad_token_id,
    logits_processor=logits_processor_list,  # 修改当前step在词表空间上的概率分布
    stopping_criteria=stopping_criteria,  # 根据用户所规定的规则来中止生成
    # streamer=streamer,
)  
# 通过直接将参数及其值传递给generate方法来覆盖任何generation_config, 例如 max_length, max_new_tokens, do_sample, top_k, eos_token_id...
print(output)
# tensor([[7454, 2402,  257,  640,   11,  262,  995,  373,  257, 1295,  286, 1049,
#          8737,  290, 1049, 3514,   13,  383,  995,  373]], device='cuda:1')
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(generated_text)
# Once upon a time, the world was a place of great beauty and great danger. The world was

tensor([[7454, 2402,  257,  640]], device='cuda:1')
tensor([[7454, 2402,  257,  640,   11,  262,  995,  373,  257, 1295,  286, 1049,
         8737,  290, 1049, 3514,   13]], device='cuda:1')
Once upon a time, the world was a place of great beauty and great danger.


In [2]:
# model.transformer
model.transformer.get_input_embeddings().weight

# model.lm_head
model.get_output_embeddings().weight

import torch
torch.equal(
    model.transformer.get_input_embeddings().weight, 
    model.get_output_embeddings().weight
)

True

## 两种搜索解码：do_sample=False


#### (1) greedy search  (num_beams=1)
在每个时间步中，选择预测概率分布中概率最大的作为下一个token

In [23]:
# from transformers import GPT2LMHeadModel, GPT2Tokenizer
# device = "cuda:1"
# model_name_or_path = "/data0/lizhong/models/gpt/gpt2"
# tokenizer = GPT2Tokenizer.from_pretrained(model_name_or_path)
# model = GPT2LMHeadModel.from_pretrained(model_name_or_path).to(device)

from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig

device = "cuda:1"
model_name_or_path = "/data0/lizhong/models/gpts/gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(model_name_or_path).to(device)
model.eval()

prompt = "Once upon a time"
# input_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.device)
inputs = tokenizer(prompt, return_tensors='pt')  # input_ids, attention_mask
print(inputs)
for key in inputs:
    inputs[key] = inputs[key].to(model.device)

# (1) greedy
generation_config = GenerationConfig(
    # max_lehgth=30,
    max_new_tokens=30, 
    eos_token_id=model.config.eos_token_id,
    pad_token_id=tokenizer.eos_token_id
)
# generation_config.save_pretrained(".myGenerationConfig", push_to_hub=True)

outputs = model.generate(
    # input_ids,
    **inputs,
    generation_config=generation_config
)
print(outputs)
print("Output:\n" + 100 * '-')
# print(tokenizer.decode(output[0], skip_special_tokens=True))
print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])

{'input_ids': tensor([[7454, 2402,  257,  640]]), 'attention_mask': tensor([[1, 1, 1, 1]])}
tensor([[7454, 2402,  257,  640,   11,  262,  995,  373,  257, 1295,  286, 1049,
         8737,  290, 1049, 3514,   13,  383,  995,  373,  257, 1295,  286, 1049,
         3514,   11,  290,  262,  995,  373,  257, 1295,  286, 1049]],
       device='cuda:1')
Output:
----------------------------------------------------------------------------------------------------
Once upon a time, the world was a place of great beauty and great danger. The world was a place of great danger, and the world was a place of great


#### (2)  beam search  (num_beams>1)
在每个时间步中，选择预测概率分布的前num_beam作为候选继续进行搜索，在所有候选的完整序列（路径中）选择概率最大的输出。

In [21]:
# from transformers import GPT2LMHeadModel, GPT2Tokenizer
# device = "cuda:1"
# model_name_or_path = "/data0/lizhong/models/gpt/gpt2"
# tokenizer = GPT2Tokenizer.from_pretrained(model_name_or_path)
# model = GPT2LMHeadModel.from_pretrained(model_name_or_path).to(device)

from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig

device = "cuda:1"
model_name_or_path = "/data0/lizhong/models/gpts/gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(model_name_or_path).to(device)
model.eval()

prompt = "Once upon a time"
# input_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.device)
inputs = tokenizer(prompt, return_tensors='pt')  # input_ids, attention_mask
print(inputs)
for key in inputs:
    inputs[key] = inputs[key].to(model.device)

# (2) beam search
generation_config = GenerationConfig(
    # max_lehgth=30,
    max_new_tokens=30,
    num_beams=3, 
    eos_token_id=model.config.eos_token_id,
    pad_token_id=tokenizer.eos_token_id
)
# generation_config.save_pretrained(".myGenerationConfig", push_to_hub=True)

outputs = model.generate(
    # input_ids,
    **inputs,
    generation_config=generation_config
)
print(outputs)
print("Output:\n" + 100 * '-')
# print(tokenizer.decode(output[0], skip_special_tokens=True))
print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])

{'input_ids': tensor([[7454, 2402,  257,  640]]), 'attention_mask': tensor([[1, 1, 1, 1]])}
tensor([[ 7454,  2402,   257,   640,    11,   340,   373,   531,   326,   262,
          4453,   550,   531,   284, 19010,    11,   366,    40,   481,  1577,
           345,   262,  8251,   286,   262, 13239,   286,  9538,    11,   290,
           314,   481,  1577,   345]], device='cuda:1')
Output:
----------------------------------------------------------------------------------------------------
Once upon a time, it was said that the Lord had said to Moses, "I will give you the keys of the kingdom of heaven, and I will give you


## 两种采样解码  do_sample=True

(1) Multinomial sampling（num_beams=1）

多项式采样生成

In [26]:
# from transformers import GPT2LMHeadModel, GPT2Tokenizer
# device = "cuda:1"
# model_name_or_path = "/data0/lizhong/models/gpt/gpt2"
# tokenizer = GPT2Tokenizer.from_pretrained(model_name_or_path)
# model = GPT2LMHeadModel.from_pretrained(model_name_or_path).to(device)

from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig

device = "cuda:1"
model_name_or_path = "/data0/lizhong/models/gpts/gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(model_name_or_path).to(device)
model.eval()

prompt = "Once upon a time"
# input_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.device)
inputs = tokenizer(prompt, return_tensors='pt')  # input_ids, attention_mask
print(inputs)
for key in inputs:
    inputs[key] = inputs[key].to(model.device)

#  Multinomial sampling
generation_config = GenerationConfig(
    # max_lehgth=30,
    max_new_tokens=30,
    do_sample=True, 
    eos_token_id=model.config.eos_token_id,
    pad_token_id=tokenizer.eos_token_id
)
# generation_config.save_pretrained(".myGenerationConfig", push_to_hub=True)

outputs = model.generate(
    # input_ids,
    **inputs,
    generation_config=generation_config
)
print(outputs)
print("Output:\n" + 100 * '-')
# print(tokenizer.decode(output[0], skip_special_tokens=True))
print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])

{'input_ids': tensor([[7454, 2402,  257,  640]]), 'attention_mask': tensor([[1, 1, 1, 1]])}
tensor([[ 7454,  2402,   257,   640,   339,   531,   284,  2241,    11,   366,
            40,   714,  1107,   466,   340,   329,   720,  1270,    13,  2011,
         17695,   318,  1642,   257,  1256,   286,  1637,   257,   614,    11,
           826,   526,  1406,    11]], device='cuda:1')
Output:
----------------------------------------------------------------------------------------------------
Once upon a time he said to himself, "I could really do it for $30. My grandfather is making a lot of money a year, right." So,


#### (2) beam search + multionmial sampling （num_beams>1）

多项式采样 + 束搜索

In [28]:
# from transformers import GPT2LMHeadModel, GPT2Tokenizer
# device = "cuda:1"
# model_name_or_path = "/data0/lizhong/models/gpt/gpt2"
# tokenizer = GPT2Tokenizer.from_pretrained(model_name_or_path)
# model = GPT2LMHeadModel.from_pretrained(model_name_or_path).to(device)

from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig

device = "cuda:1"
model_name_or_path = "/data0/lizhong/models/gpts/gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(model_name_or_path).to(device)
model.eval()

prompt = "Once upon a time"
# input_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.device)
inputs = tokenizer(prompt, return_tensors='pt')  # input_ids, attention_mask
print(inputs)
for key in inputs:
    inputs[key] = inputs[key].to(model.device)

#  Multinomial sampling
generation_config = GenerationConfig(
    # max_lehgth=30,
    max_new_tokens=30,
    do_sample=True, 
    num_beams=3,
    eos_token_id=model.config.eos_token_id,
    pad_token_id=tokenizer.eos_token_id
)
# generation_config.save_pretrained(".myGenerationConfig", push_to_hub=True)

outputs = model.generate(
    # input_ids,
    **inputs,
    generation_config=generation_config
)
print(outputs)
print("Output:\n" + 100 * '-')
# print(tokenizer.decode(output[0], skip_special_tokens=True))
print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])

{'input_ids': tensor([[7454, 2402,  257,  640]]), 'attention_mask': tensor([[1, 1, 1, 1]])}
tensor([[7454, 2402,  257,  640,   11,  340,  373,  531,  326,  611,  257,  582,
          550,  262,  826,  284,  257, 3656,   11,  339,  815,  423,  262,  826,
          284,  257, 3367,   11,  290,  262,  826,  284,  257, 4957]],
       device='cuda:1')
Output:
----------------------------------------------------------------------------------------------------
Once upon a time, it was said that if a man had the right to a wife, he should have the right to a son, and the right to a daughter


#### 需要注意几点：
（1）beam-based methods

 `early_stopping`

It accepts the following values（ Controls the stopping condition）

True, where the generation stops as soon as there are num_beams complete candidates; 

False, where an heuristic is applied and the generation stops when is it very unlikely to find better candidates; 

"never", where the beam search procedure only stops when there cannot be better candidates (canonical beam search algorithm).

In [30]:
# from transformers import GPT2LMHeadModel, GPT2Tokenizer
# device = "cuda:1"
# model_name_or_path = "/data0/lizhong/models/gpt/gpt2"
# tokenizer = GPT2Tokenizer.from_pretrained(model_name_or_path)
# model = GPT2LMHeadModel.from_pretrained(model_name_or_path).to(device)

from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig

device = "cuda:1"
model_name_or_path = "/data0/lizhong/models/gpts/gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(model_name_or_path).to(device)
model.eval()

prompt = "Once upon a time"
# input_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.device)
inputs = tokenizer(prompt, return_tensors='pt')  # input_ids, attention_mask
print(inputs)
for key in inputs:
    inputs[key] = inputs[key].to(model.device)

# (2) beam search
generation_config = GenerationConfig(
    # max_lehgth=30,
    max_new_tokens=30,
    num_beams=3, 
    early_stopping=True,
    eos_token_id=model.config.eos_token_id,
    pad_token_id=tokenizer.eos_token_id

)
# generation_config.save_pretrained(".myGenerationConfig", push_to_hub=True)

outputs = model.generate(
    # input_ids,
    **inputs,
    generation_config=generation_config
)
print(outputs)
print("Output:\n" + 100 * '-')
# print(tokenizer.decode(output[0], skip_special_tokens=True))
print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])


{'input_ids': tensor([[7454, 2402,  257,  640]]), 'attention_mask': tensor([[1, 1, 1, 1]])}
tensor([[ 7454,  2402,   257,   640,    11,   340,   373,   531,   326,   262,
          4453,   550,   531,   284, 19010,    11,   366,    40,   481,  1577,
           345,   262,  8251,   286,   262, 13239,   286,  9538,    11,   290,
           314,   481,  1577,   345]], device='cuda:1')
Output:
----------------------------------------------------------------------------------------------------
Once upon a time, it was said that the Lord had said to Moses, "I will give you the keys of the kingdom of heaven, and I will give you


#### （2）beam-based methods

`num_return_sequences`
 
 The number of independently computed returned sequences for each element in the batch.

In [34]:
# from transformers import GPT2LMHeadModel, GPT2Tokenizer
# device = "cuda:1"
# model_name_or_path = "/data0/lizhong/models/gpt/gpt2"
# tokenizer = GPT2Tokenizer.from_pretrained(model_name_or_path)
# model = GPT2LMHeadModel.from_pretrained(model_name_or_path).to(device)

from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig

device = "cuda:1"
model_name_or_path = "/data0/lizhong/models/gpts/gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(model_name_or_path).to(device)
model.eval()

prompt = "Once upon a time"
# input_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.device)
inputs = tokenizer(prompt, return_tensors='pt')  # input_ids, attention_mask
print(inputs)
for key in inputs:
    inputs[key] = inputs[key].to(model.device)

# (2) beam search
generation_config = GenerationConfig(
    # max_lehgth=30,
    max_new_tokens=30,
    num_beams=3, 
    # early_stopping=True,
    num_return_sequences=3,
    eos_token_id=model.config.eos_token_id,
    pad_token_id=tokenizer.eos_token_id

)
# generation_config.save_pretrained(".myGenerationConfig", push_to_hub=True)

outputs = model.generate(
    # input_ids,
    **inputs,
    generation_config=generation_config
)
print(outputs)
print("Output:\n" + 100 * '-')
# print(tokenizer.decode(output[0], skip_special_tokens=True))
from pprint import pprint
pprint(tokenizer.batch_decode(outputs, skip_special_tokens=True))

{'input_ids': tensor([[7454, 2402,  257,  640]]), 'attention_mask': tensor([[1, 1, 1, 1]])}
tensor([[ 7454,  2402,   257,   640,    11,   340,   373,   531,   326,   262,
          4453,   550,   531,   284, 19010,    11,   366,    40,   481,  1577,
           345,   262,  8251,   286,   262, 13239,   286,  9538,    11,   290,
           314,   481,  1577,   345],
        [ 7454,  2402,   257,   640,    11,   340,   373,   531,   326,   262,
          4453,   550,   531,   284, 19010,    11,   366,    40,   481,  1577,
           345,   262,  8251,   286,   262, 13239,   286,  9538,    11,   290,
           314,   481,   787,   345],
        [ 7454,  2402,   257,   640,    11,   340,   373,   531,   326,   262,
          4453,   550,   531,   284, 19010,    11,   366,    40,   481,  1577,
           345,   262,  8251,   286,   262, 13239,   286,  9538,    11,   290,
           345,  2236,   423, 43866]], device='cuda:1')
Output:
---------------------------------------------------------

## 四、return_dict_in_generate

 Whether or not to return a ModelOutput instead of a plain tuple.
 
https://zhuanlan.zhihu.com/p/383585103

https://discuss.huggingface.co/t/generation-probabilities-how-to-compute-probabilities-of-output-scores-for-gpt2/3175/15

In [8]:
# from transformers import GPT2LMHeadModel, GPT2Tokenizer
# device = "cuda:1"
# model_name_or_path = "/data0/lizhong/models/gpt/gpt2"
# tokenizer = GPT2Tokenizer.from_pretrained(model_name_or_path)
# model = GPT2LMHeadModel.from_pretrained(model_name_or_path).to(device)

from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig

device = "cuda:1"
model_name_or_path = "/data0/lizhong/models/gpts/gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(model_name_or_path).to(device)
model.eval()

prompt = "Once upon a time"
# input_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.device)
inputs = tokenizer(prompt, return_tensors='pt')  # input_ids, attention_mask
print(inputs)
for key in inputs:
    inputs[key] = inputs[key].to(model.device)

# (2) beam search
generation_config = GenerationConfig(
    max_length=10,
    # max_new_tokens=30,
    num_beams=3, 
    # early_stopping=True,
    num_return_sequences=3,

    return_dict_in_generate=True,
    
    output_attentions=True,
    output_hidden_states=True,
    output_scores=True,
    output_logits=True, 

    eos_token_id=model.config.eos_token_id,
    pad_token_id=tokenizer.eos_token_id

)
# generation_config.save_pretrained(".myGenerationConfig", push_to_hub=True)

outputs = model.generate(
    # input_ids,
    **inputs,
    generation_config=generation_config
)
print(outputs.keys())
# print("Output:\n" + 100 * '-')
# # print(tokenizer.decode(output[0], skip_special_tokens=True))
# from pprint import pprint
# pprint(tokenizer.batch_decode(outputs, skip_special_tokens=True))

{'input_ids': tensor([[7454, 2402,  257,  640]]), 'attention_mask': tensor([[1, 1, 1, 1]])}
odict_keys(['sequences', 'sequences_scores', 'scores', 'logits', 'beam_indices', 'attentions', 'hidden_states', 'past_key_values'])


$$\sum_t^{T}\log p(y_t|x,y_{<t})$$


### 3.1 sequences

In [61]:
print("Output:\n" + 100 * '-')
pprint(tokenizer.batch_decode(outputs.sequences, skip_special_tokens=True))

print("\nOutput ids:\n" + 100 * '-')
pprint(outputs.sequences)

print("\nInput ids:\n"  + 100 * '-')
pprint(inputs.input_ids)

print(f"\nGenerate ids: {gen_sequences.shape}\n"  + 100 * '-')
gen_sequences = outputs.sequences[:, inputs.input_ids.shape[-1]:]
pprint(gen_sequences) 

Output:
----------------------------------------------------------------------------------------------------
['Once upon a time, it was said that the',
 'Once upon a time, it was said, the',
 'Once upon a time, it was said, "']

Output ids:
----------------------------------------------------------------------------------------------------
tensor([[7454, 2402,  257,  640,   11,  340,  373,  531,  326,  262],
        [7454, 2402,  257,  640,   11,  340,  373,  531,   11,  262],
        [7454, 2402,  257,  640,   11,  340,  373,  531,   11,  366]],
       device='cuda:1')

Input ids:
----------------------------------------------------------------------------------------------------
tensor([[7454, 2402,  257,  640]], device='cuda:1')

Generate ids: torch.Size([3, 6])
----------------------------------------------------------------------------------------------------
tensor([[ 11, 340, 373, 531, 326, 262],
        [ 11, 340, 373, 531,  11, 262],
        [ 11, 340, 373, 531,  11, 366]], de

### 3.2 scores
https://discuss.huggingface.co/t/generation-probabilities-how-to-compute-probabilities-of-output-scores-for-gpt2/3175/19

In [157]:
 # generate 6 tokens 
print(f"\nScores: {len(outputs.scores)} {outputs.scores[0].shape}\n"  + 100 * '-')
# pprint(outputs.scores) 

# let's stack the logits generated at each step to a tensor and transform logits to probs
import torch
probs = torch.stack(outputs.scores, dim=1).softmax(-1) 
print(f"\nProbs: {probs.shape}\n"  + 100 * '-')
# pprint(probs)

print(f"\ngen_sequences:\n"  + 100 * '-')
pprint(gen_sequences)


print(f"\ngen_probs:\n"  + 100 * '-')
print(probs[0][0][11], probs[0][1][340], probs[0][2][373], probs[0][3][531], probs[0][4][326], probs[0][5][262])
print(probs[1][0][11], probs[1][1][340], probs[1][2][373], probs[1][3][531], probs[1][4][11], probs[1][5][262])
print(probs[2][0][11], probs[2][1][340], probs[2][2][373], probs[2][3][531], probs[2][4][11], probs[2][5][366])


Scores: 6 torch.Size([3, 50257])
----------------------------------------------------------------------------------------------------

Probs: torch.Size([3, 6, 50257])
----------------------------------------------------------------------------------------------------

gen_sequences:
----------------------------------------------------------------------------------------------------
tensor([[ 11, 340, 373, 531, 326, 262],
        [ 11, 340, 373, 531,  11, 262],
        [ 11, 340, 373, 531,  11, 366]], device='cuda:1')

gen_probs:
----------------------------------------------------------------------------------------------------
tensor(0.4269, device='cuda:1') tensor(0.0399, device='cuda:1') tensor(4.6807e-05, device='cuda:1') tensor(0.0442, device='cuda:1') tensor(0.0001, device='cuda:1') tensor(0.1809, device='cuda:1')
tensor(0.4269, device='cuda:1') tensor(4.9092e-05, device='cuda:1') tensor(0.1408, device='cuda:1') tensor(0.0005, device='cuda:1') tensor(0.0002, device='cuda:1') te

In [151]:
# pprint(gen_sequences[:,:,None]) # torch.Size([3, 6, 1]) 

# now we need to collect the probability of the generated token
# we need to add a dummy dim in the end to make gather work
print(f"\nGen_probs: {gen_probs.shape}\n"  + 100 * '-')
gen_probs = torch.gather(probs, 2, gen_sequences[:, :, None]).squeeze(-1) 
pprint(gen_probs)

# now we can do all kinds of things with the probs

# 1) the probs that exactly those sequences are generated again those are normally going to be very small
unique_prob_per_sequence = gen_probs.prod(-1)
print(f"\nUnique_prob_per_sequence:\n"  + 100 * '-')
print(unique_prob_per_sequence)

print(f"\nLog_prob_per_sequence:\n"  + 100 * '-')
print(torch.log(unique_prob_per_sequence))

print(f"\nSequences_scores:\n"  + 100 * '-')
print(outputs.sequences_scores)


# 2) normalize the probs over the three sequences
print(f"\nnormed_gen_probs:\n"  + 100 * '-')
normed_gen_probs = gen_probs / gen_probs.sum(0)
print(normed_gen_probs)

# assert normed_gen_probs[:, 0].sum() == 1.0, "probs should be normalized"

# 3) compare normalized probs to each other like in 1)
print(f"\nunique_normed_prob_per_sequence:\n"  + 100 * '-')
unique_normed_prob_per_sequence = normed_gen_probs.prod(-1)
print(unique_normed_prob_per_sequence)


Gen_probs: torch.Size([3, 6])
----------------------------------------------------------------------------------------------------
tensor([[4.2690e-01, 3.9930e-02, 4.6807e-05, 4.4169e-02, 1.2273e-04, 1.8086e-01],
        [4.2690e-01, 4.9092e-05, 1.4079e-01, 4.9947e-04, 1.5632e-04, 1.6078e-01],
        [4.2690e-01, 2.1966e-05, 3.2620e-01, 2.7762e-03, 3.3758e-01, 3.5912e-03]],
       device='cuda:1')

Unique_prob_per_sequence:
----------------------------------------------------------------------------------------------------
tensor([7.8224e-13, 3.7039e-14, 1.0295e-11], device='cuda:1')

Log_prob_per_sequence:
----------------------------------------------------------------------------------------------------
tensor([-27.8766, -30.9268, -25.2993], device='cuda:1')

Sequences_scores:
----------------------------------------------------------------------------------------------------
tensor([-1.8291, -1.8709, -1.9963], device='cuda:1')

normed_gen_probs:
----------------------------------

### 3.3 past_key_values

In [119]:
print(len(outputs.past_key_values))  # 3个seq，12head，9token，64dim
print(outputs.past_key_values[0][0].shape) # key
print(outputs.past_key_values[0][1].shape) # value

12
torch.Size([3, 12, 9, 64])
torch.Size([3, 12, 9, 64])


## 3.3 attentions


In [120]:
print(len(outputs.attentions)) # 生成6个新token
print(len(outputs.attentions[0]))  # 生成每个token时，保存12层attention_probs
print(outputs.attentions[0][11].shape)  # 生成第一个token时，需要计算prompt中所有token之间的关系
print(outputs.attentions[1][11].shape)  # 其他只需计算新token与之前的token之间的关系即可
print(outputs.attentions[2][11].shape)
print(outputs.attentions[3][11].shape)
print(outputs.attentions[4][11].shape)
print(outputs.attentions[5][11].shape)

6
12
torch.Size([3, 12, 4, 4])
torch.Size([3, 12, 1, 5])
torch.Size([3, 12, 1, 6])
torch.Size([3, 12, 1, 7])
torch.Size([3, 12, 1, 8])
torch.Size([3, 12, 1, 9])


In [143]:
print(outputs.attentions[0][-1].shape)  # 生成第一个token，最后一层attention_probs
# torch.Size([3, 12, 4, 4])  3个seq, 12个头
pprint(outputs.attentions[0][-1][0][-1]) # 第1个seq，最后head
pprint(outputs.attentions[0][-1][1][-1]) # 第2个seq，最后head

torch.Size([3, 12, 4, 4])
tensor([[1.0000, 0.0000, 0.0000, 0.0000],
        [0.8461, 0.1539, 0.0000, 0.0000],
        [0.6469, 0.2558, 0.0973, 0.0000],
        [0.6288, 0.1199, 0.1860, 0.0653]], device='cuda:1')
tensor([[1.0000, 0.0000, 0.0000, 0.0000],
        [0.8461, 0.1539, 0.0000, 0.0000],
        [0.6469, 0.2558, 0.0973, 0.0000],
        [0.6288, 0.1199, 0.1860, 0.0653]], device='cuda:1')


### hidden_states

In [108]:
print(len(outputs.hidden_states))
print(len(outputs.hidden_states[0]))
print(outputs.hidden_states[0][12].shape)
print(outputs.hidden_states[1][12].shape)
print(outputs.hidden_states[5][12].shape)

6
13
torch.Size([3, 4, 768])
torch.Size([3, 1, 768])
torch.Size([3, 1, 768])


In [28]:
#  if labels is not None:
#     # move labels to correct device to enable model parallelism
#     labels = labels.to(lm_logits.device)
#     # Shift so that tokens < n predict n
#     shift_logits = lm_logits[..., :-1, :].contiguous()  # 
#     shift_labels = labels[..., 1:].contiguous()  #
#     # Flatten the tokens
#     loss_fct = CrossEntropyLoss()
#     loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))