In [1]:
from transformers import BertTokenizer, BertModel

  from .autonotebook import tqdm as notebook_tqdm


# 截断案例

In [2]:
# 模型名字，https://huggingface.co/google-bert/bert-base-chinese
# 这里使用bert-base-chinese
model_name = "bert-base-chinese"

# 加载tokenizer和model
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

# 演示Tokenizer
output = tokenizer.encode(
    # 待编码的文本
    text='我喜欢自然语言处理，因为它很有趣',
    # 设置文本的最大长度
    max_length=10,
    # 设置如果超过最大长度，则截断
    truncation=True,
    # 如果长度不够，则自动填充到max_length
    padding='max_length',
    # 设置为None，则指定返回list类型
    return_tensors=None,
)
print(output) # [101, 2769, 1599, 3614, 5632, 4197, 6427, 6241, 1905, 102]
tokenizer.decode(output) # '[CLS] 我 喜 欢 自 然 语 言 处 [SEP]'，这里编码成了bert模型能处理的方式，包括[CLS][SEP]等特殊符号
# 在 BERT模型中，[CLS] 和 [PAD] 是两个特殊的标记，用于在输入文本中传达额外的信息
# [CLS] 标记是 BERT 模型输入序列的特殊起始标记，通常用于表示整个句子的语义信息
# [PAD] 标记用于在输入序列中进行填充，由于 BERT 模型要求输入序列的长度是固定的，因此如果输入的文本长度不足，需要通过在文本末尾添加 [PAD] 标记来进行填充，以保证所有输入序列的长度一致。

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[101, 2769, 1599, 3614, 5632, 4197, 6427, 6241, 1905, 102]


'[CLS] 我 喜 欢 自 然 语 言 处 [SEP]'

# 填充案例

In [3]:
model_name = "bert-base-chinese"

tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

# 演示Tokenizer
output = tokenizer.encode(
    text='我喜欢',
    max_length=10,
    truncation=True,
    # 如果长度不够，则自动填充到max_length
    padding='max_length',
    return_tensors=None,
)
print(output) # [101, 2769, 1599, 3614, 102, 0, 0, 0, 0, 0]
tokenizer.decode(output) # '[CLS] 我 喜 欢 [SEP] [PAD] [PAD] [PAD] [PAD] [PAD]'，[PAD]代表填充

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[101, 2769, 1599, 3614, 102, 0, 0, 0, 0, 0]


'[CLS] 我 喜 欢 [SEP] [PAD] [PAD] [PAD] [PAD] [PAD]'