In [None]:
%pip install transformers

In [1]:
from transformers import BertTokenizer, BertModel
from transformers import BertTokenizerFast

In [None]:
# 加载预训练vocab和tokenizer
tokenizer = BertTokenizer.from_pretrained(
    pretrained_model_name_or_path='bert-base-chinese',
    cache_dir=None,
    force_download=False,
)

# 加载本地预训练tokenizer和model    
# 只下载config.json，vocab.txt，pytorch_model.bin 三个文件放在 bert-base-chinses 文件夹
# tokenizer = BertTokenizer.from_pretrained('model/bert-base-chinese')
# model = BertModel.from_pretrained('model/bert-base-chinese')

# 或 BertTokenizer只调用vocab文件，BertModel只调用bin文件即可
# tokenizer = BertTokenizer.from_pretrained('model.bert-base-chinese/vocab.txt')
# mdoel = BertModel.from_pretrained('model/bert-base-chinese/pytorch_model.bin')


sents = [
    '选择珠江花园的原因就是方便。',
    '笔记本的键盘确实爽。',
    '房间太小。其他的都一般。',
    '今天才知道这书还有第6卷,真有点郁闷.',
    '机器背面似乎被撕了张什么标签，残胶还在。',
    '春江潮水连海平，海上明月共潮生。',
]

tokenizer, sents, tokenizer.SPECIAL_TOKENS_ATTRIBUTES

In [None]:
# 编码句子对
out = tokenizer.encode(
    text=sents[0],
    text_pair=sents[1],

    truncation=True,
    padding='max_length',
    max_length=30,

    add_special_tokens=True,  # CLS,SEP,PAD
    
    return_tensors=None,
)
print(out)
tokenizer.decode(out)

In [None]:
# 增强的编码函数
out = tokenizer.encode_plus(
    text=sents[0],
    text_pair=sents[1],

    truncation=True,
    padding='max_length',
    max_length=30,

    add_special_tokens=True,
 
    return_tensors=None,

    return_token_type_ids=True,  # 第一个句子0，第二个句子1，padding为0
    return_attention_mask=True,  # padding为0，其余为1
    return_special_tokens_mask=True,  # 特殊字符为1，其余为0

    # 返回offset_mapping 标识每个词的起止位置,这个参数只能BertTokenizerFast使用
    # return_offsets_mapping=True,

    return_length=True,
)


for k, v in out.items():
    print(k, ':', v)
tokenizer.decode(out['input_ids'])

In [None]:
# 批量编码句子
out = tokenizer.batch_encode_plus(
    batch_text_or_text_pairs=[sents[0], sents[1], sents[2]],

    truncation=True,
    padding='max_length',
    max_length=15,

    add_special_tokens=True,

    return_tensors=None,


    return_token_type_ids=True,
    return_attention_mask=True,
    return_special_tokens_mask=True,

    #return_offsets_mapping=True,
    return_length=True,
)


for k, v in out.items():
    print(k, ':', v)

tokenizer.decode(out['input_ids'][0]), tokenizer.decode(out['input_ids'][1]), tokenizer.decode(out['input_ids'][2])

In [None]:
# 批量编码成对的句子
out = tokenizer.batch_encode_plus(
    batch_text_or_text_pairs=[(sents[0], sents[1]), 
                            (sents[2], sents[3]),
                            (sents[4], sents[5])
                            ],
    truncation=True,
    padding='max_length',
    max_length=30,

    add_special_tokens=True,
 
    return_tensors=None,

    return_token_type_ids=True,
    return_attention_mask=True,
    return_special_tokens_mask=True,

    #return_offsets_mapping=True,

    return_length=True,
)


for k, v in out.items():
    print(k, ':', v)

tokenizer.decode(out['input_ids'][0]), tokenizer.decode(out['input_ids'][1]), tokenizer.decode(out['input_ids'][2])

In [None]:
# 获取字典
voc = tokenizer.get_vocab()

# print(voc, type(voc), len(voc))
print('[EOS]' in voc, '月光' in voc, '希望' in voc, '李' in voc, '中' in voc)

# 添加新词
tokenizer.add_tokens(new_tokens=['月光', '希望'])

# 添加新符号
tokenizer.add_special_tokens({'eos_token': '[EOS]'})

# 加载字典
new_voc = tokenizer.get_vocab()

print(len(new_voc), new_voc['月光'], new_voc['希望'], new_voc['[EOS]'])

In [None]:
# 编码新添加的词
out = tokenizer.encode(
    text='月光的新希望[EOS]',
    text_pair=None,

    truncation=True,
    padding='max_length',
    max_length=8,

    add_special_tokens=True,
    return_tensors=None,
)

print(out)
tokenizer.decode(out)

In [None]:
# 编码方式 BertTokenizer
text = '[CLS] 武1松1打11老虎 [SEP] 你在哪 [SEP]'

tokenized_text = tokenizer.tokenize(text)  # 分词
print("分词：", tokenized_text)
print()

a = tokenizer.convert_tokens_to_ids(tokenized_text) #  方式1:手动添加CLS与SEP
print("方式1:",a)
print(tokenizer.decode(a))
print()

# 另一种wordpiece
# words = list(text)
# a_2 = tokenizer.convert_tokens_to_ids(words)
# print(a_2)
# print(tokenizer.decode(a_2))
# print()

b = tokenizer(text=text) # 方式2：返回一个字典(包含id,type,mask)，无须手动添加 CLS 与 SEP (与encode_plus不同点在返回tensor维数)
print("方式2:" ,b)
print(tokenizer.decode(b['input_ids']))
print()

c=tokenizer.encode(text=text) # 方式3:只返回ids，无须手动添加CLS与SEP 
print("方式3:", c)
print(tokenizer.decode(c))
print()

d=tokenizer.encode_plus(text=text,max_length=30,return_tensors='pt') # 方式4:返回一个字典(包含id,type,mask)，无须手动添加CLS与SEP   
print("方式4:", d)

In [None]:
# 编码方式 BertTokenizerFast
text = '[CLS] 武1松1打11老虎 [SEP] 你在哪 [SEP]'

tokenizerfast = BertTokenizerFast.from_pretrained('bert-base-chinese')

tokenized_text = tokenizerfast.tokenize(text)
print("分词：", tokenized_text)
print()

a = tokenizerfast.convert_tokens_to_ids(tokenized_text)
print("方式1：", a)
print("解码：", tokenizerfast.decode(a))
print()

# words = list(text)
# e = tokenizerfast.convert_tokens_to_ids(words)
# print(e)
# print(tokenizerfast.decode(e))
# print()

b = tokenizerfast(text, return_offsets_mapping=True)  # 设置标签对其
print("方式2", b)
print(tokenizerfast.decode(b['input_ids']))
print()

c = tokenizerfast.encode(text)
print("方式3", c)
print("解码：", tokenizerfast.decode(c))
print()

d = tokenizerfast.encode_plus(text)
print("方式4：", d)
print("解码：", tokenizerfast.decode(d['input_ids']))
print()