In [8]:
from transformers import AutoModel, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese")
model = AutoModel.from_pretrained("bert-base-chinese")

embedding_batch = tokenizer(
    "The capital of Singapore is Singapore.", "The capital of China is Beijing."
)

for key, value in embedding_batch.items():
    print(f"{key}:,{value}")

input_ids:,[101, 100, 10715, 8205, 100, 8310, 100, 119, 102, 100, 10715, 8205, 100, 8310, 100, 119, 102]
token_type_ids:,[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
attention_mask:,[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [9]:
sequence = "The capital of China is Beijing."
token_ids_e2e = tokenizer.encode(sequence)
token_ids_e2e

[101, 100, 10715, 8205, 100, 8310, 100, 119, 102]

In [10]:
sequence = "The capital of China is Beijing."
tokens = tokenizer.tokenize(sequence)
tokens

['[UNK]', 'capital', 'of', '[UNK]', 'is', '[UNK]', '.']

In [12]:
token_ids = tokenizer.convert_tokens_to_ids(tokens)
token_ids

[100, 10715, 8205, 100, 8310, 100, 119]

In [17]:
len(tokenizer.vocab.keys())

21128

In [16]:
new_tokens = ["uisuy", "nsu"]
# 将集合作差结果添加到词表中, 确保新添加的 Token 不在词表中
new_tokens = set(new_tokens) - set(tokenizer.vocab.keys())

{'nsu', 'uisuy'}

In [18]:
tokenizer.add_tokens(list(new_tokens))
len(tokenizer.vocab.keys())

21130

In [19]:
new_special_token = {"sep_token": "NEW_SPECIAL_TOKEN"}
tokenizer.add_special_tokens(new_special_token)
len(tokenizer.vocab.keys())

21131

In [20]:
tokenizer.save_pretrained("./models/new-bert-base-chinese")

('./models/new-bert-base-chinese/tokenizer_config.json',
 './models/new-bert-base-chinese/special_tokens_map.json',
 './models/new-bert-base-chinese/vocab.txt',
 './models/new-bert-base-chinese/added_tokens.json',
 './models/new-bert-base-chinese/tokenizer.json')

In [21]:
model.save_pretrained("./models/new-bert-base-chinese")

In [22]:
from transformers import pipeline

fill_mask = pipeline(task="fill-mask", model="bert-base-chinese")
text = "人民是[MASK]可战胜的"

fill_mask(text, top_k=1)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dens

[{'score': 0.9203746914863586,
  'token': 679,
  'token_str': '不',
  'sequence': '人 民 是 不 可 战 胜 的'}]