# Byte Pair Encoding Example

This is an example of byte pair encoding from [Neural Machine Translation of Rare Words with Subword Units](https://arxiv.org/abs/1508.07909) by Sennrich et al. (2015).

In [None]:
import collections
import re

In [2]:
def get_stats(vocab: dict[str, int]) -> dict[tuple[str, str], int]:
    """Get stats of token pairs."""
    pairs = collections.defaultdict(int)
    for word, freq in vocab.items():
        symbols = word.split()
        for i in range(len(symbols)-1):
            pairs[symbols[i], symbols[i+1]] += freq
    return pairs

In [3]:
def merge_vocab(pair: tuple[str, str], v_in: dict[str, int]) -> dict[str, int]:
    """Merge a particular pair together and return the new vocabulary."""
    v_out = {}
    bigram = re.escape(' '.join(pair))
    p = re.compile(r'(?<!\S)' + bigram + r'(?!\S)')
    for word in v_in:
        w_out = p.sub(''.join(pair), word)
        v_out[w_out] = v_in[word]
    return v_out

In [4]:
vocab = {
    'l o w </w>' : 5,
    'l o w e r </w>' : 2,
    'n e w e s t </w>': 6,
    'w i d e s t </w>': 3,
}
num_merges = 10

for i in range(num_merges):
    print(f"{vocab=}")
    pairs = get_stats(vocab)
    top_pairs = sorted(list(pairs.items()), key=lambda x: x[1], reverse=True)[:5]
    print(f"{top_pairs=}")
    best = top_pairs[0][0]
    vocab = merge_vocab(best, vocab)
    print(f"best={best}: {pairs[best]}")

vocab={'l o w </w>': 5, 'l o w e r </w>': 2, 'n e w e s t </w>': 6, 'w i d e s t </w>': 3}
top_pairs=[(('e', 's'), 9), (('s', 't'), 9), (('t', '</w>'), 9), (('w', 'e'), 8), (('l', 'o'), 7)]
best=('e', 's'): 9
vocab={'l o w </w>': 5, 'l o w e r </w>': 2, 'n e w es t </w>': 6, 'w i d es t </w>': 3}
top_pairs=[(('es', 't'), 9), (('t', '</w>'), 9), (('l', 'o'), 7), (('o', 'w'), 7), (('n', 'e'), 6)]
best=('es', 't'): 9
vocab={'l o w </w>': 5, 'l o w e r </w>': 2, 'n e w est </w>': 6, 'w i d est </w>': 3}
top_pairs=[(('est', '</w>'), 9), (('l', 'o'), 7), (('o', 'w'), 7), (('n', 'e'), 6), (('e', 'w'), 6)]
best=('est', '</w>'): 9
vocab={'l o w </w>': 5, 'l o w e r </w>': 2, 'n e w est</w>': 6, 'w i d est</w>': 3}
top_pairs=[(('l', 'o'), 7), (('o', 'w'), 7), (('n', 'e'), 6), (('e', 'w'), 6), (('w', 'est</w>'), 6)]
best=('l', 'o'): 7
vocab={'lo w </w>': 5, 'lo w e r </w>': 2, 'n e w est</w>': 6, 'w i d est</w>': 3}
top_pairs=[(('lo', 'w'), 7), (('n', 'e'), 6), (('e', 'w'), 6), (('w', 'est</w>'),

# Sentencepiece Encoding
使用预训练模型的训练结果

In [5]:
# 安装依赖（若未安装）
# ！pip install modelscope transformers # 从魔搭社区安装transformers

from modelscope.hub.snapshot_download import snapshot_download
# 从魔搭社区下载 bert-base-chinese 模型
model_dir = snapshot_download(
    model_id="tiansz/bert-base-chinese",  # 魔搭社区的模型ID（与Hugging Face兼容）
    cache_dir="./"  # 下载到当前目录（可自定义路径）
)

print(f"模型已下载到：{model_dir}")  # 输出：./bert-base-chinese


Downloading Model from https://www.modelscope.cn to directory: ./tiansz/bert-base-chinese


2025-10-08 19:58:41,637 - modelscope - INFO - Got 7 files, start to download ...


Processing 7 items:   0%|          | 0.00/7.00 [00:00<?, ?it/s]

Downloading [config.json]:   0%|          | 0.00/624 [00:00<?, ?B/s]

Downloading [model.safetensors]:   0%|          | 0.00/392M [00:00<?, ?B/s]

Downloading [README.md]:   0%|          | 0.00/2.12k [00:00<?, ?B/s]

Downloading [configuration.json]:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

Downloading [tokenizer.json]:   0%|          | 0.00/263k [00:00<?, ?B/s]

Downloading [tokenizer_config.json]:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading [vocab.txt]:   0%|          | 0.00/107k [00:00<?, ?B/s]

2025-10-08 20:04:54,002 - modelscope - INFO - Download model 'tiansz/bert-base-chinese' successfully.


模型已下载到：./tiansz/bert-base-chinese


In [20]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained("./tiansz/bert-base-chinese")  # 路径与下载路径一致
import nltk
# nltk.download('punkt')  # 下载punkt分词模型

# 待分词的文本
english_text = "lowest"
chinese_text = "当我还只有六岁的时候在一本描写原始森林的名叫真实的故事的书中看到了一幅精彩的插画"

# 分词（返回子词列表）
en_tokens = tokenizer.tokenize(english_text)
zh_tokens = tokenizer.tokenize(chinese_text)

print(f"英文 '{english_text}' 分词结果：{en_tokens}")
print(f"中文句子分词结果：{zh_tokens}")

english_text2 = "Japan is a country in East Asia."
en_tokens2 = tokenizer.tokenize(english_text2.lower())

print(f'Chinese: {en_tokens2}')
print(f'nltk: {nltk.word_tokenize(english_text2.lower())}')  # 简单的空格分词对比

english_text3 = 'Pneumonoultramicroscopicsilicovolcanoconiosis'.lower()
en_tokens3 = tokenizer.tokenize(english_text3)
print(f'english_text3: {en_tokens3}')

english_text4 = 'Happiness'.lower()
en_tokens4 = tokenizer.tokenize(english_text4)
print(f'english_text3: {en_tokens4}')

english_text5 = 'Japaness'.lower()
en_tokens5 = tokenizer.tokenize(english_text5)
print(f'{english_text5}: {en_tokens5}')


英文 'lowest' 分词结果：['low', '##est']
中文句子分词结果：['当', '我', '还', '只', '有', '六', '岁', '的', '时', '候', '在', '一', '本', '描', '写', '原', '始', '森', '林', '的', '名', '叫', '真', '实', '的', '故', '事', '的', '书', '中', '看', '到', '了', '一', '幅', '精', '彩', '的', '插', '画']
Chinese: ['japan', 'is', 'a', 'country', 'in', 'east', 'asia', '.']
nltk: ['japan', 'is', 'a', 'country', 'in', 'east', 'asia', '.']
english_text3: ['p', '##ne', '##um', '##on', '##ou', '##lt', '##ram', '##ic', '##ros', '##co', '##pi', '##cs', '##il', '##ico', '##vo', '##lc', '##ano', '##con', '##io', '##sis']
english_text3: ['ha', '##pp', '##ine', '##ss']
japaness: ['japan', '##ess']
