# Byte Pair Encoding Example

This is an example of byte pair encoding from [Neural Machine Translation of Rare Words with Subword Units](https://arxiv.org/abs/1508.07909) by Sennrich et al. (2015).

In [None]:
import collections
import re

In [2]:
def get_stats(vocab: dict[str, int]) -> dict[tuple[str, str], int]:
    """Get stats of token pairs."""
    pairs = collections.defaultdict(int)
    for word, freq in vocab.items():
        symbols = word.split()
        for i in range(len(symbols)-1):
            pairs[symbols[i], symbols[i+1]] += freq
    return pairs

In [3]:
def merge_vocab(pair: tuple[str, str], v_in: dict[str, int]) -> dict[str, int]:
    """Merge a particular pair together and return the new vocabulary."""
    v_out = {}
    bigram = re.escape(' '.join(pair))
    p = re.compile(r'(?<!\S)' + bigram + r'(?!\S)')
    for word in v_in:
        w_out = p.sub(''.join(pair), word)
        v_out[w_out] = v_in[word]
    return v_out

In [4]:
vocab = {
    'l o w </w>' : 5,
    'l o w e r </w>' : 2,
    'n e w e s t </w>': 6,
    'w i d e s t </w>': 3,
}
num_merges = 10

for i in range(num_merges):
    print(f"{vocab=}")
    pairs = get_stats(vocab)
    top_pairs = sorted(list(pairs.items()), key=lambda x: x[1], reverse=True)[:5]
    print(f"{top_pairs=}")
    best = top_pairs[0][0]
    vocab = merge_vocab(best, vocab)
    print(f"best={best}: {pairs[best]}")

vocab={'l o w </w>': 5, 'l o w e r </w>': 2, 'n e w e s t </w>': 6, 'w i d e s t </w>': 3}
top_pairs=[(('e', 's'), 9), (('s', 't'), 9), (('t', '</w>'), 9), (('w', 'e'), 8), (('l', 'o'), 7)]
best=('e', 's'): 9
vocab={'l o w </w>': 5, 'l o w e r </w>': 2, 'n e w es t </w>': 6, 'w i d es t </w>': 3}
top_pairs=[(('es', 't'), 9), (('t', '</w>'), 9), (('l', 'o'), 7), (('o', 'w'), 7), (('n', 'e'), 6)]
best=('es', 't'): 9
vocab={'l o w </w>': 5, 'l o w e r </w>': 2, 'n e w est </w>': 6, 'w i d est </w>': 3}
top_pairs=[(('est', '</w>'), 9), (('l', 'o'), 7), (('o', 'w'), 7), (('n', 'e'), 6), (('e', 'w'), 6)]
best=('est', '</w>'): 9
vocab={'l o w </w>': 5, 'l o w e r </w>': 2, 'n e w est</w>': 6, 'w i d est</w>': 3}
top_pairs=[(('l', 'o'), 7), (('o', 'w'), 7), (('n', 'e'), 6), (('e', 'w'), 6), (('w', 'est</w>'), 6)]
best=('l', 'o'): 7
vocab={'lo w </w>': 5, 'lo w e r </w>': 2, 'n e w est</w>': 6, 'w i d est</w>': 3}
top_pairs=[(('lo', 'w'), 7), (('n', 'e'), 6), (('e', 'w'), 6), (('w', 'est</w>'),

# Sentencepiece Encoding

In [None]:
# 安装依赖（若未安装）
# ！pip install modelscope transformers # 从魔搭社区安装transformers

# 从魔搭社区下载 bert-base-chinese 模型
model_dir = snapshot_download(
    model_id="bert-base-chinese",  # 魔搭社区的模型ID（与Hugging Face兼容）
    cache_dir="./"  # 下载到当前目录（可自定义路径）
)

print(f"模型已下载到：{model_dir}")  # 输出：./bert-base-chinese


# 待分词的文本
english_text = "lowest"
chinese_text = "当我还只有六岁的时候在一本描写原始森林的名叫真实的故事的书中看到了一幅精彩的插画"

# 分词（返回子词列表）
en_tokens = tokenizer.tokenize(english_text)
zh_tokens = tokenizer.tokenize(chinese_text)

print(f"英文 '{english_text}' 分词结果：{en_tokens}")
print(f"中文句子分词结果：{zh_tokens}")

'(MaxRetryError("HTTPSConnectionPool(host='huggingface.co', port=443): Max retries exceeded with url: /bert-base-chinese/resolve/main/vocab.txt (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x11b53d9d0>, 'Connection to huggingface.co timed out. (connect timeout=10)'))"), '(Request ID: 1137c87f-0740-4772-8386-c4c8745da8e5)')' thrown while requesting HEAD https://huggingface.co/bert-base-chinese/resolve/main/vocab.txt
'(MaxRetryError("HTTPSConnectionPool(host='huggingface.co', port=443): Max retries exceeded with url: /bert-base-chinese/resolve/main/added_tokens.json (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x11b524a30>, 'Connection to huggingface.co timed out. (connect timeout=10)'))"), '(Request ID: 42ac8fae-843e-4c97-a223-e9eb78e1c2f4)')' thrown while requesting HEAD https://huggingface.co/bert-base-chinese/resolve/main/added_tokens.json
'(MaxRetryError("HTTPSConnectionPool(host='huggingface.co', port=443): Max retries

OSError: Can't load tokenizer for 'bert-base-chinese'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure 'bert-base-chinese' is the correct path to a directory containing all relevant files for a BertTokenizer tokenizer.