Unicode 是字符集，为每个字符分配唯一的代码点。
UTF-8 是一种基于 Unicode 的字符编码方式，用于在计算机中存储和传输字符。

BPE理论上还是会出现OOV的，当词汇表的大小受限时，一些较少频繁出现的子词和没有在训练过程中见过的子词，就会无法进入词汇表出现OOV，而Byte-level BPE(BBPE)理论上是不会出现这个情况的。

Byte-level BPE(BBPE)和Byte-Pair Encoding (BPE)区别就是BPE是最小词汇是字符级别，而BBPE是字节级别的，通过UTF-8的编码方式这一个字节的256的范围，理论上可以表示这个世界上的所有字符。

In [84]:
from collections import defaultdict
sentences = [
    "我",
    "喜欢",
    "吃",
    "苹果",
    "他",
    "不",
    "喜欢",
    "吃",
    "苹果派",
    "I like to eat apples",
    "She has a cute cat",
    "you are very cute",
    "give you a hug",
]
# 构建初始词汇表，包含一个字节的256个表示
initial_vocab = [bytes([byte]) for byte in range(256)]
vocab = initial_vocab.copy()
print("initial_vocab:", initial_vocab)

initial_vocab: [b'\x00', b'\x01', b'\x02', b'\x03', b'\x04', b'\x05', b'\x06', b'\x07', b'\x08', b'\t', b'\n', b'\x0b', b'\x0c', b'\r', b'\x0e', b'\x0f', b'\x10', b'\x11', b'\x12', b'\x13', b'\x14', b'\x15', b'\x16', b'\x17', b'\x18', b'\x19', b'\x1a', b'\x1b', b'\x1c', b'\x1d', b'\x1e', b'\x1f', b' ', b'!', b'"', b'#', b'$', b'%', b'&', b"'", b'(', b')', b'*', b'+', b',', b'-', b'.', b'/', b'0', b'1', b'2', b'3', b'4', b'5', b'6', b'7', b'8', b'9', b':', b';', b'<', b'=', b'>', b'?', b'@', b'A', b'B', b'C', b'D', b'E', b'F', b'G', b'H', b'I', b'J', b'K', b'L', b'M', b'N', b'O', b'P', b'Q', b'R', b'S', b'T', b'U', b'V', b'W', b'X', b'Y', b'Z', b'[', b'\\', b']', b'^', b'_', b'`', b'a', b'b', b'c', b'd', b'e', b'f', b'g', b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o', b'p', b'q', b'r', b's', b't', b'u', b'v', b'w', b'x', b'y', b'z', b'{', b'|', b'}', b'~', b'\x7f', b'\x80', b'\x81', b'\x82', b'\x83', b'\x84', b'\x85', b'\x86', b'\x87', b'\x88', b'\x89', b'\x8a', b'\x8b', b'\x8c', b'\x8

In [85]:
# 构建频率统计
def build_stats(sentences):
    stats = defaultdict(int)
    for sentence in sentences:
        symbols = sentence.split()
        for symbol in symbols:
            stats[symbol.encode("utf-8")] += 1
    return stats
stats = build_stats(sentences)

splits = {word: [bytes([byte]).decode('latin1') for byte in word] for word in stats.keys()}

print("stats:", stats)
print("splits:", splits)

stats: defaultdict(<class 'int'>, {b'\xe6\x88\x91': 1, b'\xe5\x96\x9c\xe6\xac\xa2': 2, b'\xe5\x90\x83': 2, b'\xe8\x8b\xb9\xe6\x9e\x9c': 1, b'\xe4\xbb\x96': 1, b'\xe4\xb8\x8d': 1, b'\xe8\x8b\xb9\xe6\x9e\x9c\xe6\xb4\xbe': 1, b'I': 1, b'like': 1, b'to': 1, b'eat': 1, b'apples': 1, b'She': 1, b'has': 1, b'a': 2, b'cute': 2, b'cat': 1, b'you': 2, b'are': 1, b'very': 1, b'give': 1, b'hug': 1})
splits: {b'\xe6\x88\x91': ['æ', '\x88', '\x91'], b'\xe5\x96\x9c\xe6\xac\xa2': ['å', '\x96', '\x9c', 'æ', '¬', '¢'], b'\xe5\x90\x83': ['å', '\x90', '\x83'], b'\xe8\x8b\xb9\xe6\x9e\x9c': ['è', '\x8b', '¹', 'æ', '\x9e', '\x9c'], b'\xe4\xbb\x96': ['ä', '»', '\x96'], b'\xe4\xb8\x8d': ['ä', '¸', '\x8d'], b'\xe8\x8b\xb9\xe6\x9e\x9c\xe6\xb4\xbe': ['è', '\x8b', '¹', 'æ', '\x9e', '\x9c', 'æ', '´', '¾'], b'I': ['I'], b'like': ['l', 'i', 'k', 'e'], b'to': ['t', 'o'], b'eat': ['e', 'a', 't'], b'apples': ['a', 'p', 'p', 'l', 'e', 's'], b'She': ['S', 'h', 'e'], b'has': ['h', 'a', 's'], b'a': ['a'], b'cute': ['c', 'u'

In [86]:
def compute_pair_freqs(splits):
    pair_freqs = defaultdict(int)
    for word, freq in stats.items():
        split = splits[word]
        if len(split) == 1:
            continue
        for i in range(len(split) - 1):
            pair = (split[i], split[i + 1])
            pair_freqs[pair] += freq
    return pair_freqs

pair_freqs = compute_pair_freqs(splits)

for i, key in enumerate(pair_freqs.keys()):
    print(f"{key}: {pair_freqs[key]}")
    if i >= 19:
        break

('æ', '\x88'): 1
('\x88', '\x91'): 1
('å', '\x96'): 2
('\x96', '\x9c'): 2
('\x9c', 'æ'): 3
('æ', '¬'): 2
('¬', '¢'): 2
('å', '\x90'): 2
('\x90', '\x83'): 2
('è', '\x8b'): 2
('\x8b', '¹'): 2
('¹', 'æ'): 2
('æ', '\x9e'): 2
('\x9e', '\x9c'): 2
('ä', '»'): 1
('»', '\x96'): 1
('ä', '¸'): 1
('¸', '\x8d'): 1
('æ', '´'): 1
('´', '¾'): 1


In [87]:
best_pair = ""
max_freq = None
for pair, freq in pair_freqs.items():
    if max_freq is None or max_freq < freq:
        best_pair = pair
        max_freq = freq

print(best_pair, max_freq)

('\x9c', 'æ') 3


In [88]:
def merge_pair(pair, splits):
    merged_byte = pair
    for word in stats:
        split = splits[word]
        if len(split) == 1:
            continue
        i = 0
        while i < len(split) - 1:
            if split[i:i+2] == pair:  # 检查分割中是否有这对字节
                split = split[:i] + [merged_byte] + split[i + 2 :]
            else:
                i += 1
        splits[word] = split
    return splits

splits = merge_pair(best_pair, splits)
print("splits:", splits)

splits: {b'\xe6\x88\x91': ['æ', '\x88', '\x91'], b'\xe5\x96\x9c\xe6\xac\xa2': ['å', '\x96', '\x9c', 'æ', '¬', '¢'], b'\xe5\x90\x83': ['å', '\x90', '\x83'], b'\xe8\x8b\xb9\xe6\x9e\x9c': ['è', '\x8b', '¹', 'æ', '\x9e', '\x9c'], b'\xe4\xbb\x96': ['ä', '»', '\x96'], b'\xe4\xb8\x8d': ['ä', '¸', '\x8d'], b'\xe8\x8b\xb9\xe6\x9e\x9c\xe6\xb4\xbe': ['è', '\x8b', '¹', 'æ', '\x9e', '\x9c', 'æ', '´', '¾'], b'I': ['I'], b'like': ['l', 'i', 'k', 'e'], b'to': ['t', 'o'], b'eat': ['e', 'a', 't'], b'apples': ['a', 'p', 'p', 'l', 'e', 's'], b'She': ['S', 'h', 'e'], b'has': ['h', 'a', 's'], b'a': ['a'], b'cute': ['c', 'u', 't', 'e'], b'cat': ['c', 'a', 't'], b'you': ['y', 'o', 'u'], b'are': ['a', 'r', 'e'], b'very': ['v', 'e', 'r', 'y'], b'give': ['g', 'i', 'v', 'e'], b'hug': ['h', 'u', 'g']}


In [89]:
vocab_size = 500
merges = {}
print(len(vocab))

while len(vocab) < vocab_size:
    pair_freqs = compute_pair_freqs(splits)
    best_pair = ()
    max_freq = None
    for pair, freq in pair_freqs.items():
        if max_freq is None or max_freq < freq:
            best_pair = pair
            max_freq = freq
    splits = merge_pair(best_pair, splits)
    merged_byte = best_pair
    merges[best_pair] = merged_byte
    vocab.append(merged_byte)

print("merges:", merges)
print("vocab:", vocab)

256
merges: {('\x9c', 'æ'): ('\x9c', 'æ')}
vocab: [b'\x00', b'\x01', b'\x02', b'\x03', b'\x04', b'\x05', b'\x06', b'\x07', b'\x08', b'\t', b'\n', b'\x0b', b'\x0c', b'\r', b'\x0e', b'\x0f', b'\x10', b'\x11', b'\x12', b'\x13', b'\x14', b'\x15', b'\x16', b'\x17', b'\x18', b'\x19', b'\x1a', b'\x1b', b'\x1c', b'\x1d', b'\x1e', b'\x1f', b' ', b'!', b'"', b'#', b'$', b'%', b'&', b"'", b'(', b')', b'*', b'+', b',', b'-', b'.', b'/', b'0', b'1', b'2', b'3', b'4', b'5', b'6', b'7', b'8', b'9', b':', b';', b'<', b'=', b'>', b'?', b'@', b'A', b'B', b'C', b'D', b'E', b'F', b'G', b'H', b'I', b'J', b'K', b'L', b'M', b'N', b'O', b'P', b'Q', b'R', b'S', b'T', b'U', b'V', b'W', b'X', b'Y', b'Z', b'[', b'\\', b']', b'^', b'_', b'`', b'a', b'b', b'c', b'd', b'e', b'f', b'g', b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o', b'p', b'q', b'r', b's', b't', b'u', b'v', b'w', b'x', b'y', b'z', b'{', b'|', b'}', b'~', b'\x7f', b'\x80', b'\x81', b'\x82', b'\x83', b'\x84', b'\x85', b'\x86', b'\x87', b'\x88', b'\x89