# 分词器原理 (Tokenizer Architecture)

**SOTA 教育标准** | 包含 BPE、WordPiece、SentencePiece 详解

---

## 1. 分词器概述

| 方法 | 描述 | 使用模型 |
|:-----|:-----|:---------|
| **BPE** | 最高频对合并 | GPT |
| **WordPiece** | 最大化似然 | BERT |
| **SentencePiece** | 统一框架 | LLaMA |

In [None]:
from __future__ import annotations
from collections import Counter, defaultdict
from dataclasses import dataclass
from typing import Dict, List, Tuple
import matplotlib.pyplot as plt
import numpy as np

print("分词器模块加载完成")

---

## 2. BPE 配置与初始化

In [None]:
@dataclass
class BPEConfig:
    """BPE 配置。"""
    vocab_size: int = 1000
    min_frequency: int = 2
    special_tokens: List[str] = None

    def __post_init__(self):
        if self.special_tokens is None:
            self.special_tokens = ["<pad>", "<unk>", "<bos>", "<eos>"]


class BPETokenizer:
    """BPE 分词器。
    
    Core Idea: 迭代合并最频繁的字节对。
    """

    def __init__(self, config: BPEConfig = BPEConfig()):
        self.config = config
        self.vocab: Dict[str, int] = {}
        self.merges: List[Tuple[str, str]] = []

    def _get_pair_freq(self, splits: Dict[str, List[str]]) -> Dict[Tuple, int]:
        """统计相邻对频率。"""
        pairs = defaultdict(int)
        for split in splits.values():
            for i in range(len(split) - 1):
                pairs[(split[i], split[i+1])] += 1
        return pairs

    def _merge_pair(self, splits: Dict, pair: Tuple[str, str]) -> None:
        """合并指定对。"""
        for word, split in splits.items():
            new_split, i = [], 0
            while i < len(split):
                if i < len(split)-1 and split[i] == pair[0] and split[i+1] == pair[1]:
                    new_split.append(pair[0] + pair[1])
                    i += 2
                else:
                    new_split.append(split[i])
                    i += 1
            splits[word] = new_split


# 测试
tokenizer = BPETokenizer()
print(f"配置: vocab_size={tokenizer.config.vocab_size}")

---

## 3. BPE 训练与编码

In [None]:
def train_bpe(tokenizer: BPETokenizer, texts: List[str], num_merges: int = 50) -> None:
    """训练 BPE。"""
    # 初始化：字符级分割
    splits = {w: list(w) for text in texts for w in text.split()}
    
    for _ in range(num_merges):
        pairs = tokenizer._get_pair_freq(splits)
        if not pairs:
            break
        best = max(pairs.items(), key=lambda x: x[1])[0]
        tokenizer._merge_pair(splits, best)
        tokenizer.merges.append(best)
    
    # 构建词表
    for t in tokenizer.config.special_tokens:
        tokenizer.vocab[t] = len(tokenizer.vocab)
    for p in tokenizer.merges:
        token = p[0] + p[1]
        if token not in tokenizer.vocab:
            tokenizer.vocab[token] = len(tokenizer.vocab)


# 训练演示
texts = ["hello world", "hello there", "world peace", "hello hello"]
train_bpe(tokenizer, texts, num_merges=20)
print(f"合并次数: {len(tokenizer.merges)}")
print(f"词表大小: {len(tokenizer.vocab)}")
print(f"前5个合并: {tokenizer.merges[:5]}")

---

## 4. WordPiece 对比

In [None]:
class WordPieceTokenizer:
    """WordPiece 分词器 (BERT)。
    
    Core Idea: 选择最大化似然的合并。
    Score(pair) = freq(pair) / (freq(a) * freq(b))
    """

    def __init__(self, vocab_size: int = 1000):
        self.vocab_size = vocab_size
        self.vocab: Dict[str, int] = {}

    def compute_score(self, pair: Tuple[str, str], freqs: Dict[str, int]) -> float:
        """计算合并得分。"""
        freq_ab = freqs.get(pair[0] + pair[1], 0)
        freq_a = freqs.get(pair[0], 1)
        freq_b = freqs.get(pair[1], 1)
        return freq_ab / (freq_a * freq_b) if freq_a * freq_b > 0 else 0


# 对比演示
print("BPE vs WordPiece:")
print("  BPE: 合并频率最高的对")
print("  WordPiece: 合并最大化似然的对")

---

## 5. 可视化

In [None]:
def visualize_bpe_process():
    """可视化 BPE 过程。"""
    word = "unhappiness"
    steps = [
        list(word),
        ['u', 'n', 'h', 'a', 'pp', 'i', 'n', 'e', 'ss'],
        ['un', 'h', 'a', 'pp', 'i', 'ness'],
        ['un', 'happi', 'ness'],
        ['unhappiness'],
    ]
    
    fig, axes = plt.subplots(1, len(steps), figsize=(15, 3))
    for i, (ax, tokens) in enumerate(zip(axes, steps)):
        colors = plt.cm.viridis(np.linspace(0, 1, len(tokens)))
        x = 0
        for t, c in zip(tokens, colors):
            ax.barh(0, len(t), left=x, height=0.5, color=c, edgecolor='black')
            ax.text(x + len(t)/2, 0, t, ha='center', va='center', fontsize=8)
            x += len(t)
        ax.set_xlim(0, len(word))
        ax.set_ylim(-0.5, 0.5)
        ax.set_title(f'Step {i}')
        ax.axis('off')
    
    plt.suptitle('BPE Tokenization Process')
    plt.tight_layout()
    plt.show()


visualize_bpe_process()

---

## 6. 总结

| 方法 | 策略 | 使用模型 |
|:-----|:-----|:---------|
| **BPE** | 最高频对 | GPT |
| **WordPiece** | 最大似然 | BERT |
| **SentencePiece** | 统一框架 | LLaMA |