In [8]:
from collections import defaultdict
from typing import Dict, List, Tuple, Set

def get_stats(vocab_words: List[str]) -> Dict[Tuple[str, str], int]:
    """
    Count frequency of adjacent character pairs across all words.
    """
    pairs = defaultdict(int)
    for word in vocab_words:
        symbols = word.split()
        for i in range(len(symbols)-1):
            pairs[symbols[i], symbols[i+1]] += 1
    return pairs

def merge_vocab(pair: Tuple[str, str], vocab_words: List[str]) -> List[str]:
    """
    Merge all occurrences of the most frequent pair in each word.
    """
    merged = pair[0] + pair[1]
    new_vocab_words = []
    for word in vocab_words:
        symbols = word.split()
        i = 0
        new_symbols = []
        while i < len(symbols):
            if i < len(symbols) - 1 and symbols[i] == pair[0] and symbols[i + 1] == pair[1]:
                new_symbols.append(merged)
                i += 2
            else:
                new_symbols.append(symbols[i])
                i += 1
        new_vocab_words.append(' '.join(new_symbols))
    return new_vocab_words

def get_unique_tokens(vocab_words: List[str]) -> Set[str]:
    """
    Get set of unique tokens from space-separated words.
    """
    tokens = set()
    for word in vocab_words:
        tokens.update(word.split())
    return tokens

def byte_pair_encoding(text: str, num_merges: int = 10) -> List[str]:
    """
    Perform byte pair encoding on input text.
    """
    # Initialize vocabulary with space-separated characters
    vocab_words = [' '.join(word) for word in text.replace(',', '').split()]

    print("Initial vocabulary:")
    tokens = get_unique_tokens(vocab_words)
    print(', '.join(sorted(tokens)))
    print()

    for i in range(num_merges):
        # Get statistics for pairs
        pairs = get_stats(vocab_words)
        if not pairs:
            break

        # Find most frequent pair
        most_freq = max(pairs.items(), key=lambda x: (x[1], x[0]))[0]
        merged = most_freq[0] + most_freq[1]

        print(f"Step {i+1}:")
        print(f"Most frequent pair: ({most_freq[0]}, {most_freq[1]}) with frequency {pairs[most_freq]}")
        print(f"Merging into: {merged}")

        # Merge the most frequent pair in vocabulary
        vocab_words = merge_vocab(most_freq, vocab_words)

        print("\nVocabulary:")
        tokens = get_unique_tokens(vocab_words)
        print(', '.join(sorted(tokens)))
        print()

    return vocab_words

# Run BPE on the example text
text = "fred fed ted bread and ted fed fred bread"
print("Original text:", text)
print()
final_vocab = byte_pair_encoding(text, num_merges=5)

Original text: fred fed ted bread and ted fed fred bread

Initial vocabulary:
a, b, d, e, f, n, r, t

Step 1:
Most frequent pair: (e, d) with frequency 6
Merging into: ed

Vocabulary:
a, b, d, e, ed, f, n, r, t

Step 2:
Most frequent pair: (t, ed) with frequency 2
Merging into: ted

Vocabulary:
a, b, d, e, ed, f, n, r, ted

Step 3:
Most frequent pair: (r, ed) with frequency 2
Merging into: red

Vocabulary:
a, b, d, e, ed, f, n, r, red, ted

Step 4:
Most frequent pair: (r, e) with frequency 2
Merging into: re

Vocabulary:
a, b, d, ed, f, n, re, red, ted

Step 5:
Most frequent pair: (re, a) with frequency 2
Merging into: rea

Vocabulary:
a, b, d, ed, f, n, rea, red, ted



f r ed / f ed / ted / b r e a d / a n d/  ted / f ed / f r ed / b r e a d

In [9]:
def learn_bpe_merges(text: str, num_merges: int = 5) -> List[Tuple[str, str]]:
    """Learn BPE merge operations from training text."""
    # Initialize vocabulary with space-separated characters
    vocab_words = [' '.join(word) for word in text.replace(',', '').split()]
    merges = []

    for i in range(num_merges):
        # Get statistics for pairs
        pairs = defaultdict(int)
        for word in vocab_words:
            symbols = word.split()
            for j in range(len(symbols)-1):
                pairs[symbols[j], symbols[j+1]] += 1

        if not pairs:
            break

        # Find most frequent pair
        most_freq = max(pairs.items(), key=lambda x: (x[1], x[0]))[0]
        merges.append(most_freq)

        # Merge the pair in vocabulary
        vocab_words = merge_vocab(most_freq, vocab_words)

    return merges

def tokenize_word(word: str, merges: List[Tuple[str, str]]) -> List[str]:
    """
    Tokenize a single word using learned BPE merges.
    """
    # Start with characters separated
    word = ' '.join(list(word))

    # Apply merges in order
    for pair in merges:
        new_token = pair[0] + pair[1]
        symbols = word.split()
        i = 0
        new_symbols = []
        while i < len(symbols):
            if i < len(symbols) - 1 and symbols[i] == pair[0] and symbols[i + 1] == pair[1]:
                new_symbols.append(new_token)
                i += 2
            else:
                new_symbols.append(symbols[i])
                i += 1
        word = ' '.join(new_symbols)

    return word.split()

# First, train BPE on original text
training_text = "fred fed ted bread and ted fed fred bread"
print("Training text:", training_text)
print()

# Learn merges
merges = learn_bpe_merges(training_text, num_merges=5)

# Print the learned merge operations
print("Learned BPE merge operations:")
for i, (a, b) in enumerate(merges, 1):
    print(f"{i}. Merge '{a}' + '{b}' → '{a+b}'")
print()

# Example new sentences to tokenize
test_sentences = [
    "ted freed bread",
    "red feed",
    "breed"
]

# Tokenize each test sentence
print("Tokenizing new sentences:")
for sentence in test_sentences:
    print(f"\nOriginal: {sentence}")
    tokens = []
    for word in sentence.split():
        word_tokens = tokenize_word(word, merges)
        tokens.extend(word_tokens)
    print("Tokens:", tokens)

Training text: fred fed ted bread and ted fed fred bread

Learned BPE merge operations:
1. Merge 'e' + 'd' → 'ed'
2. Merge 't' + 'ed' → 'ted'
3. Merge 'r' + 'ed' → 'red'
4. Merge 'r' + 'e' → 're'
5. Merge 're' + 'a' → 'rea'

Tokenizing new sentences:

Original: ted freed bread
Tokens: ['ted', 'f', 're', 'ed', 'b', 'rea', 'd']

Original: red feed
Tokens: ['red', 'f', 'e', 'ed']

Original: breed
Tokens: ['b', 're', 'ed']


ted / f re ed / b rea d

In [None]:
                X,    y

      image1,        class of the image
      image2,        class of the image
      image3,        class of the image
      image4,        class of the image
      image5,        class of the image



               x,  y
  I love you       Je vous aime
  I hate you       I hate you in french

              X,   y
 I [MASK] you        I love you




X                                               y  
Suddenly, [MASK] have serious [MASK] options.   Suddenly, Sri Lanka have serious batting options.


It was only in [MASK] that they crashed out of the T20 World Cup after an exceptionally dire showing in their first two matches.      It was only in July that they crashed out of the T20 World Cup after an exceptionally showing in their first two matches.

They'd stunk up the [MASK] World Cup even worse in late 2023, failing to finish among the top eight, and as such, out on qualification for the Champions Trophy for the first time.    They'd stunk up the ODI World Cup even worse in late 2023, failing to finish among the top eight, and as such, missing out on qualification for the Champions Trophy for the first time.
