In [9]:
import re 
from collections import defaultdict 

def get_stat_pairs(vocab): 
    pairs = defaultdict(int) 
    for word, freq in vocab.items(): 
        symbols = word.split()
        for i in range(len(symbols)-1): 
            pairs[symbols[i],symbols[i+1]] += freq 
    return pairs 

def merge_freq_vocab(pair, v_in): 
    v_out = {} 
    bigram = re.escape(' '.join(pair)) 
    p = re.compile(r'(?<!\S)' + bigram + r'(?!\S)') 
    for word in v_in: 
        w_out = p.sub(''.join(pair), word) 
        v_out[w_out] = v_in[word] 
    return v_out 

def get_initial_vocab(data): 
    vocab = defaultdict(int) 
    for line in data: 
        for word in line.split(): 
            vocab[' '.join(list(word)) + ' </w>'] += 1
    return vocab 

def byte_pair_encoding(data, n): 
    vocab = get_initial_vocab(data) 
    rules = []
    for i in range(n): 
        pairs = get_stat_pairs(vocab) 
        best = max(pairs, key=pairs.get) 
        rules.append(best)
        vocab = merge_freq_vocab(best, vocab) 
    return vocab, rules

training_sentences = "low lowest newer wider new"
training_data = training_sentences.split()
num_iterations = 5

vocab, rules = byte_pair_encoding(training_data, num_iterations)
print("Generated Rules:")
for i, rule in enumerate(rules, start=1):
    print(f"Rule {i}: {rule[0]} + {rule[1]} -> {rule[0]}{rule[1]}")

print(rules)

Generated Rules:
Rule 1: l + o -> lo
Rule 2: lo + w -> low
Rule 3: n + e -> ne
Rule 4: ne + w -> new
Rule 5: e + r -> er
[('l', 'o'), ('lo', 'w'), ('n', 'e'), ('ne', 'w'), ('e', 'r')]


In [10]:
test_sentence = "lower"
words = test_sentence.split(' ')

tokens = []
for word in words:
    tokens += [char for char in list(word) + ['</w>']]

print("Original Test Sentence:", test_sentence)
print("List of Subword Tokens:", tokens)
print()

def apply_rules(rules,tokens):
    merged_tokens = tokens
    for rule in rules:
        for i in range(len(merged_tokens)-1):
            if merged_tokens[i] == rule[0] and merged_tokens[i+1] == rule[1]:
                merged_tokens[i] = rule[0] + rule[1]
                merged_tokens[i+1] = ""
        # Removing the empty tokens
        merged_tokens = [token for token in merged_tokens if token != ""]
        print(f"Applying Rule: {rule[0]} + {rule[1]} -> {rule[0]}{rule[1]}")
        print("Merged Tokens:",merged_tokens)
        print()
    return merged_tokens

print("Final List of Subword Tokens:", apply_rules(rules, tokens))

Original Test Sentence: lower
List of Subword Tokens: ['l', 'o', 'w', 'e', 'r', '</w>']

Applying Rule: l + o -> lo
Merged Tokens: ['lo', 'w', 'e', 'r', '</w>']

Applying Rule: lo + w -> low
Merged Tokens: ['low', 'e', 'r', '</w>']

Applying Rule: n + e -> ne
Merged Tokens: ['low', 'e', 'r', '</w>']

Applying Rule: ne + w -> new
Merged Tokens: ['low', 'e', 'r', '</w>']

Applying Rule: e + r -> er
Merged Tokens: ['low', 'er', '</w>']

Final List of Subword Tokens: ['low', 'er', '</w>']
