In [1]:
import collections

class Tokenizer:
    def __init__(self):
        self.vocab = {}
        self.merges = collections.defaultdict(int)

In [2]:
def learn_vocabulary(self, corpus, num_merges):
    # Count character frequencies in the corpus
    char_freqs = collections.defaultdict(int)
    for word in corpus:
        for char in word:
            char_freqs[char] += 1
    
    # Initialize the vocabulary with single characters
    self.vocab = {char: freq for char, freq in char_freqs.items()}
    
    # Learn the split rules and frequencies
    for _ in range(num_merges):
        pairs = collections.defaultdict(int)
        for word in corpus:
            symbols = word.split()
            for i in range(len(symbols) - 1):
                pair = (symbols[i], symbols[i+1])
                pairs[pair] += 1
        
        # Find the most frequent pair
        most_frequent_pair = max(pairs, key=pairs.get)
        self.merges[most_frequent_pair] += 1
        
        # Update the vocabulary with the merged pair
        new_symbol = ''.join(most_frequent_pair)
        self.vocab[new_symbol] = pairs[most_frequent_pair]
        
        # Replace the merged pair in the corpus
        new_corpus = []
        for word in corpus:
            new_word = ' '.join(word.split())
            new_word = new_word.replace(' '.join(most_frequent_pair), new_symbol)
            new_corpus.append(new_word)
        
        corpus = new_corpus

In [3]:
def tokenize(self, sample):
    tokens = []
    for word in sample.split():
        word_tokens = []
        for char in word:
            if char in self.vocab:
                word_tokens.append(char)
            else:
                word_tokens.extend(self._split_word(char))
        tokens.extend(word_tokens)
    return tokens

In [4]:
def _split_word(self, word):
    if len(word) == 1:
        return [word]

    split_tokens = []
    for i in range(len(word) - 1):
        pair = (word[i], word[i+1])
        if pair in self.merges:
            split_tokens.append(''.join(pair))
        else:
            split_tokens.append(word[i])
    split_tokens.append(word[-1])
    return split_tokens

In [5]:
Tokenizer.learn_vocabulary = learn_vocabulary
Tokenizer.tokenize = tokenize
Tokenizer._split_word = _split_word

In [6]:
# read corpus from corpus.txt
corpus = []
with open('corpus.txt', 'r') as f:
    for line in f:
        corpus.append(line.strip())

# learn vocabulary from corpus
tokenizer = Tokenizer()
tokenizer.learn_vocabulary(corpus, 1000)

# generate all possible tokens from the vocabulary
tokens = []
for token in tokenizer.vocab.keys():
    tokens.append(token)

# write tokens to tokens.txt
with open('tokens.txt', 'w') as f:
    for token in tokens:
        f.write(token + '\n')
    
# write merges to merges.txt
with open('merges.txt', 'w') as f:
    for merge, freq in tokenizer.merges.items():
        f.write('{}\t{}\n'.format(merge, freq))