In [201]:
import collections, re

class Tokenizer:
    def __init__(self):
        self.vocab = {}
        self.merges = collections.defaultdict(int)
        self.tokens = []

Learn Vocabulary

In [202]:
def learn_vocabulary(self, corpus, num_merges):
    # Count character frequencies in the corpus
    word_freqs = collections.defaultdict(int)
    for word in corpus:
        # print("word: ", word)
        _word = ' '.join(list(word)) + ' </w>'
        word_freqs[_word] += 1

    # Create the initial vocabulary
    for word, freq in word_freqs.items():
        self.vocab[word] = freq

    print("self.vocab", self.vocab)
    for word in self.vocab:
        self.tokens.extend(word.split())
    # Learn the split rules and frequencies
    for _ in range(num_merges):
        pairs = collections.defaultdict(int)
        for word, freq in self.vocab.items():
            symbols = word.split()
            # print("symbols: ", symbols)
            for i in range(len(symbols) - 1):
                pairs[symbols[i], symbols[i + 1]] += freq

        # print("pairs: ", pairs)
        if not pairs:
            break

        # Get the most frequent pair
        most_frequent_pair = max(pairs, key=pairs.get)
        # print("most_frequent_pair: ", most_frequent_pair)
        self.merges[most_frequent_pair] += 1
        # print("self.merges: ", self.merges)
        # Merge the most frequent pair in the vocabulary
        new_vocab = {}
        bigram = re.escape(' '.join(most_frequent_pair)) 
        p = re.compile(r'(?<!\S)' + bigram + r'(?!\S)') 
        for word in self.vocab:
            # new_vocab[word] = self.vocab[word]
            new_word = p.sub(''.join(most_frequent_pair), word)
            new_vocab[new_word] = self.vocab[word]
        self.vocab = new_vocab
        
        # traverse the vocab and add the new words to mastervocab
        for word in self.vocab:
            self.tokens.extend(word.split())
        

Split Word

In [203]:
def _split_word(self, word):
    if len(word) == 1:
        return [word]

    split_tokens = []
    for i in range(len(word) - 1):
        pair = (word[i], word[i+1])
        if pair in self.merges:
            split_tokens.append(''.join(pair))
        else:
            split_tokens.append(word[i])
    split_tokens.append(word[-1])
    return split_tokens

Tokenize

In [217]:
def tokenize(self, sample):
    chars = []
    for word in sample.split():
        chars.extend(list(word))
    chars.extend('</w>')
    print("chars: ", chars)
    tokens = []
    i = 0

    while i < len(chars) - 1:
        pair = (chars[i], chars[i + 1])
        token = chars[i]

        while pair in self.merges:
            token += chars[i + 1]
            i += 1
            pair = (token, chars[i + 1])

        tokens.append(token)
        i += 1

    # remove last 3 tokens
    tokens = tokens[:-3]
    return tokens
    # tokens = []
    # for word in sample.split():
    #     word_tokens = []
    #     for char in word:
    #         if char in self.vocab:
    #             word_tokens.append(char)
    #         else:
    #             word_tokens.extend(self._split_word(char))
    #     tokens.extend(word_tokens)
    return tokens

In [218]:
Tokenizer.learn_vocabulary = learn_vocabulary
Tokenizer.tokenize = tokenize
Tokenizer._split_word = _split_word

In [210]:
corpus = []
with open('corpus.txt', 'r') as f:
    for line in f:
        # print("line.strip()", line.strip())
        corpus.extend(line.strip().split())

tokenizer = Tokenizer()
tokenizer.learn_vocabulary(corpus, 1000)

self.vocab {'i </w>': 3789, 's t a n d </w>': 10, 'h e r e </w>': 37, 'f e e l </w>': 1637, 'e m p t y </w>': 9, 'a </w>': 916, 'c l a s s </w>': 13, 'p o s t </w>': 14, 'c o u n t </w>': 4, 'l i n k </w>': 2, 'h r e f </w>': 25, 'h t t p </w>': 30, 'm o o s h i l u </w>': 1, 'l i t e r a l l y </w>': 4, 'j u s t </w>': 226, 't e x t </w>': 3, 't y c h e l l e </w>': 1, 't o </w>': 1340, 's e e </w>': 45, 'i f </w>': 134, 's h e </w>': 72, 'w a n t s </w>': 7, 'h a n g </w>': 1, 'o u t </w>': 113, 'b e c a u s e </w>': 183, 'r e a d i n g </w>': 13, 'w h a t </w>': 125, 'w r o t e </w>': 4, 'a b o u t </w>': 298, 'm y </w>': 642, 'n o n e x i s t e n t </w>': 1, 's o c i a l </w>': 7, 'l i f e </w>': 96, 'm a d e </w>': 36, 'm e </w>': 382, 's o </w>': 366, 'p a t h e t i c </w>': 6, 'r e a l l y </w>': 157, 'r e g r e t f u l </w>': 6, 'w h e n </w>': 228, 'h e a r i n g </w>': 4, 't h a t </w>': 796, 's h i n a e </w>': 2, 'g o t </w>': 41, 'm a r r i e d </w>': 4, 'a n o t h e r </w

Generate all possible tokens from the vocabulary

In [211]:
tokens = set(tokenizer.tokens)
# write tokens to tokens.txt
with open('tokens.txt', 'w') as f:
    for token in tokens:
        f.write(token + '\n')

Generate all merge rules learnt after learning the vocabulary

In [212]:
with open('merges.txt', 'w') as f:
    for merge, freq in tokenizer.merges.items():
        f.write(merge[0] + ',' + merge[1] + '\n')

In [219]:
sample_corpus = []
with open('sample_corpus.txt', 'r') as f:
    for line in f:
        sample_corpus.append(line.strip())

for sentence in sample_corpus:
    with open('tokenized_samples.txt', 'a') as f:
        f.write(','.join(tokenizer.tokenize(sentence)) + '\n')
