In [32]:
class Tokenizer:
    def __init__(self, num_merges=100):
        self.vocab = {}
        self.merge_history = []
        self.num_merges = num_merges
        self.token_to_id = {}
        self.id_to_token = {}
        self.trained = False

    def init_vocab(self, words):
        """
        Initialize the vocabulary with the frequency of each character in all words.
        """
        vocab = {}
        for word in words:
            for char in word:
                vocab[char] = vocab.get(char, 0) + 1
        return vocab

    def get_symbol_freq(self, all_words):
        """
        Calculate the frequency of each adjacent pair of symbols across all tokenized words.
        """
        pairs = {}
        for word_tokens in all_words:
            tokens = word_tokens.split()
            for i in range(len(tokens) - 1):
                if tokens[i] == '|' or tokens[i + 1] == '|':
                    continue
                pair = (tokens[i], tokens[i + 1])
                pairs[pair] = pairs.get(pair, 0) + 1
        return pairs

    def merge_vocab(self, pair):
        """
        Update vocab dictionary with the new merged token.
        """
        token_a, token_b = pair
        merged_token = token_a + token_b
        self.vocab[merged_token] = 0

    def merge_words(self, all_words, pair):
        """
        Replace all instances of the given pair in all word tokens with the merged token.
        """
        token_a, token_b = pair
        merged = token_a + token_b
        pattern = f"{token_a} {token_b}"
        new_words = [word.replace(pattern, merged) for word in all_words]
        return new_words

    def fit(self, text):
        words = text.split()
        all_words = [" ".join(list(word)) for word in words]
        self.vocab = self.init_vocab(words)

        # Create a merge lookup dictionary
        self.merge_lookup = {}

        print("Starting training process...")
        for i in range(self.num_merges):
            pairs = self.get_symbol_freq(all_words)
            if not pairs:
                break

            # Find the most frequent pair
            best_pair, pair_freq = max(pairs.items(), key=lambda x: x[1])

            if best_pair in self.merge_lookup:
                continue

            # Print the merge operation
            print(f"Iteration {i + 1}: Merging pair {best_pair} with frequency {pair_freq}")

            self.merge_vocab(best_pair)
            all_words = self.merge_words(all_words, best_pair)
            self.merge_lookup[best_pair] = ''.join(best_pair)

        # Initialize token-to-id mappings
        self.token_to_id = {token: idx for idx, token in enumerate(self.vocab.keys(), start=1)}
        self.id_to_token = {idx: token for token, idx in self.token_to_id.items()}
        self.token_to_id['<UNK>'] = 0
        self.id_to_token[0] = '<UNK>'
        self.trained = True
        print("Training completed.")

    def tokenize(self, text):
        if not self.trained:
            raise ValueError("Tokenizer has not been trained. Call 'fit' with training data first.")

        words = text.split()
        tokenized_text = []

        for word in words:

            word_tokens = ' '.join(list(word))

            for merged_pair, merged_token in self.merge_lookup.items():
                word_tokens = word_tokens.replace(' '.join(merged_pair), merged_token)

            tokenized_text.extend(word_tokens.split())

        return tokenized_text

    def encode(self, text):
        """
        Encode the input text into a list of token IDs.
        """
        tokens = self.tokenize(text)
        print([token for token in tokens])
        encoded = [self.token_to_id.get(token, self.token_to_id['<UNK>']) for token in tokens]
        return encoded

    def decode(self, token_ids):
        """
        Decode a list of token IDs back into the original text.
        """
        tokens = [self.id_to_token.get(token_id, '<UNK>') for token_id in token_ids]
        text = ' '.join(tokens).replace('  ', ' ').strip()
        return text

tokenizer = Tokenizer(num_merges=50)
text = "".join(open("input.txt").readlines()).replace(".", "").replace(",", "").lower()
text
tokenizer.fit(text)
encoded = tokenizer.encode(text)
decoded = tokenizer.decode(encoded)
print("Encoded:", encoded)
print("Decoded:", decoded)

Starting training process...
Iteration 1: Merging pair ('a', 'n') with frequency 155
Iteration 2: Merging pair ('i', 'n') with frequency 134
Iteration 3: Merging pair ('t', 'h') with frequency 114
Iteration 4: Merging pair ('e', 'r') with frequency 109
Iteration 5: Merging pair ('o', 'n') with frequency 106
Iteration 6: Merging pair ('e', 'n') with frequency 100
Iteration 7: Merging pair ('e', 's') with frequency 87
Iteration 8: Merging pair ('r', 'e') with frequency 78
Iteration 9: Merging pair ('a', 't') with frequency 71
Iteration 10: Merging pair ('a', 'l') with frequency 71
Iteration 11: Merging pair ('th', 'e') with frequency 66
Iteration 12: Merging pair ('an', 'd') with frequency 62
Iteration 13: Merging pair ('in', 'g') with frequency 59
Iteration 14: Merging pair ('o', 'f') with frequency 55
Iteration 15: Merging pair ('e', 'd') with frequency 52
Iteration 16: Merging pair ('s', 't') with frequency 52
Iteration 17: Merging pair ('i', 'c') with frequency 52
Iteration 18: Mergi

In [24]:
tokenizer.token_to_id

{'p': 1,
 'l': 2,
 'a': 3,
 'n': 4,
 'e': 5,
 's': 6,
 'o': 7,
 'r': 8,
 'i': 9,
 'm': 10,
 'v': 11,
 'f': 12,
 'd': 13,
 'g': 14,
 't': 15,
 'h': 16,
 'u': 17,
 'c': 18,
 'y': 19,
 'b': 20,
 'w': 21,
 'k': 22,
 'j': 23,
 '3': 24,
 '5': 25,
 '0': 26,
 '6': 27,
 'x': 28,
 'z': 29,
 '-': 30,
 "'": 31,
 'q': 32,
 'â': 33,
 '€': 34,
 '™': 35,
 '7': 36,
 '(': 37,
 ')': 38,
 '”': 39,
 'an': 40,
 'in': 41,
 'th': 42,
 'er': 43,
 'on': 44,
 'en': 45,
 'es': 46,
 're': 47,
 'at': 48,
 'al': 49,
 'the': 50,
 'and': 51,
 'ing': 52,
 'of': 53,
 'ed': 54,
 'st': 55,
 'ic': 56,
 'or': 57,
 'ion': 58,
 'it': 59,
 'ro': 60,
 'pl': 61,
 'to': 62,
 'ig': 63,
 'el': 64,
 'ra': 65,
 'ent': 66,
 'ai': 67,
 'ar': 68,
 'ec': 69,
 'as': 70,
 'lo': 71,
 'ly': 72,
 'plan': 73,
 'air': 74,
 'co': 75,
 'is': 76,
 'et': 77,
 'for': 78,
 'em': 79,
 'ation': 80,
 'igh': 81,
 'le': 82,
 'planes': 83,
 'av': 84,
 'ad': 85,
 'ur': 86,
 'wh': 87,
 'ac': 88,
 'un': 89,
 '<UNK>': 0}

In [25]:
tokenizer.merge_lookup

{('a', 'n'): 'an',
 ('i', 'n'): 'in',
 ('t', 'h'): 'th',
 ('e', 'r'): 'er',
 ('o', 'n'): 'on',
 ('e', 'n'): 'en',
 ('e', 's'): 'es',
 ('r', 'e'): 're',
 ('a', 't'): 'at',
 ('a', 'l'): 'al',
 ('th', 'e'): 'the',
 ('an', 'd'): 'and',
 ('in', 'g'): 'ing',
 ('o', 'f'): 'of',
 ('e', 'd'): 'ed',
 ('s', 't'): 'st',
 ('i', 'c'): 'ic',
 ('o', 'r'): 'or',
 ('i', 'on'): 'ion',
 ('i', 't'): 'it',
 ('r', 'o'): 'ro',
 ('p', 'l'): 'pl',
 ('t', 'o'): 'to',
 ('i', 'g'): 'ig',
 ('e', 'l'): 'el',
 ('r', 'a'): 'ra',
 ('en', 't'): 'ent',
 ('a', 'i'): 'ai',
 ('a', 'r'): 'ar',
 ('e', 'c'): 'ec',
 ('a', 's'): 'as',
 ('l', 'o'): 'lo',
 ('l', 'y'): 'ly',
 ('pl', 'an'): 'plan',
 ('ai', 'r'): 'air',
 ('c', 'o'): 'co',
 ('i', 's'): 'is',
 ('e', 't'): 'et',
 ('f', 'or'): 'for',
 ('e', 'm'): 'em',
 ('at', 'ion'): 'ation',
 ('ig', 'h'): 'igh',
 ('l', 'e'): 'le',
 ('plan', 'es'): 'planes',
 ('a', 'v'): 'av',
 ('a', 'd'): 'ad',
 ('u', 'r'): 'ur',
 ('w', 'h'): 'wh',
 ('a', 'c'): 'ac',
 ('u', 'n'): 'un'}

In [31]:
' '.join(list("test"))

't e s t'