In [116]:
import re
import collections

### ---> reading a text

In [117]:
def read_txt_file(name, method):
    with open(name, method) as f:
        corpus = f.read()
    return corpus

the_verdict = read_txt_file("the-verdict.txt", "r")
print(the_verdict[:70])

I HAD always thought Jack Gisburn rather a cheap genius--though a good


### ---> creating vocabulary with unique words and chars (Word based tokenization)

In [118]:
def word_based_vocab(text):
    word_content = []
    split_words = re.split(r'([.,:;!_?()"\']|--|\s)', text)
    for cont in split_words:
        if cont.strip():
            word_content.append(cont.strip())
    word_vocab = sorted(set(word_content))
    return word_vocab

word__based_vocab = word_based_vocab(the_verdict)
print(f"Vocabulary Size Word based: {len(word__based_vocab)}")
        

Vocabulary Size Word based: 1130


### ---> creating vocabulary with unique chars (Character based tokenization)

In [119]:
def character_based_vocab(text):
    corpus_pre = re.split(r'([.,:;!_?()"\']|--|\s)', text)
    corpus_pre_1 = []
    char_content = []
    for word in corpus_pre:
        if word.strip():
            corpus_pre_1.append(word.strip())
    unique_corpus_pre1 = set(corpus_pre_1)
    unique_corpus = list(unique_corpus_pre1)
    for item in unique_corpus:
        for char in item:
            char_content.append(char)
    all_chars_unique = sorted(set(char_content))
    return all_chars_unique, corpus_pre_1

character__based_vocab, corpus__pre_1 = character_based_vocab(the_verdict)
print(f"Vocabulary Size Character based: {len(character__based_vocab)}")

Vocabulary Size Character based: 60


### ---> mapping chars and their frequencies in word context

In [120]:
def freq_mapping(word_list):
    freq_mapping = {}
    end_of_word = "</w>"
    for word in word_list:
        if word:
            char_list = list(word) + [end_of_word]
            chars_tuple = tuple(char_list)
            if chars_tuple in freq_mapping:
                freq_mapping[chars_tuple] += 1
            else:
                freq_mapping[chars_tuple] = 1
    return freq_mapping

freq__mapping = freq_mapping(corpus__pre_1)

### ---> get char pairs and their frequencies using defaultdict

In [121]:
def get_pair_frequenciesV1(word_dict):
    pair_counts = collections.defaultdict(int)
    for word, freq in word_dict.items():
        split_words = list(word)
        for i in range(len(split_words) - 1):
            pair = (split_words[i], split_words[i+1])
            pair_counts[pair] += freq
    return pair_counts

pair_frequenciesV1 = get_pair_frequenciesV1(freq__mapping)


### ---> get char pairs and their frequencies using regular dict

In [122]:
def get_pair_frequenciesV2(word_dict):  
    pair_counts = {}
    for word, freq in word_dict.items():
        split_words = list(word)
        for i in range(len(split_words) - 1):
            pair = (split_words[i], split_words[i+1])
            if pair not in pair_counts.keys():
                pair_counts[pair] = freq
            else:
                pair_counts[pair] += freq
    return pair_counts

pair_frequenciesV2 = get_pair_frequenciesV2(freq__mapping)

best_pair = max(pair_frequenciesV2, key=pair_frequenciesV2.get)
print(f"Most frequent pair: {best_pair}")
best_freq = pair_frequenciesV2[best_pair]
print(f"Number of occurrences: {best_freq}")

Most frequent pair: ('e', '</w>')
Number of occurrences: 729


### ---> merge one pair from frequency dictionary

In [123]:
def merge_pair(pair_to_merge, new_mapping):
    mapping = {}
    (first, second) = pair_to_merge
    merged_token = first + second
    for word_tuple, freq in new_mapping.items():
        chars = list(word_tuple)
        new_chars = []
        i = 0
        while i < len(word_tuple):
            if i < len(chars) - 1 and chars[i] == first and chars[i+1] == second:
                new_chars.append(merged_token)
                i += 2
            else:
                new_chars.append(chars[i])
                i += 1
        mapping[tuple(new_chars)] = freq
    return mapping

pair_merge = merge_pair(("I", "</w>"), freq__mapping)

### ---> merging loop

In [124]:
num_merges = 40
merges = {}
current_splits = freq__mapping.copy()

for i in range(num_merges):
    pair_stats = get_pair_frequenciesV1(current_splits)
    if not pair_stats:
        print("No more pairs to merge")
        break

    best_pair = max(pair_stats, key=pair_stats.get)
    best_freq = pair_stats[best_pair]
    
    current_splits = merge_pair(best_pair, current_splits)
    new_token = best_pair[0] + best_pair[1]
    
    character__based_vocab.append(new_token)
    merges[best_pair] = new_token
    

### ---> results overview

In [125]:
print("\n--- BPE Merges Complete ---")
print(f"Final Vocabulary Size {len(character__based_vocab)}")
print("\nLearned Merges (Pair -> New Token):")

for pair, token in merges.items():
    print(f"{pair} -> '{token}'")

print("\nFinal Vocabulary (sorted):")

final_vocab_sorted = sorted(list(set(character__based_vocab)))
print(final_vocab_sorted)


--- BPE Merges Complete ---
Final Vocabulary Size 100

Learned Merges (Pair -> New Token):
('e', '</w>') -> 'e</w>'
('t', '</w>') -> 't</w>'
('d', '</w>') -> 'd</w>'
('s', '</w>') -> 's</w>'
('t', 'h') -> 'th'
('n', '</w>') -> 'n</w>'
('y', '</w>') -> 'y</w>'
('i', 'n') -> 'in'
(',', '</w>') -> ',</w>'
('.', '</w>') -> '.</w>'
('o', 'u') -> 'ou'
('e', 'r') -> 'er'
('e', 'd</w>') -> 'ed</w>'
('th', 'e</w>') -> 'the</w>'
('o', '</w>') -> 'o</w>'
('a', 'n') -> 'an'
('f', '</w>') -> 'f</w>'
('"', '</w>') -> '"</w>'
('h', 'a') -> 'ha'
('in', 'g') -> 'ing'
('I', '</w>') -> 'I</w>'
('h', 'i') -> 'hi'
('ing', '</w>') -> 'ing</w>'
('h', 'e</w>') -> 'he</w>'
('o', 'n') -> 'on'
('t', 'o</w>') -> 'to</w>'
('w', 'a') -> 'wa'
('o', 'f</w>') -> 'of</w>'
('-', '-') -> '--'
('--', '</w>') -> '--</w>'
('o', 'r') -> 'or'
('a', '</w>') -> 'a</w>'
("'", '</w>') -> ''</w>'
('e', 'a') -> 'ea'
('an', 'd</w>') -> 'and</w>'
('s', 't') -> 'st'
('e', 'n') -> 'en'
('er', '</w>') -> 'er</w>'
('u', 'r') -> 'ur'
('a