In [21]:
import re 
from collections import defaultdict 
import string
  
def get_stats(vocab): 
    """ 
    Given a vocabulary (dictionary mapping words to frequency counts), returns a  
    dictionary of tuples representing the frequency count of pairs of characters  
    in the vocabulary. 
    """
    pairs = defaultdict(int) 
    for word, freq in vocab.items(): 
        chars = word.split() # split the word by any white space
        for i in range(len(chars)-1): 
            pairs[chars[i], chars[i+1]] += freq 
    return pairs 
  
def merge_vocab(token_pair, v_in): 
    """ 
    Given a pair of characters and a vocabulary, returns a new vocabulary with the  
    pair of characters merged together wherever they appear. 
    """
    v_out = defaultdict(int)  
    bigram = re.escape(' '.join(token_pair)) 
    new_token = ''.join(token_pair)
    # search for every occurance of bigram (token pairs with a space), 
    p = re.compile(r'(?<!\S)' + bigram + r'(?!\S)') 
    for word in v_in:
        # replace the bigram (with space), with the new merged token (the concanated pair)
        w_out = p.sub(new_token, word)
        v_out[w_out] = v_in[word]
    return v_out
  
def get_init_vocab(data): 
    """ 
    Given a list of strings, returns a dictionary of words mapping to their frequency  
    count in the data. 
    """
    vocab = defaultdict(int)
    tokens = set()
    tokens.add('</w>')
    for line in data: 
        for word in line.split(): 
            vocab[' '.join(list(word)) + ' </w>'] += 1
            tokens.update(list(word))
    return vocab, tokens 
  
def byte_pair_encoding(data, n): 
    """ 
    Given a list of strings and an integer n, returns a list of n merged pairs 
    of characters found in the vocabulary of the input data. 
    """
    vocab, init_tokens = get_init_vocab(data)
    tokens = list(init_tokens)
    for i in range(n): 
        pairs = get_stats(vocab) 
        best_pair = max(pairs, key=pairs.get) 
        vocab = merge_vocab(best_pair, vocab)
        tokens.append(''.join(best_pair))
        print('step {}: merging \"{}\" and \"{}\"'.format(i+1, best_pair[0], best_pair[1]))
    return tokens

def tokenize(data, token_dict):
    """split the data into a tokens and map into index"""
    encoded_ids = []
    for line in data: 
        for word in line.split():
            word = word + '</w>'
            last_idx = 0
            idx = len(word)
            while idx > last_idx:
                whole_word = word[last_idx:idx]
                if whole_word in token_dict:
                    encoded_ids.append(token_dict[whole_word])
                    last_idx = idx
                    idx = len(word)
                else:
                    idx = idx - 1
    return encoded_ids
  
# Example usage: 
corpus = '''Berman's parents divorced when he was seven. 
Thereafter, he split time between each parent's household until he entered college.[6] 
His father relocated to Dallas for a position as a lobbyist on behalf of foodservice businesses, 
while his mother moved back in with her parents in Wooster, Ohio, and became a teacher there'''
data = corpus.split('.') 
  
n = 20 # number of merge operations
bpe_vocab = byte_pair_encoding(data, n) 


bpe_dict = dict([(tk, id) for id, tk in enumerate(bpe_vocab)])
id_to_token = dict([(tid, tk) for tk, tid in bpe_dict.items()]  )

token_ids = tokenize(data, bpe_dict)

print("The bpe tokens are: ")
for tk, tid in bpe_dict.items():
    print("{}: {}".format(tk, tid))

print(token_ids)
print(' '.join(id_to_token[tid] for tid in token_ids))


step 1: merging "e" and "r"
step 2: merging "s" and "</w>"
step 3: merging "e" and "</w>"
step 4: merging "e" and "n"
step 5: merging "d" and "</w>"
step 6: merging "h" and "er"
step 7: merging "en" and "t"
step 8: merging "e" and "d</w>"
step 9: merging "," and "</w>"
step 10: merging "her" and "</w>"
step 11: merging "n" and "</w>"
step 12: merging "p" and "a"
step 13: merging "pa" and "r"
step 14: merging "par" and "ent"
step 15: merging "en" and "</w>"
step 16: merging "h" and "e</w>"
step 17: merging "a" and "s</w>"
step 18: merging "s" and "e"
step 19: merging "e" and "a"
step 20: merging "i" and "t"
The bpe tokens are: 
,: 0
H: 1
</w>: 2
]: 3
': 4
v: 5
s: 6
y: 7
m: 8
a: 9
w: 10
c: 11
r: 12
f: 13
h: 14
i: 15
T: 16
D: 17
W: 18
g: 19
t: 20
n: 21
O: 22
B: 23
k: 24
l: 25
p: 26
b: 27
[: 28
o: 29
6: 30
u: 31
e: 32
d: 33
er: 34
s</w>: 35
e</w>: 36
en: 37
d</w>: 38
her: 39
ent: 40
ed</w>: 41
,</w>: 42
her</w>: 43
n</w>: 44
pa: 45
par: 46
parent: 47
en</w>: 48
he</w>: 49
as</w>: 50
se: 51