In [1]:
import collections
import re

In [36]:
def get_pairs(vocabs: dict[str,int])->dict[str, int]:
    pairs = collections.defaultdict(int)
    for word, freq in vocabs.items():
        symbols = word.split()
        for i in range(len(symbols)-1):
            tup_pair = symbols[i], symbols[i+1]
            pairs[tup_pair] += freq
    return pairs

def merge(vocabs: dict[str,int], voc:tuple[str, str]):
    result = {}
    bigram = re.escape(' '.join(voc))
    # (?<!) negative look behind
    # (?!) negative look ahead
    # \S, 공백, 탭, 줄바꿈이 아닌 모든 문자
    p = re.compile(r'(?<!\S)' + bigram + r'(?!\S)')
    for word in vocabs:
        w_out = p.sub(''.join(voc), word)
        result[w_out] = vocabs[word]
    return result
                      
     
from pprint import pprint
def log(i, pairs, best, vocabs):
    print(f'{i+1}th iteration')
    print(f'현재 pair들의 빈도수: {dict(pairs)}')
    print(f'new merge: {best}')
    pprint(f'vocabs: {vocabs}')

def bypte_pair_encoding(vocabs, n):
    for i in range(n):
        pairs = get_pairs(vocabs)
        most_common_vocab = max(pairs, key=pairs.get)
        vocabs = merge(vocabs, most_common_vocab)
        log(i, pairs, most_common_vocab, vocabs)
    return vocabs


dictionary = {'l o w </w>' : 5,
         'l o w e r </w>' : 2,
         'n e w e s t </w>':6,
         'w i d e s t </w>':3
         }
n = 10
bypte_pair_encoding(dictionary, n)
'end'

1th iteration
현재 pair들의 빈도수: {('l', 'o'): 7, ('o', 'w'): 7, ('w', '</w>'): 5, ('w', 'e'): 8, ('e', 'r'): 2, ('r', '</w>'): 2, ('n', 'e'): 6, ('e', 'w'): 6, ('e', 's'): 9, ('s', 't'): 9, ('t', '</w>'): 9, ('w', 'i'): 3, ('i', 'd'): 3, ('d', 'e'): 3}
new merge: ('e', 's')
("vocabs: {'l o w </w>': 5, 'l o w e r </w>': 2, 'n e w es t </w>': 6, 'w i d "
 "es t </w>': 3}")
2th iteration
현재 pair들의 빈도수: {('l', 'o'): 7, ('o', 'w'): 7, ('w', '</w>'): 5, ('w', 'e'): 2, ('e', 'r'): 2, ('r', '</w>'): 2, ('n', 'e'): 6, ('e', 'w'): 6, ('w', 'es'): 6, ('es', 't'): 9, ('t', '</w>'): 9, ('w', 'i'): 3, ('i', 'd'): 3, ('d', 'es'): 3}
new merge: ('es', 't')
("vocabs: {'l o w </w>': 5, 'l o w e r </w>': 2, 'n e w est </w>': 6, 'w i d "
 "est </w>': 3}")
3th iteration
현재 pair들의 빈도수: {('l', 'o'): 7, ('o', 'w'): 7, ('w', '</w>'): 5, ('w', 'e'): 2, ('e', 'r'): 2, ('r', '</w>'): 2, ('n', 'e'): 6, ('e', 'w'): 6, ('w', 'est'): 6, ('est', '</w>'): 9, ('w', 'i'): 3, ('i', 'd'): 3, ('d', 'est'): 3}
new merge: ('est',

'end'