# Tokenization

In [2]:
text = "I'm going to school today."

tokens = text.split()

print(tokens)

["I'm", 'going', 'to', 'school', 'today.']


In [4]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
# To solve certificate issue: pip install --upgrade certifi

[nltk_data] Downloading package punkt to /Users/maohieng/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/maohieng/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [5]:
from nltk.tokenize import word_tokenize
tokens = word_tokenize(text)

print(tokens)

['I', "'m", 'going', 'to', 'school', 'today', '.']


## Bye Pair Encoding (BPE)

In [17]:
corpus = 'banana_bandana_'

In [18]:
from collections import Counter
counter = Counter(corpus)

print(counter)

Counter({'a': 6, 'n': 4, 'b': 2, '_': 2, 'd': 1})


In [19]:
most2common = counter.most_common(2)
print(most2common)
print(most2common[1][0])
print(most2common[1][1])

[('a', 6), ('n', 4)]
n
4


In [20]:
# Update the first most common character's count
counter[most2common[0][0]] = counter[most2common[0][0]] - most2common[1][1]

# Remove the second most common character from counter
del counter[most2common[1][0]]

# Add the combination
counter[most2common[0][0] + most2common[1][0]] = most2common[1][1]

In [21]:
print(counter)

Counter({'an': 4, 'b': 2, 'a': 2, '_': 2, 'd': 1})


In [22]:
most2common = counter.most_common(2)
print(most2common)

[('an', 4), ('b', 2)]


In [23]:
# Update the first most common character's count
counter[most2common[0][0]] = counter[most2common[0][0]] - most2common[1][1]

# Remove the second most common character from counter
del counter[most2common[1][0]]

# Add the combination
counter[most2common[0][0] + most2common[1][0]] = most2common[1][1]

In [24]:
print(counter)

Counter({'a': 2, '_': 2, 'an': 2, 'anb': 2, 'd': 1})


### Khmer Encoding Using BPE

In [9]:
corpus = 'សិស្សរៀនសិស្សអានសិស្សពូកែអានអាចសរសេរគ្រូសរសើរសិស្សពូកែសិក្សា'

In [15]:
from collections import Counter

text = 'សិស្សរៀនសិស្សអានសិស្សពូកែអានអាចសរសេរគ្រូសរសើរសិស្សពូកែសិក្សា'

for merge in range(10):
    print('\nMerge:', (merge + 1))
    print("Text:", text)    
    
    tokens = list(text)

    vocab = Counter(tokens)

    print('Vocab:', vocab)

    pair = [t1 + t2 for t1, t2 in zip(tokens[:-1], tokens[1:])]
    counter_pair = Counter(pair)

    print('Pair:', counter_pair)

    v = counter_pair.most_common(1)[0]
    print('First most common pair:', v[0], v[1])

    k = chr(ord('A') + merge)
    mapping = {}
    mapping[k] = v[0]

    text = text.replace(v[0], k)
    print(text)

print('\n\nFinal text replacement:', text)


Merge: 1
Text: សិស្សរៀនសិស្សអានសិស្សពូកែអានអាចសរសេរគ្រូសរសើរសិស្សពូកែសិក្សា
Vocab: Counter({'ស': 18, '្': 6, 'រ': 6, 'ិ': 5, 'ា': 4, 'ន': 3, 'អ': 3, 'ូ': 3, 'ក': 3, 'ព': 2, 'ែ': 2, 'ៀ': 1, 'ច': 1, 'េ': 1, 'គ': 1, 'ើ': 1})
Pair: Counter({'សិ': 5, '្ស': 5, 'ិស': 4, 'ស្': 4, 'សរ': 3, 'អា': 3, 'រស': 3, 'នស': 2, 'ាន': 2, 'សព': 2, 'ពូ': 2, 'ូក': 2, 'កែ': 2, 'រៀ': 1, 'ៀន': 1, 'សអ': 1, 'ែអ': 1, 'នអ': 1, 'ាច': 1, 'ចស': 1, 'សេ': 1, 'េរ': 1, 'រគ': 1, 'គ្': 1, '្រ': 1, 'រូ': 1, 'ូស': 1, 'សើ': 1, 'ើរ': 1, 'ែស': 1, 'ិក': 1, 'ក្': 1, 'សា': 1})
First most common pair: សិ 5
Aស្សរៀនAស្សអានAស្សពូកែអានអាចសរសេរគ្រូសរសើរAស្សពូកែAក្សា

Merge: 2
Text: Aស្សរៀនAស្សអានAស្សពូកែអានអាចសរសេរគ្រូសរសើរAស្សពូកែAក្សា
Vocab: Counter({'ស': 13, '្': 6, 'រ': 6, 'A': 5, 'ា': 4, 'ន': 3, 'អ': 3, 'ូ': 3, 'ក': 3, 'ព': 2, 'ែ': 2, 'ៀ': 1, 'ច': 1, 'េ': 1, 'គ': 1, 'ើ': 1})
Pair: Counter({'្ស': 5, 'Aស': 4, 'ស្': 4, 'សរ': 3, 'អា': 3, 'នA': 2, 'ាន': 2, 'សព': 2, 'ពូ': 2, 'ូក': 2, 'កែ': 2, 'រស': 2, 'រៀ': 1, 'ៀន': 1, 'សអ': 1, 'ែអ': 1, 'ន