# WordPiece Tokenization

In [1]:
#Define a corpus of words
corpus = [
    "This is the beginning of Hugging Face Course",
    "This chapter is about tokenization",
    "This section shows several tokenization algorithm",
    "Hopefully you will be able to understand how they are trained to generate tokens"
]

In [2]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [3]:
from collections import defaultdict

word_freqs = defaultdict(int)
for text in corpus:
  words_with_offsets = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)
  new_words = [word for word, offset in words_with_offsets]
  for word in new_words:
    word_freqs[word]+=1

print(word_freqs)

defaultdict(<class 'int'>, {'This': 3, 'is': 2, 'the': 1, 'beginning': 1, 'of': 1, 'Hugging': 1, 'Face': 1, 'Course': 1, 'chapter': 1, 'about': 1, 'tokenization': 2, 'section': 1, 'shows': 1, 'several': 1, 'algorithm': 1, 'Hopefully': 1, 'you': 1, 'will': 1, 'be': 1, 'able': 1, 'to': 2, 'understand': 1, 'how': 1, 'they': 1, 'are': 1, 'trained': 1, 'generate': 1, 'tokens': 1})


In [4]:
alphabet = []
for word in word_freqs.keys():
  if word[0] not in alphabet:
    alphabet.append(word[0])
  for letter in word[1:]:
    if f"##{letter}" not in alphabet:
      alphabet.append(f"##{letter}")

In [5]:
print(alphabet)

['T', '##h', '##i', '##s', 'i', 't', '##e', 'b', '##g', '##n', 'o', '##f', 'H', '##u', 'F', '##a', '##c', 'C', '##o', '##r', 'c', '##p', '##t', 'a', '##b', '##k', '##z', 's', '##w', '##v', '##l', '##m', '##y', 'y', 'w', 'u', '##d', 'h', 'g']


In [6]:
vocab = ["[PAD]", "[UNK]", "[CLS]", "[SEP]","[MASK]"] + alphabet.copy()
vocab

['[PAD]',
 '[UNK]',
 '[CLS]',
 '[SEP]',
 '[MASK]',
 'T',
 '##h',
 '##i',
 '##s',
 'i',
 't',
 '##e',
 'b',
 '##g',
 '##n',
 'o',
 '##f',
 'H',
 '##u',
 'F',
 '##a',
 '##c',
 'C',
 '##o',
 '##r',
 'c',
 '##p',
 '##t',
 'a',
 '##b',
 '##k',
 '##z',
 's',
 '##w',
 '##v',
 '##l',
 '##m',
 '##y',
 'y',
 'w',
 'u',
 '##d',
 'h',
 'g']

In [7]:
splits = {
    word: [c if i==0 else f"##{c}" for i,c in enumerate(word)]
    for word in word_freqs.keys()
}

In [8]:
splits

{'This': ['T', '##h', '##i', '##s'],
 'is': ['i', '##s'],
 'the': ['t', '##h', '##e'],
 'beginning': ['b', '##e', '##g', '##i', '##n', '##n', '##i', '##n', '##g'],
 'of': ['o', '##f'],
 'Hugging': ['H', '##u', '##g', '##g', '##i', '##n', '##g'],
 'Face': ['F', '##a', '##c', '##e'],
 'Course': ['C', '##o', '##u', '##r', '##s', '##e'],
 'chapter': ['c', '##h', '##a', '##p', '##t', '##e', '##r'],
 'about': ['a', '##b', '##o', '##u', '##t'],
 'tokenization': ['t',
  '##o',
  '##k',
  '##e',
  '##n',
  '##i',
  '##z',
  '##a',
  '##t',
  '##i',
  '##o',
  '##n'],
 'section': ['s', '##e', '##c', '##t', '##i', '##o', '##n'],
 'shows': ['s', '##h', '##o', '##w', '##s'],
 'several': ['s', '##e', '##v', '##e', '##r', '##a', '##l'],
 'algorithm': ['a', '##l', '##g', '##o', '##r', '##i', '##t', '##h', '##m'],
 'Hopefully': ['H', '##o', '##p', '##e', '##f', '##u', '##l', '##l', '##y'],
 'you': ['y', '##o', '##u'],
 'will': ['w', '##i', '##l', '##l'],
 'be': ['b', '##e'],
 'able': ['a', '##b', '##l'

In [9]:
def compute_pair_scores(splits):
  letter_freqs = defaultdict(int)
  pair_freqs = defaultdict(int)
  for word, freq in word_freqs.items():
    split = splits[word]
    if len(split) == 1:
      letter_freqs[split[0]] += freq
      continue
    for i in range(len(split)-1):
      pair = (split[i], split[i+1])
      letter_freqs[split[i]] += freq
      pair_freqs[pair]+=freq
    letter_freqs[split[-1]]+=freq

    scores = {
        pair: freq/(letter_freqs[pair[0]] * letter_freqs[pair[1]])
        for pair, freq in pair_freqs.items()
    }
    return scores

In [10]:
pair_scores = compute_pair_scores(splits)
for i, key in enumerate(pair_scores.keys()):
  print(f"{key}: {pair_scores[key]}")
  if i>=5:
    break

('T', '##h'): 0.3333333333333333
('##h', '##i'): 0.3333333333333333
('##i', '##s'): 0.3333333333333333


In [13]:
best_pair = ""
max_score = None
for pair, score in pair_scores.items():
  if max_score is None or max_score < score:
    max_score = score
    best_pair = pair

print(best_pair, score)

('T', '##h') 0.3333333333333333
