In [2]:
from tokenizer import LinearTokenizer
import tiktoken
import json
import pandas as pd

In [3]:
def upper_first(text):
    return text[0].upper() + (text[1:] if len(text) > 1 else "")


with open('./vocab.json', 'r', encoding='utf-8') as f:
    vocab = json.load(f)

tokenizer_lin = LinearTokenizer(vocab)
tokenizer_bpe = tiktoken.get_encoding("gpt2")

words = []
words.extend(['machine', 'machines', 'machined', 'machining', 'machinion',  'machinery', 'machineries'])
words.extend(['token', 'tokens', 'tokenized', 'tokenizing', 'tokenization', 'tokenizations', 'tokenset', 'tokensets', 'tokenism', 'tokenless', "tokenes", "detokenize"])
words.extend(['query', 'querying', 'queries', 'queryable', 'queried'])
words.extend(['container', 'containers'])
words.extend(['throttle', 'throttling', 'throttles', 'throttleable', 'throttled'])
words.extend(['transformer', 'transformers', 'transforming', 'transformable', 'transformation', 'pretransformer'])


def encode_word(word):
    ids_lin, tokens_lin = tokenizer_lin.encode(word, return_token_tuple=True)
    ids_bpe = tokenizer_bpe.encode(word)
    tokens_bpe = [tokenizer_bpe.decode([id]) for id in ids_bpe]
    return tokens_lin, tokens_bpe


data = {'words': words, 'bpe': [], 'lin': [], 'BPE': [], 'LIN': []}
for word in words:
    tokens_lin, tokens_bpe = encode_word(word)
    data['lin'].append(", ".join(tokens_lin))
    data['bpe'].append(", ".join(tokens_bpe))

    tokens_lin, tokens_bpe = encode_word(upper_first(word))
    data['LIN'].append(", ".join(tokens_lin))
    data['BPE'].append(", ".join(tokens_bpe))


df = pd.DataFrame(data)
df = df.style.set_properties(**{'text-align': 'left'})
df

Unnamed: 0,words,bpe,lin,BPE,LIN
0,machine,machine,machine,Machine,Machine
1,machines,"m, ach, ines","machine, s","Mach, ines","Machine, s"
2,machined,"m, ach, ined","machine, d","Mach, ined","Machine, d"
3,machining,"m, ach, ining","mac, hin, ing","Mach, ining","Mach, ining"
4,machinion,"m, ach, inion","mac, hin, ion","Mach, inion","Mach, inion"
5,machinery,"m, ach, inery","machine, ry","Mach, inery","Machine, ry"
6,machineries,"m, ach, ineries","machine, ries","Mach, ineries","Machine, ries"
7,token,token,token,Token,Token
8,tokens,"t, ok, ens","token, s",Tokens,Tokens
9,tokenized,"token, ized","token, ized","Token, ized","Token, ized"
