In [3]:
import re

with open("the-verdict.txt", "r", encoding="utf-8") as file:
    raw_text = file.read()

print("Total number of character:", len(raw_text))
print(raw_text[:99])


Total number of character: 20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


In [4]:

# split text
preprocessed = re.split(r'([,.?_!"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(len(preprocessed))
print(preprocessed[:30])


4649
['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in']


In [5]:
# token id
all_words = sorted(list(set(preprocessed)))
all_words.extend(["<|endoftext|>", "<|unk|>"])
vocab_size = len(all_words)
print(vocab_size)

vocab = {token:integer for integer, token in enumerate(all_words)}
for i, item in enumerate(vocab.items()):
    print(item)
    if i > 10:
        break

print("\n")
for i, item in enumerate(list(vocab.items())[-5:]):
    print(item)

1161
('!', 0)
('"', 1)
("'", 2)
('(', 3)
(')', 4)
(',', 5)
('--', 6)
('.', 7)
(':', 8)
(';', 9)
('?', 10)
('A', 11)


('younger', 1156)
('your', 1157)
('yourself', 1158)
('<|endoftext|>', 1159)
('<|unk|>', 1160)


In [6]:
# tokenizer

class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {index: string for string, index in vocab.items()}

    def encode(self, text):
        preprocessed = re.split(r'([,.?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        preprocessed = [item if item in self.str_to_int else "<|unk|>" for item in preprocessed]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
    
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

In [7]:
# test tokenizer
tokenizer = SimpleTokenizerV1(vocab)

ids = tokenizer.encode(raw_text)
print(ids[:20])

decodedText = tokenizer.decode(ids)

print(decodedText[:50])

text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."
text3 = "<|endoftext|> ".join((text1, text2)) 
print(text3)

encodedText = tokenizer.encode(text3)
print(encodedText)
print(tokenizer.decode(encodedText))

[55, 46, 154, 1028, 59, 39, 839, 119, 263, 494, 6, 1027, 119, 508, 443, 399, 6, 933, 596, 1104]
I HAD always thought Jack Gisburn rather a cheap g
Hello, do you like tea?<|endoftext|> In the sunlit terraces of the palace.
[1160, 5, 362, 1155, 642, 1000, 10, 1159, 57, 1013, 981, 1009, 738, 1013, 1160, 7]
<|unk|>, do you like tea? <|endoftext|> In the sunlit terraces of the <|unk|>.


In [13]:
from importlib.metadata import version
import tiktoken
print("tiktoken version:", version("tiktoken"))
tokenizer = tiktoken.get_encoding("gpt2")

encodedText = tokenizer.encode(text3, allowed_special={"<|endoftext|>"})
print(encodedText)

decodedText = tokenizer.decode(encodedText)
print(decodedText)

unknownWord = "Awkirw ier"
encodedText = tokenizer.encode(unknownWord)
print(encodedText)
decodedText = tokenizer.decode(encodedText)
print(decodedText)

tiktoken version: 0.11.0
[15496, 11, 466, 345, 588, 8887, 30, 50256, 554, 262, 4252, 18250, 8812, 2114, 286, 262, 20562, 13]
Hello, do you like tea?<|endoftext|> In the sunlit terraces of the palace.
[23155, 74, 343, 86, 220, 959]
Awkirw ier
