In [1]:
# import necessary dependecies 
import re # regular expression to check if theres a pattern 

In [18]:
class SimpleTokenizer: 
    def __init__(self, vocab):
        self.string_to_integer = vocab
        self.integer_to_string = {i:s for s,i in vocab.items()} 
        
    def encode(self, text):
        preprocessed = re.split(r'([,.?_!"()\']|--|\s)', text)  # Split text by specific punctuation and whitespace
        preprocessed = [item.strip() for item in preprocessed if item.strip()]  # Strip and filter out empty items
        preprocessed = [item if item in self.string_to_integer else "<|unk|>" for item in preprocessed]

        ids = [self.string_to_integer[s] for s in preprocessed if s in self.string_to_integer]  # Map the text to ids
        return ids

    def decode(self, ids): 
        text = " ".join([self.integer_to_string[i] for i in ids])
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text 
    

In [19]:
with open("The_Verdict.txt", "r", encoding="utf=8") as f: 
    text = f.read() 
    
result = re.split(r'([,.:;?_!"()\']|--|\s)', text)
preprocessed = [item for item in result if item.strip()]                #further remove whitespaces from string list 
all_tokens = sorted(list(set(preprocessed)))               # set() builds a hash set, and list() makes the set into a list. 
all_tokens.extend(["<|endoftext|>", "<|unk|>"])

vocab = {token:integer for integer, token in enumerate(all_tokens)}

vocab


{'!': 0,
 '"': 1,
 "'": 2,
 '(': 3,
 ')': 4,
 '*': 5,
 ',': 6,
 '--': 7,
 '.': 8,
 '0': 9,
 '1': 10,
 '16': 11,
 '1908': 12,
 '1929': 13,
 '2024': 14,
 '4': 15,
 ':': 16,
 ';': 17,
 '?': 18,
 'A': 19,
 'About': 20,
 'Ah': 21,
 'Among': 22,
 'And': 23,
 'Are': 24,
 'Arrt': 25,
 'As': 26,
 'At': 27,
 'Attribution-ShareAlike': 28,
 'Be': 29,
 'Begin': 30,
 'Burlington': 31,
 'But': 32,
 'By': 33,
 'Carlo': 34,
 'Chicago': 35,
 'Claude': 36,
 'Come': 37,
 'Commons': 38,
 'Creative': 39,
 'Croft': 40,
 'Destroyed': 41,
 'Devonshire': 42,
 'Don': 43,
 'Dubarry': 44,
 'During': 45,
 'Edith': 46,
 'Emperors': 47,
 'Exported': 48,
 'FDL': 49,
 'Florence': 50,
 'For': 51,
 'GNU': 52,
 'Gallery': 53,
 'Gideon': 54,
 'Gisburn': 55,
 'Gisburns': 56,
 'Grafton': 57,
 'Greek': 58,
 'Grindle': 59,
 'Grindles': 60,
 'HAD': 61,
 'Had': 62,
 'Hang': 63,
 'Has': 64,
 'He': 65,
 'Her': 66,
 'Hermia': 67,
 'His': 68,
 'How': 69,
 'I': 70,
 'If': 71,
 'In': 72,
 'It': 73,
 'Jack': 74,
 'January': 75,
 'Jove'

In [20]:
# Test - Instantiate an object 
tokenizer = SimpleTokenizer(vocab) 
ids = tokenizer.encode(text) 

decoded_text = tokenizer.decode(ids) 


In [21]:
decoded_text

'The Verdict Edith Wharton 1908 Exported from Wikisource on September 16, 2024 I HAD always thought Jack Gisburn rather a cheap genius -- though a good fellow enough -- so it was no great surprise to me to hear that, in the height of his glory, he had dropped his painting, married a rich widow, and established himself in a villa on the Riviera.( Though I rather thought it would have been Rome or Florence.)" The height of his glory" -- that was what the women called it. I can hear Mrs. Gideon Thwing -- his last Chicago sitter -- deploring his unaccountable abdication." Of course it\' s going to send the value of my picture\' way <|unk|> but I don\' t think of that, Mr. Rickham -- the loss to Arrt is all I think of." The word, on Mrs. Thwing\' s lips, multiplied its _ rs _ as though they were reflected in an endless vista of mirrors. And it was not only the Mrs. Thwings who mourned. Had not the exquisite Hermia Croft, at the last Grafton Gallery show, stopped me before Gisburn\' s" Moon-

In [22]:
for i, item in enumerate(list(vocab.items())[-5:]):
    print(item)

('younger', 1214)
('your', 1215)
('yourself', 1216)
('<|endoftext|>', 1217)
('<|unk|>', 1218)


In [23]:
text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."
text = " <|endoftext|> ".join((text1, text2))
print(text)

Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.


In [24]:
tokenizer = SimpleTokenizer(vocab)
print(tokenizer.encode(text))


[1218, 6, 402, 1213, 687, 1054, 18, 1217, 72, 1069, 1035, 1065, 788, 1069, 1218, 8]


In [27]:
print(tokenizer.decode(tokenizer.encode(text)))

<|unk|>, do you like tea? <|endoftext|> In the sunlit terraces of the <|unk|>.
