In [7]:
# import necessary dependecies 
import re # regular expression to check if theres a pattern 
import tiktoken # Implementing Byte Pair Encoding using exisitng open-source library 

import torch 
from torch.utils.data import Dataset, DataLoader 

In [8]:
class SimpleTokenizer: 
    def __init__(self, vocab):
        self.string_to_integer = vocab
        self.integer_to_string = {i:s for s,i in vocab.items()} 
        
    def encode(self, text):
        preprocessed = re.split(r'([,.?_!"()\']|--|\s)', text)  # Split text by specific punctuation and whitespace
        preprocessed = [item.strip() for item in preprocessed if item.strip()]  # Strip and filter out empty items
        preprocessed = [item if item in self.string_to_integer else "<|unk|>" for item in preprocessed]

        ids = [self.string_to_integer[s] for s in preprocessed if s in self.string_to_integer]  # Map the text to ids
        return ids

    def decode(self, ids): 
        text = " ".join([self.integer_to_string[i] for i in ids])
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text 
    

In [9]:
class GPTDataset(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.tokenizer = tokenizer 
        self.inputs_ids = []
        self.target_ids = [] 
        
        token_ids = tokenizer.encode(txt) 
        
        for i in range(0, len(token_ids) - max_length, stride): # Creating context window 
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))
            
    def __len__(self): 
        return len(self.input_ids)
    
    def __getitem__(self, idx): 
        return self.input_ids[idx], self.target_ids[idx]

In [None]:
with open("The_Verdict.txt", "r", encoding="utf=8") as f: 
    text = f.read() 
    
result = re.split(r'([,.:;?_!"()\']|--|\s)', text)
preprocessed = [item for item in result if item.strip()]                #further remove whitespaces from string list 
all_tokens = sorted(list(set(preprocessed)))               # set() builds a hash set, and list() makes the set into a list. 
all_tokens.extend(["<|endoftext|>", "<|unk|>"])

vocab = {token:integer for integer, token in enumerate(all_tokens)}

vocab


In [30]:
# Test - Instantiate an object 
tokenizer = SimpleTokenizer(vocab) 
ids = tokenizer.encode(text) 

decoded_text = tokenizer.decode(ids) 


In [None]:
ids

In [None]:
decoded_text

In [None]:
for i, item in enumerate(list(vocab.items())[-5:]):
    print(item)

In [None]:
# Test end of text token in the case of additional test source
text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."
text = " <|endoftext|> ".join((text1, text2))
print(text)

In [None]:
tokenizer = SimpleTokenizer(vocab)
print(tokenizer.encode(text))


In [None]:
print(tokenizer.decode(tokenizer.encode(text)))

# TEST - Byte Pair Encoding 

In [None]:
tokenizer = tiktoken.get_encoding("gpt2") 
integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
print(integers)

In [None]:
print(type(integers))

In [None]:
text = "AKwirw ier"
integers = tokenizer.encode(text)
tokenizer.decode(integers)