In [78]:
#opens a text file "The_Verdict.txt" in read mode
with open("The_Verdict.txt", "r") as f: 
    wikiText = f.read() #reads its entire content into the variable 'wikiText'
print("Total number of characters:", len(wikiText)) #prints the total number of characters in the file


Total number of characters: 21940


In [79]:
import re #regular expressions
result = "This is python file, this is practise."

#re.split function to split the string into a list of tokens based on commas, periods, or spaces
var = re.split('([,.]|\s)', result)
print(var)

['This', ' ', 'is', ' ', 'python', ' ', 'file', ',', '', ' ', 'this', ' ', 'is', ' ', 'practise', '.', '']


In [80]:
#filters out any empty or whitespace from the 'result' list by using a list comprehension.
result = [item for item in result if item.strip()] 
print(result)

['T', 'h', 'i', 's', 'i', 's', 'p', 'y', 't', 'h', 'o', 'n', 'f', 'i', 'l', 'e', ',', 't', 'h', 'i', 's', 'i', 's', 'p', 'r', 'a', 'c', 't', 'i', 's', 'e', '.']


In [81]:
text = "Hello, world. Is this-- a test?"

#regular expression that captures various punctuation marks, double hyphens, and spaces
result = re.split(r'([,.:;?_!"()\']|--|\s)', text)

#removes any extra spaces from the 'result' list
result = [item.strip() for item in result if item.strip()] 
print(result)

['Hello', ',', 'world', '.', 'Is', 'this', '--', 'a', 'test', '?']


In [82]:
#entire content of 'wikiText' into tokens using a regular expression that matches punctuation, double hyphens, and spaces
preProcessed = re.split(r'([,.:;?_!"()\']|--|\s)', wikiText) 
print(preProcessed)
print(len(preProcessed))

['The', ' ', 'Verdict', '\n', '', '\n', '', '\n', '', '\t', '', '\t', '', '\t', '', '\t', '', '\t', 'Edith', ' ', 'Wharton', '\n', '', '\n', '', '\n', '', '\n', '', '\n', '', '\n', '1908', '\n', '', '\n', '', '\n', '', '\n', '', '\n', '', '\n', 'Exported', ' ', 'from', ' ', 'Wikisource', ' ', 'on', ' ', 'August', ' ', '27', ',', '', ' ', '2024', '\n', '', '\n', '', '\n', '', '\n', '', '\n', '', '\n', 'I', ' ', 'HAD', ' ', 'always', ' ', 'thought', ' ', 'Jack', ' ', 'Gisburn', ' ', 'rather', ' ', 'a', ' ', 'cheap', ' ', 'genius', '--', 'though', ' ', 'a', ' ', 'good', ' ', 'fellow', ' ', 'enough', '--', 'so', ' ', 'it', ' ', 'was', ' ', 'no', ' ', 'great', ' ', 'surprise', ' ', 'to', ' ', 'me', ' ', 'to', ' ', 'hear', ' ', 'that', ',', '', ' ', 'in', ' ', 'the', ' ', 'height', ' ', 'of', ' ', 'his', ' ', 'glory', ',', '', ' ', 'he', ' ', 'had', ' ', 'dropped', ' ', 'his', ' ', 'painting', ',', '', ' ', 'married', ' ', 'a', ' ', 'rich', ' ', 'widow', ',', '', ' ', 'and', ' ', 'establishe

### Token IDs

In [83]:
#removes duplicate tokens from the 'preProcessed' list by converting it into a set, then sorts the unique tokens alphabetically.
allWords = sorted(set(preProcessed))
print(len(allWords))


1239


In [84]:
#creates a dictionary 'out', where each unique token from 'allWords' is paired with a unique ID, based on its position in the list
out = {token:tokenID for tokenID, token in enumerate(allWords)}
print(out)

{'': 0, '\t': 1, '\n': 2, ' ': 3, '!': 4, '"': 5, "'": 6, '(': 7, ')': 8, '*': 9, ',': 10, '--': 11, '.': 12, '0': 13, '1': 14, '1908': 15, '1929': 16, '2024': 17, '27': 18, '4': 19, ':': 20, ';': 21, '?': 22, 'A': 23, 'Abigor': 24, 'About': 25, 'AdamBMorgan': 26, 'Ah': 27, 'Among': 28, 'And': 29, 'Are': 30, 'Arrt': 31, 'As': 32, 'At': 33, 'Attribution-ShareAlike': 34, 'August': 35, 'AzaToth': 36, 'Be': 37, 'Begin': 38, 'Bender235': 39, 'Blurpeace': 40, 'Boris23': 41, 'Bromskloss': 42, 'Burlington': 43, 'But': 44, 'By': 45, 'Carlo': 46, 'Chicago': 47, 'Claude': 48, 'Come': 49, 'Commons': 50, 'Creative': 51, 'Croft': 52, 'Dbenbenn': 53, 'Destroyed': 54, 'Devonshire': 55, 'Dha': 56, 'Don': 57, 'Dschwen': 58, 'Dubarry': 59, 'During': 60, 'Edith': 61, 'Emperors': 62, 'Exported': 63, 'FDL': 64, 'Florence': 65, 'For': 66, 'GNU': 67, 'Gallery': 68, 'Gideon': 69, 'Gisburn': 70, 'Gisburns': 71, 'Grafton': 72, 'Greek': 73, 'Grindle': 74, 'Grindles': 75, 'HAD': 76, 'Had': 77, 'Hang': 78, 'Has': 7

In [85]:
class SimpleTokenizerV1:
    def __init__(self, out):
        self.str_to_int = out
        self.int_to_str = {i:s for s,i in out.items()}
    
    def encode(self, text):
        preprocessed = re.split(r'([,.?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
        
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids]) 
        
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

In [86]:
#Test encoding and decoding with the SimpleTokenizerV1 class
tokenizer = SimpleTokenizerV1(out)
print(tokenizer.encode("day"))
print(tokenizer.decode([384]))

[384]
day


In [87]:
#Add special tokens to the vocabulary and print the total number of vocabulary items
allWords.extend(["<|endoftext|>", "<|unk|>"])
vocab = {token:integer for integer,token in enumerate(allWords)}
 
print(len(vocab.items()))

1241


### Byte pair encoding

In [88]:
#Check the version of tiktoken
from importlib.metadata import version
import tiktoken
print("tiktoken version:", version("tiktoken"))

tiktoken version: 0.7.0


In [89]:
tokenizer = tiktoken.get_encoding("gpt2")

In [90]:
#test encoding and decoding with the GPT-2 tokenizer.
text = "I love you Chennai"
integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
print(integers)
strings = tokenizer.decode(integers)
print(strings)

[40, 1842, 345, 47678]
I love you Chennai


### Data sampling with a sliding window

In [91]:
#Encode the entire content of "The_verdict.txt" using the tiktoken tokenizer and print the length of the encoded text
with open("The_verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()
 
enc_text = tokenizer.encode(raw_text)
print(len(enc_text))

5560


In [92]:
#Prepare data for training by creating context-target pairs from the encoded text
enc_sample = enc_text[50:]
context_size = 4
x = enc_sample[:context_size]
y = enc_sample[1:context_size+1]
print(f"x: {x}")
print(f"y:      {y}")

x: [7026, 15632, 438, 2016]
y:      [15632, 438, 2016, 257]


In [93]:
#Print context and the corresponding desired output for different lengths of context
for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]
    print(context, "---->", desired)

[7026] ----> 15632
[7026, 15632] ----> 438
[7026, 15632, 438] ----> 2016
[7026, 15632, 438, 2016] ----> 257


In [94]:
#Decode and print context and corresponding desired output tokens
for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]
    print(tokenizer.decode(context), "---->", tokenizer.decode([desired]))

 cheap ---->  genius
 cheap genius ----> --
 cheap genius-- ----> though
 cheap genius--though ---->  a


In [95]:
#Define a PyTorch Dataset class for preparing input and target sequences from text
import torch
from torch.utils.data import Dataset, DataLoader
 
class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []
 
        token_ids = tokenizer.encode(txt)
 
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))
 
    def __len__(self):
        return len(self.input_ids)
 
    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [96]:
#Create a DataLoader for the GPTDatasetV1 class with specified parameters.
def create_dataloader_v1(txt, batch_size=4, max_length=256,
        stride=128, shuffle=True, drop_last=True, num_workers=0):
    tokenizer = tiktoken.get_encoding("gpt2")
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=0
    )
 
    return dataloader

In [97]:
#Initialize the DataLoader and print the first two batches of data
with open("The_verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()
    
dataloader = create_dataloader_v1(
    raw_text, batch_size=1, max_length=4, stride=1, shuffle=False)
data_iter = iter(dataloader)

first_batch = next(data_iter)
print(first_batch)

second_batch = next(data_iter)
print(second_batch)

[tensor([[  464,  4643, 11600,   628]]), tensor([[ 4643, 11600,   628,   198]])]
[tensor([[ 4643, 11600,   628,   198]]), tensor([[11600,   628,   198,   197]])]


### Creating token embeddings

In [98]:
# Define an embedding layer with specified vocabulary size and output dimension, and print the weights

input_ids = torch.tensor([2, 3, 5, 1])

vocab_size = 6
output_dim = 3

torch.manual_seed(123)
embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
print(embedding_layer.weight)

Parameter containing:
tensor([[ 0.3374, -0.1778, -0.1690],
        [ 0.9178,  1.5810,  1.3010],
        [ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096]], requires_grad=True)
