In [1]:
import re

In [2]:
text = "Hello world! this is, Nitin Mishra"
re.split(r'(\s)', text)          # This will count the whitespaces as well in the list created later
re.split(r'\s', text)   # this will cutout all the whitespace between the words


['Hello', 'world!', 'this', 'is,', 'Nitin', 'Mishra']

In [3]:
with open("the_verdict.txt", 'r') as f:
    raw_text = f.read()
    items = re.split(r'([.,!";()+?-]|\s)', raw_text)
    new_items = [item for item in items if item.strip()]
    all_words = sorted(set(new_items))
    voacb = {token: integer for integer, token in enumerate(all_words)}
    all_words.extend(["<|endoftext|>", "<|unk|>"])
    voacb = {token: integers for integers, token in enumerate(all_words)}

In [4]:
class tokenizerv1:
 def __init__(self, voacb):
  self.str_to_int = voacb
  self.int_to_str= {i:s for s,i in voacb.items()}

 def encoder(self, raw_text):
   preprocessed = re.split(r'([.,!";()+?-]|\s)', raw_text)
   preprocessed = [item.strip() for item in preprocessed if item.strip()]
   idS = [self.str_to_int[s] for s in preprocessed]
   return idS
  
 def decode(self, idS):
   text = " ".join([self.int_to_str[i] for i in idS])
   text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)   
   return text


In [5]:
tokenize = tokenizerv1(voacb)
text = """The desultory life of the Riviera lends itself to such purely academic speculations; and having, 
on my way to Monte Carlo, caught a glimpse of Jack's balustraded terraces between the pines,
 I had myself borne thither the next day."""
ids = tokenize.encoder(text)

ide = tokenize.decode(ids)
print(ide)

The desultory life of the Riviera lends itself to such purely academic speculations ; and having, on my way to Monte Carlo, caught a glimpse of Jack's balustraded terraces between the pines, I had myself borne thither the next day.


In [6]:
class tokenizer2:
    def __init__(self, voacb):
     self.str_to_int = voacb
     self.int_to_str= {i:s for s,i in voacb.items()}
    
    def encoder(self, raw_text):
     preprocessed = re.split(r'([.,!";()+?-]|\s)', raw_text)
     preprocessed = [item.strip() for item in preprocessed if item.strip()]
     preprocessed = [item if item in self.str_to_int else "<|unk|>" for item in preprocessed]
     idS = [self.str_to_int[s] for s in preprocessed]
     return idS
    def decode(self, idS):
     text = " ".join([self.int_to_str[i] for i in idS])
     text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)   
     return text

In [16]:
text1 = "Hello, how are you?"
new_tok = tokenizer2(voacb)
result = new_tok.encoder(text1)
print(new_tok.decode(result))

<|unk|>, how are you?


In [7]:
import tiktoken

In [8]:
tokenize = tiktoken.get_encoding("gpt2")
text = "Hello, do you like the tea <|endoftext|>"
integers = tokenize.encode(text, allowed_special={"<|endoftext|>"})
print(integers)
print(tokenize.decode(integers))

[15496, 11, 466, 345, 588, 262, 8887, 220, 50256]
Hello, do you like the tea <|endoftext|>


In [22]:
te1 =  "Akwirw ier"
print(tokenize.encode(te1))
print(tokenize.decode(tokenize.encode(te1)))

[33901, 86, 343, 86, 220, 959]
Akwirw ier


In [9]:
with open("the_verdict.txt", 'r') as f:
    raw_text = f.read()
    _result = tokenize.encode(raw_text)
    new_result = tokenize.decode(_result)
    # print(_result)
    # print(new_result)

In [10]:
sample = new_result[:100]

context_size = 16
x = sample[:context_size]
y = sample[1:context_size+1]
# print(f"x: {x}")
# print(f"y: {y}")

In [11]:
n_sample = _result[:100]

context_size = 16
x = sample[:context_size]
y = sample[1:context_size+1]
# print(f"x: {x}")
# print(f"y: {y}")

In [26]:
for i in range(1, context_size+1):
    context = sample[:i]
    desired = sample[i]
    print(context , "----->" , desired)

I ----->  
I  -----> H
I H -----> A
I HA -----> D
I HAD ----->  
I HAD  -----> a
I HAD a -----> l
I HAD al -----> w
I HAD alw -----> a
I HAD alwa -----> y
I HAD alway -----> s
I HAD always ----->  
I HAD always  -----> t
I HAD always t -----> h
I HAD always th -----> o
I HAD always tho -----> u


In [15]:
%pip install torch

Note: you may need to restart the kernel to use updated packages.


In [12]:
import torch
from torch.utils.data import Dataset, DataLoader

In [13]:
class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})   
        for i in range(0, len(token_ids) - max_length, stride):    
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))
    def __len__(self):   
        return len(self.input_ids)
    def __getitem__(self, idx):        
        return self.input_ids[idx], self.target_ids[idx]

In [14]:

def create_dataloder_v1(txt, batch_size=4, max_length = 256, stride = 128, num_workers=0, shuffle= True, drop_last = True):
    tokenizer =tiktoken.get_encoding("gpt2")
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)
    dataloder = DataLoader(dataset,
                            batch_size=batch_size,
                            shuffle=shuffle,
                            drop_last=drop_last,
                            num_workers=num_workers)
    return dataloder


In [15]:
with open("the_verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()
    dataloader = create_dataloder_v1(raw_text, batch_size=2, max_length=4, stride=2, shuffle=False)
    data_iter = iter(dataloader)     
    first_batch = next(data_iter)
    print(first_batch)

[tensor([[  40,  367, 2885, 1464],
        [2885, 1464, 1807, 3619]]), tensor([[ 367, 2885, 1464, 1807],
        [1464, 1807, 3619,  402]])]


In [16]:
second_batch = next(data_iter)
print(second_batch)

[tensor([[ 1807,  3619,   402,   271],
        [  402,   271, 10899,  2138]]), tensor([[ 3619,   402,   271, 10899],
        [  271, 10899,  2138,   257]])]


In [50]:
dataloader = create_dataloder_v1(raw_text, batch_size=8, max_length=8, stride=4)
third_batch = iter(dataloader)
input, target = next(third_batch)
print(("Input: \n", input))
print("target: \n", target)

('Input: \n', tensor([[  314,   550,  1775,   683,    11,   523,  1690,    11],
        [   11,   523,   595, 18052,    11,   326,   530,   890],
        [  438,  1169,   691,  2134,  7163,   262,  8631, 26210],
        [   11,   484,  1067, 11137,    13,   314,  2497,   326],
        [  418,   286,  3595,   520,  5493,   338,  3451,   286],
        [  262,  3024,    12,  4803,   286,   511,   512,  1741],
        [  546,   683,    11, 10597,   314,  2063,  1392,   284],
        [ 1701,   314,  1965, 25891,    13,   198,   198,  3347]]))
target: 
 tensor([[  550,  1775,   683,    11,   523,  1690,    11,  1615],
        [  523,   595, 18052,    11,   326,   530,   890,   276],
        [ 1169,   691,  2134,  7163,   262,  8631, 26210,  3425],
        [  484,  1067, 11137,    13,   314,  2497,   326,   339],
        [  286,  3595,   520,  5493,   338,  3451,   286,  5287],
        [ 3024,    12,  4803,   286,   511,   512,  1741,    13],
        [  683,    11, 10597,   314,  2063,  1392,

In [None]:
input_ids = torch.tensor([2,3,5,1])
vocab_size = 6
output_dim =3
torch.manual_seed(123)
embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
print(embedding_layer.weight)

Parameter containing:
tensor([[ 0.3374, -0.1778, -0.1690],
        [ 0.9178,  1.5810,  1.3010],
        [ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096]], requires_grad=True)


In [18]:
print(embedding_layer(torch.tensor([3])))

tensor([[-0.4015,  0.9666, -1.1481]], grad_fn=<EmbeddingBackward0>)


In [38]:
vocab_size = 50257
output_dim = 256
torch_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
print(torch_embedding_layer.weight)
print(torch_embedding_layer)

Parameter containing:
tensor([[ 0.2944,  1.6946, -0.1879,  ..., -0.0196,  0.8605,  0.0176],
        [-1.2162,  0.3617,  0.6800,  ...,  0.9172,  0.6879, -0.3622],
        [ 0.7821,  0.2635,  0.9459,  ...,  0.6895, -1.5726, -0.5220],
        ...,
        [ 1.2535, -0.4668,  0.0418,  ...,  2.3790,  0.4870, -1.1981],
        [-2.1790, -0.1755, -0.7141,  ...,  0.2131, -0.3337,  0.5516],
        [-0.2183, -1.0058,  1.6554,  ...,  0.8342,  1.3221,  2.0354]],
       requires_grad=True)
Embedding(50257, 256)


In [39]:
max_length =4
dataloader = create_dataloder_v1(raw_text, batch_size=8, max_length=max_length, stride=max_length//2, shuffle=False, drop_last=False)
data_iter = iter(dataloader)
input, target = next(data_iter)
# print("Input: \n", input)
# print(input.shape)
token_embedding= torch_embedding_layer(input)
print(token_embedding.shape)


torch.Size([8, 4, 256])


In [28]:
context_size = max_length
pos_embedding_layer = torch.nn.Embedding(context_size, output_dim)
pos_embeddings = pos_embedding_layer(torch.arange(context_size))
print(pos_embeddings.shape)

torch.Size([4, 256])


In [42]:
input_embedding = token_embedding + pos_embeddings
print(input_embedding.shape)

torch.Size([8, 4, 256])


In [49]:
random_text = "your journey starts with one step"
tokens = tokenize.encode(random_text)
words= tokenize.decode(tokens=tokens)
v_s = 3
dim = 5
torch.manual_seed(42)
text_embedding_layer = torch.nn.Embedding(v_s, dim)
print(text_embedding_layer.weight)

Parameter containing:
tensor([[ 0.3367,  0.1288,  0.2345,  0.2303, -1.1229],
        [-0.1863,  2.2082, -0.6380,  0.4617,  0.2674],
        [ 0.5349,  0.8094,  1.1103, -1.6898, -0.9890]], requires_grad=True)
