In [1]:
import re
import tiktoken
import torch
from typing import List
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset,DataLoader

In [2]:
with open('verdict.txt',"r",encoding="utf-8") as f:
    txt=f.read()

In [3]:
#lets tokenize 
tokenized=re.split(r'([,.?_!"()\']|--|\s)',txt)

# print(len(tokenized))
tokenized=[item for item in tokenized if item.split()]
print(len(tokenized))


4649


# lets make vocublary

In [4]:
clean_token=sorted(list(set(tokenized)))

In [5]:
print(len(clean_token))

1159


In [6]:
#lets make the vocab
text_index={text:index for index,text in enumerate(clean_token)}
index_text={index:text for index,text in enumerate(clean_token)}

In [7]:
print(text_index['that'])

1012


In [8]:
#this is the encoder
def encoder(text):
    #lets tokenize it
    tokenizer=re.split(r'([,.?_!"()\']|--|\s)',text)  
    tokenizer=[item for item in tokenizer if item.strip()]
    
    #lets make token id
    token_id=[]
    for i in tokenizer:
        token_id.append(text_index[i])
    return token_id

#this is the decoder
def decoder(encoded):
    decoded_id=[]
    for i in encoded:
        decoded_id.append(index_text[i])
    return decoded_id

In [9]:
print(encoder('this is not good'))
print(decoder([1024, 595, 727, 508]))

[1024, 595, 727, 508]
['this', 'is', 'not', 'good']


# lets make ita a  class

In [10]:
class TokenizerVersion1():
    def __init__(self,vocab) -> None:
        self.vocab=vocab
    
    def encoder(self,text):
        #lets tokenize it
        tokenizer=re.split(r'([,.?_!"()\']|--|\s)',text)  
        tokenizer=[item for item in tokenizer if item.strip()]
        
        #lets make token id
        token_id=[]
        for i in tokenizer:
            token_id.append(self.vocab[i])
        return token_id
    
    def decoder(self,encoded):
         self.decoded_id=[]
         for i in encoded:
            self.decoded_id.append(index_text[i])
        
         text=" ".join(i for i in self.decoded_id)
         text= re.sub(r'\s+([,.?_!"()\'"])', r'\1', text)
         return text

In [11]:
#lets check out the class
text="""
    this is the whole word of that women said "her man?"
"""
toknizer=TokenizerVersion1(vocab=text_index)

encode=toknizer.encoder(text=text)
print(encode)

decoder=toknizer.decoder(encoded=encode)
print(decoder)

[1024, 595, 1013, 1125, 1145, 738, 1012, 1140, 873, 1, 547, 671, 10, 1]
this is the whole word of that women said" her man?"


# handle unknown tokens

In [12]:
#add new token into old
all_tokens=clean_token
#appending
all_tokens.extend(['<|endoftext|>',"<|unk|>"])
#make the vocab 
vocab={token:index for index,token in enumerate(all_tokens)}

In [13]:
if 'the' in vocab:
    print("present")
else:
    print('not present')

present


In [14]:
vocab['the']

1013

In [15]:
len(all_tokens),len(vocab)

(1161, 1161)

# include the 2 new tokens into the tokenizer

In [16]:
class TokenizerVersion2():
    def __init__(self,vocab) -> None:
        self.vocab=vocab  #text to num
        self.index_text={index:text for text,index in self.vocab.items()}  #num to text
    
    def encoder(self,text)->List:
        #lets tokenize it
        tokenized=re.split(r'([,.?_!"()\']|--|\s)',text)  
        tokenized=[item for item in tokenized if item.strip()]
        
        #lets make token id
        token_id=[item if item in self.vocab else "<|unk|>" for item in tokenized]
        ids=[self.vocab[item] for item in token_id]
       
        return ids
    
    def decoder(self,encoded):
         #print(encode)
         text = " ".join([self.index_text[i] for i in encoded])
         
         text= re.sub(r'\s+([,.?_!"()\'"])', r'\1', text)
         return text

In [17]:
#lets check out the class
text="""
    the is the whole word of that women said "her man?"
"""
toknizer=TokenizerVersion2(vocab)

encode=toknizer.encoder(text=text)
print("this is encoder")
print(encode)

decoder=toknizer.decoder(encode)
print("this is the decoder")
print(decoder)

this is encoder
[1013, 595, 1013, 1125, 1145, 738, 1012, 1140, 873, 1, 547, 671, 10, 1]
this is the decoder
the is the whole word of that women said" her man?"


In [18]:
# for i,(k,v) in enumerate(vocab.items()):
#     print(k,v)
#     if(i>10):
#         break

In [19]:
#lets test it
text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."
text = "<|endoftext|>".join((text1, text2))
print(text)

Hello, do you like tea?<|endoftext|>In the sunlit terraces of the palace.


In [20]:
encoded=toknizer.encoder(text)

In [21]:
toknizer.decoder(encoded)

'<|unk|>, do you like tea? <|unk|> the sunlit terraces of the <|unk|>.'

# lets tokenize the entire text of the verdict

In [22]:
entire_tokenize=TokenizerVersion2(vocab)

entire_encoded=entire_tokenize.encoder(txt)
#print(entire_encoded[:20])

In [23]:
len(entire_encoded)

4649

In [24]:
x=entire_encoded[0:len(entire_encoded)]
y=entire_encoded[1:len(entire_encoded)+1]

In [25]:
print(x[:10])
print(y[:10])

[55, 46, 154, 1028, 59, 39, 839, 119, 263, 494]
[46, 154, 1028, 59, 39, 839, 119, 263, 494, 6]


In [26]:
#create context and desired
for i in range(1,len(entire_encoded)):
    context=entire_encoded[:i]
    desired=entire_encoded[i]
    print(f"{context} ---> {desired}")
    if i>10:
        break

[55] ---> 46
[55, 46] ---> 154
[55, 46, 154] ---> 1028
[55, 46, 154, 1028] ---> 59
[55, 46, 154, 1028, 59] ---> 39
[55, 46, 154, 1028, 59, 39] ---> 839
[55, 46, 154, 1028, 59, 39, 839] ---> 119
[55, 46, 154, 1028, 59, 39, 839, 119] ---> 263
[55, 46, 154, 1028, 59, 39, 839, 119, 263] ---> 494
[55, 46, 154, 1028, 59, 39, 839, 119, 263, 494] ---> 6
[55, 46, 154, 1028, 59, 39, 839, 119, 263, 494, 6] ---> 1027


In [27]:
#what suppose to happen interm of text
#create context and desired
for i in range(1,len(entire_encoded)):
    context=entire_tokenize.decoder(entire_encoded[:i])
    desired=entire_tokenize.decoder([entire_encoded[i]])
    print(f"{context} ---> {desired}")
    if i>10:
        break

I ---> HAD
I HAD ---> always
I HAD always ---> thought
I HAD always thought ---> Jack
I HAD always thought Jack ---> Gisburn
I HAD always thought Jack Gisburn ---> rather
I HAD always thought Jack Gisburn rather ---> a
I HAD always thought Jack Gisburn rather a ---> cheap
I HAD always thought Jack Gisburn rather a cheap ---> genius
I HAD always thought Jack Gisburn rather a cheap genius ---> --
I HAD always thought Jack Gisburn rather a cheap genius -- ---> though


In [28]:
class GptDataSetv1(Dataset):
    def __init__(self,tokenizer,dataset,context_length,stride) -> None:
        super().__init__()
        self.tokenizer=tokenizer

        #lets tokenize the text
        self.tokens=self.tokenizer.encode(dataset,allowed_special={"<|endoftext|>"})   #array of ids
        
        self.inputs=[]
        self.outputs=[]

        for i in range(0,len(self.tokens),stride):
            input_chunks=self.tokens[i:i+context_length]
            output_chunks=self.tokens[i+1:i+context_length+1]

            #lets append
            if(len(input_chunks)==context_length and len(output_chunks)==context_length):
                self.inputs.append(torch.tensor(input_chunks))
                self.outputs.append(torch.tensor(output_chunks))
    
    def __len__(self) -> int:
        return len(self.inputs)

    def __getitem__(self, index) :
        #purpose of this function is to make an input and output matcher
        return self.inputs[index].clone().detach(),self.outputs[index].clone().detach()

In [29]:
def collate_fn(batch):
    inputs,outputs=zip(*batch)
    inputs=pad_sequence(inputs,batch_first=True,padding_value=0)
    outputs=pad_sequence(outputs,batch_first=True,padding_value=0)
    return inputs,outputs

In [30]:
def create_dataloader_v1(txt,batch_size=4,context_length=120,stride=128,shuffle=True):
    tokenizer=tiktoken.get_encoding('gpt2')
    dataset=GptDataSetv1(tokenizer,txt,context_length,stride)
    #prepare the datalaoder
    dataloader=DataLoader(dataset,batch_size=batch_size,collate_fn=collate_fn,shuffle=shuffle)
    return dataloader

In [31]:
#lets test the code

tokenizer=tiktoken.get_encoding('gpt2')

#dataset=GptDataSetv1(tokenizer,txt,context_length=16,stride=4)

#data loader
dataloader=create_dataloader_v1(txt,batch_size=8,stride=2,context_length=15,shuffle=False)

In [32]:
for batch in dataloader:
    inputs,outputs=batch
    print(f"the whole thing is:{len(inputs)}")

    for i,(inputs_data,expected_data) in enumerate(zip(inputs,outputs)):
        print(f"the length of the following is:{len(inputs_data)}")
        if(i>20):
            break

the whole thing is:8
the length of the following is:15
the length of the following is:15
the length of the following is:15
the length of the following is:15
the length of the following is:15
the length of the following is:15
the length of the following is:15
the length of the following is:15
the whole thing is:8
the length of the following is:15
the length of the following is:15
the length of the following is:15
the length of the following is:15
the length of the following is:15
the length of the following is:15
the length of the following is:15
the length of the following is:15
the whole thing is:8
the length of the following is:15
the length of the following is:15
the length of the following is:15
the length of the following is:15
the length of the following is:15
the length of the following is:15
the length of the following is:15
the length of the following is:15
the whole thing is:8
the length of the following is:15
the length of the following is:15
the length of the following is:1

In [33]:
inputs,outputs=next(iter(dataloader))
print(inputs)
print()
print(outputs)

tensor([[   40,   367,  2885,  1464,  1807,  3619,   402,   271, 10899,  2138,
           257,  7026, 15632,   438,  2016],
        [ 2885,  1464,  1807,  3619,   402,   271, 10899,  2138,   257,  7026,
         15632,   438,  2016,   257,   922],
        [ 1807,  3619,   402,   271, 10899,  2138,   257,  7026, 15632,   438,
          2016,   257,   922,  5891,  1576],
        [  402,   271, 10899,  2138,   257,  7026, 15632,   438,  2016,   257,
           922,  5891,  1576,   438,   568],
        [10899,  2138,   257,  7026, 15632,   438,  2016,   257,   922,  5891,
          1576,   438,   568,   340,   373],
        [  257,  7026, 15632,   438,  2016,   257,   922,  5891,  1576,   438,
           568,   340,   373,   645,  1049],
        [15632,   438,  2016,   257,   922,  5891,  1576,   438,   568,   340,
           373,   645,  1049,  5975,   284],
        [ 2016,   257,   922,  5891,  1576,   438,   568,   340,   373,   645,
          1049,  5975,   284,   502,   284]])

tensor

# lets use embedding

In [41]:
n_vocab=50257
embed_dim=256
embedding=torch.nn.Embedding(n_vocab,embed_dim)

In [42]:
embedding.weight

Parameter containing:
tensor([[-1.3734, -1.4809, -0.2765,  ...,  0.9602, -1.4292,  1.5996],
        [ 0.2590,  0.3550,  0.7237,  ..., -0.5295,  0.0797, -0.8992],
        [ 0.4246, -1.0363,  1.3692,  ..., -1.0938, -0.6761, -0.5912],
        ...,
        [ 0.9751,  1.6313,  0.1821,  ..., -0.4655, -0.2601, -0.6540],
        [ 1.3087, -0.0665, -0.3509,  ..., -0.3324, -2.1329, -0.3860],
        [-0.1212,  1.0214,  0.9822,  ...,  0.0617, -0.6953, -1.1991]],
       requires_grad=True)

In [36]:
em=tiktoken.get_encoding('gpt2')

In [37]:
#lets test the code

tokenizer=tiktoken.get_encoding('gpt2')

#dataset=GptDataSetv1(tokenizer,txt,context_length=16,stride=4)

#data loader
dataloader=create_dataloader_v1(txt,batch_size=8,context_length=4,shuffle=False)

inputs,outputs=next(iter(dataloader))

In [40]:
print(f"inputs tokens are: {inputs}")
print(f"output tokens are {outputs}")

inputs tokens are: tensor([[   40,   367,  2885,  1464],
        [  286,   616,  4286,   705],
        [10197,   832,   262, 46475],
        [ 4150,     8,  3688,   284],
        [  271, 10899,   550,   366],
        [ 1021,   757,   438, 10919],
        [  314,  4752,   340,  6777],
        [  423,  4750,   326,  9074]])
output tokens are tensor([[  367,  2885,  1464,  1807],
        [  616,  4286,   705,  1014],
        [  832,   262, 46475,   286],
        [    8,  3688,   284,   402],
        [10899,   550,   366,  7109],
        [  757,   438, 10919,   257],
        [ 4752,   340,  6777,    13],
        [ 4750,   326,  9074,    13]])


In [44]:
#pass input into embedding
input_embed_layer=embedding(inputs)

In [49]:
input_embed_layer.size()

torch.Size([8, 4, 256])

# lets insert postion embedding

In [57]:
context_length=4
pos_embedding_layer=torch.nn.Embedding(context_length,embed_dim)

#pass dummy data into the embedding
pos_embedding=pos_embedding_layer(torch.arange(context_length))

In [58]:
print(pos_embedding.shape)

torch.Size([4, 256])


In [59]:
input_embedding=input_embed_layer+pos_embedding

In [60]:
input_embedding.size()

torch.Size([8, 4, 256])