# testing bpe using tiktoken

In [3]:
import tiktoken

In [5]:
tiktoken.list_encoding_names()

['gpt2',
 'r50k_base',
 'p50k_base',
 'p50k_edit',
 'cl100k_base',
 'o200k_base',
 'o200k_harmony']

In [6]:
tokenizer = tiktoken.get_encoding("gpt2")

In [8]:
text = "Akwirw ier"
encoded_text = tokenizer.encode(text)
encoded_text

[33901, 86, 343, 86, 220, 959]

In [None]:
tokenizer.decode(encoded_text)

'Akwirw ier'

In [13]:
a = list(range(1,10))
a

[1, 2, 3, 4, 5, 6, 7, 8, 9]

In [None]:
a[1:3] # slices are picking index in the domain form of [m,n) 

[2, 3]

# creating my own data loader

In [19]:
with open("data/the-verdict.txt","r", encoding="utf-8") as f:
    data = f.read()

In [22]:
data[:10]

'I HAD alwa'

In [29]:
import torch
from torch.utils.data import Dataset, DataLoader

class MyDataset(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        """
        max_length: window size of chosen vector
        """
        self.input_ids = []
        self.target_ids = []

        token_ids = tokenizer.encode(txt) # tokenize text

        # create the input ids 
        for idx in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[idx: idx + max_length]
            target_chunk = token_ids[idx + 1 : idx + 1 + max_length]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))
        
    def __len__(self):
        return (len(self.input_ids))
        
    def __getitem__(self, idx):
        i = self.input_ids[idx]
        t = self.target_ids[idx]
        return i,t




In [30]:
md = MyDataset(data, tokenizer, 4, 1)

In [34]:
tokenizer.decode(list(md[0][1]))

' HAD always thought'

In [35]:
# creating the data loader
def my_dataloader (
        dataset,
        batch_size = 4,
        shuffle = True,
        drop_last = True,
        num_workers = 0
):
    dataloader = DataLoader(dataset
                            ,batch_size
                            ,shuffle
                            ,drop_last=drop_last
                            ,num_workers=num_workers)
    return dataloader


In [37]:
# test my dataset with ml=2, stride 2 and ml = 8, stride=2
test1 = MyDataset(data, tokenizer, max_length=2, stride=2)
input, target = test1[0]
print(f"input:{tokenizer.decode(list(input))}")
print(f"target:{tokenizer.decode(list(target))}")

input:I H
target: HAD


In [68]:
test2 = MyDataset(data, tokenizer, max_length=8, stride=2)
input, target = test1[0]
print(f"input:{tokenizer.decode(list(input))}")
print(f"target:{tokenizer.decode(list(target))}")

input:I HAD always thought Jack Gis
target: HAD always thought Jack Gisburn


In [39]:
# lets test it using bath, and datalloader
ds = MyDataset(data, tokenizer, 4, 4)
mdl = my_dataloader(ds,batch_size=4)
type(mdl)

torch.utils.data.dataloader.DataLoader

In [41]:
loader_it = iter(mdl)
input, targets = next(loader_it)
print(f"inputs:\n {input}")
print(f"targets:\n {targets}")

inputs:
 tensor([[ 6532,    62,  8263,    12],
        [  632,   373,   326,   326],
        [  550,  1239,  1900, 44807],
        [  338,  2270,   351,   465]])
targets:
 tensor([[   62,  8263,    12,  3823],
        [  373,   326,   326,   925],
        [ 1239,  1900, 44807,  5514],
        [ 2270,   351,   465,  1468]])


In [64]:
print(f"input: {tokenizer.decode(list(input.reshape(1,16)[0]))}")
print(f"target: {tokenizer.decode(list(targets.reshape(1,16)[0]))}")

input: arry_ drawing- It was that that had never known_.'s break with his
target: _ drawing-room was that that made never known_. Only break with his old


In [66]:
# about embeddings
tokenizer.__dict__

{'name': 'gpt2',
 '_pat_str': "'(?:[sdmt]|ll|ve|re)| ?\\p{L}++| ?\\p{N}++| ?[^\\s\\p{L}\\p{N}]++|\\s++$|\\s+(?!\\S)|\\s",
 '_mergeable_ranks': {b'!': 0,
  b'"': 1,
  b'#': 2,
  b'$': 3,
  b'%': 4,
  b'&': 5,
  b"'": 6,
  b'(': 7,
  b')': 8,
  b'*': 9,
  b'+': 10,
  b',': 11,
  b'-': 12,
  b'.': 13,
  b'/': 14,
  b'0': 15,
  b'1': 16,
  b'2': 17,
  b'3': 18,
  b'4': 19,
  b'5': 20,
  b'6': 21,
  b'7': 22,
  b'8': 23,
  b'9': 24,
  b':': 25,
  b';': 26,
  b'<': 27,
  b'=': 28,
  b'>': 29,
  b'?': 30,
  b'@': 31,
  b'A': 32,
  b'B': 33,
  b'C': 34,
  b'D': 35,
  b'E': 36,
  b'F': 37,
  b'G': 38,
  b'H': 39,
  b'I': 40,
  b'J': 41,
  b'K': 42,
  b'L': 43,
  b'M': 44,
  b'N': 45,
  b'O': 46,
  b'P': 47,
  b'Q': 48,
  b'R': 49,
  b'S': 50,
  b'T': 51,
  b'U': 52,
  b'V': 53,
  b'W': 54,
  b'X': 55,
  b'Y': 56,
  b'Z': 57,
  b'[': 58,
  b'\\': 59,
  b']': 60,
  b'^': 61,
  b'_': 62,
  b'`': 63,
  b'a': 64,
  b'b': 65,
  b'c': 66,
  b'd': 67,
  b'e': 68,
  b'f': 69,
  b'g': 70,
  b'h': 71,
  b

In [72]:
max_length = 4
stride = 4
batch_size = 8

mds = MyDataset(data, tokenizer, max_length, stride)
mdl = my_dataloader(mds, batch_size=batch_size)
loader_it = iter(mdl)
input, target = next(loader_it)

In [73]:
# because gpt2, tokenizer we use is vocab_size = 50256
# we will create and embedding layer of size 50256
vocab_size = 50256
output_dim = 256 #always find out the best embedding size (256 is good for testing)
token_embedding_layer = torch.nn.Embedding(num_embeddings=vocab_size, embedding_dim=output_dim)

In [None]:
# and we need the positional embeddings to help the self attention mechanism (investigate further if needed)
# will have same dimension as token_embeddings
"""
While token embeddings provide consistent vector representations for
each token, they lack a sense of the token's position in a sequence. To
rectify this, two main types of positional embeddings exist: absolute and
relative. OpenAI's GPT models utilize absolute positional embeddings that
are added to the token embedding vectors and are optimized during the
model training
"""
positional_embeddings_layer = torch.nn.Embedding(max_length, output_dim)

In [77]:
# genereate embeddings for the input and positional and add thosea
token_embeddings = token_embedding_layer(input)
positional_embeddings = positional_embeddings_layer(torch.arange(max_length)) #produce positions
input_embeddings = token_embeddings + positional_embeddings

print(f"token embeddings shape: {token_embeddings.shape}")
print(f"pos embeddings shape: {positional_embeddings.shape}")
print(f"input embeddings shape: {input_embeddings.shape}")

token embeddings shape: torch.Size([8, 4, 256])
pos embeddings shape: torch.Size([4, 256])
input embeddings shape: torch.Size([8, 4, 256])
