In [47]:
import tiktoken

In [48]:
tokenizer = tiktoken.get_encoding("gpt2")

text = (
    "Hello, do you like tea? <|endoftext|> In the sunlit terraces"
    "of someunkwownPlace"
)

integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
print(integers)

[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 1659, 617, 2954, 86, 593, 27271]


In [49]:
strings = tokenizer.decode(integers)
print(strings)

Hello, do you like tea? <|endoftext|> In the sunlit terracesof someunkwownPlace


In [50]:
integers = tokenizer.encode("Akwirw ier")
print(integers)

strings = tokenizer.decode(integers)
print(strings)

[33901, 86, 343, 86, 220, 959]
Akwirw ier


In [51]:
with open("/home/enid/Downloads/wizard-of-oz.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

enc_text = tokenizer.encode(raw_text)

In [52]:
context_size = 4

enc_sample = enc_text[50:]
x = enc_sample[:context_size]
y = enc_sample[1:context_size + 1]

print(f"x: {x}")
print(f"y:          {y}")

x: [15485, 13, 921, 743]
y:          [13, 921, 743, 4866]


In [53]:
for i in range(1, context_size + 1):
    context = enc_sample[:i]
    desired = enc_sample[i]
    print(tokenizer.decode(context), "---->", tokenizer.decode([desired]))

soever ----> .
soever. ---->  You
soever. You ---->  may
soever. You may ---->  copy


In [None]:
from torch.utils.data import Dataset, DataLoader
import torch

class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_len, stride):    # max_len = context_size, stride determina quantas palavras para o lado vamos
        self.input_ids = []
        self.target_ids = []
        
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})
        
        # Sliding window
        for i in range(0, len(token_ids) - max_len, stride):
            input_chunk = token_ids[i : i + max_len]
            target_chunk = token_ids[i + 1 : i + max_len + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))
            
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, idx):     # retorna linha idx do tensor de input e linha idx do tensor de output, necessário para colocar no dataloader
        return self.input_ids[idx], self.target_ids[idx]
            

In [None]:
# drop_last = True para retirar ultimo batch caso este seja menor que batch_size para prevenir spikes de loss no treinamento
# Ajuda a criar pares input output a partir do dataset que definimos anteriormente
# Facilita processamento em paralelo
# batch_size é o número de batches que o modelo processa antes de atualizar seus parâmetros
# num_workers é o número de cpus para processar
def create_dataloader_v1(txt, batch_size=4, max_len=256,        # batch_size -> numero de cpus para rodar
                         stride=128, shuffle=True, drop_last=True,
                         num_workers=0):
    
    tokenizer = tiktoken.get_encoding("gpt2")
    
    # Inicializando dataset
    dataset = GPTDatasetV1(txt, tokenizer, max_len, stride)
    
    # Inicializando dataloader
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )
    
    return dataloader