In [152]:
# !pip install tiktoken
# !pip install torch

In [159]:
import requests
import tiktoken
import torch
from torch.utils.data import Dataset, DataLoader

In [160]:
url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
text = requests.get(url).text

In [161]:
tokenizer = tiktoken.get_encoding("gpt2")
print(f"{len(text)=}")
print(f"{len(tokenizer.encode(text))=}")

encoded_text_100 = tokenizer.encode(text[:100])
print(f"{encoded_text_100=}")
decoded_text_100 = tokenizer.decode(encoded_text_100)
print(f"{decoded_text_100=}")

len(text)=1115394
len(tokenizer.encode(text))=338025
encoded_text_100=[5962, 22307, 25, 198, 8421, 356, 5120, 597, 2252, 11, 3285, 502, 2740, 13, 198, 198, 3237, 25, 198, 5248, 461, 11, 2740, 13, 198, 198, 5962, 22307, 25, 198, 1639]
decoded_text_100='First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou'


In [162]:
class InMemoryDataset(Dataset):
    def __init__(self, text: str, tokenizer, max_length: int, stride: int):
        self.input_ids = []
        self.target_ids = []
        token_ids = tokenizer.encode(text)
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))
            
    def __len__(self):
        return len(self.input_ids)
        
    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [185]:
ds = InMemoryDataset(text[:1_000], tokenizer, max_length=4, stride=1)
print(f"{len(ds)=}")
print(f"x[0]: {ds[4][0]}")
print(f"y[0]:       {ds[4][1]}")
print(f"x[0]: {tokenizer.decode(list(ds[4][0]))}")
print(f"y[0]:       {tokenizer.decode(list(ds[4][1]))}")


len(ds)=281
x[0]: tensor([8421,  356, 5120,  597])
y[0]:       tensor([ 356, 5120,  597, 2252])
x[0]: Before we proceed any
y[0]:        we proceed any further


In [192]:
dataloader = DataLoader(
    ds,
    batch_size=5,
    shuffle=False,
    drop_last=True,
    num_workers=0,
)
batch_1 = next(iter(dataloader))
print("Batch 1 | Input:\n", batch_1[0])
print("Batch 1 | Output:\n", batch_1[1])

Batch 1 | Input:
 tensor([[ 5962, 22307,    25,   198],
        [22307,    25,   198,  8421],
        [   25,   198,  8421,   356],
        [  198,  8421,   356,  5120],
        [ 8421,   356,  5120,   597]])
Batch 1 | Output:
 tensor([[22307,    25,   198,  8421],
        [   25,   198,  8421,   356],
        [  198,  8421,   356,  5120],
        [ 8421,   356,  5120,   597],
        [  356,  5120,   597,  2252]])
