Largely adopted from https://github.com/karpathy/build-nanogpt/

In [1]:
import os
import torch

In [2]:
DATASET_URL = (
    "https://raw.githubusercontent.com/karpathy/char-rnn/master"
    "/data/tinyshakespeare/input.txt"
)
DATA_DIR = os.path.expanduser("~/Data/tinyshakespeare")
DATA_FILENAME = "input.txt"
DATA_FILEPATH = os.path.join(DATA_DIR, DATA_FILENAME)

In [3]:
DEVICE = "cpu"
if torch.cuda.is_available():
    DEVICE = "cuda"
elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
    DEVICE = "mps"
DEVICE_TYPE = "cuda" if DEVICE.startswith("cuda") else "cpu"
print(f"using device: {DEVICE}")


SEED = 1337
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed(SEED)

using device: mps


## Load data

In [4]:
from urllib import request

os.makedirs(DATA_DIR, exist_ok=True)
if not os.path.isfile(DATA_FILEPATH):
    content = request.urlopen(DATASET_URL)
    with open(DATA_FILEPATH, "wb") as f:
        f.write(content.read())

with open(DATA_FILEPATH, "r") as f:
    text = f.read()

print(example := text[:1000])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.



## Tokenization examples

In [5]:
import tiktoken

enc = tiktoken.get_encoding("gpt2")
example_tokens = enc.encode(example)
print(example_tokens[:24 + 1])

[5962, 22307, 25, 198, 8421, 356, 5120, 597, 2252, 11, 3285, 502, 2740, 13, 198, 198, 3237, 25, 198, 5248, 461, 11, 2740, 13, 198]


In [6]:
example_buf = torch.tensor(example_tokens[:24 + 1])
example_x = example_buf[:-1].view(4, 6)
example_y = example_buf[1:].view(4, 6)  # predict the next token
print(example_x)
print(example_y)

tensor([[ 5962, 22307,    25,   198,  8421,   356],
        [ 5120,   597,  2252,    11,  3285,   502],
        [ 2740,    13,   198,   198,  3237,    25],
        [  198,  5248,   461,    11,  2740,    13]])
tensor([[22307,    25,   198,  8421,   356,  5120],
        [  597,  2252,    11,  3285,   502,  2740],
        [   13,   198,   198,  3237,    25,   198],
        [ 5248,   461,    11,  2740,    13,   198]])


## Utils

In [7]:
from typing import Optional
from tiktoken.core import Encoding

class DataLoaderLite:
    # We are not doing DDP in this project whatsoever, and we're also only training on
    # tinyshakespeare, so this is a significantly downsized version of the DataLoaderLite
    # in the tutorial

    def __init__(self, B: int, T: int, text: str, encoder: Optional[Encoding] = None):
        self.B = B
        self.T = T
        self.text = text

        self.encoder = encoder or tiktoken.get_encoding("gpt2")
        self._buf = []
        self._pos = 0

    def next_tokens(self, add_pred_token: bool = False) -> list[int]:
        n_tokens = self.B * self.T + int(add_pred_token)
        pos_step = n_tokens * 4

        while len(self._buf) < n_tokens:
            tokens = self.encoder.encode(text[self._pos : self._pos + pos_step])
            self._buf.extend(tokens)

        tokens = self._buf[: n_tokens]  # we want this many tokens
        self._buf = self._buf[self.B * self.T :]  # remove BT tokens (not BT + 1!)
        return tokens

    def next_batch(self) -> tuple[torch.Tensor, torch.Tensor]:
        batch_tokens = torch.as_tensor(self.next_tokens(add_pred_token=True))
        x = (batch_tokens[:-1]).view(self.B, self.T) # inputs
        y = (batch_tokens[1:]).view(self.B, self.T) # targets
        return x, y

In [22]:
dataloader = DataLoaderLite(4, 6, text, encoder=enc)
add_pred_token = True
display(enc.decode(dataloader.next_tokens(add_pred_token=add_pred_token)))
display(enc.decode(dataloader.next_tokens(add_pred_token=add_pred_token)))
display(enc.decode(dataloader.next_tokens(add_pred_token=add_pred_token)))
display(enc.decode(dataloader.next_tokens(add_pred_token=add_pred_token)))

'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n'

'\n\nFirst Citizen:\nYouFirst Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:'

':\nSpeak, speak.\n\nFirst Citizen:\nYouFirst Citizen:\nBefore we proceed any further, hear'

' hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYouFirst Citizen:\n'