In [1]:
import torch

# Data preparation

## Load raw text

In [2]:
with open('../data/shakespeare.txt', 'r') as f:
    text = f.read()

In [3]:
print(text[:200])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you


## Tokenization

In [4]:
tokens = sorted(list(set(text)))
''.join(tokens)

"\n !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"

In [5]:
stoi = { ch: i for i, ch in enumerate(tokens) }
itos = { i: ch for i, ch in enumerate(tokens) }

In [6]:
def encode(text):
    return torch.tensor([stoi[ch] for ch in text], dtype=torch.long)

def decode(tensor):
    return ''.join([itos[i.item()] for i in tensor])

In [7]:
encode('testi')

tensor([58, 43, 57, 58, 47])

In [8]:
decode(encode('testi'))

'testi'

In [9]:
data = encode(text)
data[:10]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47])

In [10]:
decode(data[:10])

'First Citi'

In [11]:
split = int(0.8 * len(data))
train = data[:split]
val = data[split:]

print(train.shape, val.shape)

torch.Size([892315]) torch.Size([223079])


# Dataloader

Getting a single chunk of data:

In [25]:
block_size = 8
batch_size = 4

In [13]:
offset = 10 # arbitrary offset for demonstration

x = train[offset:offset+block_size]
y = train[offset+1:offset+block_size+1]

print(x)
print(y)

tensor([64, 43, 52, 10,  0, 14, 43, 44])
tensor([43, 52, 10,  0, 14, 43, 44, 53])


We generate random offsets into the training data:

In [26]:
offsets = torch.randint(0, split-block_size, (batch_size,))
offsets

tensor([718767, 179714, 143107,  71281])

And then generate a block-size x and a shifted-by-1 block-size y for each offset, stacking those tensor into a single x and y tensor:

In [24]:
print(torch.stack([data[offset : offset+block_size] for offset in offsets]))
print(torch.stack([data[offset+1 : offset+block_size+1] for offset in offsets]))

tensor([[58,  1, 61, 46, 47, 41, 46,  1],
        [46, 39, 52,  1, 39, 52, 63,  1],
        [14, 30, 33, 32, 33, 31, 10,  0],
        [39, 54, 54, 43, 39, 56,  8,  1],
        [ 1, 52, 53,  1, 57, 58, 56, 43],
        [ 1, 39, 56, 43,  1, 52, 53, 61],
        [39, 58, 56, 43, 42,  6,  1, 57],
        [ 1, 58, 46, 43,  1, 56, 43, 51]])
tensor([[ 1, 61, 46, 47, 41, 46,  1, 41],
        [39, 52,  1, 39, 52, 63,  1, 46],
        [30, 33, 32, 33, 31, 10,  0, 35],
        [54, 54, 43, 39, 56,  8,  1, 28],
        [52, 53,  1, 57, 58, 56, 43, 52],
        [39, 56, 43,  1, 52, 53, 61,  1],
        [58, 56, 43, 42,  6,  1, 57, 61],
        [58, 46, 43,  1, 56, 43, 51, 43]])


In [27]:
def get_batch(data, block_size=block_size, batch_size=batch_size):
    offsets = torch.randint(0, split-block_size, (batch_size,))

    xb = torch.stack([data[offset : offset+block_size] for offset in offsets])
    yb = torch.stack([data[offset+1 : offset+block_size+1] for offset in offsets])

    return xb, yb

In [28]:
get_batch(train)

(tensor([[34, 39, 50, 43, 56, 47, 39,  1],
         [56,  1, 44, 43, 43, 50,  1, 58],
         [43,  1, 54, 56, 43, 41, 47, 53],
         [49, 47, 52, 45, 11,  0, 27, 44]]),
 tensor([[39, 50, 43, 56, 47, 39,  1, 47],
         [ 1, 44, 43, 43, 50,  1, 58, 46],
         [ 1, 54, 56, 43, 41, 47, 53, 59],
         [47, 52, 45, 11,  0, 27, 44,  1]]))