# Let's Build GPT

Data: https://github.com/karpathy/char-rnn/blob/master/data/tinyshakespeare/input.txt

Video: https://www.youtube.com/watch?v=kCc8FmEb1nY&list=PLAqhIrjkxbuWI23v9cThsA9GvCAUhRvKZ

# Imports

In [13]:
import time
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt

# Dataset

In [2]:
class Tokenizer:
    def __init__(self, vocab):
        assert isinstance(vocab, list)
        assert all(isinstance(v, str) for v in vocab)
        assert all(len(v) == 1 for v in vocab)
        self.stoi = {ch: i for i, ch in enumerate(vocab)}
        self.itos = {i: ch for i, ch in enumerate(vocab)}

    def encode(self, text):
        return [self.stoi[s] for s in text]

    def decode(self, sequence):
        if isinstance(sequence, list):
            return ''.join([self.itos[i] for i in sequence])
        elif isinstance(sequence, torch.Tensor):
            assert sequence.ndim in [0, 1]
            if sequence.ndim == 0:
                return self.itos[sequence.item()]  # one char
            else:
                return ''.join([self.itos[i.item()] for i in sequence])
        else:
            raise ValueError(f"Type {type(sequence)} not supported")

In [3]:
with open('../data/tinyshakespeare.txt', 'r') as f:
    text = f.read()
print("Num chars:", len(text))
print("Dataset Start:")
print(text[:462])

Num chars: 1115394
Dataset Start:
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.


In [4]:
# Get vocabulary
letters = sorted(list(set(''.join(text))))
n_vocab = len(letters)
print(''.join(letters))
print('Num:', len(letters))


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
Num: 65


In [73]:
tok = Tokenizer(letters)
print(tok.encode("hii there"))
print(tok.decode(tok.encode("hii there")))
print(f"Newline is: {tok.encode('\n')[0]}")

[46, 47, 47, 1, 58, 46, 43, 56, 43]
hii there
Newline is: 0


In [6]:
data = torch.tensor(tok.encode(text), dtype=torch.long)
n = int(0.9*len(text))
train_data, valid_data = data[:n], data[n:]  # 90%/10% split 
print(f"Train data len: {len(train_data)}")
print(f"Valid data len: {len(valid_data)}")

Train data len: 1003854
Valid data len: 111540


In [7]:
class DataLoader:
    def __init__(self, data, batch_size, sequence_length):
        self.data = data
        self.n_batch = batch_size
        self.n_seq = sequence_length

    def get_batch(self):
        bi = torch.randint(len(self.data)-self.n_seq, (self.n_batch,))
        x = torch.stack([self.data[i:i+self.n_seq] for i in bi])
        y = torch.stack([self.data[i+1:i+1+self.n_seq] for i in bi])
        return x, y

In [31]:
torch.manual_seed(1337)
n_batch = 4
n_seq = 8
n_emb = n_vocab
data_loader = DataLoader(train_data, n_batch, n_seq)

In [32]:
x_batch, y_batch = data_loader.get_batch()

In [33]:
print(x_batch.shape)
print(x_batch)

torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])


In [34]:
print(y_batch.shape)
print(y_batch)

torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])


In [106]:
class BigramLanguageModel(nn.Module):
    def __init__(self, n_vocab):
        super().__init__()
        self.emb_table = nn.Embedding(n_vocab, n_vocab)
    
    def forward(self, idx, targets=None):
        assert idx.dtype == torch.long
        assert targets is None or targets.dtype == torch.long
        
        logits = self.emb_table(idx)    # B,T,C <- B,S

        if targets is None:
            return logits, None
        else:
            B, T, C = logits.shape
            logits_ = logits.view(B*T, C)  # B*T, C
            targets_ = targets.view(B*T)   # B*T
            loss = F.cross_entropy(logits_, targets_)
            return logits, loss
    
    def generate(self, idx, max_tokens):
        """Generate max_tokens starting from idx[B,T]"""
        # assert idx.shape == (n_batch, n_seq)
        assert idx.dtype == torch.long
        assert isinstance(max_tokens, int)

        for _ in range(max_tokens):
            # Model Output
            logits, _ = self(idx)      # B,T,C <- B,T

            # Discard all but last step
            logits = logits[:, -1, :]  # B,C <- B,T,C

            probs = F.softmax(logits, dim=-1)  # (B, C)

            idx_next = torch.multinomial(probs, num_samples=1)  # B, 1

            idx = torch.cat((idx, idx_next), dim=1)  # B, T+1

        return idx

In [115]:
# Create Model
torch.manual_seed(1337)
m = BigramLanguageModel(n_vocab)
logits, loss = m(x_batch, y_batch)
print(f"Model loss: {loss}")
# Expected initial loss:
print(f"Expected initial loss: {-torch.tensor(1/n_vocab).log()}")

Model loss: 4.878634929656982
Expected initial loss: 4.174387454986572


In [116]:
# Generate
idx = torch.tensor([[0]])  # B=1, T=1, '\n'
res = m.generate(idx, max_tokens=100)
print(tok.decode(res[0].tolist()))


SKIcLT;AcELMoTbvZv C?nq-QE33:CJqkOKH-q;:la!oiywkHjgChzbQ?u!3bLIgwevmyFJGUGp
wnYWmnxKWWev-tDqXErVKLgJ
