In [1]:
import os
import math
import requests

import torch
import torch.nn as nn
from torch.nn import functional as F
import tiktoken
import numpy as np
from tqdm.notebook import tqdm

In [2]:
with open('bible.txt', 'r', encoding='utf-8') as f:
    data = f.read()

n = len(data)

print(n)

train_data = data[:int(n*0.9)]
test_data = data[int(n*0.9):]

# encode with tiktoken gpt2 bpe
enc = tiktoken.get_encoding("gpt2")
train_ids = enc.encode(train_data)
test_ids = enc.encode(test_data)
print(f"train has {len(train_ids):,} tokens")
print(f"test has {len(test_ids):,} tokens")

# export to bin files
train_ids = torch.tensor(train_ids)
test_ids = torch.tensor(test_ids)

4351186
train has 1,087,091 tokens
test has 117,911 tokens


In [3]:
# @title
"""
Full definition of a GPT Language Model, all of it in this single file.
References:
1) the official GPT-2 TensorFlow implementation released by OpenAI:
https://github.com/openai/gpt-2/blob/master/src/model.py
2) huggingface/transformers PyTorch implementation:
https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py
"""

block_size = 1024
vocab_size = 50257 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency
n_layer = 12
n_head = 12
n_embd = 768
dropout = 0.1
bias = True # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster
learning_rate = 1e-3
weight_decay = 1e-1
batch_size = 8


class LayerNorm(nn.Module):
    """ LayerNorm but with an optional bias. PyTorch doesn't support simply bias=False """
    def __init__(self):
        super().__init__()
        self.weight = nn.Parameter(torch.ones(n_embd))
        self.bias = nn.Parameter(torch.zeros(n_embd)) if bias else None

    def forward(self, input):
        return F.layer_norm(input, self.weight.shape, self.weight, self.bias, 1e-5)


class CausalSelfAttention(nn.Module):
    def __init__(self):
        super().__init__()
        # key, query, value projections for all heads, but in a batch
        self.c_attn = nn.Linear(n_embd, 3 * n_embd, bias=bias)
        # output projection
        self.c_proj = nn.Linear(n_embd, n_embd, bias=bias)
        # regularization
        self.attn_dropout = nn.Dropout(dropout)
        self.resid_dropout = nn.Dropout(dropout)

    def forward(self, x):
        B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)

        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
        q, k, v  = self.c_attn(x).split(n_embd, dim=2)
        k = k.view(B, T, n_head, C // n_head).transpose(1, 2) # (B, nh, T, hs)
        q = q.view(B, T, n_head, C // n_head).transpose(1, 2) # (B, nh, T, hs)
        v = v.view(B, T, n_head, C // n_head).transpose(1, 2) # (B, nh, T, hs)

        # causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
        y = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=None, dropout_p=dropout if self.training else 0, is_causal=True)

        y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side

        # output projection
        y = self.resid_dropout(self.c_proj(y))
        return y


class MLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.c_fc    = nn.Linear(n_embd, 4 * n_embd, bias=bias)
        self.gelu    = nn.GELU()
        self.c_proj  = nn.Linear(4 * n_embd, n_embd, bias=bias)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = self.c_fc(x)
        x = self.gelu(x)
        x = self.c_proj(x)
        x = self.dropout(x)
        return x

class Block(nn.Module):
    def __init__(self):
        super().__init__()
        self.ln_1 = LayerNorm()
        self.attn = CausalSelfAttention()
        self.ln_2 = LayerNorm()
        self.mlp = MLP()

    def forward(self, x):
        x = x + self.attn(self.ln_1(x))
        x = x + self.mlp(self.ln_2(x))
        return x


class GPT(nn.Module):
    def __init__(self):
        super().__init__()

        self.transformer = nn.ModuleDict(dict(
            wte = nn.Embedding(vocab_size, n_embd),
            wpe = nn.Embedding(block_size, n_embd),
            drop = nn.Dropout(dropout),
            h = nn.ModuleList([Block() for _ in range(n_layer)]),
            ln_f = LayerNorm(),
        ))
        self.lm_head = nn.Linear(n_embd, vocab_size, bias=False)
        # with weight tying when using torch.compile() some warnings get generated:
        # "UserWarning: functional_call was passed multiple values for tied weights.
        # This behavior is deprecated and will be an error in future versions"
        # not 100% sure what this is, so far seems to be harmless. TODO investigate
        self.transformer.wte.weight = self.lm_head.weight # https://paperswithcode.com/method/weight-tying

        # init all weights
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        device = idx.device
        b, t = idx.size()
        pos = torch.arange(0, t, dtype=torch.long, device=device) # shape (t)

        # forward the GPT model itself
        tok_emb = self.transformer.wte(idx) # token embeddings of shape (b, t, n_embd)
        pos_emb = self.transformer.wpe(pos) # position embeddings of shape (t, n_embd)
        x = self.transformer.drop(tok_emb + pos_emb)
        for block in self.transformer.h:
            x = block(x)
        x = self.transformer.ln_f(x)

        if targets is not None:
            # if we are given some desired targets also calculate the loss
            logits = self.lm_head(x)
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
        else:
            # inference-time mini-optimization: only forward the lm_head on the very last position
            logits = self.lm_head(x[:, [-1], :]) # note: using list [-1] to preserve the time dim
            loss = None

        return logits, loss

    def crop_block_size(self, block_size):
        # model surgery to decrease the block size if necessary
        # e.g. we may load the GPT2 pretrained model checkpoint (block size 1024)
        # but want to use a smaller block size for some smaller, simpler model
        self.block_size = block_size
        self.transformer.wpe.weight = nn.Parameter(self.transformer.wpe.weight[:block_size])
        for block in self.transformer.h:
            if hasattr(block.attn, 'bias'):
                block.attn.bias = block.attn.bias[:,:,:block_size,:block_size]

    @torch.no_grad()
    def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):
        """
        Take a conditioning sequence of indices idx (LongTensor of shape (b,t)) and complete
        the sequence max_new_tokens times, feeding the predictions back into the model each time.
        Most likely you'll want to make sure to be in model.eval() mode of operation for this.
        """
        for _ in range(max_new_tokens):
            # if the sequence context is growing too long we must crop it at block_size
            idx_cond = idx if idx.size(1) <= block_size else idx[:, -block_size:]
            # forward the model to get the logits for the index in the sequence
            logits, _ = self(idx_cond)
            # pluck the logits at the final step and scale by desired temperature
            logits = logits[:, -1, :] / temperature
            # optionally crop the logits to only the top k options
            if top_k is not None:
                v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
                logits[logits < v[:, [-1]]] = -float('Inf')
            # apply softmax to convert logits to (normalized) probabilities
            probs = F.softmax(logits, dim=-1)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1)
            # append sampled index to the running sequence and continue
            idx = torch.cat((idx, idx_next), dim=1)

        return idx

model = GPT()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# start with all of the candidate parameters
param_dict = {pn: p for pn, p in model.named_parameters()}
# filter out those that do not require grad
param_dict = {pn: p for pn, p in param_dict.items() if p.requires_grad}
# create optim groups. Any parameters that is 2D will be weight decayed, otherwise no.
# i.e. all weight tensors in matmuls + embeddings decay, all biases and layernorms don't.
decay_params = [p for n, p in param_dict.items() if p.dim() >= 2]
nodecay_params = [p for n, p in param_dict.items() if p.dim() < 2]
optim_groups = [
    {'params': decay_params, 'weight_decay': weight_decay},
    {'params': nodecay_params, 'weight_decay': 0.0}
]

optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=(0.9, 0.95))

In [5]:
max_steps = 500
current_position = 0

new_phrase = "And God said"
new_phrase_ids = enc.encode_ordinary(new_phrase)
new_phrase_ids = torch.tensor(new_phrase_ids).unsqueeze(0).to(device)

for step in tqdm(range(max_steps + 1)):
    buf = train_ids[current_position : current_position+(batch_size*block_size)+1]
    X = (buf[:-1]).view(batch_size, block_size) # inputs
    y = (buf[1:]).view(batch_size, block_size) # targets
    # advance the position in the tensor
    current_position += batch_size * block_size
    # if loading the next batch would be out of bounds, advance to next shard
    if current_position + (batch_size * block_size + 1) > len(train_ids):
        current_position = 0

    X = X.to(device)
    y = y.to(device)

    optimizer.zero_grad()

    logits, loss = model(X, y)

    loss.backward()
    optimizer.step()

    if step % 50 == 0:
        # Test model on text
        out = model.generate(new_phrase_ids, 20, temperature=1.0, top_k=None)
        out = list(out.squeeze())
        out = enc.decode(out)
        print(f"Loss: {loss.item()}")
        print(f"Step: {step} - {out}")


  0%|          | 0/501 [00:00<?, ?it/s]

Loss: 6.004243850708008
Step: 0 - And God said fathersal Re of died a land the hundredquire LORD And Asland that priest 4 Benjamin people were
Loss: 5.120280742645264
Step: 50 - And God said senton valiantab:12 So heel over notist people, he alligned unto


Loss: 5.1248779296875
Step: 100 - And God said.
of Babylonst blessedves of theil things forth the LORD ye shall see for He no
Loss: 4.805957317352295
Step: 150 - And God said the memorial

country, and make, and it earnest priest of cattle: for hath shall put
Loss: 5.148601531982422
Step: 200 - And God said, vain of
was their hand

26 The written Aaron, that should Hebohan it of
Loss: 4.659879684448242
Step: 250 - And God said, and a desire iserest unto God, I should one were one?  let worshipped him at
Loss: 4.5109357833862305
Step: 300 - And God said up away. And the children of Egypt them unto him of his mighty
9 ( conseciege you
Loss: 4.752330780029297
Step: 350 - And God said accept pure, and heard shall all the LORD of hosts o