In [1]:
# !pip install torch
# We always start with a dataset to train on. Let's download the tiny shakespe

I use a smaller dataset here so that we can prevent overfitting. We can add a check that if train loss exceeds validation loss by >1, we need to increase dropout rate or other checks later.

In [2]:
datafile = "/kaggle/input/dickens-yap/novels.txt"
# "/kaggle/input/thisrandombook/data.txt"
# we using smaller data and building up to prevent overfitting

In [3]:
import torch

In [4]:
# ----
# hyperparameters
learning_rate = 1e-4

n_embed = 512 # dimensionality of the character embedding vectors
n_head = 6 # number of heads in the multi-head attention
n_layer = 6
dropout = 0.4

block_size = 256
batch_size = 64
max_iters = 10000
eval_interval = 500
eval_iters = 200

# ----

In [5]:
import torch
from numba import cuda

def free_gpu_cache():                          

    torch.cuda.empty_cache()

    cuda.select_device(0)
    cuda.close()
    cuda.select_device(0)
    
device = "cpu"
if torch.cuda.is_available():
    device = "cuda"
    free_gpu_cache() 

In [6]:
!export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True

In [7]:
# imports
import pickle
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader

 


# ----

# create a model
class BiGramDataModel(nn.Module):
    def __init__(self, vocab_size):
        # initialize with random weights
        super().__init__()
        # warning: generally set to a small value but we have characters for tokens
        # this basically is used to tell you what the probability is of one token
        # coming after another, hence "bigram"
        self.token_embedding_table = nn.Embedding(vocab_size, n_embed)
        self.position_embedding_table = nn.Embedding(block_size, n_embed)
        self.blocks = nn.Sequential(
                *[Block(n_embed, n_head) for _ in range(n_layer)],
        )
        self.lm_head = nn.Linear(n_embed, vocab_size)

        self.feed_forward = FeedForward(n_embed)

    def forward(self, idx, targets=None):
        B, T= idx.shape
        idx = idx
        # plugs the tokens into the table 
        # semantic lationship, this is basically the "self-attention" part 
        # of the paper, and these complex weights are the only thing that's
        # really trained here :)
        token_emb = self.token_embedding_table(idx)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device))

        # add them
        x = token_emb + pos_emb
        # apply the head
        x = self.blocks(x)

        # apply lm head (linear transformation to return back to life)
        logits = self.lm_head(x)

        # if there's nothing we compare to
        if targets is None:
            loss = None
        else:
            # batch size, sequence length, classes/dimensions
            # number of sequences being processed simultaneously WHY DO WE CARE
            # number of time steps/token per sequence (block_size)
            # contains informatation about the tokens before it, the \
            # "density" of each token
            B, T, C = logits.shape

            # view it as a 2D tensor of what all has been processed 
            # that way we can have entropy
            logits = logits.view(B * T, C)

            # targets is the same thing except there is no output size
            # they don't care about storing context of each token in 
            # the output
            targets = targets.view(B * T)

            # quantifying the information encoded between what it is
            # and what the semantic relationship should identify
            # warning: I don't really understand what this does
            # also can the same text have different semantic relationship?
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:,-block_size:]

            # get the predictions
            # this calls forward for the tokens 
            logits, loss = self(idx_cond)

            # focus only on the last time step
            # remove T because this is a BiGram model
            # this might be wrong
            logits = logits[:, -1, :]  # get the last time step 

            probs = F.softmax(logits, dim=-1)  # probabilities

            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1).
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

# create a Head
class Head(nn.Module):
    def __init__(self, head_size):
        # initialize with random weights
        super().__init__()
        # add a key, value, and query
        self.key = nn.Linear(n_embed, head_size, bias=False)
        self.query = nn.Linear(n_embed, head_size, bias=False)
        self.value = nn.Linear(n_embed, head_size, bias=False)
        self.register_buffer("tril", torch.tril(torch.ones(block_size, block_size)))
        
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = x
        B, T, C = x.shape
        k = self.key(x) # (B, T, C)
        q = self.query(x) # (B, T, C)
        # compute attention scores
        # (B, T, C) @ (B, C, T) -> (B, T, T)
        wei = q @ k.transpose(-2, -1) * C**-0.5 #normalization
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float("-inf"))
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B, T, C)
        out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)
        return out

class MultiHeadAttention(nn.Module):
    def __init__(self, heads, size):
        super().__init__()
        self.head_list = nn.ModuleList([Head(size) for _ in range(heads)])
        self.proj = nn.Linear(heads * size, n_embed)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x):
        out = torch.cat([h(x) for h in self.head_list], dim=-1)
        out = self.dropout(self.proj(out))
        return out

class FeedForward(nn.Module):
    def __init__(self, n_embed):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embed, n_embed * 4),
            nn.GELU(),
            nn.Linear(n_embed * 4, n_embed),
            nn.Dropout(dropout)
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    def __init__(self, n_embed, n_head):
        super().__init__()
        head_size = n_embed // n_head
        self.sa_heads = MultiHeadAttention(n_head, head_size)
        self.ff = FeedForward(n_embed)
        
        # batch normalization
        # https://arxiv.org/abs/1607.06450
        self.la1 = nn.LayerNorm(n_embed)
        self.la2 = nn.LayerNorm(n_embed)

    def forward(self, x):
        x = x
        # some optimization by adding it
        x = x + self.sa_heads(self.la1(x)) # this is the same as x = x + self.sa_heads(x)
        x = x + self.ff(self.la2(x)) # this is the same as x = x + self.ff(x)
        return x


In [8]:
# imports
import pickle
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader



# warning: susceptible to overlapping data
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == "train" else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i : i + block_size] for i in ix])
    y = torch.stack([data[i + 1:i+block_size+1] for i in ix])
    return x, y

@torch.no_grad()
def estimate_loss():
    out = {}
    m.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = m(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    m.train()
    return out


# tokenization
data = ""
with open(datafile, "r") as f:
    data = f.read()

# tokens 
chars = list(set(data))
data_size, vocab_size = len(data), len(chars)
print("data has %d characters, %d unique" % (data_size, vocab_size))

m = BiGramDataModel(vocab_size)
m = m.to(device)

# mapping
char_to_ix = {ch: i for i, ch in enumerate(chars)}
ix_to_char = {i: ch for i, ch in enumerate(chars)}

## encode and decode
encode = lambda x: [char_to_ix[c] for c in x]
decode = lambda x: "".join([ix_to_char[i] for i in x])

# convert the data to numbers
data_num = torch.tensor(encode(data), dtype=torch.long).to(device)

## decide training and validation data
n = int(0.9 * len(data_num))
train_data = data_num[:n]
val_data = data_num[n:]

x = train_data[:block_size]  # input
y = train_data[1:block_size + 1]  # labels


xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

logits, loss = m(xb, yb)
print(logits.shape, loss)

# print untrained output for testing
print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long, device=device), max_new_tokens=100)[0].tolist()))

# optimize 

optimizer = torch.optim.AdamW(m.parameters(), lr=learning_rate)


for steps in range(max_iters):
    # hangs over here somewhere no idea why

    # every once in a while evaluate the loss on train and val sets
    if steps % eval_interval == 0 or steps == max_iters - 1:
        losses = estimate_loss()
        print(f"step {steps}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    xb, yb = get_batch('train')
    logits, loss = m(xb, yb)

    # approach the optimized gradient
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

## generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=500)[0].tolist())) # print somewhat trained data

# pickle the results
torch.save((vocab_size, char_to_ix, ix_to_char, m.state_dict()), "model.pkl")


data has 2725304 characters, 86 unique
inputs:
torch.Size([64, 256])
tensor([[44, 83, 83,  ..., 78, 50, 14],
        [83, 69, 36,  ..., 17, 24, 17],
        [50, 24, 25,  ..., 69, 26, 62],
        ...,
        [50, 14, 62,  ..., 36, 50, 44],
        [24, 17, 62,  ..., 78, 69, 36],
        [50, 41, 19,  ...,  9, 36, 29]], device='cuda:0')
targets:
torch.Size([64, 256])
tensor([[83, 83, 50,  ..., 50, 14, 44],
        [69, 36, 50,  ..., 24, 17,  5],
        [24, 25, 53,  ..., 26, 62, 50],
        ...,
        [14, 62, 50,  ..., 50, 44, 19],
        [17, 62, 50,  ..., 69, 36, 60],
        [41, 19,  5,  ..., 36, 29, 50]], device='cuda:0')
torch.Size([16384, 86]) tensor(4.6570, device='cuda:0', grad_fn=<NllLossBackward0>)
qpêX]t)7ugs“zyQ0sgzi
EJY“fekpQe‘7RE!o
r2q;WeqGefjOh8NldG2TvGnq!Y6tUOAGxoh‘Q,Qx]()NYr(-gL8H
kg
g12’8X"
step 0: train loss 4.6199, val loss 4.6154
step 500: train loss 2.3242, val loss 2.3097
step 1000: train loss 1.9196, val loss 1.9247
step 1500: train loss 1.7320, val loss