In [1]:
import pathlib
import re
import torch
import torch.nn as nn

from tokenizers import Tokenizer, models, pre_tokenizers, decoders, trainers, processors, Regex
from tokenizers.models import BPE
from tokenizers.pre_tokenizers import Whitespace, Split
from tokenizers.trainers import BpeTrainer
from torch.nn import functional as F

In [2]:
# BPE vocabulary
vocab_size = 10000
min_frequency=2
#hyperparameters
batch_size = 64 # independent sequences processed in parallel
block_size = 256 #max context length for predictions
step = 0
eval_interval = 500
learning_rate = 5e-4
device = 'mps' if torch.backends.mps.is_available() else 'cpu'
eval_iters = 200
n_embd = 384
n_head = 6
n_layer = 6
dropout = 0.2

In [3]:
torch.backends.mps.is_available()

True

In [4]:
# wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
with open('allLyrics.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [5]:
# tokenizer = Tokenizer(BPE())
# tokenizer.pre_tokenizer = Split(Regex('[ \t]+'), behavior='removed')
# trainer = BpeTrainer(vocab_size=vocab_size, min_frequency=min_frequency)
# tokenizer.train(files=['allLyrics.txt'], trainer=trainer)

In [6]:
tokenizer = Tokenizer.from_file("tokenizer5.json")

In [7]:
vocab_size = tokenizer.get_vocab_size()
encode = lambda s: tokenizer.encode(s).ids
decode = lambda l: tokenizer.decode(l)

In [8]:
y = text[:250]
print(y)
print("---")
print(len(y))
x = encode(y)
print(len(x))
print(x)




All of this and more is for you
With love, sincerity and deepest care
My life with you I share


Ever since I met you, baby
I've been wantin' to lay you down
But it's so hard to get you
Baby, when you never come around
Every day that you keep it a
---
250
61
[0, 0, 0, 450, 198, 282, 179, 406, 177, 231, 347, 595, 832, 8548, 1771, 868, 179, 1169, 782, 2670, 413, 440, 251, 162, 40, 7609, 0, 0, 3598, 1391, 40, 1411, 512, 600, 539, 443, 9560, 170, 897, 162, 611, 295, 383, 266, 615, 170, 256, 347, 1137, 325, 162, 374, 336, 1212, 526, 339, 202, 162, 501, 168, 62]


In [9]:
print(vocab_size)

10000


In [10]:
print(decode(x))


 
 
 All of this and more is for you
 With love, sin cer ity and deep est care
 My life with you I share
 
 
 Ever since I met you, baby
 I've been wantin' to lay you down
 But it's so hard to get you
 Baby, when you never come around
 Every day that you keep it a


In [11]:
# chars = sorted(list(set(text)))
# vocab_size = len(chars)

# stoi = { ch:i for i,ch in enumerate(chars) }
# itos = { i:ch for i,ch in enumerate(chars) }
# encode = lambda s: [stoi[c] for c in s]
# decode = lambda l: ''.join([itos[i] for i in l])

In [12]:
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

In [13]:
def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x,y

In [14]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [15]:
class Head(nn.Module):
    
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)
        q = self.query(x)
        
        #compute attn scores ('affinities')
        wei = q @ k.transpose(-2, -1) * C**-0.5 # (B, T, C) @ (B, C, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        
        #perform the weighted aggregation of the values
        v = self.value(x) # (B, T, C)
        out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)
        return out

In [16]:
class MultiHeadAttention(nn.Module):
    """Multiple heads of attention in parallel"""
    
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

In [17]:
class FeedForward(nn.Module):
    """ a simple linear leayer followed by a non-linearity"""
    
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout)
        )
    
    def forward(self, x):
        return self.net(x)

In [18]:
class Block(nn.Module):
    """ Transformer block: communication followed by computation"""
    
    def __init__(self, nembd, n_head):
        # n_embd: embedding dimension, n_head: th number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedForward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)
        
    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

In [19]:
class BigramLanguageModel(nn.Module):
    
    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd) #number of embedding dimensions
        self.position_embedding_table = nn.Embedding(block_size, n_embd) # postion of each char
        
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd)
        
        self.lm_head = nn.Linear(n_embd, vocab_size) # language model head
    
    def forward(self, idx, targets = None):
        B, T = idx.shape
        
        #idx and targets are both (B,T) tensor of ints
        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb= self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.lm_head(x) # (B,T,vocab_size)
        
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        
        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            #focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to probs
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled indext to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

In [20]:
model = BigramLanguageModel()
m = model.to(device)

In [21]:
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

In [22]:
checkpoint = torch.load(str(pathlib.Path().resolve()) + '/snapshot5')
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
step = checkpoint['step']
losses = checkpoint['losses']

In [23]:
for itr in range(2000):
    # every once in a while evaluate the loss on the train and val sets:
    if itr % eval_interval == 0:
        losses = estimate_loss()
        print(f"Step {step}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
    
    xb, yb = get_batch('train')
    
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    step += 1

Step 11000: train loss 0.8441, val loss 8.2380
Step 11500: train loss 0.8107, val loss 8.3545
Step 12000: train loss 0.7773, val loss 8.4241
Step 12500: train loss 0.7373, val loss 8.4931


In [24]:
context = torch.zeros((1, 1), dtype=torch.long, device=device)

In [25]:
print(decode(m.generate(context, max_new_tokens=500)[0].tolist()))


 I’ve been so long
 Re mem ' the sun rise has been rob ins up
 Then they laid Him in free ?
 
 
 Come along and take an empt iness you've been little girl, my mind cha long ed windows , didn't have I can ever known
 I can make it with you
 4 a choice heart sing
 These are sad and sad and the pink Cadillac
 Cr own the sun 's grown
 He's all going to all around life in his eyes
 
 Come here and take some bur g
 K new s, yeah
 I, I am our to meet again moon up now, happy let them and so I
 Let's do this, stop shiver what you want to
 I, I just don't even think about
 All I say
 
 Boys in ta this in ju ic al, drink s, ru b
 I think it's lost and clear through a bas ket ball of diamonds in teeth
 Oh, can you go out of the blue always
 Yes, out of my life
 I can't go to fire runnin' away I ain't enough
 I go hard again.
 
 It's like a long time
 I've been a long time
 Since I've seen a long time
 I've seen but I keep saying it's not too long
 How do you know why, yeah, you know just how to 

In [28]:
# torch.save({
#             'step': step,
#             'model_state_dict': model.state_dict(),
#             'optimizer_state_dict': optimizer.state_dict(),
#             'losses': losses,
#             }, str(pathlib.Path().resolve()) + '/snapshot5')

In [29]:
# tokenizer.save("tokenizer5.json")