# Coding transformer

In [2]:
import os
import requests
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F

torch.manual_seed(1337)

device = 'cuda' if torch.cuda.is_available() else 'cpu'

## Data retrieval
Retrieve training data

In [3]:
# download the tiny shakespeare dataset

filename = 'input.txt'
file_dir = os.path.join(os.getcwd(), 'data')
file_path = os.path.join(file_dir, filename)

if not os.path.exists(file_dir):
    os.mkdir(file_dir)

if not os.path.exists(file_path):
    data_url = 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt'
    with open(file_path, 'w', encoding='utf-8') as f:
        f.write(requests.get(data_url).text)

In [4]:
with open(file_path, 'r', encoding = 'utf-8') as f:
    text = f.read()

In [5]:
print('Dataset length', len(text), end = '\n')
print(text[:100])

Dataset length 1115394
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [6]:
# Create a simple token space using python inner methods

chars = sorted(list(set(text)))
vocab_size = len(chars)
batch_size = 32
n_embed = 32
print('Dictionary size', vocab_size)
print(''.join(chars))

Dictionary size 65

 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz


## Tokenizer strategy

In [7]:
text2ind = {j: i for i, j in enumerate(chars)}
ind2text = {i: j for i, j in enumerate(chars)}

In [8]:
encode = lambda x: [text2ind[char] for char in x]
decode = lambda x: [ind2text[ind] for ind in x]

In [9]:
print(encode('Hi there'))
print(decode(encode('Hi there')))

[20, 47, 1, 58, 46, 43, 56, 43]
['H', 'i', ' ', 't', 'h', 'e', 'r', 'e']


Tokenizer uses simple encoding and decoding strategies that represent simple look-up tables that operate only at char level (because we're working with simple char-level transformer).

In [10]:
data = torch.tensor(encode(text), dtype = torch.long) # torch.long represents int64 
print(data.shape, data.dtype) # -> Shape of a known dataset that was seen previously

print(data[:50]) # -> Encoded data

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56])


In [11]:
# Train-test split

n = int(0.9*len(data))

train_data = data[:n]
val_data = data[n:]

In [12]:
block_size = 8 # Context length
print(train_data[:block_size+1])
decode(train_data[:block_size+1].tolist())

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])


['F', 'i', 'r', 's', 't', ' ', 'C', 'i', 't']

In [13]:
x = train_data[:block_size]
y = train_data[1:block_size+1]

Training data consists of `x` and `y` lists, that are packed each with 8 symbols. For each item in `x` list item in `y` list is considered the following one in context of all preceding `x` items.

In [14]:
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f'Input: {context} corresponds to ouptut {target}')
    print(f'Input: {decode(context.tolist())} corresponds to ouptut {decode([target.tolist()])}')

Input: tensor([18]) corresponds to ouptut 47
Input: ['F'] corresponds to ouptut ['i']
Input: tensor([18, 47]) corresponds to ouptut 56
Input: ['F', 'i'] corresponds to ouptut ['r']
Input: tensor([18, 47, 56]) corresponds to ouptut 57
Input: ['F', 'i', 'r'] corresponds to ouptut ['s']
Input: tensor([18, 47, 56, 57]) corresponds to ouptut 58
Input: ['F', 'i', 'r', 's'] corresponds to ouptut ['t']
Input: tensor([18, 47, 56, 57, 58]) corresponds to ouptut 1
Input: ['F', 'i', 'r', 's', 't'] corresponds to ouptut [' ']
Input: tensor([18, 47, 56, 57, 58,  1]) corresponds to ouptut 15
Input: ['F', 'i', 'r', 's', 't', ' '] corresponds to ouptut ['C']
Input: tensor([18, 47, 56, 57, 58,  1, 15]) corresponds to ouptut 47
Input: ['F', 'i', 'r', 's', 't', ' ', 'C'] corresponds to ouptut ['i']
Input: tensor([18, 47, 56, 57, 58,  1, 15, 47]) corresponds to ouptut 58
Input: ['F', 'i', 'r', 's', 't', ' ', 'C', 'i'] corresponds to ouptut ['t']


We enable the transformer to see context for a sentence with a length of 1 to length of context size.

In [15]:
torch.manual_seed(1337)
batch_size = 4
block_size = 8

In [63]:
def get_batch(split, block_size: int = 8):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,)) # -> sample indexes for random sequences
    x = torch.stack([data[i:i+block_size] for i in ix]).to(device) # -> sample sequences with a fixed context length
    y = torch.stack([data[i+1:i+block_size+1] for i in ix]).to(device) # -> sample y outputs
    return x, y

In [64]:
xb, yb = get_batch('train')
print(xb.shape, yb.shape)

torch.Size([4, 8]) torch.Size([4, 8])


In [18]:
decode(xb[2].to('cpu').tolist())

['n', 't', ' ', 't', 'h', 'a', 't', ' ']

## Sample self-attention blocks

In [19]:
torch.manual_seed(1337)
B, T, C = 4,8,2 # batch, timestemps, channels

x = torch.randn(B, T, C)
x.shape

torch.Size([4, 8, 2])

Creating mechanism that will allow tokens to communicate with each other, by creating self-attention mechanism. This mechanism allows models to attend to earlier context, and omit further context of the message by using triangular matrices.

This version of self-attention helps to communicate with earlier tokens by creating average or summed vectors of earlier context. Such process is quite lossy, but it's good enough for simple versions.

In [20]:
xbow = torch.zeros((B, T, C))

for b in range(B):
    for t in range(T):
        xprev = x[b, :t+1] #t, C
        xbow[b, t] = torch.mean(xprev, 0) #averaging by time

This attention type is some averaging process, but very inefficient.

In [21]:
xbow[0]

tensor([[ 0.1808, -0.0700],
        [-0.0894, -0.4926],
        [ 0.1490, -0.3199],
        [ 0.3504, -0.2238],
        [ 0.3525,  0.0545],
        [ 0.0688, -0.0396],
        [ 0.0927, -0.0682],
        [-0.0341,  0.1332]])

In [22]:
x[0]

tensor([[ 0.1808, -0.0700],
        [-0.3596, -0.9152],
        [ 0.6258,  0.0255],
        [ 0.9545,  0.0643],
        [ 0.3612,  1.1679],
        [-1.3499, -0.5102],
        [ 0.2360, -0.2398],
        [-0.9211,  1.5433]])

Matrix multiplication is the answer for creating a more faster version of an algorithm.

In [23]:
torch.manual_seed(42)
a = torch.ones(3, 3)
b = torch.randint(0, 10, (3, 2)).float()

c = a @ b

print('a:', a, sep = '\n')
print('---')
print('b:', b, sep = '\n')
print('---')
print('c:', c, sep = '\n')

a:
tensor([[1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.]])
---
b:
tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
---
c:
tensor([[14., 16.],
        [14., 16.],
        [14., 16.]])


In [24]:
torch.tril(torch.ones(3, 3))

tensor([[1., 0., 0.],
        [1., 1., 0.],
        [1., 1., 1.]])

By creating triangular matrix using `tril` we can omit further tokens and work with only preceding ones. Using masked filling we can also work with functions like `softmax`.

In [25]:
torch.manual_seed(42)
a = torch.tril(torch.ones(3, 3)) 
b = torch.randint(0, 10, (3, 2)).float()

c = a @ b

print('a:', a, sep = '\n')
print('---')
print('b:', b, sep = '\n')
print('---')
print('c:', c, sep = '\n')

a:
tensor([[1., 0., 0.],
        [1., 1., 0.],
        [1., 1., 1.]])
---
b:
tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
---
c:
tensor([[ 2.,  7.],
        [ 8., 11.],
        [14., 16.]])


In order to implement other functions like avearging or softmaxxing we can work with mutated triangular ones matrices.

Averaging is done by the following process: lower traingular matrix is mutated to a weight matrix by normalizing it's ones to some weights.

In [26]:
torch.manual_seed(42)
a = torch.tril(torch.ones(3, 3)) 
a = a / torch.sum(a, 1, keepdim = True) # Normalize each row to create a weight matrix for further multiplications
b = torch.randint(0, 10, (3, 2)).float()

c = a @ b

print('a:', a, sep = '\n')
print('---')
print('b:', b, sep = '\n')
print('---')
print('c:', c, sep = '\n')

a:
tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
---
b:
tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
---
c:
tensor([[2.0000, 7.0000],
        [4.0000, 5.5000],
        [4.6667, 5.3333]])


In [27]:
xbow = torch.zeros((B, T, C))

for b in range(B):
    for t in range(T):
        xprev = x[b, :t+1] #t, C
        xbow[b, t] = torch.mean(xprev, 0) #averaging by time

This attention type is some averaging process, but very inefficient.

In [28]:
xbow[0]

tensor([[ 0.1808, -0.0700],
        [-0.0894, -0.4926],
        [ 0.1490, -0.3199],
        [ 0.3504, -0.2238],
        [ 0.3525,  0.0545],
        [ 0.0688, -0.0396],
        [ 0.0927, -0.0682],
        [-0.0341,  0.1332]])

In [29]:
x[0]

tensor([[ 0.1808, -0.0700],
        [-0.3596, -0.9152],
        [ 0.6258,  0.0255],
        [ 0.9545,  0.0643],
        [ 0.3612,  1.1679],
        [-1.3499, -0.5102],
        [ 0.2360, -0.2398],
        [-0.9211,  1.5433]])

In [30]:
# Recreating averaging using matrices

wei = torch.tril(torch.ones(T, T))
wei = wei / wei.sum(1, keepdim = True)

xbow2 = wei @ x # (T, T) @ (B, T, C) -> (B, T, C)

wei shape is (T, T) @ and x shape is (B, T, C) which are inconsistent with each other. To conform with batch dimension torch will automatically create a batch dimension for wei tensor to perform batch multiplication.

In [31]:
torch.allclose(xbow, xbow2)

False

In [32]:
xbow[0], xbow2[0]

(tensor([[ 0.1808, -0.0700],
         [-0.0894, -0.4926],
         [ 0.1490, -0.3199],
         [ 0.3504, -0.2238],
         [ 0.3525,  0.0545],
         [ 0.0688, -0.0396],
         [ 0.0927, -0.0682],
         [-0.0341,  0.1332]]),
 tensor([[ 0.1808, -0.0700],
         [-0.0894, -0.4926],
         [ 0.1490, -0.3199],
         [ 0.3504, -0.2238],
         [ 0.3525,  0.0545],
         [ 0.0688, -0.0396],
         [ 0.0927, -0.0682],
         [-0.0341,  0.1332]]))

In [33]:
# third version with softmax

tril = torch.tril(torch.ones(T, T))
wei = torch.zeros((T, T))

# transform zero elements to negative infinity - to limit token's abilities to communicate with earlier tokens
wei = wei.masked_fill(tril == 0, float('-inf')) 

In [34]:
wei

tensor([[0., -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., 0., -inf, -inf, -inf],
        [0., 0., 0., 0., 0., 0., -inf, -inf],
        [0., 0., 0., 0., 0., 0., 0., -inf],
        [0., 0., 0., 0., 0., 0., 0., 0.]])

In [35]:
wei = F.softmax(wei, dim = -1)

xbow3 = wei @ x

In [36]:
xbow[0], xbow3[0]

(tensor([[ 0.1808, -0.0700],
         [-0.0894, -0.4926],
         [ 0.1490, -0.3199],
         [ 0.3504, -0.2238],
         [ 0.3525,  0.0545],
         [ 0.0688, -0.0396],
         [ 0.0927, -0.0682],
         [-0.0341,  0.1332]]),
 tensor([[ 0.1808, -0.0700],
         [-0.0894, -0.4926],
         [ 0.1490, -0.3199],
         [ 0.3504, -0.2238],
         [ 0.3525,  0.0545],
         [ 0.0688, -0.0396],
         [ 0.0927, -0.0682],
         [-0.0341,  0.1332]]))

## Simple Bi-Gram Language model

Bigram language model, that only works with the last prediction of the model (context is only the one preceding token).

In [37]:
class BigramLanguageModel(nn.Module):

    def __init__(self, ):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embed)
        self.lm_head = nn.Linear(n_embed, vocab_size)
        
    def forward(self, idx, targets = None):

        B, T = idx.shape
        
        # idx and targets are both (B, T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # output (B, T, C)
        pos_emb = self.position_embedding_table(torch.arange(T, device = device)) # (T, C)
        x = tok_emb + pos_emb # (B, T, C)
        logits = self.lm_head(x) # output (B, T, vocab_size)
        
        if targets is None:
            loss = None  
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C) #Tensor reshape
            targets = targets.view(B*T) # or .view(-1)
            # Loss expects logits in another shape, rather than (B, T, C) -> (B, C, T)
            loss = F.cross_entropy(logits, targets)
        
        return logits, loss

    def generate(self, idx, max_new_tokens):

        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):

            logits, loss = self(idx) 

            # Use only the last timestep - The main thing is that it's a Bi-Gram model, that's looking for the last timestep
            logits = logits[:, -1, :] # becomes (B, C)

            # Apply softmax to use probabilities
            probs = F.softmax(logits, dim = -1) # (B, C)
            
            # Sample from distribution
            idx_next = torch.multinomial(probs, num_samples = 1) # (B, 1)

            # Append sampled index to te running sequence
            idx = torch.cat((idx, idx_next), dim = 1) # (B, T+1)
        # Generate outputs as (B, T+max_new_tokens)
        return idx

In [20]:
m = BigramLanguageModel(vocab_size).to(device)
logits, loss = m(xb, yb)

In [21]:
print(logits.shape)

print(loss)

torch.Size([32, 65])
tensor(5.0364, device='cuda:0', grad_fn=<NllLossBackward0>)


In [22]:
context = torch.zeros((1, 1), dtype=torch.long, device = device)
print(decode(m.generate(context, max_new_tokens=100)[0].tolist()))

['\n', 'y', 'q', '$', ';', 't', 'f', 'B', 'f', 'R', 'O', 'k', 'N', 'd', 'c', 'u', 'w', 'd', 'Z', 'Z', 'T', 'k', 'O', 'M', 'l', ';', ',', 'e', 'r', 't', 'K', '\n', 'w', ':', '!', 'P', 'L', 'C', 'k', 'M', 'B', 'b', 'e', 'A', '$', '3', ':', 'X', 'a', 'S', 'G', 'J', 'O', '-', '3', 'p', '&', 'M', '-', 'c', '?', 'K', 'L', '3', 'a', 'u', 'h', 'p', 'F', 'Y', 'V', 'X', 'J', 'F', 'h', 'N', 'N', 'N', 'u', 'h', 'q', '$', 'O', 'M', 'x', 'v', '.', 't', 'b', 'V', 'F', 'Y', 'd', 'X', 'l', 'r', 'F', 'Z', 'a', 'A', 'e']


Train the model in order to increase it's efficiency and stability in text generation

In [23]:
optimizer = torch.optim.AdamW(m.parameters(), lr = 1e-3)

In [24]:
for steps in range(10000):

    xb, yb = get_batch('train')

    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)

    loss.backward()
    optimizer.step()

print(loss.item())

2.4487955570220947


In [25]:
context = torch.zeros((1, 1), dtype=torch.long, device = device)
print(''.join(decode(m.generate(context, max_new_tokens=500)[0].tolist())))


Wawice my.

HDEdarom oroup
Yowhthetof isth ble mil; dill, ath iree sengmin lat Heriliovets, and Win nghir.
Thanousel lind me l.
HAshe ce hiry ptupr aisspllw y.
Hurindu n Boopetelaves
MPORDis, d mothakleo Windo whthCoribyo the m dourive we higend t so mower; te

AN ad nterupt f s ar igr t m:

Thiny aleronth,
Mad
RD:

WISo myr f-NLIERor,
Sb&hak
Sadsal thes ghesthidin cour ay aney Iry ts I fr y ce.
Jken pand, bemary.
Yor 'Wour menm sora anghy t-senomes twe ten.
Wand thot sulin s th llety ome.
I muc


We've got a simple Bi-Gram model, that was looking only to last token from predictions.

In [74]:
@torch.no_grad()
def estimate_loss(model, eval_iters):
    '''
        Estimate loss outputs more stable loss metrics 
    due to averaging calculated loss by number of batches.
    '''
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

# Self-attention

In this implementation `wei` is a simple matrix with uniformly distributed weights, but attention necessarily needs more complicated path in order tokens to communicate with each other. It's necessary that `wei` matrix needs to be data dependent in order to work with sequences and attend to different parts of it.

**Self-attention** solves this problem by emitting multiple vectors: **query**, **key** vectors.

- **Query** - what information is being searched;
- **Key** - what information is contained.

In [51]:
# fourth version: self-attention
torch.manual_seed(1337)
B, T, C = 4, 8, 32

x = torch.randn(B,T,C) # Input tensor


# implementing head of attention
head_size = 16

key = nn.Linear(C, head_size, bias = False) #Wk
query = nn.Linear(C, head_size, bias = False) #Wq
value = nn.Linear(C, head_size, bias = False) #Wv
# K, Q matrices are created by the following steps
K = key(x) # (B, T, 16) 
Q = query(x) # (B, T, 16)
V = value(x) # (B, T, 16)

wei = Q @ K.transpose(-2, -1) # (B, T, 16) @ (B, 16, T) -> (B, T, T)

tril = torch.tril(torch.ones(T, T))
# wei = torch.zeros((T, T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim = -1)

out = wei @ V

Projections made by `Wk` and `Wq` matrices of `query` and `keys` vectors have created `Key` and `Query` matrices which further were matrix multiplied to create a matrix of dot-products that indicates affinity scores between *keys* and *queries* (the higher the score is the more important is one token to another).

These lines of code:

```
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim = -1)
```
help to turn on and off some attributes of attention mechanism:

1. `wei.masked_fill(tril == 0, float('-inf'))` helps to turn on and off further context for each token;
2. `F.softmax(wei, dim = -1)` creates normalized data distribution for efficient and interpretable results.

In [52]:
out.shape

torch.Size([4, 8, 16])

In [53]:
wei.shape

torch.Size([4, 8, 8])

Notes:
- Attention is a **communication mechanism**. Can be seen as nodes in a directed graph looking at each other and aggregating information with a weighted sum from all nodes that point to them, with data-dependent weights.
- There is no notion of space. Attention simply acts over a set of vectors. This is why we need to positionally encode tokens.
- Each example across batch dimension is of course processed completely independently and never "talk" to each other
- In an "encoder" attention block just delete the single line that does masking with `tril`, allowing all tokens to communicate. This block here is called a "decoder" attention block because it has triangular masking, and is usually used in autoregressive settings, like language modeling.
- "self-attention" just means that the keys and values are produced from the same source as queries. In "cross-attention", the queries still get produced from x, but the keys and values come from some other, external source (e.g. an encoder module)
- "Scaled" attention additional divides `wei` by 1/sqrt(head_size). This makes it so when input Q,K are unit variance, wei will be unit variance too and Softmax will stay diffuse and not saturate too much. Illustration below

In [56]:
class Head(nn.Module):
    def __init__(self, n_emb, head_size:int = 16):
        super().__init__()
        
        self.Wq = nn.Linear(n_emb, head_size, bias = False)
        self.Wk = nn.Linear(n_emb, head_size, bias = False)
        self.Wv = nn.Linear(n_emb, head_size, bias = False)

        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

    def forward(self, x):
        B,T,C = x.shape

        Q = self.Wq(x) # (B, T, C)
        K = self.Wk(x) # (B, T, C)
        V = self.Wv(x) # (B, T, C)

        # Attention compute + normalization = softmax(QK/sqrt(head_dim))*Value
        wei = Q @ K.transpose(-2, -1) * (C ** -0.5) # (B, T, C) @ (B, C, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T) - decoder block
        wei = F.softmax(wei, dim = -1) # (B, T, T)

        # Compute attention - weighted aggregates (B, T, T) @ (B, T, C) -> (B, T, C)
        out = wei @ V 
        return out
        

In [76]:
class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size, block_size, n_embed, sa_dim):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embed)
        self.position_embedding_table = nn.Embedding(block_size, n_embed)
        self.sa_head = Head(n_embed, sa_dim)
        self.lm_head = nn.Linear(sa_dim, vocab_size)
        
    def forward(self, idx, targets = None):

        B, T = idx.shape
        
        # idx and targets are both (B, T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # output (B, T, C)
        pos_emb = self.position_embedding_table(torch.arange(T, device = device)) # (T, C)
        x = tok_emb + pos_emb # (B, T, C)
        x = self.sa_head(x) # apply one head of self-attention
        logits = self.lm_head(x) # output (B, T, vocab_size)
        
        if targets is None:
            loss = None  
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C) #Tensor reshape
            targets = targets.view(B*T) # or .view(-1)
            # Loss expects logits in another shape, rather than (B, T, C) -> (B, C, T)
            loss = F.cross_entropy(logits, targets)
        
        return logits, loss

    def generate(self, idx, max_new_tokens):

        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
 
            idx_cond = idx[:, -block_size:] # Crop the context to cpmly with positional embeddings
            
            logits, loss = self(idx_cond) 

            # Use only the last timestep - The main thing is that it's a Bi-Gram model, that's looking for the last timestep
            logits = logits[:, -1, :] # becomes (B, C)

            # Apply softmax to use probabilities
            probs = F.softmax(logits, dim = -1) # (B, C)
            
            # Sample from distribution
            idx_next = torch.multinomial(probs, num_samples = 1) # (B, 1)

            # Append sampled index to te running sequence
            idx = torch.cat((idx, idx_next), dim = 1) # (B, T+1)
        # Generate outputs as (B, T+max_new_tokens)
        return idx

In [80]:
block_size = 8
batch_size = 4
max_iters = 10000
eval_interval = 500

In [81]:
model = BigramLanguageModel(vocab_size, block_size, 32, 16)
m = model.to(device)
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)

for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss(m, 20)
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

0.004977 M parameters
step 0: train loss 4.2726, val loss 4.2596
step 500: train loss 3.1551, val loss 3.0472
1000: train loss 2.9469, val loss 2.9577
step 1500: train loss 2.8796, val loss 2.8891
2000: train loss 2.7680, val loss 2.6833

step 2500: train loss 2.6448, val loss 2.6929
3000: train loss 2.5788, val loss 2.5944

step 3500: train loss 2.5840, val loss 2.6112
4000: train loss 2.6323, val loss 2.5533

step 4500: train loss 2.4967, val loss 2.5709
5000: train loss 2.4904, val loss 2.4454

step 5500: train loss 2.4253, val loss 2.5030
6000: train loss 2.4442, val loss 2.5205

step 6500: train loss 2.4345, val loss 2.5112
7000: train loss 2.4360, val loss 2.4100

step 7500: train loss 2.5096, val loss 2.5521
8000: train loss 2.4538, val loss 2.4576

step 8500: train loss 2.5051, val loss 2.3144
9000: train loss 2.4416, val loss 2.4762

step 9500: train loss 2.4685, val loss 2.4203
999: train loss 2.4579, val loss 2.5704



In [82]:
# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(''.join(decode(m.generate(context, max_new_tokens=2000)[0].tolist())))


B; emlant.

3 thule bilomy thanve cstinoliren.
GI mo ngo t fas; th ine,
Ave twhte ithe
Bur cthurld uthise,
Nkaave I't harl Se flo picuen, ad am nd was
re ofouse sis is it LHire
Th!
O: in be d ym owthand iy cour th mance rl thith thee ther thamor, heren ndurencs! adyong llima unt whs ythelased pille adriey meawce ainon maprotu ouril ladd nt y. TI mireve th bst ar my pas, th the. he vous.


ERCNENLANUEOS bur elanno shof maroll cs ake thirawsereeait ty, yourr hpr dk sl su nfud
As mowto quu ceman omefipel, lind th brath, to yacust, he theighaig tre meghor A.
Ad.

LANADdr enver oll bnod.

Tind RI IKIDog'd th Bvy to, w:ofo wre thandr ngoru Y m.

Merrd.

S otesl pr fithere p dth me we weirg onecloil ivanfut; tham cet ier,
Thes me yo hrine kars ksste,
This, thecodunickit whis,
ALAre buth ghanto whatant woan nBom tghayerm aikn blaithe.

I, tishegun inde nokere prom ben cay t.
ASouy corimang, husto ot thadre;
PHin! hy theg win criisen?

Bek'lod to yr ding hs teat jrde bre' thes I: weay wong isp