# Imports

In [42]:
# Add line so we don't have to reload notebooks for changes in imported modules
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [65]:
import torch
import numpy as np

from helpers.load_data import load_data
from helpers.get_batch import get_batch
from helpers.estimate_loss import estimate_loss
from helpers.tokenizer import Tokenizer
from models.bigram_model import BigramLanguageModel

# Globals

In [44]:
BATCH_SIZE = 32
BLOCK_SIZE = 8
EVAL_INTERVAL = 100
LEARNING_RATE = 1e-2
EVAL_ITERS = 100
MAX_ITERS = 10000
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Prepare Data

In [45]:
# Load data
text: str = load_data()

tokenizer = Tokenizer(text)
data = torch.tensor(tokenizer.encode(text))

# Create train and test sets
n = int(len(text) * 0.9)
train_data = data[:n]
test_data = data[n:]

# Bigram Language Model

In [46]:
bigram_model = BigramLanguageModel(tokenizer.vocab_size).to(DEVICE)

## Training loop

In [61]:
optimizer = torch.optim.AdamW(bigram_model.parameters(), lr=1e-3)

for iter in range(MAX_ITERS):
    
    if iter % EVAL_INTERVAL == 0:
        train_loss, val_loss = estimate_loss(
            model=bigram_model, 
            train_data=train_data,
            valid_data=test_data,
            block_size=BLOCK_SIZE,
            batch_size=BATCH_SIZE,
            eval_iters=EVAL_ITERS,
            device=DEVICE
        )
        print(f"Step {iter}, Train loss: {train_loss:.4f}, Val loss: {val_loss:.4f}")

    xb, yb = get_batch(
        train_data,
        BLOCK_SIZE, 
        BATCH_SIZE,
        device=DEVICE
    )

    logits, loss = bigram_model(xb, yb)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

Step 0, Train loss: 2.4701, Val loss: 2.4890
Step 100, Train loss: 2.4760, Val loss: 2.4941
Step 200, Train loss: 2.4735, Val loss: 2.4912
Step 300, Train loss: 2.4607, Val loss: 2.4887
Step 400, Train loss: 2.4602, Val loss: 2.4909
Step 500, Train loss: 2.4697, Val loss: 2.4891
Step 600, Train loss: 2.4460, Val loss: 2.4742
Step 700, Train loss: 2.4565, Val loss: 2.4983
Step 800, Train loss: 2.4672, Val loss: 2.4757
Step 900, Train loss: 2.4517, Val loss: 2.4885
Step 1000, Train loss: 2.4528, Val loss: 2.4912
Step 1100, Train loss: 2.4519, Val loss: 2.4841
Step 1200, Train loss: 2.4577, Val loss: 2.4762
Step 1300, Train loss: 2.4523, Val loss: 2.4853
Step 1400, Train loss: 2.4463, Val loss: 2.4940
Step 1500, Train loss: 2.4494, Val loss: 2.4841
Step 1600, Train loss: 2.4731, Val loss: 2.4828
Step 1700, Train loss: 2.4469, Val loss: 2.4820
Step 1800, Train loss: 2.4537, Val loss: 2.5005
Step 1900, Train loss: 2.4587, Val loss: 2.4823
Step 2000, Train loss: 2.4630, Val loss: 2.4896
Step

In [64]:
start_token = torch.zeros((1, 1)).long().to(DEVICE)
sequence = bigram_model.generate(start_token, max_len=500)[0].tolist()
print(tokenizer.decode(sequence))

DYo spour he fo t ots iell f,
Tot,--h n d merit ira my?
Anso 's?
Whagethe nt h blfore lld rus an h O:
ail Cortis l tingu He esedass thil the mbel's ckineannor 's hmbo moutoorseis tex--
S:
HEThouth tomerover hist'dond an;
NTh.
'd opus d ind tend ngonor ma mse bs!
Thid to nd wo s thatom.



KIO:
KETald yobly tild:
AMineromu l s e
te tof we ild.
QULIUS:
TI busporknes
Whe halour akes mand!
ILABY miss sst obes cathid fathay, asen wavitlm d stha:
CLI loche!
FO:

G ame.

ADUpie od; he hy banwody
GBO,

W


In [None]:
xbow = torch.zeris((B,T,C))
for b in range(B):
    for t in range(T):
        xprev = x[b, :t+1] # (t, C)
        xbow[b, t] = torch.mean(xprev, 0) # (C)

In [None]:
weights = torch.tril(torch.ones(3,3))
weights = weights / torch.sum(weights, 1, keepdim=True)
xbow2 = weights @ x # (B, T, T) @ (B, T, C) -> (B, T, C)

tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])

In [57]:
import torch.nn as nn
import torch.nn.functional as F

In [58]:
B,T,C = 4, 8, 32
x = torch.randn(B, T, C)

head_size = 16
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)
k = key(x) # (B, T, 16)
q = query(x) # (B, T, 16)
wei = q @ k.transpose(-2, -1) # (B, T, 16) @ (B, 16, T) -> (B, T, T)

tril = torch.tril(torch.ones(T, T))
# wei = torch.zeros((T, T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, 1)

v = value(x) # (B, T, C)
out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)

out = wei @ x # (T, T) @ (B, T, C) -> (B, T, C)

In [None]:

wei = q @ k.transpose(-2, -1) / np.sqrt(head_size)