# **Building blocks**

In [30]:
# Download tiny shakespeare dataset
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2024-09-17 21:57:23--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt.1’


2024-09-17 21:57:24 (23.4 MB/s) - ‘input.txt.1’ saved [1115394/1115394]



In [31]:
# Read and inspect dataset
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [32]:
print ("length of dataset in characters: ", len(text))

length of dataset in characters:  1115394


In [33]:
# First 1000 characters
print(text[:1000])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.



In [34]:
# List of all unique chars occured in the dataset
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [35]:
# Create a mapping from characters to integers
stoi = { ch:i for i, ch in enumerate(chars)}
itos = {i:ch for i, ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

print(encode("hii there"))
print(decode(encode("hii there")))

[46, 47, 47, 1, 58, 46, 43, 56, 43]
hii there


In [36]:
# Encode the entire dataset and store in a Torch Tensor
import torch
import numpy as np
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:1000]) # first 1000 characters of the dataset

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15, 39, 47, 59, 57,  1, 25, 39, 56, 41,
      

In [37]:
# Split data into train and validation sets
n =int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

In [38]:
block_size = 8
train_data[:block_size+1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [39]:
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
  context = x[:t+1]
  target = y[t]
  print(f"when input is {context} the target: {target}")

when input is tensor([18]) the target: 47
when input is tensor([18, 47]) the target: 56
when input is tensor([18, 47, 56]) the target: 57
when input is tensor([18, 47, 56, 57]) the target: 58
when input is tensor([18, 47, 56, 57, 58]) the target: 1
when input is tensor([18, 47, 56, 57, 58,  1]) the target: 15
when input is tensor([18, 47, 56, 57, 58,  1, 15]) the target: 47
when input is tensor([18, 47, 56, 57, 58,  1, 15, 47]) the target: 58


In [40]:
y

tensor([47, 56, 57, 58,  1, 15, 47, 58])

In [41]:
torch.manual_seed(1337)
batch_size = 4 # how many independent sequence wiill we process in parallel
block_size = 8 # what is the maximum context length for prediction

def get_batch(split):
  # generate a small batch of data of inputs x and targets y
  data = train_data if split == 'train' else val_data
  ix = torch.randint(len(data) - block_size, (batch_size,)) # generate random index
  x = torch.stack([data[i:i+block_size] for i in ix]) # get the inputs from the generated index
  y = torch.stack([data[i+1:i+block_size+1] for i in ix]) # offset the inputs from the generated index to get the targets
  return x, y

xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

print('----')

for b in range (batch_size):
  for t in range (block_size):
    context = xb[b, :t+1]
    target = yb[b, t]
    print (f"when input is {context.tolist()} the target: {target}")

inputs:
torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
targets:
torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])
----
when input is [24] the target: 43
when input is [24, 43] the target: 58
when input is [24, 43, 58] the target: 5
when input is [24, 43, 58, 5] the target: 57
when input is [24, 43, 58, 5, 57] the target: 1
when input is [24, 43, 58, 5, 57, 1] the target: 46
when input is [24, 43, 58, 5, 57, 1, 46] the target: 43
when input is [24, 43, 58, 5, 57, 1, 46, 43] the target: 39
when input is [44] the target: 53
when input is [44, 53] the target: 56
when input is [44, 53, 56] the target: 1
when input is [44, 53, 56, 1] the target: 58
when input is [44, 53, 56, 1, 58] the target: 46
when input is [44, 53

In [42]:
ix_test = torch.randint(len(train_data) - 8, (4,))
print (len(train_data) - 8)
print (ix_test)
for ix in ix_test:
    print (train_data[ix:ix+8])

1003846
tensor([971401, 579495, 193625, 348340])
tensor([57, 43, 60, 43, 52,  1, 63, 43])
tensor([60, 43, 42,  8,  0, 25, 63,  1])
tensor([56, 42,  5, 57,  1, 57, 39, 49])
tensor([43, 57, 58, 63,  6,  1, 58, 46])


In [43]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):

        # idx and targets are both (B,T) tensor of integers
        logits = self.token_embedding_table(idx) # (B,T,C)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the predictions
            logits, loss = self(idx)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)

print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[0].tolist()))


torch.Size([32, 65])
tensor(4.8786, grad_fn=<NllLossBackward0>)

Sr?qP-QWktXoL&jLDJgOLVz'RIoDqHdhsV&vLLxatjscMpwLERSPyao.qfzs$Ys$zF-w,;eEkzxjgCKFChs!iWW.ObzDnxA Ms$3


In [44]:
# create a Pytorch optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [45]:
batch_size = 32
for steps in range(300):

  # sample a batch of data
  xb, yb = get_batch('train')

  # evaluate the loss
  logits, loss = m(xb, yb)
  optimizer.zero_grad(set_to_none=True)
  loss.backward()
  optimizer.step()

print(loss.item())

4.344598293304443


In [46]:
print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=500)[0].tolist()))


lKEz-qlgXr.I!ZYD&ZSlaPyj,FoSnkzotetoiKu$HSSP-RqYuAUGaSrRlvPphzl;A:COMqolxF-gCTyHswh,SiMpr&ynQWNKuH,S3:Yl,&&vT'OyNo'VPLQffNCV&GJMbnq  -,'i.
SPLzLKU;; 
mBALIycLj?abSPgLHUFsAcoC!us$gUzzKAUGUbXqPBLCenTQKAUzr:NTpN'AkzPyOlIlyNo?RObb.JnVgwaXiSAcsBcq-OMCf;iKuDWx?suHeVf?xTbOphOKiHMfNZT'IFbR
pcRSQ'UQNWW,.iGSiyxtKtzKHXnknpm,SGeYim,LKI3FuCtRS&Dxy;na!u!Uprdd,lgCh3l,p$YC&$I!BiqCA.?nEZ!A.m:CKoibVlHod?dFLEy.A-I&AUAd,tsrVjKUzoYasBTE-PiM:CDWno'aH.
SP CGSqt;;Qy,noT ffXERJr,!PdagAmZAUiyHVBLMyLjgW
LxcLKHABlI-uQAT!.I


# The mathematical trick in self-attention

In [47]:
torch.manual_seed(1337)
B, T, C = 4, 8, 2 # batch, time, channels
x = torch.randn(B, T, C)
x.shape

torch.Size([4, 8, 2])

In [48]:
# We want x[b,t] = mean x[b, i<=t]
# Version 1
xbow = torch.zeros((B,T,C))
for b in range(B):
  for t in range(T):
    xprev = x[b, :t+1] # (t, C)
    xbow[b,t] = torch.mean(xprev, 0)


In [49]:
# Version 2
w = torch.tril(torch.ones(T, T))
w = w / w.sum(1, keepdim=True)
w

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])

In [50]:
xbow2 = w @ x # (B, T, T) @ (B, T, C) ----> (B, T, C)
print (xbow[0])
print (xbow2[0])
diff = torch.abs(xbow - xbow2)
print(diff)


tensor([[ 0.1808, -0.0700],
        [-0.0894, -0.4926],
        [ 0.1490, -0.3199],
        [ 0.3504, -0.2238],
        [ 0.3525,  0.0545],
        [ 0.0688, -0.0396],
        [ 0.0927, -0.0682],
        [-0.0341,  0.1332]])
tensor([[ 0.1808, -0.0700],
        [-0.0894, -0.4926],
        [ 0.1490, -0.3199],
        [ 0.3504, -0.2238],
        [ 0.3525,  0.0545],
        [ 0.0688, -0.0396],
        [ 0.0927, -0.0682],
        [-0.0341,  0.1332]])
tensor([[[0.0000e+00, 0.0000e+00],
         [0.0000e+00, 0.0000e+00],
         [0.0000e+00, 2.9802e-08],
         [0.0000e+00, 0.0000e+00],
         [0.0000e+00, 7.4506e-09],
         [7.4506e-09, 0.0000e+00],
         [7.4506e-09, 1.4901e-08],
         [0.0000e+00, 0.0000e+00]],

        [[0.0000e+00, 0.0000e+00],
         [0.0000e+00, 0.0000e+00],
         [1.4901e-08, 0.0000e+00],
         [0.0000e+00, 0.0000e+00],
         [1.4901e-08, 2.2352e-08],
         [4.6566e-09, 3.2363e-08],
         [7.4506e-09, 0.0000e+00],
         [2.9802e-08, 0

In [51]:
x[0]

tensor([[ 0.1808, -0.0700],
        [-0.3596, -0.9152],
        [ 0.6258,  0.0255],
        [ 0.9545,  0.0643],
        [ 0.3612,  1.1679],
        [-1.3499, -0.5102],
        [ 0.2360, -0.2398],
        [-0.9211,  1.5433]])

In [52]:
xbow[0]

tensor([[ 0.1808, -0.0700],
        [-0.0894, -0.4926],
        [ 0.1490, -0.3199],
        [ 0.3504, -0.2238],
        [ 0.3525,  0.0545],
        [ 0.0688, -0.0396],
        [ 0.0927, -0.0682],
        [-0.0341,  0.1332]])

In [53]:
print ((0.1808 + -0.3596 + 0.6258)/3)

0.14900000000000002


In [54]:
torch.tril(torch.ones(3,3))

tensor([[1., 0., 0.],
        [1., 1., 0.],
        [1., 1., 1.]])

In [55]:
torch.manual_seed(42)
a = torch.tril(torch.ones(3,3))
a = a / torch.sum(a, 1, keepdim=True)
b = torch.randint(0, 10, (3,2)).float()
c = a @ b
print ('a=')
print (a)
print ('---')
print ('b=')
print (b)
print ('---')
print ('c=')
print (c)

a=
tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
---
b=
tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
---
c=
tensor([[2.0000, 7.0000],
        [4.0000, 5.5000],
        [4.6667, 5.3333]])


In [56]:
# version 3: use Softmax
tril = torch.tril(torch.ones(T, T))
w3 = torch.zeros((T,T))
w3 = w3.masked_fill(tril == 0, float('-inf'))
w3 = F.softmax(w3, dim=-1)
xbow3 = w3 @ x
print (torch.abs(xbow - xbow3))

tensor([[[0.0000e+00, 0.0000e+00],
         [0.0000e+00, 0.0000e+00],
         [0.0000e+00, 2.9802e-08],
         [0.0000e+00, 0.0000e+00],
         [0.0000e+00, 7.4506e-09],
         [7.4506e-09, 0.0000e+00],
         [7.4506e-09, 1.4901e-08],
         [0.0000e+00, 0.0000e+00]],

        [[0.0000e+00, 0.0000e+00],
         [0.0000e+00, 0.0000e+00],
         [1.4901e-08, 0.0000e+00],
         [0.0000e+00, 0.0000e+00],
         [1.4901e-08, 2.2352e-08],
         [4.6566e-09, 3.2363e-08],
         [7.4506e-09, 0.0000e+00],
         [2.9802e-08, 0.0000e+00]],

        [[0.0000e+00, 0.0000e+00],
         [0.0000e+00, 0.0000e+00],
         [1.4901e-08, 2.9802e-08],
         [0.0000e+00, 0.0000e+00],
         [1.4901e-08, 7.4506e-09],
         [7.4506e-09, 0.0000e+00],
         [2.9802e-08, 2.9802e-08],
         [2.9802e-08, 2.9802e-08]],

        [[0.0000e+00, 0.0000e+00],
         [0.0000e+00, 0.0000e+00],
         [0.0000e+00, 0.0000e+00],
         [0.0000e+00, 0.0000e+00],
         [0.00

In [57]:
# version 4: self-attention

import torch
import torch.nn as nn
import torch.nn.functional as F

# Setting a manual seed for reproducibility
torch.manual_seed(1337)

# B: batch size, T: sequence length (number of tokens), C: embedding dimension
B, T, C = 4, 8, 32 # Example values
x = torch.randn(B, T, C) # Random input tensor of shape (B, T, C)

# Initializing a matrix w4 to store attention weights
# This matrix is T x T because each token in the sequence will attend to every other token
w4 = torch.zeros((T, T))
print("Initial weights: ", w4)

# Defining head size, which is the dimensionality for key, query, and value vectors
head_size = 16

# These layers will map the input x (embedding dimension C) to the query, key, and value spaces
# Each input vector will be projected to a vector of size 'head_size' (16 in this case)
key = nn.Linear(C, head_size, bias=False)    # Linear layer for the keys
query = nn.Linear(C, head_size, bias=False)  # Linear layer for the queries
value = nn.Linear(C, head_size, bias=False)  # Linear layer for the values

# Compute key, query, and value matrices for self-attention
k = key(x) # Key matrix: shape (B, T, 16) -> Projects input x into the key space
print("key: ", k)

q = query(x) # Query matrix: shape (B, T, 16) -> Projects input x into the query space
print("query: ", q)

# Compute raw attention scores (dot-product attention)
# q @ k.transpose(-2, -1): Dot product between query and key, resulting in shape (B, T, T)
# The attention score for each token pair (i, j) represents how much token i attends to token j
w4 = q @ k.transpose(-2, -1) # Shape: (B, T, T), each token attends to every other token
print("Raw attention scores: ", w4)

# Create a lower triangular mask (tril) to prevent tokens from attending to future tokens
# For autoregressive models (like text generation), a token should not attend to future tokens
tril = torch.tril(torch.ones(T, T)) # A lower triangular matrix (T x T) filled with ones
print("tril: ", tril)

# Mask out future tokens by replacing values where tril == 0 with negative infinity (-inf)
# This ensures that when softmax is applied, those positions will have zero probability
w4 = w4.masked_fill(tril == 0, value=float('-inf')) # Shape (B, T, T)
print("Masked attention scores: ", w4)

# Apply softmax to the attention scores to get the attention weights
# Softmax normalizes the scores so that they sum to 1, making them interpretable as probabilities
w4 = F.softmax(w4, dim=-1) # Shape (B, T, T), where each row contains normalized weights
print("Softmax weights (attention probabilities): ", w4)

# Compute the value matrix from the input x
# Each value vector holds the information from the corresponding token in the sequence
v = value(x) # Shape (B, T, 16) -> Projects input x into the value space
print("value: ", v)

# Multiply the attention weights (w4) by the value matrix (v)
# This produces a weighted sum of the value vectors, where the weights are determined by how much each token attends to every other token
out = w4 @ v # Shape: (B, T, 16), output is the attended information for each token
print("Output's shape: ", out.shape) # Final output shape after applying attention

# 'out' contains the information from the value vectors, weighted by how much each token attends to others


Initial weights:  tensor([[0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.]])
key:  tensor([[[ 1.1965e-01, -3.0127e-01,  3.6293e-01,  1.1771e+00,  1.1385e+00,
          -2.5543e-01,  1.4537e-01, -2.9437e-01, -7.0201e-01, -1.0308e+00,
           7.4357e-01, -8.0984e-01, -6.6687e-01,  9.1233e-02, -6.0747e-03,
           1.9833e-01],
         [-5.4229e-01, -5.5581e-01, -7.6131e-02,  1.2929e+00,  8.6535e-01,
          -1.1998e+00,  3.8781e-01,  1.9389e-01,  7.0235e-01, -8.2251e-01,
           2.3484e-01, -8.4995e-01, -3.8126e-01, -2.9906e-01,  1.0242e-02,
          -5.5449e-01],
         [-3.7359e-01, -4.6781e-01, -2.1560e-01, -8.0344e-01, -3.7153e-01,
          -5.4427e-01, -9.1455e-01, -5.5926e-02, -3.2903e-01, -2.1

Explanation of Key Parts:
Linear Projections:

key, query, and value are linear transformations that project the input embeddings into different spaces. The key and query vectors are used to calculate the relevance (attention score) between tokens, while the value vectors contain the actual information that gets aggregated.
Attention Scores:

The dot product between the query and key matrices calculates how much each token should "attend" to every other token. The resulting matrix (w4) is a score matrix of size (B, T, T).
Masking:

In autoregressive models (e.g., when generating text), we don't want a token to look ahead and see future tokens. The lower triangular matrix (tril) ensures that each token only attends to previous or current tokens, masking out future positions with -inf.
Softmax:

Softmax is applied to the masked attention scores to transform them into probabilities, ensuring that they sum to 1. This allows the model to focus more on relevant tokens based on their computed similarity.
Weighted Sum of Values:

The attention probabilities (from the softmax) are used to weight the value vectors. This produces the final output, which is a combination of all the value vectors weighted by how much each token attends to the others.


In [58]:
w4[0]

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1574, 0.8426, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2088, 0.1646, 0.6266, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5792, 0.1187, 0.1889, 0.1131, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0294, 0.1052, 0.0469, 0.0276, 0.7909, 0.0000, 0.0000, 0.0000],
        [0.0176, 0.2689, 0.0215, 0.0089, 0.6812, 0.0019, 0.0000, 0.0000],
        [0.1691, 0.4066, 0.0438, 0.0416, 0.1048, 0.2012, 0.0329, 0.0000],
        [0.0210, 0.0843, 0.0555, 0.2297, 0.0573, 0.0709, 0.2423, 0.2391]],
       grad_fn=<SelectBackward0>)

In [61]:
class LayerNorm1d:

  def __init__ (self, dim, eps=1e-5, momentum=0.1):
    self.eps = eps
    self.gamma = torch.ones(dim)
    self.beta = torch.zeros(dim)

  def __call__(self, x):
      # calculate the forward pass
      xmean = x.mean(1, keepdim=True)
      xvar = x.var(1, keepdim=True)
      xhat = (x - xmean) / torch.sqrt(xvar + self.eps)
      self.out = self.gamma * xhat + self.beta
      return self.out

  def parameters(self):
      return [self.gamma, self.beta]

torch.manual_seed(1337)
module = LayerNorm1d(100)
x = torch.randn(32, 100)
x = module(x)
x.shape

torch.Size([32, 100])

In [62]:
x[:,0].mean(), x[:,0].std() # mean,std of one feature across all batch inputs

(tensor(0.1469), tensor(0.8803))

In [63]:
x[0,:].mean(), x[0,:].std() # mean,std of a single input from the batch, of its features

(tensor(-9.5367e-09), tensor(1.0000))

# **Full code for Inference**

In [73]:
import torch
import torch.nn as nn
from torch.nn import functional as F

# Hyperparameters
batch_size = 16 # how many independent sequences will we process in
block_size = 32 # what is the maximum context length for prediction
max_iters = 20000
eval_interval = 100
learning_rate = 1e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embedd = 64
n_head = 4
n_layer = 4
dropout = 0.0

# Set manual seed for reproducibility
torch.manual_seed(1337)

# wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# Here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)

# Create mapping from characters to inter
stoi = {ch:i for i, ch in enumerate(chars)}
itos = {i:ch for i, ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

# Train and test split
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

# Function to generate a small batch of input and target sequences
def get_batch(split):
  # Generate a small batch of data of inputs x and targets y
  data = train_data if split == 'train' else val_data
  ix = torch.randint(len(data) - block_size, (batch_size,))
  x = torch.stack([data[i:i+block_size] for i in ix])
  y = torch.stack([data[i+1:i+block_size+1] for i in ix])
  x, y = x.to(device), y.to(device)

  return x, y

# Function to estimate training and validation loss (used for evaluation)
@torch.no_grad() # No need to track gradients for evaluation
def estimate_loss():
  out = {}
  model.eval() # Put the model in evaluation mode
  for split in ['train', 'val']:
    losses = torch.zeros(eval_iters) # Store loss values
    for k in range(eval_iters): # Evaluate over multiple iterations
      X, Y = get_batch(split) # Get a batch of data
      logits, loss = model(X, Y) # Forward pass and compute loss
      losses[k] = loss.item() # Store the loss
    out[split] = losses.mean() # Compute average loss for this split
  model.train() # Put the model back into training mode

  return out

# Define a single head of self-attention
class Head(nn.Module):
  """ One head of self-attention """

  def __init__(self, head_size):
    super().__init__()
    # Linear projections for key, query, and value vectors
    self.key = nn.Linear(n_embedd, head_size, bias=False)
    self.query = nn.Linear(n_embedd, head_size, bias=False)
    self.value = nn.Linear(n_embedd, head_size, bias=False)

    # Lower triangular matrix used for masking future tokens in self-attention
    self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

    self.dropout = nn.Dropout(dropout)

  def forward(self, x):
      B, T, C = x.shape # B: Batch size, T: Time (sequence length), C: Embedding dim
      k = self.key(x)
      q = self.query(x)
      # Compute attention scores (affinity matrix) using scaled dot-product
      w = q @ k.transpose(-2, -1) * C**-0.5 # (B, T, C) @ (B, C, T) -> (B, T, T)
      # Mask future tokens
      w = w.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
      # Apply softmax to get attention weights
      w = F.softmax(w, dim=-1) # (B, T, T)
      w = self.dropout(w)
      # Perform the weighted aggeration of the values
      v = self.value(x) # (B, T, C)
      out = w @ v # (B, T, T) @ (B, T, C) -> (B, T, C)

      return out

# Define multi-head self-attention (multiple heads running in parallel)
class MultiHeadAttention(nn.Module):
    """ Multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
      super().__init__()
      # Create a list of heads, each head performs self-attention
      self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
      # Linear projection after concatenating the outputs of all heads
      self.proj = nn.Linear(n_embedd, n_embedd)
      self.dropout = nn.Dropout(dropout)

    def forward(self, x):
      # Run each head independently and concatenate the results
      out = torch.cat([h(x) for h in self.heads], dim=-1)
      out = self.dropout(self.proj(out)) # Final linear projection
      return out

# Define the feedforward network (used after self-attention in each block)
class FeedForward(nn.Module):
  def __init__ (self, n_embedd):
    super().__init__()
    self.net = nn.Sequential(
        nn.Linear(n_embedd, 4 * n_embedd),
        nn.ReLU(),
        nn.Linear(4 * n_embedd, n_embedd),
        nn.Dropout(dropout),
    )

  def forward(self, x):
    return self.net(x)

# Define a single transformer block
class Block(nn.Module):
  """ Transformer block: communication followed by computation """

  def __init__ (self, n_embedd, n_head):
    # n_embedd: embedding dimension, n_head: the number of heads we'd like
    super().__init__()
    # Each block contains multi-head attention followed by a feedforward network
    head_size = n_embedd // n_head
    self.sa = MultiHeadAttention(n_head, head_size) # Multi-head self-attention
    self.ffwd = FeedForward(n_embedd) # Feedforward network
    self.ln1 = nn.LayerNorm(n_embedd) # Layer normalization for attention
    self.ln2 = nn.LayerNorm(n_embedd) # Layer normalization for feedforward

  def forward(self, x):
    # Apply self-attention followed by residual connection and layer normalization
    x = x + self.sa(self.ln1(x))
    # Apply feedforward network followed by residual connection and layer normalization
    x = x + self.ffwd(self.ln2(x))

    return x

# Define the Bigram Language Model using the Transformer architecture
class BigramLanguageModel(nn.Module):

  def __init__(self):
    super().__init__()

    # Embedding table for tokens (characters) and their positions
    self.token_embedding_table = nn.Embedding(vocab_size, n_embedd)
    self.position_embedding_table = nn.Embedding(block_size, n_embedd)
    # Stack of transformer blocks (n_layer blocks)
    self.blocks = nn.Sequential(*[Block(n_embedd, n_head=n_head) for _ in range (n_layer)])
    self.ln_f = nn.LayerNorm(n_embedd) # final layer norm
    self.lm_head = nn.Linear(n_embedd, vocab_size)

  def forward(self, idx, targets=None):
    B, T = idx.shape # B: Batch size, T: Sequence length (context length)

    # idx and targets are both (B, T) tensor of integers
    tok_emb = self.token_embedding_table(idx) # (B, T, C)
    pos_emb = self.position_embedding_table(torch.arange(T, device=device) ) # (T, C)
    x = tok_emb + pos_emb # (B, T, C)
    x = self.blocks(x) # (B, T, C)
    x = self.ln_f(x) # (B, T, C)
    logits = self.lm_head(x) # (B,T,vocab_size)

    if targets is None:
      loss = None
    else:
      B, T, C = logits.shape
      logits = logits.view(B*T, C) # Reshape logits for loss calculation
      targets = targets.view(B*T) # Reshape targets
      loss = F.cross_entropy(logits, targets) # Compute cross-entropy loss

    return logits, loss

  # Generate new text from the model
  def generate (self, idx, max_new_tokens):
    # idx is (B, T) array of indices in the current context
    for _ in range(max_new_tokens):
      # crop idx to the last block_size tokens
      idx_cond = idx[:, -block_size:]
      # get the predictions
      logits, loss = self(idx_cond)
      # focus only on the last time step
      logits = logits[:, -1, :] # becomes (B, C)
      # apply softmax to get probabilities
      probs = F.softmax(logits, dim=-1) # (B, C)
      # sample from the distribution
      idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
      # append sampled index to the running sequence
      idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)

    return idx

# Instantiate the model and move it to the correct device (CPU or GPU)
model = BigramLanguageModel()
m = model.to(device)
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=2500)[0].tolist()))

0.209729 M parameters
step 0: train loss 4.4116, val loss 4.4022
step 100: train loss 2.6568, val loss 2.6670
step 200: train loss 2.5091, val loss 2.5058
step 300: train loss 2.4197, val loss 2.4336
step 400: train loss 2.3501, val loss 2.3562
step 500: train loss 2.2963, val loss 2.3125
step 600: train loss 2.2407, val loss 2.2496
step 700: train loss 2.2054, val loss 2.2187
step 800: train loss 2.1633, val loss 2.1866
step 900: train loss 2.1241, val loss 2.1504
step 1000: train loss 2.1036, val loss 2.1306
step 1100: train loss 2.0698, val loss 2.1180
step 1200: train loss 2.0380, val loss 2.0791
step 1300: train loss 2.0248, val loss 2.0634
step 1400: train loss 1.9926, val loss 2.0359
step 1500: train loss 1.9697, val loss 2.0287
step 1600: train loss 1.9627, val loss 2.0477
step 1700: train loss 1.9403, val loss 2.0115
step 1800: train loss 1.9090, val loss 1.9941
step 1900: train loss 1.9092, val loss 1.9858
step 2000: train loss 1.8847, val loss 1.9925
step 2100: train loss 1.