In [1]:
from keras.preprocessing.text import one_hot,Tokenizer
import numpy as np

In [1]:
import torch
a = torch.randn((64, 2))

In [2]:
a.std()

tensor(1.1044)

In [6]:
(a * 1/64**0.5).std()

tensor(0.1381)

In [5]:
1/64**0.5

0.125

In [9]:
a = np.linspace(0.001, 1, 100)
b = np.linspace(-3, 0, 100)
b = 10**b
a.shape, a

((100,),
 array([0.001     , 0.01109091, 0.02118182, 0.03127273, 0.04136364,
        0.05145455, 0.06154545, 0.07163636, 0.08172727, 0.09181818,
        0.10190909, 0.112     , 0.12209091, 0.13218182, 0.14227273,
        0.15236364, 0.16245455, 0.17254545, 0.18263636, 0.19272727,
        0.20281818, 0.21290909, 0.223     , 0.23309091, 0.24318182,
        0.25327273, 0.26336364, 0.27345455, 0.28354545, 0.29363636,
        0.30372727, 0.31381818, 0.32390909, 0.334     , 0.34409091,
        0.35418182, 0.36427273, 0.37436364, 0.38445455, 0.39454545,
        0.40463636, 0.41472727, 0.42481818, 0.43490909, 0.445     ,
        0.45509091, 0.46518182, 0.47527273, 0.48536364, 0.49545455,
        0.50554545, 0.51563636, 0.52572727, 0.53581818, 0.54590909,
        0.556     , 0.56609091, 0.57618182, 0.58627273, 0.59636364,
        0.60645455, 0.61654545, 0.62663636, 0.63672727, 0.64681818,
        0.65690909, 0.667     , 0.67709091, 0.68718182, 0.69727273,
        0.70736364, 0.71745455, 0.72754

In [10]:
b.shape, b

((100,),
 array([0.001     , 0.00107227, 0.00114976, 0.00123285, 0.00132194,
        0.00141747, 0.00151991, 0.00162975, 0.00174753, 0.00187382,
        0.00200923, 0.00215443, 0.00231013, 0.00247708, 0.00265609,
        0.00284804, 0.00305386, 0.00327455, 0.00351119, 0.00376494,
        0.00403702, 0.00432876, 0.00464159, 0.00497702, 0.0053367 ,
        0.00572237, 0.00613591, 0.00657933, 0.0070548 , 0.00756463,
        0.00811131, 0.00869749, 0.00932603, 0.01      , 0.01072267,
        0.01149757, 0.01232847, 0.01321941, 0.01417474, 0.01519911,
        0.01629751, 0.01747528, 0.01873817, 0.02009233, 0.02154435,
        0.0231013 , 0.02477076, 0.02656088, 0.02848036, 0.03053856,
        0.03274549, 0.03511192, 0.03764936, 0.04037017, 0.04328761,
        0.04641589, 0.04977024, 0.05336699, 0.05722368, 0.06135907,
        0.06579332, 0.07054802, 0.07564633, 0.08111308, 0.0869749 ,
        0.09326033, 0.1       , 0.10722672, 0.1149757 , 0.12328467,
        0.13219411, 0.14174742, 0.15199

### <font color='Brown'>We are training LLM based on character-based level (instead of a token(word) like ChatGPT does.)</font>

# Loading Data and Analysing

In [2]:
with open('input.txt', 'r') as f:
    text = f.read()

In [3]:
print('Length of text in characters', len(text))

Length of text in characters 1115394


In [4]:
print(text[:200])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you


# Vocabulary; All the unique characters in the text

In [5]:
vocab = sorted(list(set(text)))
vocab_size = len(vocab)
print('Total uniques characters:', vocab_size)
print('All unique characters in the text:', ''.join(vocab))

Total uniques characters: 65
All unique characters in the text: 
 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz


# Character to Integer encoding

In [6]:
# create a mapping for each character to integer
stoi = {ch:i for i, ch in enumerate(vocab)}
itos = {i:ch for i, ch in enumerate(vocab)}
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list on integers
decode = lambda l: ''.join([itos[i] for i in l]) # encoder: take a list on integers, output a string

print("Vocab Size:", vocab_size) 
print(encode('hi there'))
print(decode(encode('hi there')))

Vocab Size: 65
[46, 47, 1, 58, 46, 43, 56, 43]
hi there


In [7]:
# EXTRA EXPLANATION
'''
encod_corp = one_hot(text, vocab_size)
we could've done this if we were to make the model based on tokens instead of single char

In practice most of the LLMs like ChatGPT uses sub-word encoding.
Meaning instead of encoding "hii there" to char level like this: [1,2,2,3,4,5,6,7,6]
or instead of encoding "hii there" to token level like this: [1,2]
sub-word encodes "hii there" to sub-token level like this: [1,2,3] --> 1:hi, 2:i, 3:the

This is a trade-off between code book size (vocabulary) and sequences of output intergers
'''
# OpenAI tiktoken for sub-word encoding
from tiktoken import get_encoding
enc = get_encoding('gpt2')
print("Vocab Size:", enc.n_vocab) # We can see vocab size if based on char level will just be around 65, but on sub-word level it will have a ton of a size
print(enc.encode('hii there')) # output sequences is just length of 3
print(enc.decode(enc.encode('hii there')))

Vocab Size: 50257
[71, 4178, 612]
hii there


In [8]:
# Let's encode the entire text dataset and store in into torch tensor
import torch
data = torch.tensor(encode(text), dtype=torch.long) # torch.long: 64 bit integer tensor
print(data.shape, data.dtype)
print(data[:200])

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59])


# Split into Train and Validation

In [9]:
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

# Context

In [10]:
block_size = 9 # context_length
'''
if as long as we have 1 char, the model know what would come next
here we'll have 9 Examples.
if model sees 18 it'll has have some idea what will come next i.e., 47
if model sees 18 and 47, it would likely output 56
and so on..

Remember this is just 1 block, will gonna feed whole data in blocks of 9,
so likely 18 can be followed by any other character.
If model will have more context like 18, 47, 56, 57, etc. then it might have high chance to predict 58 to next char

Here let consider only this much dataset we trained the model on, 
now if the model sees 58 in real application has 2 options for next outcome 1 or 47
It will only be decided clearly if it sees more context like: ..., 57, 58 OR ..., 47, 58
if it sees the former one, then it will output: 1
or if it sees the latter one, then it will output: 47
'''
train_data[:block_size+1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47])

In [11]:
# EXTRAS
x = train_data[:block_size+1]
y = train_data[1:block_size+1]
for i, t in enumerate(range(block_size)):
    context = x[:t+1]
    target = y[t]
    print(f'{i+1}. When the input is {context}, the Target is {target}')

1. When the input is tensor([18]), the Target is 47
2. When the input is tensor([18, 47]), the Target is 56
3. When the input is tensor([18, 47, 56]), the Target is 57
4. When the input is tensor([18, 47, 56, 57]), the Target is 58
5. When the input is tensor([18, 47, 56, 57, 58]), the Target is 1
6. When the input is tensor([18, 47, 56, 57, 58,  1]), the Target is 15
7. When the input is tensor([18, 47, 56, 57, 58,  1, 15]), the Target is 47
8. When the input is tensor([18, 47, 56, 57, 58,  1, 15, 47]), the Target is 58
9. When the input is tensor([18, 47, 56, 57, 58,  1, 15, 47, 58]), the Target is 47


# Train on Batches

In [12]:
torch.manual_seed(1337)
batch_size = 4 # how many independent sequences will we process in parallel?
block_size = 8 # what is the maximum context length for predictions?

def get_batch(batch_size, block_size, split='train'):
    # Generate a small batch of data of input x and target y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch(batch_size, block_size)
print('input:', xb.shape, '\n', xb)
print('target:', yb.shape, '\n', yb)
print('\n----\n')

for b in range(batch_size): # batch dimension
    for t in range(block_size): # time dimension
        context = xb[b, :t+1]
        target = yb[b, t]
        print(f'When the input is {context.tolist()}, the Target is {target}')

input: torch.Size([4, 8]) 
 tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
target: torch.Size([4, 8]) 
 tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])

----

When the input is [24], the Target is 43
When the input is [24, 43], the Target is 58
When the input is [24, 43, 58], the Target is 5
When the input is [24, 43, 58, 5], the Target is 57
When the input is [24, 43, 58, 5, 57], the Target is 1
When the input is [24, 43, 58, 5, 57, 1], the Target is 46
When the input is [24, 43, 58, 5, 57, 1, 46], the Target is 43
When the input is [24, 43, 58, 5, 57, 1, 46, 43], the Target is 39
When the input is [44], the Target is 53
When the input is [44, 53], the Target is 56
When the input is [44, 53, 56], the Target is 1
When the input is [44, 53, 56, 1],

# Make Model: Understanding forward() method

In [13]:
import torch.nn as nn
torch.manual_seed(1337)

# Check out my Notebook on Embedding Layers: https://www.kaggle.com/code/lunaticsain/docs-embedding-layer-keras

class BigramLanguageModel(nn.Module):
    def __init__ (self, vocab_size):
        super().__init__()
        # each token directly reads of the logits for the next token from a lookup table
        '''The size of the embedding vector (embedding_dim) for each word or token, in this case is set to vocab_size, 
            meaning each word or token will be represented by an vocab_size-dimensional embedding vector.
        '''
        self.token_embedding_table = nn.Embedding(num_embeddings=vocab_size, embedding_dim=vocab_size) # this will make a table of 65x65

    def forward(self, idx, targets=None):
        # idx and targets are both (B, T) of tensor of interger
        # each element in idx say 24, will be embedded in the sequence as a 65-dimensional vector
        logits = self.token_embedding_table(idx) # (B, T, C) => Batch(batch_size), Time(block_size), Channel(vocab_size)
        
        # PyTorch cross_entropy expect to be in the shape of B, C
        B, T, C = logits.shape
        logits = logits.view(B*T, C) # reshaping it as 2-dimension
        targets = targets.view(-1) # reshaping it as single dimension
        loss = nn.functional.cross_entropy(logits, targets)
        
        return logits, loss

m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb) # forward()
print("Shape:", logits.shape)
print(logits[:2])
print("Loss:", loss)

Shape: torch.Size([32, 65])
tensor([[-1.5101, -0.0948,  1.0927,  0.1505,  1.6347, -0.0518,  0.4996,  0.7216,
         -0.8968, -0.4122,  1.0030,  0.8508,  0.2178,  0.0328, -0.1699,  1.0659,
         -0.6177,  1.1824,  0.0214, -0.2154, -1.4623,  2.1707,  0.1624,  1.0296,
          0.4154,  0.6207,  0.2341, -0.0326,  1.0124,  1.5122, -0.3359,  0.2456,
          1.8682,  0.7536, -0.1177, -0.1967, -0.9552, -0.8995, -0.9583, -0.5945,
          0.1321, -0.5406,  0.1405, -0.7321,  1.1796,  1.3316, -0.2094,  0.0960,
          0.9040, -0.4032,  0.3027, -0.8034, -1.2537, -1.5195,  0.7446,  1.1914,
         -0.8061, -0.6290,  1.2447, -2.4400,  0.8408, -0.3993, -0.6126, -0.6597,
          0.7624],
        [ 0.3323, -0.0872, -0.7470, -0.6074,  0.3418,  0.5343,  0.3957, -0.4919,
         -0.0894, -1.3886,  1.2835, -0.3975,  2.0152,  1.6773, -0.3833,  1.5728,
          1.9458,  0.7247, -0.4834, -0.3263,  0.3193, -0.4198, -0.6435, -0.3311,
          0.7554, -1.2385,  0.4067,  0.9982, -0.6511,  1.2450,

In [14]:
# EXTRAS: Calculation of cross-entropy (it is same as negative-Log-Likelihood (Avg))
x = logits[0] #embeddings of 1st alphabet/Time step i.e, emb of xb[0,0]
y = yb[0,0]
print('x:', x)
print('y:', y)
print('--------\n')

print('BUILT-IN:')
print('\tCross-Entropy:', nn.functional.cross_entropy(x, y), '\n')

print('MANUAL:')
## Step 1: softmax probability
probabilities = nn.functional.softmax(x, dim=-1)
print('\tprobabilities:', probabilities)
## Step 2: select dimension correspoding to target
## dimension corresponding to target should have high value, compare to all other dimensions
predicted_prob = probabilities[y] 
print('\tpredicted_prob:', predicted_prob)
## Step 3: negative log likelyhood loss
log_prob = -np.log(predicted_prob.tolist())
## Step 4: Averaging
cross_entropy = log_prob / len(y)
print('\tCross-Entropy:', cross_entropy)

x: tensor([-1.5101, -0.0948,  1.0927,  0.1505,  1.6347, -0.0518,  0.4996,  0.7216,
        -0.8968, -0.4122,  1.0030,  0.8508,  0.2178,  0.0328, -0.1699,  1.0659,
        -0.6177,  1.1824,  0.0214, -0.2154, -1.4623,  2.1707,  0.1624,  1.0296,
         0.4154,  0.6207,  0.2341, -0.0326,  1.0124,  1.5122, -0.3359,  0.2456,
         1.8682,  0.7536, -0.1177, -0.1967, -0.9552, -0.8995, -0.9583, -0.5945,
         0.1321, -0.5406,  0.1405, -0.7321,  1.1796,  1.3316, -0.2094,  0.0960,
         0.9040, -0.4032,  0.3027, -0.8034, -1.2537, -1.5195,  0.7446,  1.1914,
        -0.8061, -0.6290,  1.2447, -2.4400,  0.8408, -0.3993, -0.6126, -0.6597,
         0.7624], grad_fn=<SelectBackward0>)
y: tensor(43)
--------

BUILT-IN:
	Cross-Entropy: tensor(5.4015, grad_fn=<NllLossBackward0>) 

MANUAL:
	probabilities: tensor([0.0021, 0.0085, 0.0280, 0.0109, 0.0481, 0.0089, 0.0155, 0.0193, 0.0038,
        0.0062, 0.0256, 0.0220, 0.0117, 0.0097, 0.0079, 0.0272, 0.0051, 0.0306,
        0.0096, 0.0076, 0.0022, 0

# Make Model: Introduces generate() method

In [15]:
import torch.nn as nn
torch.manual_seed(1337)

# Check out my Notebook on Embedding Layers: https://www.kaggle.com/code/lunaticsain/docs-embedding-layer-keras

class BigramLanguageModel(nn.Module):
    def __init__ (self, vocab_size):
        super().__init__()
        # each token directly reads of the logits for the next token from a lookup table
        '''The size of the embedding vector (embedding_dim) for each word or token, in this case is set to vocab_size, 
            meaning each word or token will be represented by an vocab_size-dimensional embedding vector.
        '''
        self.token_embedding_table = nn.Embedding(num_embeddings=vocab_size, embedding_dim=vocab_size) # this will make a table of 65x65

    def forward(self, idx, targets=None):
        # idx and targets are both (B, T) of tensor of interger
        # each element in idx say 24, will be embedded in the sequence as a 65-dimensional vector
        logits = self.token_embedding_table(idx) # (B, T, C) => Batch(batch_size), Time(block_size), Channel(vocab_size)
        
        if targets is None:
            loss = None
        else:
            # PyTorch cross_entropy expect to be in the shape of B, C
            B, T, C = logits.shape
            logits = logits.view(B*T, C) # reshaping it as 2-dimension
            targets = targets.view(-1) # reshaping it as single dimension
            loss = nn.functional.cross_entropy(logits, targets)
        
        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is a (B, T) array of indices in current context
        for _ in range(max_new_tokens):
            # get the prediction
            logits, loss = self(idx) # forward() # (B, T, C)
            # focus only on the last time step i.e., 43, 58, 1, 54 which last val of each time step in xb or idx
            ## No matter what we have context we will generate based only on the last char with higher probability
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = nn.functional.softmax(logits, dim=-1) # (B, C)
            # RANDOM SAMPLE from a multinomial distribution with the given probability distribution 
            ## We are pulling out random stuff, we will have to train the model
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

m = BigramLanguageModel(vocab_size)
# We are giving just 1 char '\n' (0) and asking it to generate more tokens
context = torch.zeros((1,1), dtype=torch.long)
print(decode(m.generate(context, max_new_tokens=100)[0].tolist()))


SKIcLT;AcELMoTbvZv C?nq-QE33:CJqkOKH-q;:la!oiywkHjgChzbQ?u!3bLIgwevmyFJGUGp
wnYWmnxKWWev-tDqXErVKLgJ


# Train Model: Optimize loss (Logits)

In [16]:
# Create a Pytorch Optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [17]:
batch_size = 32
block_size = 8

for step in range(10000):
    # sample a batch of data
    xb, yb = get_batch(batch_size, block_size)
    
    # Evaluate the loss
    logits, loss = m.forward(xb, yb)
    optimizer.zero_grad(set_to_none=True) # zero out the gradient in the previous step
    loss.backward() # Backpropogation to optimize parameters (logits)
    optimizer.step()
    
print(loss.item())

2.382369041442871


In [18]:
# We are giving just 1 char '\n' (0) and asking it to generate more tokens
context = torch.zeros((1,1), dtype=torch.long)
print(decode(m.generate(context, max_new_tokens=500)[0].tolist()))


lso br. ave aviasurf my, yxMPZI ivee iuedrd whar ksth y h bora s be hese, woweee; the! KI 'de, ulseecherd d o blllando;LUCEO, oraingofof win!
RIfans picspeserer hee tha,
TOFonk? me ain ckntoty ded. bo'llll st ta d:
ELIS me hurf lal y, ma dus pe athouo
BEY:! Indy; by s afreanoo adicererupa anse tecorro llaus a!
OLeneerithesinthengove fal amas trr
TI ar I t, mes, n IUSt my w, fredeeyove
THek' merer, dd
We ntem lud engitheso; cer ize helorowaginte the?
Thak orblyoruldvicee chot, p,
Bealivolde Th li


# Model

In [19]:
import torch
import torch.nn as nn
from torch.nn import functional as F

# hyperparameters
batch_size = 32 # how many independent sequences will we process in parallel?
block_size = 8 # what is the maximum context length for predictions?
max_iters = 3000
learning_rate = 1e-2
device = 'cuda' if torch.cuda.is_available() else 'cpu' # if you have gpu, run on it
eval_interval = 300
eval_iters = 200
# ------------

torch.manual_seed(1337)

# wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

# Train and test splits
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

# data loading
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device) # sending data to device (cpu or gpu)
    return x, y

'''
    Disabling gradient calculation @torch.no_grad(), is useful for inference,
    as we will never gonna call loss.backward() afterwards for gradient calculation.
    This will reduce memory consumption for storing the intermediate variable that would have `requires_grad=True`

    model.eval() is a kind of switch for some specific layers/parts of the model that 
    behave differently during training and inference (evaluating) time. 
    For example, Dropouts Layers, BatchNorm Layers etc., you need to turn them off 
    during model evaluation, and .eval() will do it for you.
    In addition, the common practice for evaluating/validation is using torch.no_grad() 
    in pair with model.eval() to turn off gradients computation.

    ref: https://stackoverflow.com/questions/60018578/what-does-model-eval-do-in-pytorch
'''
@torch.no_grad()
def estimate_loss():
    out = {}
    '''
        Instead of evaluating the model every iter, 
        we Evaluate every eval_interval based on what the model has been trained so far
    '''
    model.eval() # Switch to Evaluation mode
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters) #making an array of size eval_iters
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train() # Switch to Training mode
    return out

# super simple bigram model
class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):

        # idx and targets are both (B,T) tensor of integers
        logits = self.token_embedding_table(idx) # (B,T,C)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the predictions
            logits, loss = self(idx)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

# Build Model
model = BigramLanguageModel(vocab_size)
m = model.to(device)

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

# Epochs
for iter in range(max_iters):
    # every once in a while evaluate the loss on train and val sets
    # After every eval_interval iters, evaluate the loss on train and val sets
    if iter % eval_interval == 0:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model.forward(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=500)[0].tolist()))

step 0: train loss 4.7305, val loss 4.7241
step 300: train loss 2.8110, val loss 2.8249
step 600: train loss 2.5434, val loss 2.5682
step 900: train loss 2.4932, val loss 2.5088
step 1200: train loss 2.4863, val loss 2.5035
step 1500: train loss 2.4665, val loss 2.4921
step 1800: train loss 2.4683, val loss 2.4936
step 2100: train loss 2.4696, val loss 2.4846
step 2400: train loss 2.4638, val loss 2.4879
step 2700: train loss 2.4738, val loss 2.4911

MARI he avayokis erceller thour d, myono thishe me tord se by he me, Forder anen: at trselorinjulour t yoru thrd wo ththathy IUShe bavidelanoby man ond be jus as g e atot Meste hrle s, ppat t JLENCOLIUS:
Oppid tes d s o ged moer y pevehear soue maramapay fo t: bueyo malalyo!
Duir.
Fl ke it I t l o'ddre d ondu s?
cr, havetrathackes w.
PUpee meshancun, hrendspouthoulouren whel's'sesoread pe, s whure our heredinsethes; sedsend r lo pamit,
QUMIVIVIOfe m ne RDINid we tr ort; t:
MINENXI l dintandore r


# Mathematical Trick in Self-Attention

In [20]:
'''
    Till now, the tokens do not talk to previous tokens (context)
    But we want them to take into account their preceding tokens.
    The Easiest way is to take a simple average of the previous tokens (Bag of words)
'''
# Version 1: using loop
torch.manual_seed(1337)
B, T, C = 4, 8, 2
x = torch.randn(B, T, C) #depicts embedding
print(x[0])

xbow = torch.zeros((B, T, C))
for b in range(B):
    for t in range(T):
        x_prec = x[b, :t+1]
        xbow[b,t] = torch.mean(x_prec, 0)
print(xbow[0])

tensor([[ 0.1808, -0.0700],
        [-0.3596, -0.9152],
        [ 0.6258,  0.0255],
        [ 0.9545,  0.0643],
        [ 0.3612,  1.1679],
        [-1.3499, -0.5102],
        [ 0.2360, -0.2398],
        [-0.9211,  1.5433]])
tensor([[ 0.1808, -0.0700],
        [-0.0894, -0.4926],
        [ 0.1490, -0.3199],
        [ 0.3504, -0.2238],
        [ 0.3525,  0.0545],
        [ 0.0688, -0.0396],
        [ 0.0927, -0.0682],
        [-0.0341,  0.1332]])


In [21]:
'''
    The method is what we want, but the computation is insufficient. Time Complexity: O(B*T*T) (last T is for loop to sum)
    Hence we can do smarter computation using matrix multiplication. Time Complexity: O(B^2.81) (accord. strassen algo.)
    The example illustrating how matrix multiplication can be used for a "weighted aggregation"
'''

# Aggregation (Sum) of previous values
torch.manual_seed(42)
a = torch.tril(torch.ones(3, 3))
b = torch.randint(0,10,(3,2)).float()
c = a @ b
print('a=\n', a)
print('b=\n', b)
print('c=\n', c)
print('We can see, it is computing rolling sum')
print('------------')

# Aggregation (Average) of previous values
torch.manual_seed(42)
a = torch.tril(torch.ones(3, 3))
a = a / torch.sum(a, 1, keepdim=True) # normalizing inorder to make the sum(a) = 1
b = torch.randint(0,10,(3,2)).float()
c = a @ b
print('a=\n', a)
print('b=\n', b)
print('c=\n', c)
print('We can see, it is computing rolling mean')

a=
 tensor([[1., 0., 0.],
        [1., 1., 0.],
        [1., 1., 1.]])
b=
 tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
c=
 tensor([[ 2.,  7.],
        [ 8., 11.],
        [14., 16.]])
We can see, it is computing rolling sum
------------
a=
 tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
b=
 tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
c=
 tensor([[2.0000, 7.0000],
        [4.0000, 5.5000],
        [4.6667, 5.3333]])
We can see, it is computing rolling mean


In [22]:
# Version 2: using MatMul
tril = torch.tril(torch.ones(T, T)) # This will not allow past tokens to communicate w/ future tokens
wei = tril / torch.sum(tril, 1, keepdim=True)
xbow2 = wei @ x # (T, T) @ (B, T, C) ==> (B, T, T) @ (B, T, C) ==> (B, T, C)
print('isSame:', torch.allclose(xbow, xbow2))
print('a=\n', wei)
print('b=\n', x)
print('c=\n', xbow2)

isSame: True
a=
 tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])
b=
 tensor([[[ 0.1808, -0.0700],
         [-0.3596, -0.9152],
         [ 0.6258,  0.0255],
         [ 0.9545,  0.0643],
         [ 0.3612,  1.1679],
         [-1.3499, -0.5102],
         [ 0.2360, -0.2398],
         [-0.9211,  1.5433]],

        [[ 1.3488, -0.1396],
         [ 0.2858,  0.9651],
         [-2.0371,  0.4931],
         [ 1.4870,  0.5910],
         [ 0.1260, -1.5627],
       

In [23]:
# Version 3: using Softmax
tril = torch.tril(torch.ones(T, T)) # This will not allow past tokens to communicate w/ future tokens
wei = torch.zeros((T, T))  # The difference is wei here starts with zeros instead of ones -Interaction strength or Affinity
wei = wei.masked_fill(tril==0, float('-inf'))
wei = F.softmax(wei, dim=-1)
xbow3 = wei @ x
print('isSame:', torch.allclose(xbow2, xbow3))

isSame: True


# Model - incl. Position Emb. and Linear Transformation

In [24]:
import torch
import torch.nn as nn
from torch.nn import functional as F

# hyperparameters
batch_size = 32 # how many independent sequences will we process in parallel?
block_size = 8 # what is the maximum context length for predictions?
max_iters = 3000
learning_rate = 1e-2
device = 'cuda' if torch.cuda.is_available() else 'cpu' # if you have gpu, run on it
eval_interval = 300
eval_iters = 200
n_embd = 32
# ------------

torch.manual_seed(1337)

# wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

# Train and test splits
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

# data loading
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device) # sending data to device (cpu or gpu)
    return x, y

@torch.no_grad()
def estimate_loss():
    out = {}
    '''
        Instead of evaluating the model every iter, 
        we Evaluate every eval_interval based on what the model has been trained so far
    '''
    model.eval() # Switch to Evaluation mode
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train() # Switch to Training mode
    return out

# super simple bigram model
class BigramLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        # Each word or token will be represented by an n_embd-dimensional embedding vector.
        self.token_embedding_table = nn.Embedding(num_embeddings=vocab_size, embedding_dim=n_embd)
        # We don't just want to encode the identity of token, but also its position
        self.position_embedding_table = nn.Embedding(num_embeddings=block_size, embedding_dim=n_embd)
        # Applies a linear transformation to the incoming data: :math:`y = x @ W.T + b`
        # W will be randomly initialized w/ shape (vocab_size, n_embd) and b w/ shape (vocab_size)
        # ref: https://stackoverflow.com/questions/54916135/what-is-the-class-definition-of-nn-linear-in-pytorch
        self.lm_head = nn.Linear(in_features=n_embd, out_features=vocab_size)

    def forward(self, idx, targets=None):

        # idx and targets are both (B,T) tensor of integers
        tkn_emb = self.token_embedding_table(idx) # (B,T,C)
        print('tkn_emb')
        print(tkn_emb.shape)
        print(tkn_emb[0])
        
        # range(0, T):: Every position will have n_embd-dimensional embedding vector.
        # Remember the pos_emb for all records in batch will be same
        # In real life it is done using sin and cosine functions
        pos_emb = self.position_embedding_table(torch.arange(block_size, device=device)) # (T,C)
        print('pos_emb')
        print(pos_emb.shape)
        print(pos_emb[0])
        
        # x now, not just hold token identity but also its position
        x = tkn_emb + pos_emb # (B, T, C)
        print('x')
        print(x.shape)
        print(x[0])
        
        # Making Logits
        logits = self.lm_head(x) # (B,T,vocab_size)
        print('logits')
        print(logits.shape)
        print(logits[0])
        
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the predictions
            logits, loss = self(idx)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

# Build Model
model = BigramLanguageModel()
m = model.to(device)

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

# Epochs
for iter in range(max_iters):
    # every once in a while evaluate the loss on train and val sets
    # After every eval_interval iters, evaluate the loss on train and val sets
#     if iter % eval_interval == 0:
#         losses = estimate_loss()
#         print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model.forward(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    break

# # generate from the model
# context = torch.zeros((1, 1), dtype=torch.long, device=device)
# print(decode(m.generate(context, max_new_tokens=500)[0].tolist()))

tkn_emb
torch.Size([32, 8, 32])
tensor([[-1.7823,  0.1339, -2.0973,  1.9108,  1.6555,  2.0254,  0.6044, -0.7006,
          0.8141,  0.2263, -0.8224, -1.1513,  0.1186, -0.3123, -0.6024, -0.1058,
         -0.5325,  0.1415, -0.0339, -0.6461,  0.5560, -0.0698, -0.7516, -1.7028,
         -0.6811, -1.2044, -0.2007,  1.3154, -0.4974, -0.2338, -0.9047,  0.4135],
        [-0.6119, -0.4034,  0.3025,  0.6852, -1.0045, -1.0104, -1.0886,  1.3292,
          0.5912, -1.1082, -1.2869, -0.8170,  0.9682,  1.6030, -0.0726, -0.4725,
         -1.1616,  0.5962,  1.3058, -0.7422, -1.2529,  0.6750,  1.5664, -0.9238,
         -0.0956, -1.5452, -0.1801,  3.1838, -0.1277,  0.0910,  0.5422, -0.6110],
        [-0.6631, -0.2513,  1.0101,  0.1215,  0.1584,  1.1340, -1.1539, -0.2984,
         -0.5075, -0.9239,  0.5467, -1.4948, -1.2057,  0.5718, -0.5974, -0.6937,
          1.6455, -0.8030,  1.3514, -0.2759, -1.5108,  2.1048,  2.7630, -1.7465,
          1.4516, -1.5103,  0.8212, -0.2115,  0.7789,  1.5333,  1.6097, -0.

In [25]:
# EXTRAS: Explaning nn.Linear -- Linear Transformation
''' Linear Transformation is nothing but a normal hidden layer with weights and bias 
    we can apply any activation function after this, or we may not, it totally on us.
'''
x = torch.tensor([-1.0794,  1.6179, -1.9836,  1.1737, -0.0939,  3.6704, -0.5773, -1.3376,
                 -0.1591, -1.6883,  1.2444,  0.9919,  0.1451,  0.5203, -1.0878, -1.3108,
                 -1.2333,  2.2412,  0.0803, -0.7107,  1.1460,  0.5399, -1.4596, -2.0878,
                 -1.3853, -1.1428, -0.1356,  2.1132,  0.1673,  0.7807, -0.4232, -1.1781])

lt = nn.Linear(32, 65)
print(x.shape, 'x containing:\n', x, '\n')
# Randomly Initialized Parameters
print(lt.weight.shape, lt.weight, '\n')
print(lt.bias.shape, lt.bias)
print('--------')
print(f'{lt(x).shape} Linear Transformation:\n{lt(x)}')
print(f'{(x @ lt.weight.T + lt.bias).shape} Linear Transformation Manual:\n{x @ lt.weight.T + lt.bias}')

torch.Size([32]) x containing:
 tensor([-1.0794,  1.6179, -1.9836,  1.1737, -0.0939,  3.6704, -0.5773, -1.3376,
        -0.1591, -1.6883,  1.2444,  0.9919,  0.1451,  0.5203, -1.0878, -1.3108,
        -1.2333,  2.2412,  0.0803, -0.7107,  1.1460,  0.5399, -1.4596, -2.0878,
        -1.3853, -1.1428, -0.1356,  2.1132,  0.1673,  0.7807, -0.4232, -1.1781]) 

torch.Size([65, 32]) Parameter containing:
tensor([[ 0.1163, -0.1766,  0.0488,  ...,  0.1733, -0.1753, -0.0822],
        [ 0.0760,  0.0990, -0.0904,  ..., -0.1290, -0.1246, -0.0028],
        [-0.0257,  0.0605, -0.1345,  ..., -0.0785, -0.0084,  0.0847],
        ...,
        [-0.0027,  0.0686,  0.0374,  ...,  0.0458,  0.1309, -0.1700],
        [ 0.0370, -0.1052, -0.1530,  ...,  0.1096,  0.0774, -0.1308],
        [-0.0557, -0.0416, -0.0018,  ...,  0.0716, -0.1583, -0.0908]],
       requires_grad=True) 

torch.Size([65]) Parameter containing:
tensor([-0.0445,  0.0178,  0.0214, -0.0513, -0.1190, -0.0909,  0.1144, -0.0524,
         0.0760, -0.

# SELF ATTENTION

<img src='SelfAttentionHead.png' width="1000" height="1000">

In [26]:
# Version 4: SELF ATTENTION (Decoder Attention Block)
torch.manual_seed(1337)
B, T, C = 4, 8, 32 # Batch, Time, Channel
x = torch.randn(B, T, C) # Depicting Token Embeddings

'''
Instead of wei being uniform for each prev. token, i.e., have const. affinity for all previous tokens
we want it to have self attention (have higher affinity w/ similar tokens)

Every single token at each position, emit 2 vectors: Query and Key
Query vector: what am I looking for?
Key vector: what do I contain?

Now the way we get affinity between this tokens is by computing a dot product between keys and Queries

A token's Query will be dot product with all the previous token's Keys

The tokens whose keys are similar to the token's queries will have high affinity
'''

# Let's see a single Head perform self-attention
head_size = 16 # Huper-param
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)
k = key(x) # (B, T, head_size)
q = query(x) # (B, T, head_size)
v = value(x) # (B, T, head_size)

wei = q @ k.transpose(-2, -1) # (B, T, head_size) @ (B, head_size, T) ==> (B, T, T)
# print('x\n', x[0])
# print('k\n', k[0])
# print('q\n', q[0])
# print('k.T\n', k.transpose(-2, -1)[0])
# print('wei\n', wei[0])

tril = torch.tril(torch.ones(T, T)) # This will not allow past tokens to communicate w/ future tokens
wei = wei.masked_fill(tril==0, float('-inf'))
# wei = wei * head_size**-0.5 # Scaled wei; This will prevent explosion of variance due to peak for tokens with higher affinities
wei = F.softmax(wei, dim=-1)
# xbow4 = wei @ x # (B, T, C)
xbow4 = wei @ v # (B, T, head_size)
xbow4.shape

torch.Size([4, 8, 16])

In [27]:
wei[0] # Notice we don't have const. affinity for all previous tokens

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1574, 0.8426, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2088, 0.1646, 0.6266, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5792, 0.1187, 0.1889, 0.1131, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0294, 0.1052, 0.0469, 0.0276, 0.7909, 0.0000, 0.0000, 0.0000],
        [0.0176, 0.2689, 0.0215, 0.0089, 0.6812, 0.0019, 0.0000, 0.0000],
        [0.1691, 0.4066, 0.0438, 0.0416, 0.1048, 0.2012, 0.0329, 0.0000],
        [0.0210, 0.0843, 0.0555, 0.2297, 0.0573, 0.0709, 0.2423, 0.2391]],
       grad_fn=<SelectBackward0>)

**Inference: In batch 1, last token has this wei: [0.0210, 0.0843, 0.0555, 0.2297, 0.0573, 0.0709, 0.2423, 0.2391] <br>
We can see last token (8th) has self-affinity of 0.2391 which is highly close to token 4 with 0.2297, this shows high affinity between token 4th and 8th** 
<br><br>
**If we were to had uniform affinity then the wei would be [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250], <br>
which shows token 8th have same affinity with each previous token**
<br><br>
**Remember this is just randomly initialized, it has to be optimized through backprop.**

In [28]:
print(x[0])
print((wei @ x)[0])
print((wei @ v)[0])
# Self Attention: (0.1574 * 1.8077e-01) + (0.8426 * -6.6310e-01) = -0.5303
# Uniform affinity: (0.5 * 1.8077e-01) + (0.5 * -6.6310e-01) = -0.24

tensor([[ 1.8077e-01, -6.9988e-02, -3.5962e-01, -9.1520e-01,  6.2577e-01,
          2.5510e-02,  9.5451e-01,  6.4349e-02,  3.6115e-01,  1.1679e+00,
         -1.3499e+00, -5.1018e-01,  2.3596e-01, -2.3978e-01, -9.2111e-01,
          1.5433e+00,  1.3488e+00, -1.3964e-01,  2.8580e-01,  9.6512e-01,
         -2.0371e+00,  4.9314e-01,  1.4870e+00,  5.9103e-01,  1.2603e-01,
         -1.5627e+00, -1.1601e+00, -3.3484e-01,  4.4777e-01, -8.0164e-01,
          1.5236e+00,  2.5086e+00],
        [-6.6310e-01, -2.5128e-01,  1.0101e+00,  1.2155e-01,  1.5840e-01,
          1.1340e+00, -1.1539e+00, -2.9840e-01, -5.0754e-01, -9.2392e-01,
          5.4671e-01, -1.4948e+00, -1.2057e+00,  5.7182e-01, -5.9735e-01,
         -6.9368e-01,  1.6455e+00, -8.0299e-01,  1.3514e+00, -2.7592e-01,
         -1.5108e+00,  2.1048e+00,  2.7630e+00, -1.7465e+00,  1.4516e+00,
         -1.5103e+00,  8.2115e-01, -2.1153e-01,  7.7890e-01,  1.5333e+00,
          1.6097e+00, -4.0323e-01],
        [-8.3447e-01,  5.9780e-01, -5.14

Notes:
- Attention is a **communication mechanism**. Can be seen as nodes in a directed graph looking at each other and aggregating information with a weighted sum from all nodes that point to them, with data-dependent weights.
<img src='Self_Attention.png' width="400" height="400">
- There is no notion of space (position). Attention simply acts over a set of vectors. This is why we need to positionally encode tokens (pos_emb).
- Each example across batch dimension is of course processed completely independently and never "talk" to each other
- In an "encoder" attention block just delete the single line that does masking with `tril`, allowing all tokens to communicate (Useful in applications like Sentiment Analysis). This block here is called a "decoder" attention block because it has triangular masking, and is usually used in autoregressive settings, like language modeling.
- "self-attention" just means that the keys and values are produced from the same source as queries (x). In "cross-attention", the queries still get produced from x, but the keys and values come from some other, external source (e.g. an encoder module)
- "Scaled" attention additional divides `wei` by 1/sqrt(head_size). This makes it so when input Q,K are unit variance, wei will be unit variance too and Softmax will stay diffuse and not saturate too much. Illustration below


# Model - w/ Self Attention

In [29]:
import torch
import torch.nn as nn
from torch.nn import functional as F

# hyperparameters
batch_size = 32 # how many independent sequences will we process in parallel?
block_size = 8 # what is the maximum context length for predictions?
max_iters = 5000
learning_rate = 1e-3 #self-attention can't tolerate high lr
device = 'cuda' if torch.cuda.is_available() else 'cpu' # if you have gpu, run on it
eval_interval = 500
eval_iters = 200
n_embd = 32
# ------------

torch.manual_seed(1337)

# wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

# Train and test splits
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

# data loading
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device) # sending data to device (cpu or gpu)
    return x, y

@torch.no_grad()
def estimate_loss():
    out = {}
    '''
        Instead of evaluating the model every iter, 
        we Evaluate every eval_interval based on what the model has been trained so far
    '''
    model.eval() # Switch to Evaluation mode
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train() # Switch to Training mode
    return out

# Self-Attention
class Head(nn.Module):
    """ one head of self-attention """
    
    def __init__(self, head_size):
        super().__init__()
        self.head_size = head_size
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)        
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size))) #tril is not a parameter to be optimized
    
    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x)   # (B, T, head_size)
        q = self.query(x) # (B, T, head_size)
        v = self.value(x) # (B, T, head_size)
        
        # compute attention scores ("affinities")
        wei = (q @ k.transpose(-2, -1)) * self.head_size**-0.5   # (B, T, head_size) @ (B, head_size, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T]==0, float('-inf'))   # (B, T, T)
        wei = F.softmax(wei, dim=-1)  # (B, T, T)
        
        # perform weighted aggregation of values
        out = wei @ v  # (B, T, T) @ (B, T, head_size) -> (B, T, head_size) == (B, T, n_embd)
        return out

class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """
    
    def __init__ (self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        
    def forward(self, x):
        return torch.cat([h(x) for h in self.heads], dim=-1) # (B, T, head_size*4) == (B, T, n_embd)
    
# super simple bigram model
class BigramLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        # Each word or token will be represented by an n_embd-dimensional embedding vector.
        self.token_embedding_table = nn.Embedding(num_embeddings=vocab_size, embedding_dim=n_embd)
        # We don't just want to encode the identity of token, but also its position
        self.position_embedding_table = nn.Embedding(num_embeddings=block_size, embedding_dim=n_embd)
        # Self Attention
        ##self.sa_head = Head(head_size = n_embd) # head_size == C, tho' as it is an HP, we can take any value
        self.sa_heads = MultiHeadAttention(num_heads=4, head_size = n_embd//4)
        # Applies a linear transformation to the incoming data: :math:`y = x @ W.T + b`
        self.lm_head = nn.Linear(in_features=n_embd, out_features=vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        
        # idx and targets are both (B,T) tensor of integers
        tkn_emb = self.token_embedding_table(idx) # (B,T,C)
        
        # range(0, T):: Every position will have n_embd-dimensional embedding vector.
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        
        # x now, not just hold token identity but also its position
        x = tkn_emb + pos_emb # (B, T, C)
        
        # self-attention
        x = self.sa_heads(x) #forward() # (B, T, head_size)
        
        # Making Logits
        logits = self.lm_head(x) # (B,T,vocab_size)
        
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # as we are implementing position embedding (pos_emb), we can't include context more than block_size
            # crop idx to the last block_size token
            idx_cond = idx[:, -block_size:] # (B, block_size) == (B, T)
            # get the predictions
            logits, loss = self(idx_cond) #forward()
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

# Build Model
model = BigramLanguageModel()
m = model.to(device)

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

# Epochs
for iter in range(max_iters):
    
    # After every eval_interval iters, evaluate the loss on train and val sets
    if (iter % eval_interval == 0) or (iter==max_iters-1):
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # train model
    logits, loss = model.forward(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=500)[0].tolist()))

step 0: train loss 4.2248, val loss 4.2250
step 500: train loss 2.6663, val loss 2.6809
step 1000: train loss 2.5107, val loss 2.5189
step 1500: train loss 2.4394, val loss 2.4447
step 2000: train loss 2.3769, val loss 2.3890
step 2500: train loss 2.3459, val loss 2.3606
step 3000: train loss 2.3163, val loss 2.3361
step 3500: train loss 2.2867, val loss 2.3138
step 4000: train loss 2.2861, val loss 2.2796
step 4500: train loss 2.2692, val loss 2.2816
step 4999: train loss 2.2565, val loss 2.2798

And'ree aene of wis wit, wild les adery wheerel cromon sace, thincio aldanints wary coutu bus,
Whis not hrececthisurouse.

BHECLIUS:
Mrdtuio bell voenecen sou heant, me sa, ber peth past sor:
Andg wour my miven pord, thercave pipes seve onu.

Pughe adns wight isen't then, thee my teary, weird, gor sas the DoY: Gas Is.

WAnd, hriene thour and ting
Pow, your Can wit thatree.

That asthe medlect by sail whil the anl cundingaight,
Anered whe lan tat aparsted
Ang.

Fre sthe!
Set thr foro Euld fioo

# Model - adding Feed Foward Component

<img src='SelfAttentionModel.png' width="200" height="200">

In [30]:
import torch
import torch.nn as nn
from torch.nn import functional as F

# hyperparameters
batch_size = 32 # how many independent sequences will we process in parallel?
block_size = 8 # what is the maximum context length for predictions?
max_iters = 5000
learning_rate = 1e-3 #self-attention can't tolerate high lr
device = 'cuda' if torch.cuda.is_available() else 'cpu' # if you have gpu, run on it
eval_interval = 500
eval_iters = 200
n_embd = 32
# ------------

torch.manual_seed(1337)

# wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

# Train and test splits
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

# data loading
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device) # sending data to device (cpu or gpu)
    return x, y

@torch.no_grad()
def estimate_loss():
    out = {}
    '''
        Instead of evaluating the model every iter, 
        we Evaluate every eval_interval based on what the model has been trained so far
    '''
    model.eval() # Switch to Evaluation mode
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train() # Switch to Training mode
    return out

# Self-Attention
class Head(nn.Module):
    """ one head of self-attention """
    
    def __init__(self, head_size):
        super().__init__()
        self.head_size = head_size
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)        
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size))) #tril is not a parameter to be optimized
    
    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x)   # (B, T, head_size)
        q = self.query(x) # (B, T, head_size)
        v = self.value(x) # (B, T, head_size)
        
        # compute attention scores ("affinities")
        wei = (q @ k.transpose(-2, -1)) * self.head_size**-0.5   # (B, T, head_size) @ (B, head_size, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T]==0, float('-inf'))   # (B, T, T)
        wei = F.softmax(wei, dim=-1)  # (B, T, T)
        
        # perform weighted aggregation of values
        out = wei @ v  # (B, T, T) @ (B, T, head_size) -> (B, T, head_size) == (B, T, n_embd)
        return out

class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """
    
    def __init__ (self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        
    def forward(self, x):
        return torch.cat([h(x) for h in self.heads], dim=-1) # (B, T, head_size*4) == (B, T, n_embd)

'''
Before what we were doing is, just after the self-attention component, we went to directly compute logits.
Here we want all tokens now to independently think about the information they gathered by 
communication w/ other tokens (self-attention).
'''
class FeedForward(nn.Module):
    """ Simple Linear Layer followed by Non-Linearity"""
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, n_embd),
            nn.ReLU()
        )
    
    def forward(self, x):
        return self.net(x)    
    
# super simple bigram model
class BigramLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        # Each word or token will be represented by an n_embd-dimensional embedding vector.
        self.token_embedding_table = nn.Embedding(num_embeddings=vocab_size, embedding_dim=n_embd)
        # We don't just want to encode the identity of token, but also its position
        self.position_embedding_table = nn.Embedding(num_embeddings=block_size, embedding_dim=n_embd)
        # Self Attention
        n_head = 4
        self.sa_heads = MultiHeadAttention(num_heads=n_head, head_size = n_embd//n_head)
        # Feed Forward
        self.ffwd = FeedForward(n_embd)
        # Applies a linear transformation to the incoming data: :math:`y = x @ W.T + b`
        self.lm_head = nn.Linear(in_features=n_embd, out_features=vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        
        # idx and targets are both (B,T) tensor of integers
        tkn_emb = self.token_embedding_table(idx) # (B,T,C)
        
        # range(0, T):: Every position will have n_embd-dimensional embedding vector.
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        
        # x now, not just hold token identity but also its position
        x = tkn_emb + pos_emb # (B, T, C)
        
        # self-attention
        x = self.sa_heads(x) # (B, T, head_size)
        
        # Feed Forward
        x = self.ffwd(x) # (B, T, head_size)
        
        # Making Logits
        logits = self.lm_head(x) # (B,T,vocab_size)
        
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # as we are implementing position embedding (pos_emb), we can't include context more than block_size
            # crop idx to the last block_size token
            idx_cond = idx[:, -block_size:] # (B, block_size) == (B, T)
            # get the predictions
            logits, loss = self(idx_cond) #forward()
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

# Build Model
model = BigramLanguageModel()
m = model.to(device)

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

# Epochs
for iter in range(max_iters):
    
    # After every eval_interval iters, evaluate the loss on train and val sets
    if (iter % eval_interval == 0) or (iter==max_iters-1):
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # train model
    logits, loss = model.forward(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=500)[0].tolist()))

step 0: train loss 4.2022, val loss 4.2019
step 500: train loss 2.6144, val loss 2.6230
step 1000: train loss 2.4766, val loss 2.4768
step 1500: train loss 2.3985, val loss 2.3938
step 2000: train loss 2.3277, val loss 2.3451
step 2500: train loss 2.2955, val loss 2.3156
step 3000: train loss 2.2825, val loss 2.2921
step 3500: train loss 2.2455, val loss 2.2726
step 4000: train loss 2.2434, val loss 2.2458
step 4500: train loss 2.2293, val loss 2.2418
step 4999: train loss 2.2147, val loss 2.2501

I farr,
Hin of, somord'd yore hit tal det?

ghe
An ingouf hanto taw.

Ichit to learcys als ef sactr, hon cous wore hoin,
Mat En, magesrournou wilt with vill ceve sting; ho this, lim dot pruorve I he shu his helly mry hord the the am, fomy therfes arks a bistt he 'ild thes lovey; peavy bre crot mant.

FOOKINALANG 'nded nivarll.

EY: a tatsuttb my thy-f how play fory tyould to lenl time the sat of ditto it dearrat sate! itt out oumess's lairt, I no prande bourr,
Boon. Toth a thime lirt caigeeis

# Model - adding Transformer block Notion

In [31]:
import torch
import torch.nn as nn
from torch.nn import functional as F

# hyperparameters
batch_size = 32 # how many independent sequences will we process in parallel?
block_size = 8 # what is the maximum context length for predictions?
max_iters = 5000
learning_rate = 1e-3 #self-attention can't tolerate high lr
device = 'cuda' if torch.cuda.is_available() else 'cpu' # if you have gpu, run on it
eval_interval = 500
eval_iters = 200
n_embd = 32
# ------------

torch.manual_seed(1337)

# wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

# Train and test splits
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

# data loading
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device) # sending data to device (cpu or gpu)
    return x, y

@torch.no_grad()
def estimate_loss():
    out = {}
    '''
        Instead of evaluating the model every iter, 
        we Evaluate every eval_interval based on what the model has been trained so far
    '''
    model.eval() # Switch to Evaluation mode
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train() # Switch to Training mode
    return out

# Self-Attention
class Head(nn.Module):
    """ one head of self-attention """
    
    def __init__(self, head_size):
        super().__init__()
        self.head_size = head_size
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)        
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size))) #tril is not a parameter to be optimized
    
    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x)   # (B, T, head_size)
        q = self.query(x) # (B, T, head_size)
        v = self.value(x) # (B, T, head_size)
        
        # compute attention scores ("affinities")
        wei = (q @ k.transpose(-2, -1)) * self.head_size**-0.5   # (B, T, head_size) @ (B, head_size, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T]==0, float('-inf'))   # (B, T, T)
        wei = F.softmax(wei, dim=-1)  # (B, T, T)
        
        # perform weighted aggregation of values
        out = wei @ v  # (B, T, T) @ (B, T, head_size) -> (B, T, head_size) == (B, T, n_embd)
        return out

class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """
    
    def __init__ (self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        
    def forward(self, x):
        return torch.cat([h(x) for h in self.heads], dim=-1) # (B, T, head_size*4) == (B, T, n_embd)

class FeedForward(nn.Module):
    """ Simple Linear Layer followed by Non-Linearity"""
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, n_embd),
            nn.ReLU()
        )
    
    def forward(self, x):
        return self.net(x)    

class Block(nn.Module):
    """ Transformer block: communication followed by computation """
    def __init__(self, n_embd, n_head):
        super().__init__()
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        # Self Attention
        head_size = n_embd // n_head
        self.sa_heads = MultiHeadAttention(n_head, head_size)
        # Feed Forward
        self.ffwd = FeedForward(n_embd)
    
    def forward(self, x):
        x = self.sa_heads(x)
        x = self.ffwd(x)
        return x
        
# super simple bigram model
class BigramLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        # Each word or token will be represented by an n_embd-dimensional embedding vector.
        self.token_embedding_table = nn.Embedding(num_embeddings=vocab_size, embedding_dim=n_embd)
        # We don't just want to encode the identity of token, but also its position
        self.position_embedding_table = nn.Embedding(num_embeddings=block_size, embedding_dim=n_embd)
        # Block Component
        self.blocks = nn.Sequential(
            Block(n_embd, n_head=4),
            Block(n_embd, n_head=4),
            Block(n_embd, n_head=4)
        )
        # Applies a linear transformation to the incoming data: :math:`y = x @ W.T + b`
        self.lm_head = nn.Linear(in_features=n_embd, out_features=vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        
        # idx and targets are both (B,T) tensor of integers
        tkn_emb = self.token_embedding_table(idx) # (B,T,C)
        
        # range(0, T):: Every position will have n_embd-dimensional embedding vector.
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        
        # x now, not just hold token identity but also its position
        x = tkn_emb + pos_emb # (B, T, C)
        
        # Block component
        x = self.blocks(x) # (B, T, head_size)
        
        # Making Logits
        logits = self.lm_head(x) # (B,T,vocab_size)
        
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # as we are implementing position embedding (pos_emb), we can't include context more than block_size
            # crop idx to the last block_size token
            idx_cond = idx[:, -block_size:] # (B, block_size) == (B, T)
            # get the predictions
            logits, loss = self(idx_cond) #forward()
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

# Build Model
model = BigramLanguageModel()
m = model.to(device)

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

# Epochs
for iter in range(max_iters):
    
    # After every eval_interval iters, evaluate the loss on train and val sets
    if (iter % eval_interval == 0) or (iter==max_iters-1):
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # train model
    logits, loss = model.forward(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=500)[0].tolist()))

step 0: train loss 4.2116, val loss 4.2078
step 500: train loss 3.1195, val loss 3.1197
step 1000: train loss 2.7860, val loss 2.7735
step 1500: train loss 2.6227, val loss 2.6083
step 2000: train loss 2.5139, val loss 2.5352
step 2500: train loss 2.4782, val loss 2.4671
step 3000: train loss 2.4179, val loss 2.4335
step 3500: train loss 2.3973, val loss 2.4080
step 4000: train loss 2.3846, val loss 2.3628
step 4500: train loss 2.3617, val loss 2.3660
step 4999: train loss 2.3280, val loss 2.3389

Hewem opowteeend eth veam this wike-uu nry.

LOUTH:
Acacy.

Tkne Ellisginh ad wo
whe moe'n epou:
Astt on bno'r feod of pottor Hopyounl, athe of pver longee.
Andd too thou no in you go
ius ding therbalesous! dilts?

Chel, mes hreg cere!
Twhaert hert you hou, whowe bok you,
Afh het nous eave would grem.
RODRUEED:
Nes thour
Tonluce hou nhe, thes nhas ecetunpent rinl do anle rothy arod thas mavle comfus pibbe'g mripdcale wopuurye?

BLis ink sowcume.

Paman: do yor:
And it thaveoudt: the mas thing

**<span style="background-color: #ff9e9e">This is performing worst than previous one, as model becoming more Deep, it req. more optimitions</span>**

# Model - Skip / Residual connection (Optimization)

In [38]:
from IPython.display import HTML, display
display(HTML("<table><tr><td><img src='SelfAttentionModel.png'></td><td><img src='ResidualBlock.png'></td></tr></table>"))

- We basically have this Gradient Super-Highway (when backpropogation), that goes directly from the Supervision all the way to the Input unimpeted. <br>
- The Gradient also fork off and go through the Residual Blocks. <br>
- This Residual blocks are initialized in the beginning in such a way that they contribute very little like it's not even there, and then during the optimization they starts to come online overtime and start to contribute. <br>
- In Backpropogation, Addition distributes Gradient equally to both of its branches.

In [39]:
import torch
import torch.nn as nn
from torch.nn import functional as F

# hyperparameters
batch_size = 32 # how many independent sequences will we process in parallel?
block_size = 8 # what is the maximum context length for predictions?
max_iters = 5000
learning_rate = 1e-3 #self-attention can't tolerate high lr
device = 'cuda' if torch.cuda.is_available() else 'cpu' # if you have gpu, run on it
eval_interval = 500
eval_iters = 200
n_embd = 32
# ------------

torch.manual_seed(1337)

# wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

# Train and test splits
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

# data loading
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device) # sending data to device (cpu or gpu)
    return x, y

@torch.no_grad()
def estimate_loss():
    out = {}
    '''
        Instead of evaluating the model every iter, 
        we Evaluate every eval_interval based on what the model has been trained so far
    '''
    model.eval() # Switch to Evaluation mode
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train() # Switch to Training mode
    return out

# Self-Attention
class Head(nn.Module):
    """ one head of self-attention """
    
    def __init__(self, head_size):
        super().__init__()
        self.head_size = head_size
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)        
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size))) #tril is not a parameter to be optimized
    
    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x)   # (B, T, head_size)
        q = self.query(x) # (B, T, head_size)
        v = self.value(x) # (B, T, head_size)
        
        # compute attention scores ("affinities")
        wei = (q @ k.transpose(-2, -1)) * self.head_size**-0.5   # (B, T, head_size) @ (B, head_size, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T]==0, float('-inf'))   # (B, T, T)
        wei = F.softmax(wei, dim=-1)  # (B, T, T)
        
        # perform weighted aggregation of values
        out = wei @ v  # (B, T, T) @ (B, T, head_size) -> (B, T, head_size) == (B, T, n_embd)
        return out

class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """
    
    def __init__ (self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        
    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1) # (B, T, head_size*4) == (B, T, n_embd)
        out = self.proj(out) # we are doing Linear transformation of the output from self-attention
        return out

class FeedForward(nn.Module):
    """ Simple Linear Layer followed by Non-Linearity"""
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4* n_embd), # making ouput of Linear 4 times, as suggested in paper
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd) # self.proj # here again making ouput of Linear n_embd, as suggested in paper
        )
    
    def forward(self, x):
        return self.net(x)    

class Block(nn.Module):
    """ Transformer block: communication followed by computation """
    def __init__(self, n_embd, n_head):
        super().__init__()
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        # Self Attention
        head_size = n_embd // n_head
        self.sa_heads = MultiHeadAttention(n_head, head_size)
        # Feed Forward
        self.ffwd = FeedForward(n_embd)
    
    def forward(self, x):
        x = x + self.sa_heads(x) # we include x and subsiquent fork off, perform some computation and come back.
        x = x + self.ffwd(x)
        return x
        
# super simple bigram model
class BigramLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        # Each word or token will be represented by an n_embd-dimensional embedding vector.
        self.token_embedding_table = nn.Embedding(num_embeddings=vocab_size, embedding_dim=n_embd)
        # We don't just want to encode the identity of token, but also its position
        self.position_embedding_table = nn.Embedding(num_embeddings=block_size, embedding_dim=n_embd)
        # Block Component
        self.blocks = nn.Sequential(
            Block(n_embd, n_head=4),
            Block(n_embd, n_head=4),
            Block(n_embd, n_head=4)
        )
        # Applies a linear transformation to the incoming data: :math:`y = x @ W.T + b`
        self.lm_head = nn.Linear(in_features=n_embd, out_features=vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        
        # idx and targets are both (B,T) tensor of integers
        tkn_emb = self.token_embedding_table(idx) # (B,T,C)
        
        # range(0, T):: Every position will have n_embd-dimensional embedding vector.
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        
        # x now, not just hold token identity but also its position
        x = tkn_emb + pos_emb # (B, T, C)
        
        # Block component
        x = self.blocks(x) # (B, T, head_size)
        
        # Making Logits
        logits = self.lm_head(x) # (B,T,vocab_size)
        
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # as we are implementing position embedding (pos_emb), we can't include context more than block_size
            # crop idx to the last block_size token
            idx_cond = idx[:, -block_size:] # (B, block_size) == (B, T)
            # get the predictions
            logits, loss = self(idx_cond) #forward()
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

# Build Model
model = BigramLanguageModel()
m = model.to(device)

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

# Epochs
for iter in range(max_iters):
    
    # After every eval_interval iters, evaluate the loss on train and val sets
    if (iter % eval_interval == 0) or (iter==max_iters-1):
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # train model
    logits, loss = model.forward(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=500)[0].tolist()))

step 0: train loss 4.6328, val loss 4.6313
step 500: train loss 2.3722, val loss 2.3678
step 1000: train loss 2.2583, val loss 2.2621
step 1500: train loss 2.1740, val loss 2.1979
step 2000: train loss 2.1323, val loss 2.1728
step 2500: train loss 2.0993, val loss 2.1470
step 3000: train loss 2.0636, val loss 2.1339
step 3500: train loss 2.0542, val loss 2.1109
step 4000: train loss 2.0187, val loss 2.1005
step 4500: train loss 1.9986, val loss 2.0940
step 4999: train loss 1.9891, val loss 2.0714

Time; pall'd that narighoph is sear at forss And thy kingpon my lord, but te stomey, his thou rolder bothis brighile's ther the and in this nep as but as stless, I this,
That bry prake with. Velast a fathereeemansshe dyo muthis for have that faker inight seet oben losed on of the lord helle thereace:
You sonchark.
Be it ry may somen my lives so mualss than negith a chall, oW he be ast youse?

NORWARD VIO:
In uppare Romeirn leam; and my dade uplebed Jous abse son laudow. doth the how good with

# Model - LayerNorm (Layer Normalization)

Layer Normalization is somewhat similar to Batch Normalization. Though instead of normalizing columns in case of BatchNorm, in LayerNorm we normalize rows

In [99]:
import torch
import torch.nn as nn
from torch.nn import functional as F

# hyperparameters
batch_size = 32 # how many independent sequences will we process in parallel?
block_size = 8 # what is the maximum context length for predictions?
max_iters = 5000
learning_rate = 1e-3 #self-attention can't tolerate high lr
device = 'cuda' if torch.cuda.is_available() else 'cpu' # if you have gpu, run on it
eval_interval = 500
eval_iters = 200
n_embd = 32
# ------------

torch.manual_seed(1337)

# wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

# Train and test splits
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

# data loading
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device) # sending data to device (cpu or gpu)
    return x, y

@torch.no_grad()
def estimate_loss():
    out = {}
    '''
        Instead of evaluating the model every iter, 
        we Evaluate every eval_interval based on what the model has been trained so far
    '''
    model.eval() # Switch to Evaluation mode
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train() # Switch to Training mode
    return out

# Self-Attention
class Head(nn.Module):
    """ one head of self-attention """
    
    def __init__(self, head_size):
        super().__init__()
        self.head_size = head_size
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)        
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size))) #tril is not a parameter to be optimized
    
    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x)   # (B, T, head_size)
        q = self.query(x) # (B, T, head_size)
        v = self.value(x) # (B, T, head_size)
        
        # compute attention scores ("affinities")
        wei = (q @ k.transpose(-2, -1)) * self.head_size**-0.5   # (B, T, head_size) @ (B, head_size, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T]==0, float('-inf'))   # (B, T, T)
        wei = F.softmax(wei, dim=-1)  # (B, T, T)
        
        # perform weighted aggregation of values
        out = wei @ v  # (B, T, T) @ (B, T, head_size) -> (B, T, head_size) == (B, T, n_embd)
        return out

class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """
    
    def __init__ (self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        
    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1) # (B, T, head_size*4) == (B, T, n_embd)
        out = self.proj(out) # we are doing Linear transformation of the output from self-attention
        return out

class FeedForward(nn.Module):
    """ Simple Linear Layer followed by Non-Linearity"""
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4* n_embd), # making ouput of Linear 4 times, as suggested in paper
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd) # self.proj # here again making ouput of Linear n_embd, as suggested in paper
        )
    
    def forward(self, x):
        return self.net(x)    

class Block(nn.Module):
    """ Transformer block: communication followed by computation """
    def __init__(self, n_embd, n_head):
        super().__init__()
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        # Self Attention
        head_size = n_embd // n_head
        self.sa_heads = MultiHeadAttention(n_head, head_size)
        # Feed Forward
        self.ffwd = FeedForward(n_embd)
        # Layer Norms
        self.ln1 = nn.LayerNorm(n_embd) # making Unit Gaussians
        self.ln2 = nn.LayerNorm(n_embd)
    
    def forward(self, x):
        x = x + self.sa_heads(self.ln1(x)) # in original paper the layernorm is applied after the computation, though overtime it has become more common to apply it before the computaion
        x = x + self.ffwd(self.ln2(x))
        return x
        
# super simple bigram model
class BigramLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        # Each word or token will be represented by an n_embd-dimensional embedding vector.
        self.token_embedding_table = nn.Embedding(num_embeddings=vocab_size, embedding_dim=n_embd)
        # We don't just want to encode the identity of token, but also its position
        self.position_embedding_table = nn.Embedding(num_embeddings=block_size, embedding_dim=n_embd)
        # Block Component
        self.blocks = nn.Sequential(
            Block(n_embd, n_head=4),
            Block(n_embd, n_head=4),
            Block(n_embd, n_head=4),
            nn.LayerNorm(n_embd)
        )
        # Applies a linear transformation to the incoming data: :math:`y = x @ W.T + b`
        self.lm_head = nn.Linear(in_features=n_embd, out_features=vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        
        # idx and targets are both (B,T) tensor of integers
        tkn_emb = self.token_embedding_table(idx) # (B,T,C)
        
        # range(0, T):: Every position will have n_embd-dimensional embedding vector.
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        
        # x now, not just hold token identity but also its position
        x = tkn_emb + pos_emb # (B, T, C)
        
        # Block component
        x = self.blocks(x) # (B, T, head_size)
        
        # Making Logits
        logits = self.lm_head(x) # (B,T,vocab_size)
        
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # as we are implementing position embedding (pos_emb), we can't include context more than block_size
            # crop idx to the last block_size token
            idx_cond = idx[:, -block_size:] # (B, block_size) == (B, T)
            # get the predictions
            logits, loss = self(idx_cond) #forward()
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

# Build Model
model = BigramLanguageModel()
m = model.to(device)

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

# Epochs
for iter in range(max_iters):
    
    # After every eval_interval iters, evaluate the loss on train and val sets
    if (iter % eval_interval == 0) or (iter==max_iters-1):
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # train model
    logits, loss = model.forward(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=500)[0].tolist()))

step 0: train loss 4.3103, val loss 4.3100
step 500: train loss 2.3804, val loss 2.3800
step 1000: train loss 2.2507, val loss 2.2554
step 1500: train loss 2.1563, val loss 2.1839
step 2000: train loss 2.1193, val loss 2.1598
step 2500: train loss 2.0711, val loss 2.1256
step 3000: train loss 2.0391, val loss 2.1201
step 3500: train loss 2.0331, val loss 2.1010
step 4000: train loss 2.0065, val loss 2.0912
step 4500: train loss 1.9882, val loss 2.0933
step 4999: train loss 1.9758, val loss 2.0636

Tito; palastour hen right have seay
Of fors sArvows ake meade?'dl as mastede stome cepplace, brother not in brighild'd thee the a lease; for me as but as wolls, brothis,
Thele'ty plake abur. I was thing thee.

QUORTUS:
Morderithem. And my her poft inight seet aben losed on frue aplook a lelp to races; heas batter.

GLORCOF MIONNENIUS:
I day light alls than neg-to ance kneave he i' all yopsed, and wall I conne anate Roweity leam;
And my dade upleble Jous able sinsed.

VARICKINTY:
Wowll'd it th

### EXTRAS: Understanding LayerNorm

In [41]:
''' Batch 1::
x:
 tensor([[-1.0940e+00,  9.2369e-01,  1.0306e+00, -6.9489e-01, -1.7699e+00,
          1.8532e+00, -4.6956e-01, -1.3914e+00, -2.6492e+00, -1.2716e+00,
          2.1448e+00,  2.7581e+00, -2.4380e-01,  5.7058e-01,  1.1910e+00,
         -2.0536e-01, -8.3706e-01,  8.1948e-01, -1.3165e+00, -1.3496e+00,
         -3.0246e-01,  1.1609e+00, -1.2607e+00, -1.7108e+00, -8.8963e-01,
         -2.5150e-01,  2.1512e-01,  3.2316e+00,  7.1009e-02,  2.8975e-01,
         -5.6196e-01,  3.4828e-01],
        [-1.7736e+00, -5.1685e-01,  5.8721e-01,  4.8704e-01,  2.6758e+00,
         -7.4817e-01, -1.1273e+00,  1.4494e+00,  1.4603e+00, -2.8269e+00,
         -5.2299e-01,  1.2928e+00,  3.9777e-01, -4.4793e-01, -1.1879e+00,
         -1.4327e+00, -5.6135e-01,  2.8670e-03, -7.5764e-01, -2.0226e+00,
         -1.7019e+00,  1.9411e+00,  1.1781e+00,  1.0802e+00,  1.5170e+00,
          6.2494e-01,  4.2556e-01, -1.2375e+00, -1.3487e+00,  2.9858e-01,
         -1.6523e+00,  2.2149e+00],
        [ 5.4068e-01, -2.0137e+00,  9.4452e-01, -8.7465e-01,  2.9296e-01,
          1.3235e-01,  1.1524e+00,  1.1950e+00,  1.1027e+00, -1.8403e+00,
         -8.4074e-02, -1.4029e+00, -4.0751e-01, -3.3725e-01,  2.7917e+00,
          1.3275e+00, -6.6403e-01, -2.4184e-01,  2.6448e+00, -2.3585e+00,
          2.6160e+00, -1.6893e+00,  1.9611e-02, -8.8555e-01, -9.6600e-01,
         -2.5286e-01,  1.4540e-01,  5.1787e-01,  8.1136e-01, -3.2851e-01,
          8.5210e-01,  3.5812e-01],
        [-2.8521e+00, -1.4999e+00, -1.9989e+00,  3.2833e+00, -2.3870e+00,
          3.4027e-01,  4.4615e-01,  1.1787e+00, -7.8588e-01, -9.6206e-01,
          1.3192e+00,  1.1422e+00,  1.0747e+00, -9.2530e-01,  2.7430e+00,
          1.6823e+00,  1.3534e+00, -3.7213e-01, -5.1179e-02,  2.2511e-01,
         -2.2516e+00, -9.4133e-01,  2.4152e-01,  1.6129e+00,  1.2362e+00,
         -1.5004e+00, -1.5125e+00, -5.1243e-01, -1.0272e-02,  1.2794e+00,
          7.9406e-01,  2.2734e+00],
        [-2.1345e-01,  2.0583e+00, -3.2669e-01, -1.0559e+00,  2.6381e-01,
          1.5843e+00,  2.6540e+00,  2.5563e-02,  8.5180e-01, -1.6690e+00,
          2.1492e+00, -8.1328e-01, -1.4390e+00,  2.1434e+00,  3.7608e-01,
         -1.2563e+00,  2.2049e+00, -1.1276e+00, -7.3530e-01, -2.3094e+00,
         -1.3528e-01, -4.7972e-01,  7.9737e-01, -1.8810e+00,  1.2237e+00,
          1.4250e+00, -3.5959e-01, -1.2622e+00,  1.9894e-02, -5.1607e-01,
          4.6683e-01,  1.6260e+00],
        [-2.3517e-01,  5.0012e-01,  2.8267e-01, -1.1373e+00, -1.3541e+00,
         -7.9941e-01,  1.7430e+00, -6.9294e-01, -1.4591e+00,  1.1422e+00,
          5.6525e-01,  4.8045e-02, -2.5048e+00,  1.5255e+00, -2.1560e+00,
         -6.5913e-01,  2.0463e-02,  3.9527e-01,  4.2689e+00,  1.7024e+00,
          1.2449e+00, -1.7773e+00, -2.4046e+00,  1.1256e+00,  7.2505e-01,
          4.5249e-01,  2.5114e-01, -6.3403e-02,  1.0856e-01, -1.0927e-01,
          7.2755e-01,  1.3204e+00],
        [ 2.6609e+00,  1.0810e-01,  2.7206e+00,  3.3549e+00, -5.1177e-01,
         -2.6488e-02,  1.8399e-01, -1.5500e+00, -1.6921e+00,  1.1674e-01,
         -4.6875e-01, -1.3300e+00, -5.5608e-01,  2.2961e+00, -3.1031e+00,
          4.7118e-01, -2.3055e-01, -2.8578e+00,  1.5767e+00,  1.1904e+00,
         -1.8580e+00,  2.8070e-01, -2.3369e+00, -1.6313e-01,  3.1788e-01,
         -1.7727e+00,  1.2914e+00, -1.2241e+00, -3.8053e-02, -9.2530e-02,
          4.8940e-01, -2.7709e-02],
        [-6.9476e-02, -5.6724e-01, -5.1432e-01,  5.4404e-01,  1.5037e-01,
          1.0765e-01,  3.2190e+00, -1.2193e+00, -9.2795e-02,  2.2568e-01,
         -2.0195e+00, -4.4993e-01,  3.8238e-01,  6.0903e-02, -1.6166e+00,
          4.4886e-01,  2.2783e+00,  1.7588e-01, -1.0842e-01,  8.6834e-01,
         -7.3116e-01, -7.6861e-01,  1.4200e+00,  8.9089e-01, -5.7830e-01,
         -1.6016e+00, -3.0439e+00, -9.3725e-01,  1.1119e-01, -4.9917e-01,
          1.0994e+00,  2.7446e+00]])
x After LayerNorm:
 tensor([[-0.7762,  0.7267,  0.8064, -0.4789, -1.2797,  1.4191, -0.3111, -0.9977,
         -1.9347, -0.9085,  1.6363,  2.0932, -0.1429,  0.4637,  0.9258, -0.1143,
         -0.5848,  0.6491, -0.9420, -0.9666, -0.1866,  0.9034, -0.9004, -1.2357,
         -0.6240, -0.1487,  0.1989,  2.4459,  0.0916,  0.2545, -0.3799,  0.2981],
        [-1.2551, -0.3293,  0.4840,  0.4102,  2.0225, -0.4997, -0.7790,  1.1191,
          1.1271, -2.0310, -0.3339,  1.0038,  0.3444, -0.2786, -0.8237, -1.0040,
         -0.3621,  0.0535, -0.5067, -1.4385, -1.2023,  1.4813,  0.9192,  0.8471,
          1.1689,  0.5118,  0.3649, -0.8602, -0.9421,  0.2714, -1.1658,  1.6830],
        [ 0.3501, -1.6649,  0.6687, -0.7663,  0.1547,  0.0280,  0.8327,  0.8663,
          0.7935, -1.5281, -0.1427, -1.1831, -0.3978, -0.3424,  2.1259,  0.9708,
         -0.6002, -0.2671,  2.0100, -1.9369,  1.9873, -1.4090, -0.0609, -0.7749,
         -0.8384, -0.2758,  0.0383,  0.3321,  0.5637, -0.3355,  0.5958,  0.2061],
        [-1.9574, -1.0652, -1.3944,  2.0909, -1.6506,  0.1490,  0.2189,  0.7022,
         -0.5941, -0.7103,  0.7949,  0.6781,  0.6336, -0.6861,  1.7344,  1.0345,
          0.8175, -0.3211, -0.1093,  0.0730, -1.5612, -0.6966,  0.0838,  0.9887,
          0.7401, -1.0655, -1.0735, -0.4136, -0.0823,  0.7686,  0.4484,  1.4246],
        [-0.2614,  1.4477, -0.3466, -0.8952,  0.0976,  1.0910,  1.8958, -0.0816,
          0.5400, -1.3565,  1.5160, -0.7127, -1.1835,  1.5117,  0.1821, -1.0460,
          1.5579, -0.9492, -0.6540, -1.8383, -0.2026, -0.4618,  0.4990, -1.5160,
          0.8197,  0.9712, -0.3714, -1.0505, -0.0859, -0.4891,  0.2503,  1.1224],
        [-0.2336,  0.2989,  0.1414, -0.8871, -1.0441, -0.6423,  1.1992, -0.5652,
         -1.1202,  0.7640,  0.3461, -0.0285, -1.8776,  1.0416, -1.6249, -0.5407,
         -0.0485,  0.2230,  3.0287,  1.1698,  0.8384, -1.3506, -1.8050,  0.7520,
          0.4619,  0.2644,  0.1186, -0.1092,  0.0153, -0.1425,  0.4637,  0.8931],
        [ 1.7794,  0.1263,  1.8181,  2.2288, -0.2751,  0.0391,  0.1754, -0.9475,
         -1.0395,  0.1319, -0.2473, -0.8050, -0.3038,  1.5432, -1.9532,  0.3614,
         -0.0930, -1.7944,  1.0773,  0.8272, -1.1469,  0.2381, -1.4571, -0.0494,
          0.2621, -1.0917,  0.8925, -0.7364,  0.0316, -0.0036,  0.3732,  0.0383],
        [-0.0523, -0.4426, -0.4011,  0.4288,  0.1201,  0.0866,  2.5263, -0.9539,
         -0.0706,  0.1792, -1.5814, -0.3506,  0.3020,  0.0500, -1.2654,  0.3542,
          1.7887,  0.1401, -0.0828,  0.6831, -0.5711, -0.6005,  1.1157,  0.7008,
         -0.4513, -1.2537, -2.3846, -0.7327,  0.0894, -0.3892,  0.8643,  2.1543]])
'''

'\nx:\n tensor([[[-1.0940,  0.9237,  1.0306,  ...,  0.2897, -0.5620,  0.3483],\n         [-1.7736, -0.5168,  0.5872,  ...,  0.2986, -1.6523,  2.2149],\n         [ 0.5407, -2.0137,  0.9445,  ..., -0.3285,  0.8521,  0.3581],\n         ...,\n         [-0.2352,  0.5001,  0.2827,  ..., -0.1093,  0.7275,  1.3204],\n         [ 2.6609,  0.1081,  2.7206,  ..., -0.0925,  0.4894, -0.0277],\n         [-0.0695, -0.5672, -0.5143,  ..., -0.4992,  1.0994,  2.7446]],\n         ....\n        ])\n         \nLayerNorm:\n tensor([[[-0.7762,  0.7267,  0.8064,  ...,  0.2545, -0.3799,  0.2981],\n         [-1.2551, -0.3293,  0.4840,  ...,  0.2714, -1.1658,  1.6830],\n         [ 0.3501, -1.6649,  0.6687,  ..., -0.3355,  0.5958,  0.2061],\n         ...,\n         [-0.2336,  0.2989,  0.1414,  ..., -0.1425,  0.4637,  0.8931],\n         [ 1.7794,  0.1263,  1.8181,  ..., -0.0036,  0.3732,  0.0383],\n         [-0.0523, -0.4426, -0.4011,  ..., -0.3892,  0.8643,  2.1543]],\n         ...\n        ])\n'

In [49]:
x_batch1_token1_embd = [-0.7762,  0.7267,  0.8064, -0.4789, -1.2797,  1.4191, -0.3111, -0.9977,
         -1.9347, -0.9085,  1.6363,  2.0932, -0.1429,  0.4637,  0.9258, -0.1143,
         -0.5848,  0.6491, -0.9420, -0.9666, -0.1866,  0.9034, -0.9004, -1.2357,
         -0.6240, -0.1487,  0.1989,  2.4459,  0.0916,  0.2545, -0.3799,  0.2981]

print('LayerNorm Variantion:', np.round(np.var(x_batch1_token1_embd), 2))
print('LayeNorm Mean:', np.round(np.mean(x_batch1_token1_embd), 2))

LayerNorm Variantion: 1.0
LayeNorm Mean: 0.0


In [96]:
# BatchNorm v LayerNorm
class layerNorm():
    def __init__ (self, dim, eps=1e-5):
        self.eps = eps
        self.gamma =  torch.ones(dim)
        self.beta = torch.zeros(dim)
        
    def __call__ (self, x):
        # calculate the forward pass
        '''
            when taken mean,var along dim=0, i.e., of every feature across all inputs, it is BatchNorm.
            and if mean,var along dim=1, i.e., of the same feature, it is layerNorm.
        '''
        x_mean = x.mean(dim=1, keepdim=True)
        x_var = x.var(dim=1, keepdim=True)
        print("BatchNorm:", x.mean(dim=0, keepdim=True), '\n', x.var(dim=0, keepdim=True))
        print('-------')
        print("LayerNorm:", x.mean(dim=1, keepdim=True), '\n', x.var(dim=1, keepdim=True))
        print(x_var)
        xhat = (x - x_mean) / torch.sqrt(x_var + self.eps) #nornatize to unit variance
        self.out = self.gamma * xhat + self.beta
        return self.out
    
    def parneters(sett):
        return (self.gamma, self.beta)
                
torch.manual_seed(1337)
x = torch.randn(32, 100) #batch size of 32 of 100 dimensional vectors
print(f'x: {x.shape}\n', x, '\n\n')
module = layerNorm(100)
x = module(x)

x: torch.Size([32, 100])
 tensor([[ 0.1808, -0.0700, -0.3596,  ..., -1.3651, -0.1655,  0.9623],
        [ 0.0315, -0.7419, -0.2978,  ..., -0.6150, -0.4589,  0.5675],
        [ 0.0183, -1.6608,  1.1169,  ..., -0.9001,  0.6614,  0.5118],
        ...,
        [-1.6462, -1.6728, -0.7227,  ..., -0.9647,  0.1162, -0.8295],
        [-0.2266,  0.0219, -0.2785,  ..., -0.9094, -1.3062, -0.7847],
        [ 0.1956, -0.2808, -0.5215,  ..., -1.6868, -0.9292, -1.2395]]) 


BatchNorm: tensor([[ 0.1392, -0.5709, -0.3944,  0.0442, -0.1492,  0.0062, -0.2630,  0.4211,
          0.1554,  0.1999,  0.2484, -0.2488, -0.0572, -0.0869,  0.1383,  0.2357,
          0.2467,  0.1547, -0.2061, -0.3572,  0.2785,  0.2085, -0.2316, -0.0536,
          0.1361, -0.1408,  0.1386, -0.2578,  0.1453,  0.0186,  0.0714, -0.2406,
         -0.0831,  0.1026,  0.1838,  0.0831, -0.0111,  0.3121,  0.1191, -0.2944,
          0.0761,  0.1658,  0.0705,  0.1351,  0.0082, -0.2185, -0.0326, -0.0873,
          0.1913, -0.0268, -0.1935, -0.4

In [98]:
# It will return Gaussian mean and std
print("BatchNorm:", x[:,0].mean(), x[:,0].std()) # mean, std of every feature across all inputs
print("LayerNorm:", x[0,:].mean(), x[0,:].std()) # mean, std of single input from the batch, of its features

BatchNorm: tensor(0.1469) tensor(0.8803)
LayerNorm: tensor(-9.5367e-09) tensor(1.0000)


# Final Model

In [106]:
import torch
import torch.nn as nn
from torch.nn import functional as F

# hyperparameters
batch_size = 64 # how many independent sequences will we process in parallel?
block_size = 256 # what is the maximum context length for predictions?
max_iters = 5000
learning_rate = 3e-4 #self-attention can't tolerate high lr
device = 'cuda' if torch.cuda.is_available() else 'cpu' # if you have gpu, run on it
eval_interval = 500
eval_iters = 200
n_embd = 384
n_head = 6
n_layer = 6
dropout = 0.2
# ------------

torch.manual_seed(1337)

# wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

# Train and test splits
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

# data loading
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device) # sending data to device (cpu or gpu)
    return x, y

@torch.no_grad()
def estimate_loss():
    out = {}
    '''
        Instead of evaluating the model every iter, 
        we Evaluate every eval_interval based on what the model has been trained so far
    '''
    model.eval() # Switch to Evaluation mode
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train() # Switch to Training mode
    return out

# Self-Attention
class Head(nn.Module):
    """ one head of self-attention """
    
    def __init__(self, head_size):
        super().__init__()
        self.head_size = head_size
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)        
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size))) #tril is not a parameter to be optimized
    
        self.dropout = nn.Dropout(dropout)    
        
    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x)   # (B, T, head_size)
        q = self.query(x) # (B, T, head_size)
        v = self.value(x) # (B, T, head_size)
        
        # compute attention scores ("affinities")
        wei = (q @ k.transpose(-2, -1)) * self.head_size**-0.5   # (B, T, head_size) @ (B, head_size, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T]==0, float('-inf'))   # (B, T, T)
        wei = F.softmax(wei, dim=-1)  # (B, T, T)
        wei = self.dropout(wei)
        
        # perform weighted aggregation of values
        out = wei @ v  # (B, T, T) @ (B, T, head_size) -> (B, T, head_size) == (B, T, n_embd)
        return out

class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """
    
    def __init__ (self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1) # (B, T, head_size*4) == (B, T, n_embd)
        out = self.proj(out) # we are doing Linear transformation of the output from self-attention
        out = self.dropout(out)
        return out

class FeedForward(nn.Module):
    """ Simple Linear Layer followed by Non-Linearity"""
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4* n_embd), # making ouput of Linear 4 times, as suggested in paper
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd), # self.proj # here again making ouput of Linear n_embd, as suggested in paper
            nn.Dropout(dropout)   
        )
    
    def forward(self, x):
        return self.net(x)    

class Block(nn.Module):
    """ Transformer block: communication followed by computation """
    def __init__(self, n_embd, n_head):
        super().__init__()
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        # Self Attention
        head_size = n_embd // n_head
        self.sa_heads = MultiHeadAttention(n_head, head_size)
        # Feed Forward
        self.ffwd = FeedForward(n_embd)
        # Layer Norms
        self.ln1 = nn.LayerNorm(n_embd) # making Unit Gaussians
        self.ln2 = nn.LayerNorm(n_embd)
    
    def forward(self, x):
        x = x + self.sa_heads(self.ln1(x)) # in original paper the layernorm is applied after the computation, though overtime it has become more common to apply it before the computaion
        x = x + self.ffwd(self.ln2(x))
        return x
        
# super simple bigram model
class BigramLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        # Each word or token will be represented by an n_embd-dimensional embedding vector.
        self.token_embedding_table = nn.Embedding(num_embeddings=vocab_size, embedding_dim=n_embd)
        # We don't just want to encode the identity of token, but also its position
        self.position_embedding_table = nn.Embedding(num_embeddings=block_size, embedding_dim=n_embd)
        # Block Component
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        # Layer Norm
        self.ln_f = nn.LayerNorm(n_embd) # final layer norm
        # Applies a linear transformation to the incoming data: :math:`y = x @ W.T + b`
        self.lm_head = nn.Linear(in_features=n_embd, out_features=vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        
        # idx and targets are both (B,T) tensor of integers
        tkn_emb = self.token_embedding_table(idx) # (B,T,C)
        
        # range(0, T):: Every position will have n_embd-dimensional embedding vector.
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        
        # x now, not just hold token identity but also its position
        x = tkn_emb + pos_emb # (B, T, C)
        
        # Block component
        x = self.blocks(x) # (B, T, head_size)
        
        # final layer norm
        x = self.ln_f(x)
        
        # Making Logits
        logits = self.lm_head(x) # (B,T,vocab_size)
        
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # as we are implementing position embedding (pos_emb), we can't include context more than block_size
            # crop idx to the last block_size token
            idx_cond = idx[:, -block_size:] # (B, block_size) == (B, T)
            # get the predictions
            logits, loss = self(idx_cond) #forward()
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

# Build Model
model = BigramLanguageModel()
m = model.to(device)

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

# Epochs
for iter in range(max_iters):
    
    # After every eval_interval iters, evaluate the loss on train and val sets
    if (iter % eval_interval == 0) or (iter==max_iters-1):
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # train model
    logits, loss = model.forward(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=500)[0].tolist()))

KeyboardInterrupt: 

In [None]:
''' Model Output when ran on Google Colab with GPU (took ~ 45min)
    step 0: train loss 4.2846, val loss 4.2820
    step 500: train loss 1.8865, val loss 2.0023
    step 1000: train loss 1.5361, val loss 1.7221
    step 1500: train loss 1.3948, val loss 1.6038
    step 2000: train loss 1.3077, val loss 1.5490
    step 2500: train loss 1.2523, val loss 1.5153
    step 3000: train loss 1.2010, val loss 1.4894
    step 3500: train loss 1.1587, val loss 1.4800
    step 4000: train loss 1.1222, val loss 1.4800
    step 4500: train loss 1.0853, val loss 1.4736
    step 4999: train loss 1.0494, val loss 1.4913

    But with price of a breast sast-creature.
    Of whom, Cariolanus: of God! what's Romeo?

    Third Consciden:
    Mistress, let's proceed. Go to me, go: I say.

    CAMILLO:
    By my lord, I'll braw a light:
    I have bore alone death.

    SLY:
    If you would I wish vengeance me when I was off
    these advancementary; this to fled till
    I clear thee join till ar outrain, thou art to me;
    And, not many hath punishmen.

    SLY:
    They Gentleman, I have deliver'd by thus leave
    He known I dissevel to you!

    Lord:
    Lords, am I though for
'''

# Load the Pytorch Model

In [108]:
PATH = 'BigramModel.pth'
model = torch.load(PATH, map_location=torch.device('cpu'))
model

BigramLanguageModel(
  (token_embedding_table): Embedding(65, 384)
  (position_embedding_table): Embedding(256, 384)
  (blocks): Sequential(
    (0): Block(
      (sa_heads): MultiHeadAttention(
        (heads): ModuleList(
          (0): Head(
            (key): Linear(in_features=384, out_features=64, bias=False)
            (query): Linear(in_features=384, out_features=64, bias=False)
            (value): Linear(in_features=384, out_features=64, bias=False)
            (dropout): Dropout(p=0.2, inplace=False)
          )
          (1): Head(
            (key): Linear(in_features=384, out_features=64, bias=False)
            (query): Linear(in_features=384, out_features=64, bias=False)
            (value): Linear(in_features=384, out_features=64, bias=False)
            (dropout): Dropout(p=0.2, inplace=False)
          )
          (2): Head(
            (key): Linear(in_features=384, out_features=64, bias=False)
            (query): Linear(in_features=384, out_features=64, bias=Fals

In [109]:
print("Total Number of Parameters:", sum(p.numel() for p in m.parameters()))

Total Number of Parameters: 10788929


In [112]:
# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(model.generate(context, max_new_tokens=500)[0].tolist()))


Lords:
Have I trust'd through them not; for they do thee, shall
She lie of those wildo: the of your lady foot.

JULIET:
I would excuse lords; proof, shall be hoft by arm:
'Zoundsire I am his power and lean
Uncalamal.' O God, and blow hath burnt I came!

LADY ANNA:
Here, Thou is, the pity of thee! a black not mock
That noble daggers, not was mercy uncles;
But guicide where I pray to his present,
Wore hearted of and downfastest of his fells;
The house servens o' the father field,
See Keep, their E
