**Table of Contents**  

- [Imports](#$imports)    
- [Load Data](#load-data)    
- [BiGram Model to Generate Text](#bigram-model)    
- [Mathematical Trick in Self-Attention](#math-trick-self-attention)    
- [Self-Attention Ad-Hoc Implementation](#self-attention-adhoc)    
- [Scaling Weights for Unit Variance](#scaling-weights-unit-variance)    
- [Bigram Model with Single Self-Attention Head](#bigram-single-head)    
- [Bigram Model with Multiple Self-Attention Heads](#bigram-multi-head)    
- [Bigram Model with Multiple Self-Attention Blocks](#bigram-multi-block)    


## <a id="imports"></a> Imports  

In [None]:
import torch  # we use PyTorch: https://pytorch.org
import torch.nn as nn
from torch.nn import functional as F

## <a id="load-data"></a> Load Data  

In [None]:
# We always start with a dataset to train on. Let's download the tiny shakespeare dataset
# !wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
# read it in to inspect it
with open("input.txt", "r", encoding="utf-8") as f:
    text = f.read()

# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
print("".join(chars))
print(vocab_size)

# create a mapping from characters to integers
stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for i, ch in enumerate(chars)}
encode = lambda s: [
    stoi[c] for c in s
]  # encoder: take a string, output a list of integers
decode = lambda l: "".join(
    [itos[i] for i in l]
)  # decoder: take a list of integers, output a string

print(encode("hii there"))
print(decode(encode("hii there")))

# let's now encode the entire text dataset and store it into a torch.Tensor
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(
    data[:1000]
)  # the 1000 characters we looked at earier will to the GPT look like this


# Let's now split up the data into train and validation sets
n = int(0.9 * len(data))  # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]


block_size = 8
train_data[: block_size + 1]


torch.manual_seed(1337)
block_size = 8
train_data[: block_size + 1]
x = train_data[:block_size]
y = train_data[1 : block_size + 1]
for t in range(block_size):
    context = x[: t + 1]
    target = y[t]
    print(f"when input is {context} the target: {target}")


torch.manual_seed(1337)
batch_size = 4  # how many independent sequences will we process in parallel?
block_size = 8  # what is the maximum context length for predictions?


def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == "train" else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i : i + block_size] for i in ix])
    y = torch.stack([data[i + 1 : i + block_size + 1] for i in ix])
    return x, y


xb, yb = get_batch("train")
print("inputs:")
print(xb.shape)
print(xb)
print("targets:")
print(yb.shape)
print(yb)

print("----")

for b in range(batch_size):  # batch dimension
    for t in range(block_size):  # time dimension
        context = xb[b, : t + 1]
        target = yb[b, t]
        print(f"when input is {context.tolist()} the target: {target}")

when input is tensor([18]) the target: 47
when input is tensor([18, 47]) the target: 56
when input is tensor([18, 47, 56]) the target: 57
when input is tensor([18, 47, 56, 57]) the target: 58
when input is tensor([18, 47, 56, 57, 58]) the target: 1
when input is tensor([18, 47, 56, 57, 58,  1]) the target: 15
when input is tensor([18, 47, 56, 57, 58,  1, 15]) the target: 47
when input is tensor([18, 47, 56, 57, 58,  1, 15, 47]) the target: 58
inputs:
torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
targets:
torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])
----
when input is [24] the target: 43
when input is [24, 43] the target: 58
when input is [24, 43, 58] the target: 5
when input is [24, 43, 58, 5] the target: 

## <a id="bigram-model"></a> BiGram Model to Generate Text  


In [None]:
class BiGram(nn.Module):

    def __init__(self):
        super(BiGram, self).__init__()
        self.emb_layer = nn.Embedding(vocab_size, vocab_size)

    def forward(self, x, targets=None):
        # x - > (B,T)
        logits = self.emb_layer(x)  # B, T, C
        B, T, C = logits.shape
        if targets is None:
            loss = None
        else:
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx B, T
        for _ in range(max_new_tokens):
            logits, _ = self(idx)
            # take the last one
            logits = logits[:, -1, :]  # B,C
            # -1 here in the time direction means that only
            # previous word is considered.
            probs = F.softmax(logits, -1)
            new_idx = torch.multinomial(probs, num_samples=1)
            idx = torch.cat([idx, new_idx], axis=1)
        return idx

In [None]:
bigram = BiGram()
logits, loss = bigram(xb, yb)

print(logits.shape, loss)  # check logits shape and loss
print(
    decode(
        bigram.generate(idx=torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[
            0
        ].tolist()
    )
)

torch.Size([32, 65]) tensor(5.0364, grad_fn=<NllLossBackward0>)

lfJeukRuaRJKXAYtXzfJ:HEPiu--sDioi;ILCo3pHNTmDwJsfheKRxZCFs
lZJ XQc?:s:HEzEnXalEPklcPU cL'DpdLCafBheH


In [None]:
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(bigram.parameters(), lr=1e-3)

batch_size = 32
for steps in range(100):  # increase number of steps for good results...

    # sample a batch of data
    xb, yb = get_batch("train")
    # evaluate the loss
    logits, loss = bigram(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())


print(
    decode(
        bigram.generate(idx=torch.zeros((1, 1), dtype=torch.long), max_new_tokens=500)[
            0
        ].tolist()
    )
)

  from .autonotebook import tqdm as notebook_tqdm


4.545020580291748

tcQDHEXsLEocK;tgQ;y?q Ew$D-MkpJuerOn$vSmct'mOdwdw-Thoek$AqS:UCRQPauiu XTus!kRuBHeA-'uOm'N
tp'SrP&!NgRjK3m'mn: mOYxHci&'Wdgersq-E-:Zty3bBMf,w:V.m$osh$kot:LTziacyd
RuibLE?KXyMq$k
LLKD-qWy&h.Su'opx'NxVTyFCdNhpZL-x:ZLaT ym&o&IYBfzydo?MQgFZiNAUg;QA.XajBE$LwU!nBf;kGMcZPw fBjop.EVLcBzg;cPf;.qqAEmy,r 
sZhV-suIRugljoS.xvPXjGtfH
w:otygZQNWLCoiopLwkZEhOVe:ci,NQJ.F
kZcQa?qIJKBYgvhw HEzyE-JWtIJOJKEnL z,-sfINT VI'rKlXhvsgwXjz-J:FT'3pYxxmjgVyNU.O?eSfb.jg!!dlrENRFKzjtzakbezn$PgGUKXr!WwkZz?TgSbBhwGW,iximRENU Bf$


## <a id="math-trick-self-attention"></a> Mathematical Trick in Self-Attention  

using lower triangular matrix for weighted aggregated.
lower triangular matrix is used for concept of not seeing future token when predicting a toke at time step "t"
weights are learned to give different weights for different tokens that appear till the time t-1 when predicting for "t"

3 methods for doing causal multiplication are demonstrated below
1. straightfoward matrix mul
2. lower triangular matrix
3. softmax + triangular matrix

In [None]:
# method-1 straightforward matric multiplication
# toy example illustrating how matrix multiplication can be used for a "weighted aggregation"
torch.manual_seed(42)
a = torch.tril(torch.ones(3, 3))
a = a / torch.sum(a, 1, keepdim=True)
b = torch.randint(0, 10, (3, 2)).float()
c = a @ b
print("a=")
print(a)
print("--")
print("b=")
print(b)
print("--")
print("c=")
print(c)

a=
tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
--
b=
tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
--
c=
tensor([[2.0000, 7.0000],
        [4.0000, 5.5000],
        [4.6667, 5.3333]])


In [None]:
# method-2: lower triangular matrix

# consider the following toy example:
torch.manual_seed(1337)
B, T, C = 4, 8, 2  # batch, time, channels
x = torch.randn(B, T, C)
x.shape

torch.Size([4, 8, 2])

In [None]:
# We want x[b,t] = mean_{i<=t} x[b,i]
xbow = torch.zeros((B, T, C))
for b in range(B):
    for t in range(T):
        xprev = x[b, : t + 1]  # (t,C)
        xbow[b, t] = torch.mean(xprev, 0)

In [None]:
# use the method of weighted aggregation
# B,T,C finally we want to get B,T,C with weighted average
wei = torch.tril(torch.ones(T, T))
wei = wei / wei.sum(1, keepdim=True)

In [26]:
wei

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])

In [27]:
xbow2 = wei @ x

In [28]:
torch.allclose(xbow, xbow2)

True

In [None]:
# method-3
# Using the softmax
# version 3: use Softmax
tril = torch.tril(torch.ones(T, T))
wei = torch.zeros((T, T))
wei = wei.masked_fill(tril == 0, float("-inf"))
wei = F.softmax(wei, dim=-1)
xbow3 = wei @ x
torch.allclose(xbow, xbow3)

True

## <a id="self-attention-adhoc"></a> Self-Attention Ad-Hoc Implementation  


Self attention
for each tokens there is
1. Query
2. Key
3. Value

Query and key interact with each other and finally value is a representaion of token added at the end.
When query and key match their activation is high.

In higher level
query - what I want
key - what I have
value - a represenation of token with small dense layer


In [None]:
# version 4 : implement self attention
torch.manual_seed(1337)
B, T, C = 4, 8, 32  # batch, time, channels
x = torch.randn(B, T, C)

# let's see a single Head perform self-attention
head_size = 16

key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)

k = key(x)  # B, T, head_size
q = query(x)  # B, T, head_size
v = value(x)  # B, T, head_size


wei = q @ k.transpose(-2, -1)  # B, T, T
# each time step interaction with other time step

trill = torch.tril(torch.ones(T, T))
wei = wei.masked_fill(trill == 0, float("-inf"))
wei = F.softmax(wei, dim=-1)
# this makes sure the causal connection in wei


# finally take the causal wei and matmul with v
output = wei @ v
print(output.shape)

torch.Size([4, 8, 16])



Notes:

1. Attention is a communication mechanism. Can be seen as nodes in a directed graph looking at each other and aggregating information with a weighted sum from all nodes that point to them, with data-dependent weights.
1. There is no notion of space. Attention simply acts over a set of vectors. This is why we need to positionally encode tokens.
1. Each example across batch dimension is of course processed completely independently and never "talk" to each other
1. In an "encoder" attention block just delete the single line that does masking with tril, allowing all tokens to communicate. This block here is called a "decoder" attention block because it has triangular masking, and is usually used in autoregressive settings, like language modeling.
1. "self-attention" just means that the keys and values are produced from the same source as queries. In "cross-attention", the queries still get produced from x, but the keys and values come from some other, external source (e.g. an encoder module)
1. "Scaled" attention additional divides wei by 1/sqrt(head_size). This makes it so when input Q,K are unit variance, wei will be unit variance too and Softmax will stay diffuse and not saturate too much. Illustration below

## <a id="scaling-weights-unit-variance"></a> Scaling Weights for Unit Variance  

In [None]:
# without scaling
k = torch.randn(B, T, head_size)
q = torch.randn(B, T, head_size)
wei = q @ k.transpose(-2, -1)

In [None]:
k.var(), q.var(), wei.var()

(tensor(1.0449), tensor(1.0700), tensor(17.4690))

In [None]:
# with scaled variance
k = torch.randn(B, T, head_size)
q = torch.randn(B, T, head_size)
wei = q @ k.transpose(-2, -1) * head_size**-0.5

In [None]:
k.var(), q.var(), wei.var()

(tensor(0.9006), tensor(1.0037), tensor(0.9957))

## <a id="bigram-single-head"></a> Bigram Model with Single Self-Attention Head  


In [None]:
class Head(nn.Module):
    def __init__(self, input_size, head_size):
        super(Head, self).__init__()
        self.key = nn.Linear(input_size, head_size, bias=False)
        self.query = nn.Linear(input_size, head_size, bias=False)
        self.value = nn.Linear(input_size, head_size, bias=False)

    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x)  # B, T, head_size
        q = self.query(x)  # B, T, head_size
        wei = q @ k.transpose(-2, -1)  # B, T, T
        trill = torch.tril(torch.ones(T, T))
        wei = wei.masked_fill(trill == 0, float("-inf"))
        wei = F.softmax(wei, dim=-1)  # B, T, T

        v = self.value(x)  # B, T, head_size
        output = wei @ v  # B, T, head_size
        return output

In [None]:
class BiGramWithSelfAttention(nn.Module):

    def __init__(self):
        super(BiGramWithSelfAttention, self).__init__()
        emb_size = 16
        self.token_embed_table = nn.Embedding(vocab_size, emb_size)
        self.position_embed_table = nn.Embedding(block_size, emb_size)
        self.sa_head = Head(emb_size, emb_size)
        self.lm_head = nn.Linear(emb_size, vocab_size)

    def forward(self, x, targets=None):
        # x - > (B,T)
        B, T = x.shape
        token_embeddings = self.token_embed_table(x)  # B, T, emb_size
        positional_emebddings = self.position_embed_table(
            torch.arange(T)
        )  # B,T, emb_size
        logits = (
            token_embeddings + positional_emebddings
        )  # B,T, emb_size #positional expanded and added
        logits = self.sa_head(logits)
        logits = self.lm_head(logits)  # B, T, vocab_size
        if targets is None:
            loss = None
        else:
            logits = logits.view(B * T, -1)
            targets = targets.view(B * T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx B, T
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            logits, loss = self(idx_cond)
            # take the last one
            logits = logits[:, -1, :]  # B,C
            probs = F.softmax(logits, -1)
            new_idx = torch.multinomial(probs, num_samples=1)
            idx = torch.cat([idx, new_idx], axis=1)
        return idx

In [75]:
m = BiGramWithSelfAttention()
logits, loss = m(xb, yb)

In [None]:
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

batch_size = 32
for steps in range(5000):  # increase number of steps for good results...

    # sample a batch of data
    xb, yb = get_batch("train")
    # evaluate the loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())

print(
    decode(
        m.generate(idx=torch.zeros((1, 1), dtype=torch.long), max_new_tokens=500)[
            0
        ].tolist()
    )
)

2.4724979400634766

INERTOLUF:
D cot
Heth hens ithit thyennod qut hat.

lfmilsd co I ugo hres wy he utr nse an adt mothico athalnson pall yoven ICELOLU:
G ce th!
Wavilleire sir
Bor ko lerd Whers movee.

ciulth in:
A thouingin mper he yillatw sseef tes Pak com, yod?
:
MIml
Whe.
MI:
Sy me
Cofto T'se yy m'ghatthe himowour mis I soat om, lans,
I o fee cor, bud
Yed!
C st p fes kas ro Ror sablomee vesyullir tretopoteut then ct ronlt hem.

Y hokr bee jeecowis igple spo;
Liconen hekeand.

O:
MENNus'th thth per orowouns, I 


## <a id="bigram-multi-head"></a> Bigram Model with Multiple Self-Attention Heads  


- multiple heads of self attention running in parallel
- All of them can be run parallely.
- concat all outputs in channel dimension finally.
- Communication channel is self attention, each communicate channel can learn something different
- This is something like grouped convolution. (This is multi head)

In [None]:
class MultiSelfAttentionHeads(nn.Module):
    def __init__(self, input_size, head_size, num_heads):
        super(MultiSelfAttentionHeads, self).__init__()
        self.heads = nn.ModuleList(
            [Head(input_size, head_size // num_heads) for i in range(num_heads)]
        )

    def forward(self, x):
        # ModuleList can act as an iterable, or be indexed using ints
        outputs = [head(x) for head in self.heads]
        outputs = torch.concat(outputs, dim=-1)
        return outputs

In [None]:
class BiGramWithMultiHeadSelfAttention(nn.Module):

    def __init__(self):
        super(BiGramWithMultiHeadSelfAttention, self).__init__()
        emb_size = 16
        self.token_embed_table = nn.Embedding(vocab_size, emb_size)
        self.position_embed_table = nn.Embedding(block_size, emb_size)
        self.sa_head = MultiSelfAttentionHeads(emb_size, emb_size, num_heads=4)
        self.lm_head = nn.Linear(emb_size, vocab_size)

    def forward(self, x, targets=None):
        # x - > (B,T)
        B, T = x.shape
        token_embeddings = self.token_embed_table(x)  # B, T, emb_size
        positional_emebddings = self.position_embed_table(torch.arange(T))  # B,T
        logits = (
            token_embeddings + positional_emebddings
        )  # B,T, emb_size #positional expanded and added
        logits = self.sa_head(logits)
        logits = self.lm_head(logits)  # B, T, vocab_size
        if targets is None:
            loss = None
        else:
            logits = logits.view(B * T, -1)
            targets = targets.view(B * T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx B, T
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            logits, loss = self(idx_cond)
            # take the last one
            logits = logits[:, -1, :]  # B,C
            probs = F.softmax(logits, -1)
            new_idx = torch.multinomial(probs, num_samples=1)
            idx = torch.cat([idx, new_idx], axis=1)
        return idx

In [21]:
m = BiGramWithMultiHeadSelfAttention()
logits, loss = m(xb, yb)

In [None]:
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

batch_size = 32
for steps in range(5000):  # increase number of steps for good results...

    # sample a batch of data
    xb, yb = get_batch("train")
    # evaluate the loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())

print(
    decode(
        m.generate(idx=torch.zeros((1, 1), dtype=torch.long), max_new_tokens=500)[
            0
        ].tolist()
    )
)

2.363037109375

HI HAMICAMofur beol lamasiche weat
B:
Ye haist hesthord my madulve in laqyomin ceerour:
San;
D of holthe. nuse
Cemad-nve ipsttesth sud I dy shell-Fbt?

I,
LDUCDod AONESOLEFIEDFO:
Te
CyA IDut Poto che,
Why leilgdecoos macourenas, of thelelie I my I sreg menl ost bamyne withent moold to oond sotr,
Wo towigs

Mulit.
R'-zan thabeow por puid the.
oo: I he pefce ang Xan of gow, pulere, the your sowes, tim Yon the nohe wirse for so it, bou his sorgeand noolt, whee as wir woos;
Ang.

TAng, I theto avip 


## <a id="bigram-multi-block"></a> Bigram Model with Multiple Self-Attention Blocks  

Block in this case are MultiHeadAttention + feed-feedforward networks

In [None]:
class Block(nn.Module):
    def __init__(self, input_size, head_size, num_heads):
        super(Block, self).__init__()
        self.heads = nn.ModuleList(
            [Head(input_size, head_size // num_heads) for i in range(num_heads)]
        )
        self.ffn = nn.Linear(head_size, head_size)

    def forward(self, x):
        # ModuleList can act as an iterable, or be indexed using ints
        outputs = [head(x) for head in self.heads]
        outputs = torch.concat(outputs, dim=-1)
        outputs = self.ffn(outputs)
        return outputs

In [None]:
class BiGramWithMultiHeadSelfAttentionSingleBlock(nn.Module):

    def __init__(self):
        super(BiGramWithMultiHeadSelfAttentionSingleBlock, self).__init__()
        emb_size = 16
        self.token_embed_table = nn.Embedding(vocab_size, emb_size)
        self.position_embed_table = nn.Embedding(block_size, emb_size)
        self.blocks = nn.Sequential(
            Block(emb_size, emb_size, num_heads=4),
            Block(emb_size, emb_size, num_heads=4),
            Block(emb_size, emb_size, num_heads=4),
            Block(emb_size, emb_size, num_heads=4),
        )
        self.lm_head = nn.Linear(emb_size, vocab_size)

    def forward(self, x, targets=None):
        # x - > (B,T)
        B, T = x.shape
        token_embeddings = self.token_embed_table(x)  # B, T, emb_size
        positional_emebddings = self.position_embed_table(torch.arange(T))  # B,T
        logits = (
            token_embeddings + positional_emebddings
        )  # B,T, emb_size #positional expanded and added
        logits = self.blocks(logits)
        logits = self.lm_head(logits)  # B, T, vocab_size
        if targets is None:
            loss = None
        else:
            logits = logits.view(B * T, -1)
            targets = targets.view(B * T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        B, T = idx.shape
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            logits, loss = self(idx_cond)
            # take the last one
            logits = logits[:, -1, :]  # B,C
            probs = F.softmax(logits, -1)
            new_idx = torch.multinomial(probs, num_samples=1)
            idx = torch.cat([idx, new_idx], axis=1)
        return idx

In [None]:
m = BiGramWithMultiHeadSelfAttentionSingleBlock()
logits, loss = m(xb, yb)

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

batch_size = 32
for steps in range(5000):  # increase number of steps for good results...

    # sample a batch of data
    xb, yb = get_batch("train")
    # evaluate the loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())

print(
    decode(
        m.generate(idx=torch.zeros((1, 1), dtype=torch.long), max_new_tokens=500)[
            0
        ].tolist()
    )
)

2.624037981033325

TPyi:
QFannh gins kraur av yanr sren fans ocer linn
We tesbimser sglr tou fror ginve myexs
Tonmr is mese nid fel tefoelnl, ta,n alr lisen moorulm the menm yhhigs, sto'e an silngec mhit:;
S;
ECI ULWINSRS CALNGBPYCEH:
A
Thhy broem ceotue;
Thed toe, meve itro ture a yp tet el ivero ie thet nopeec, whast sy, tee moe aosf caa wef okannrec, ny he fo fihbasde lad bikrvha. Npleud;
Be sugi cora shi gtoo:
Thee seho rhos ge, seuy ho,e hond tit prhe chw af, annte toum lilnnk cisiy tou tis mng ti farilliuo t


```txt
problems with this
this NN got bigger, has problems with backpro
two ideas which help a lot with this big networks 
1. Residual connections
2. layer normalization

In [None]:
# layernorm ELI5

a = torch.randn(10, 5)
ln = nn.LayerNorm(5)
a_normed = ln(a)
a_normed.mean(1), a_normed.std(1)

for p in ln.parameters():
    print(p.shape, p)

torch.Size([5]) Parameter containing:
tensor([1., 1., 1., 1., 1.], requires_grad=True)
torch.Size([5]) Parameter containing:
tensor([0., 0., 0., 0., 0.], requires_grad=True)



## <a id="bigram-modified-multi-block"></a> Multi Self Attention Block with layernorm and skip connections


In [None]:
# BlockModified = Block + layernorm + skip connection(residual connection)


class BlockModified(nn.Module):
    def __init__(self, input_size, head_size, num_heads):
        super(BlockModified, self).__init__()
        self.heads = nn.ModuleList(
            [Head(input_size, head_size // num_heads) for i in range(num_heads)]
        )
        self.ffn = nn.Linear(head_size, head_size)
        self.ln1 = nn.LayerNorm(head_size)
        self.ln2 = nn.LayerNorm(head_size)

    def forward(self, x):
        # ModuleList can act as an iterable, or be indexed using ints
        outputs = [head(x) for head in self.heads]
        outputs = torch.concat(outputs, dim=-1) + x
        outputs = self.ln1(outputs)  # layernorm-1
        outputs = self.ffn(outputs) + x  # skip connection
        outputs = self.ln2(outputs)  # layernorm-2
        return outputs

In [None]:
class BiGramWithModifiedBlock(nn.Module):

    def __init__(self):
        super(BiGramWithModifiedBlock, self).__init__()
        emb_size = 16
        self.token_embed_table = nn.Embedding(vocab_size, emb_size)
        self.position_embed_table = nn.Embedding(block_size, emb_size)
        self.blocks = nn.Sequential(
            BlockModified(emb_size, emb_size, num_heads=4),
            BlockModified(emb_size, emb_size, num_heads=4),
            BlockModified(emb_size, emb_size, num_heads=4),
            BlockModified(emb_size, emb_size, num_heads=4),
        )
        self.lm_head = nn.Linear(emb_size, vocab_size)

    def forward(self, x, targets=None):
        # x - > (B,T)
        B, T = x.shape
        token_embeddings = self.token_embed_table(x)  # B, T, emb_size
        positional_emebddings = self.position_embed_table(
            torch.arange(T)
        )  # B,T, emb_size
        logits = (
            token_embeddings + positional_emebddings
        )  # B,T, emb_size # no change in dimensions
        logits = self.blocks(logits)  # B, T, emb_size
        # thing to observer - size is same after applying multiheadattention blocks
        # how to apply layer norm here? - better to apply internally?
        logits = self.lm_head(logits)  # B, T, vocab_size
        if targets is None:
            loss = None
        else:
            logits = logits.view(B * T, -1)
            targets = targets.view(B * T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        B, T = idx.shape
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            logits, loss = self(idx_cond)
            # take the last one
            logits = logits[:, -1, :]  # B,C
            probs = F.softmax(logits, -1)
            new_idx = torch.multinomial(probs, num_samples=1)
            idx = torch.cat([idx, new_idx], axis=1)
        return idx

In [None]:
m = BiGramWithModifiedBlock()
logits, loss = m(xb, yb)

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

batch_size = 32
for steps in range(5000):  # increase number of steps for good results...

    # sample a batch of data
    xb, yb = get_batch("train")
    # evaluate the loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())

print(
    decode(
        m.generate(idx=torch.zeros((1, 1), dtype=torch.long), max_new_tokens=500)[
            0
        ].tolist()
    )
)

2.361734628677368


Su I't pint That barver, thed thep, gody! omly mat o poul so mpy thir wam they:
And that end.


K':
Fallrcewe for ther thamude: man.

I ing
Thas thag.

Who gote fonase meme,':
I I shel thcae bon!

RLOLUCR:
Thid rathertuefrt koy, theay ust'd and thad.

BBEFINDIMGOHEOHENRS:
Ighad beved ent not ant;
Ina monoust e nof twond gor woa nome hin ind hape then to nave coir ist cart so reave
And has'm urme o,
An tha thou' so suf,
Ere thin
I ing, Vocy my ance tint atth! tis's; ands hing dep: ast tUExs bre 
