### A Model intended to generate text the same likeness as the input text file; hence the 'echo' in 'echoGPT'


In [None]:
import torch

In [None]:
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2025-03-07 18:05:57--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt’


2025-03-07 18:05:58 (157 MB/s) - ‘input.txt’ saved [1115394/1115394]



In [None]:
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [None]:
print("length of dataset in characters: ", len(text))

length of dataset in characters:  1115394


In [None]:
print(text[:1000])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.



In [None]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [None]:
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }

In [None]:
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

print(encode("hello world!$"))
print(decode(encode("hello world!$")))

[46, 43, 50, 50, 53, 1, 61, 53, 56, 50, 42, 2, 3]
hello world!$


In [None]:
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
# print(data[:1000])

torch.Size([1115394]) torch.int64


In [None]:
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

In [None]:
block_size = 8
train_data[:block_size+1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [None]:
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"when input is {context} the target: {target}")

when input is tensor([18]) the target: 47
when input is tensor([18, 47]) the target: 56
when input is tensor([18, 47, 56]) the target: 57
when input is tensor([18, 47, 56, 57]) the target: 58
when input is tensor([18, 47, 56, 57, 58]) the target: 1
when input is tensor([18, 47, 56, 57, 58,  1]) the target: 15
when input is tensor([18, 47, 56, 57, 58,  1, 15]) the target: 47
when input is tensor([18, 47, 56, 57, 58,  1, 15, 47]) the target: 58


In [None]:
torch.manual_seed(1337)
batch_size = 4 # how many independent sequences will we process in parallel?
block_size = 8 # what is the maximum context length for predictions?

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)
print('----')

for b in range(batch_size): # batch dimension
    for t in range(block_size): # time dimension
        context = xb[b, :t+1]
        target = yb[b,t]
        print(f"when input is {context.tolist()} the target: {target}")

inputs:
torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
targets:
torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])
----
when input is [24] the target: 43
when input is [24, 43] the target: 58
when input is [24, 43, 58] the target: 5
when input is [24, 43, 58, 5] the target: 57
when input is [24, 43, 58, 5, 57] the target: 1
when input is [24, 43, 58, 5, 57, 1] the target: 46
when input is [24, 43, 58, 5, 57, 1, 46] the target: 43
when input is [24, 43, 58, 5, 57, 1, 46, 43] the target: 39
when input is [44] the target: 53
when input is [44, 53] the target: 56
when input is [44, 53, 56] the target: 1
when input is [44, 53, 56, 1] the target: 58
when input is [44, 53, 56, 1, 58] the target: 46
when input is [44, 53

In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):
        logits = self.token_embedding_table(idx)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):

            logits, loss = self(idx)

            logits = logits[:, -1, :]

            probs = F.softmax(logits, dim=-1)

            idx_next = torch.multinomial(probs, num_samples=1)

            idx = torch.cat((idx, idx_next), dim=1)
        return idx

m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)

print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[0].tolist()))


torch.Size([32, 65])
tensor(4.8786, grad_fn=<NllLossBackward0>)

Sr?qP-QWktXoL&jLDJgOLVz'RIoDqHdhsV&vLLxatjscMpwLERSPyao.qfzs$Ys$zF-w,;eEkzxjgCKFChs!iWW.ObzDnxA Ms$3


In [None]:
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [None]:
batch_size = 32
for steps in range(10000): # increase number of steps for good results...

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())


2.5727508068084717


In [None]:
print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=500)[0].tolist()))


Iyoteng h hasbe pave pirance
Rie hicomyonthar's
Plinseard ith henoure wounonthioneir thondy, y heltieiengerofo'dsssit ey
KIN d pe wither vouprrouthercc.
hathe; d!
My hind tt hinig t ouchos tes; st yo hind wotte grotonear 'so it t jod weancotha:
h hay.JUCle n prids, r loncave w hollular s O:
HIs; ht anjx?

DUThinqunt.

LaZAnde.
athave l.
KEONH:
ARThanco be y,-hedarwnoddy scace, tridesar, wnl'shenous s ls, theresseys
PlorseelapinghiybHen yof GLUCEN t l-t E:
I hisgothers je are!-e!
QLYotouciullle'z


In [None]:
a = torch.tril(torch.ones(3, 3))
a

tensor([[1., 0., 0.],
        [1., 1., 0.],
        [1., 1., 1.]])

In [None]:
torch.sum(a, 1, keepdim=True)

tensor([[1.],
        [2.],
        [3.]])

In [None]:
a = a / torch.sum(a, 1, keepdim=True)
a

tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])

In [None]:
b = torch.randint(0,10,(3,2)).float()
b

tensor([[2., 2.],
        [0., 9.],
        [3., 0.]])

[1.0000, 0.0000, 0.0000] *
                          [7]
                          [9]
                          [3] = [1 * 7 + 0 * 9 + 0 * 3] = [7]

[1.0000, 0.0000, 0.0000] *
                      [6]
                      [6]
                      [1] = [1 * 6 + 0 * 6 + 0 * 1] = [6]

[0.5000, 0.5000, 0.0000]* [7]
                          [9]
                          [3] = [0.5 * 7 + 0.5 * 9 + 0 * 3] = [8]

[0.5000, 0.5000, 0.0000] *
                      [6]
                      [6]
                      [3] = [0.5 * 6 + 0.5 * 6 + 0 * 3] = [3+3+0] = [6]


In [None]:
c = a @ b
c

tensor([[2.0000, 2.0000],
        [1.0000, 5.5000],
        [1.6667, 3.6667]])

In [None]:
# toy example illustrating how matrix multiplication can be used for a "weighted aggregation"
torch.manual_seed(42)
a = torch.tril(torch.ones(3, 3))
a = a / torch.sum(a, 1, keepdim=True)
b = torch.randint(0,10,(3,2)).float()
c = a @ b
print('a=')
print(a)
print('--')
print('b=')
print(b)
print('--')
print('c=')
print(c)

a=
tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
--
b=
tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
--
c=
tensor([[2.0000, 7.0000],
        [4.0000, 5.5000],
        [4.6667, 5.3333]])


In [None]:
torch.manual_seed(1337)
B,T,C = 4,8,2 # batch, time, channels
x = torch.randn(B,T,C)
x.shape

torch.Size([4, 8, 2])

In [None]:
x[0]

tensor([[ 0.1808, -0.0700],
        [-0.3596, -0.9152],
        [ 0.6258,  0.0255],
        [ 0.9545,  0.0643],
        [ 0.3612,  1.1679],
        [-1.3499, -0.5102],
        [ 0.2360, -0.2398],
        [-0.9211,  1.5433]])

In [None]:
# [ 0.1808, -0.0700],
# [-0.3596, -0.9152],
# whats being done in the for loop cell below (manual here)
print(((0.1808 + -0.3596 )) / 2)
print(((-0.0700 + -0.9152 )) / 2)

-0.0894
-0.49260000000000004


In [None]:
xbow = torch.zeros((B,T,C))
for b in range(B):
    for t in range(T):
        xprev = x[b,:t+1] # (t,C)
        xbow[b,t] = torch.mean(xprev, 0)

In [None]:
xbow[0]

tensor([[ 0.1808, -0.0700],
        [-0.0894, -0.4926],
        [ 0.1490, -0.3199],
        [ 0.3504, -0.2238],
        [ 0.3525,  0.0545],
        [ 0.0688, -0.0396],
        [ 0.0927, -0.0682],
        [-0.0341,  0.1332]])

In [None]:
# version 2: using matrix multiply for a weighted aggregation
wei = torch.tril(torch.ones(T, T))
wei

tensor([[1., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1.]])

In [None]:
wei = wei / wei.sum(1, keepdim=True)
wei

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])

In [None]:
xbow2 = wei @ x # (B, T, T) @ (B, T, C) ----> (B, T, C)
xbow2[0]

tensor([[ 0.1808, -0.0700],
        [-0.0894, -0.4926],
        [ 0.1490, -0.3199],
        [ 0.3504, -0.2238],
        [ 0.3525,  0.0545],
        [ 0.0688, -0.0396],
        [ 0.0927, -0.0682],
        [-0.0341,  0.1332]])

Self attention via softmax normalization

In [None]:
tril = torch.tril(torch.ones(T, T))
tril

tensor([[1., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1.]])

In [None]:
wei = torch.zeros((T,T))
wei

tensor([[0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.]])

In [None]:
wei = wei.masked_fill(tril == 0, float('-inf'))

In [None]:
wei = F.softmax(wei, dim=-1)
wei

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])

In [None]:
xbow3 = wei @ x
xbow3

tensor([[[ 0.1808, -0.0700],
         [-0.0894, -0.4926],
         [ 0.1490, -0.3199],
         [ 0.3504, -0.2238],
         [ 0.3525,  0.0545],
         [ 0.0688, -0.0396],
         [ 0.0927, -0.0682],
         [-0.0341,  0.1332]],

        [[ 1.3488, -0.1396],
         [ 0.8173,  0.4127],
         [-0.1342,  0.4395],
         [ 0.2711,  0.4774],
         [ 0.2421,  0.0694],
         [ 0.0084,  0.0020],
         [ 0.0712, -0.1128],
         [ 0.2527,  0.2149]],

        [[-0.6631, -0.2513],
         [ 0.1735, -0.0649],
         [ 0.1685,  0.3348],
         [-0.1621,  0.1765],
         [-0.2312, -0.0436],
         [-0.1015, -0.2855],
         [-0.2593, -0.1630],
         [-0.3015, -0.2293]],

        [[ 1.6455, -0.8030],
         [ 1.4985, -0.5395],
         [ 0.4954,  0.3420],
         [ 1.0623, -0.1802],
         [ 1.1401, -0.4462],
         [ 1.0870, -0.4071],
         [ 1.0430, -0.1299],
         [ 1.1138, -0.1641]]])

In [None]:
torch.manual_seed(1337)
B,T,C = 4,8,32 # batch, time, channels
print('B =', B)
print('T =', T)
print('C =', C)

B = 4
T = 8
C = 32


In [None]:
x = torch.randn(B,T,C)
x[:2, :, :5]

tensor([[[ 0.1808, -0.0700, -0.3596, -0.9152,  0.6258],
         [-0.6631, -0.2513,  1.0101,  0.1215,  0.1584],
         [-0.8345,  0.5978, -0.0514, -0.0646, -0.4970],
         [-1.6669, -1.3651, -0.1655,  0.9623,  0.0315],
         [-2.0555,  1.8275,  1.3035, -0.4501,  1.3471],
         [-0.8961,  0.0662, -0.0563,  2.3412, -2.7234],
         [ 0.1910, -0.3425,  1.7955,  1.3915,  1.0785],
         [-0.5819, -0.2208,  0.0135, -0.3057, -0.0304]],

        [[ 0.4562, -1.0917, -0.8207,  1.8634,  0.8148],
         [ 0.0210,  1.0060, -1.2492,  0.2441, -0.6387],
         [ 2.2007, -0.2195,  0.5427,  2.5867, -0.4687],
         [ 0.2922,  1.3143,  1.2607, -0.3505, -2.0660],
         [ 0.1275, -0.0560,  0.8315, -0.5512,  1.0477],
         [ 0.3091,  1.1661, -2.1821, -1.0422,  1.0207],
         [ 0.0943, -0.3156,  0.7850, -0.8699, -1.6525],
         [ 0.6455, -0.3313, -1.0390,  0.9112,  1.2984]]])

In [None]:
# let's see a single Head perform self-attention
head_size = 16
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)

In [None]:
k = key(x)   # (B, T, 16)
k[:2,:,:5]

tensor([[[ 0.1196, -0.3013,  0.3629,  1.1771,  1.1385],
         [-0.5423, -0.5558, -0.0761,  1.2929,  0.8653],
         [-0.3736, -0.4678, -0.2156, -0.8034, -0.3715],
         [-0.3146,  0.0845, -0.1235, -0.7058, -0.1802],
         [ 0.0239,  0.0998, -0.1871, -0.0860, -0.4881],
         [-0.2362, -0.7873, -0.3802,  0.5815, -0.3722],
         [-0.7941, -0.1660, -0.2810, -0.1021, -0.7352],
         [ 0.1651, -0.1599, -0.5717, -0.3957,  0.3930]],

        [[-0.1698, -1.5875, -0.9185,  0.0663, -1.1497],
         [-0.1144, -0.3531, -0.1843,  0.5200, -0.6060],
         [-0.6351, -1.0090,  0.4485,  0.2610,  0.3095],
         [ 0.0712,  0.5713,  0.6227,  0.2422,  1.1163],
         [-0.0329,  0.5380,  0.0509,  1.1635, -0.1320],
         [-0.1540,  0.6426, -0.1227,  0.4075,  0.0728],
         [ 0.7557, -0.1168, -0.7970,  0.0162,  0.8680],
         [ 0.1957,  0.1531, -0.2639, -0.9068, -0.8997]]],
       grad_fn=<SliceBackward0>)

In [None]:
q = query(x) # (B, T, 16)
q[:1, :1, :]

tensor([[[-0.6567,  0.0283,  0.0094, -0.6995, -0.3604,  0.8376, -0.4446,
           0.1228,  0.6276, -0.6222,  0.3483,  0.2411,  0.5409, -0.2605,
           0.3612, -0.0436]]], grad_fn=<SliceBackward0>)

In [None]:
k[:1, :1 , :]

tensor([[[ 0.1196, -0.3013,  0.3629,  1.1771,  1.1385, -0.2554,  0.1454,
          -0.2944, -0.7020, -1.0308,  0.7436, -0.8098, -0.6669,  0.0912,
          -0.0061,  0.1983]]], grad_fn=<SliceBackward0>)

In [None]:
k[:1, :1 , :].transpose(-2,-1)

tensor([[[ 0.1196],
         [-0.3013],
         [ 0.3629],
         [ 1.1771],
         [ 1.1385],
         [-0.2554],
         [ 0.1454],
         [-0.2944],
         [-0.7020],
         [-1.0308],
         [ 0.7436],
         [-0.8098],
         [-0.6669],
         [ 0.0912],
         [-0.0061],
         [ 0.1983]]], grad_fn=<TransposeBackward0>)

In [None]:
wei =  q @ k.transpose(-2, -1) # (B, T, 16) @ (B, 16, T) ---> (B, T, T)
wei[:2,:,:5]

tensor([[[-1.7629, -1.3011,  0.5652,  2.1616, -1.0674],
         [-3.3334, -1.6556,  0.1040,  3.3782, -2.1825],
         [-1.0226, -1.2606,  0.0762, -0.3813, -0.9843],
         [ 0.7836, -0.8014, -0.3368, -0.8496, -0.5602],
         [-1.2566,  0.0187, -0.7880, -1.3204,  2.0363],
         [-0.3126,  2.4152, -0.1106, -0.9931,  3.3449],
         [ 1.0876,  1.9652, -0.2621, -0.3158,  0.6091],
         [-1.8044, -0.4126, -0.8306,  0.5899, -0.7987]],

        [[-0.7353, -1.7807,  1.0745, -0.2743,  1.6347],
         [-3.0892, -1.4943, -0.2617,  2.2760, -0.2436],
         [-0.5021, -2.0745,  0.5379, -0.4049,  0.8329],
         [ 1.3810, -0.1471,  1.2181, -0.2227, -1.8247],
         [-2.3568, -0.4617, -0.8820,  2.3700,  0.6783],
         [-0.9243, -0.6235, -1.3938,  1.3336, -0.0090],
         [-0.6552,  1.0991, -2.1399,  0.9647,  0.9946],
         [ 1.5463, -0.4944, -0.0142, -0.9743,  1.3779]]],
       grad_fn=<SliceBackward0>)

In [None]:
tril = torch.tril(torch.ones(T, T))
tril

tensor([[1., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1.]])

In [None]:
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)
wei[:1, :, :]

tensor([[[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.1574, 0.8426, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.2088, 0.1646, 0.6266, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.5792, 0.1187, 0.1889, 0.1131, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.0294, 0.1052, 0.0469, 0.0276, 0.7909, 0.0000, 0.0000, 0.0000],
         [0.0176, 0.2689, 0.0215, 0.0089, 0.6812, 0.0019, 0.0000, 0.0000],
         [0.1691, 0.4066, 0.0438, 0.0416, 0.1048, 0.2012, 0.0329, 0.0000],
         [0.0210, 0.0843, 0.0555, 0.2297, 0.0573, 0.0709, 0.2423, 0.2391]]],
       grad_fn=<SliceBackward0>)

In [None]:
v = value(x)
v[:1, :, :]

tensor([[[-0.1571,  0.8801,  0.1615, -0.7824, -0.1429,  0.7468,  0.1007,
          -0.5239, -0.8873,  0.1907,  0.1762, -0.5943, -0.4812, -0.4860,
           0.2862,  0.5710],
         [ 0.8321, -0.8144, -0.3242,  0.5191, -0.1252, -0.4898, -0.5287,
          -0.0314,  0.1072,  0.8269,  0.8132, -0.0271,  0.4775,  0.4980,
          -0.1377,  1.4025],
         [ 0.6035, -0.2500, -0.6159,  0.4068,  0.3328, -0.3910,  0.1312,
           0.2172, -0.1299, -0.8828,  0.1724,  0.4652, -0.4271, -0.0768,
          -0.2852,  1.3875],
         [ 0.6657, -0.7096, -0.6099,  0.4348,  0.8975, -0.9298,  0.0683,
           0.1863,  0.5400,  0.2427, -0.6923,  0.4977,  0.4850,  0.6608,
           0.8767,  0.0746],
         [ 0.1536,  1.0439,  0.8457,  0.2388,  0.3005,  1.0516,  0.7637,
           0.4517, -0.7426, -1.4395, -0.4941, -0.3709, -1.1819,  0.1000,
          -0.1806,  0.5129],
         [-0.8920,  0.0578, -0.3350,  0.8477,  0.3876,  0.1664, -0.4587,
          -0.5974,  0.4961,  0.6548,  0.0548,  0.946

In [None]:
out = wei @ v
print(wei.shape)
print(v.shape)
out[:1]

torch.Size([4, 8, 8])
torch.Size([4, 8, 16])


tensor([[[-0.1571,  0.8801,  0.1615, -0.7824, -0.1429,  0.7468,  0.1007,
          -0.5239, -0.8873,  0.1907,  0.1762, -0.5943, -0.4812, -0.4860,
           0.2862,  0.5710],
         [ 0.6764, -0.5477, -0.2478,  0.3143, -0.1280, -0.2952, -0.4296,
          -0.1089, -0.0493,  0.7268,  0.7130, -0.1164,  0.3266,  0.3431,
          -0.0710,  1.2716],
         [ 0.4823, -0.1069, -0.4055,  0.1770,  0.1581, -0.1697,  0.0162,
           0.0215, -0.2490, -0.3773,  0.2787,  0.1629, -0.2895, -0.0676,
          -0.1416,  1.2194],
         [ 0.1971,  0.2856, -0.1303, -0.2655,  0.0668,  0.1954,  0.0281,
          -0.2451, -0.4647,  0.0693,  0.1528, -0.2032, -0.2479, -0.1621,
           0.1947,  0.7678],
         [ 0.2510,  0.7346,  0.5939,  0.2516,  0.2606,  0.7582,  0.5595,
           0.3539, -0.5934, -1.0807, -0.3111, -0.2781, -0.9054,  0.1318,
          -0.1382,  0.6371],
         [ 0.3428,  0.4960,  0.4725,  0.3028,  0.1844,  0.5814,  0.3824,
           0.2952, -0.4897, -0.7705, -0.1172, -0.254

In [None]:
batch_size = 32 # how many independent sequences will we process in parallel?
block_size = 64 # what is the maximum context length for predictions?
max_iters = 5000
eval_interval = 100
learning_rate = 1e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 64
n_head = 4
n_layer = 4
dropout = 0.2

In [None]:
device

'cuda'

In [None]:
torch.manual_seed(1337)

# wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()


chars = sorted(list(set(text)))
vocab_size = len(chars)

stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

In [None]:
# data loading
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [None]:
class Head(nn.Module):

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)   # (B,T,C)
        q = self.query(x) # (B,T,C)

        wei = q @ k.transpose(-2,-1) * C**-0.5 # (B, T, C) @ (B, C, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)

        v = self.value(x) # (B,T,C)
        out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)
        return out

In [None]:
class MultiHeadAttention(nn.Module):

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out


In [None]:
class FeedFoward(nn.Module):

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

In [None]:
class Block(nn.Module):

    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x


In [None]:
class BigramLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.blocks(x) # (B,T,C)
        x = self.ln_f(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]

            logits, loss = self(idx_cond)

            logits = logits[:, -1, :] # becomes (B, C)

            probs = F.softmax(logits, dim=-1) # (B, C)

            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)

            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx


In [None]:
model = BigramLanguageModel()
m = model.to(device)

print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):


    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    xb, yb = get_batch('train')

    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=2000)[0].tolist()))

0.211777 M parameters
step 0: train loss 4.3387, val loss 4.3485
step 100: train loss 2.6136, val loss 2.6146
step 200: train loss 2.4867, val loss 2.4921
step 300: train loss 2.3950, val loss 2.3991
step 400: train loss 2.3156, val loss 2.3236
step 500: train loss 2.2504, val loss 2.2679
step 600: train loss 2.1927, val loss 2.2158
step 700: train loss 2.1424, val loss 2.1659
step 800: train loss 2.0993, val loss 2.1317
step 900: train loss 2.0542, val loss 2.0991
step 1000: train loss 2.0156, val loss 2.0730
step 1100: train loss 1.9740, val loss 2.0360
step 1200: train loss 1.9364, val loss 2.0128
step 1300: train loss 1.9063, val loss 1.9898
step 1400: train loss 1.8759, val loss 1.9575
step 1500: train loss 1.8500, val loss 1.9488
step 1600: train loss 1.8251, val loss 1.9416
step 1700: train loss 1.8070, val loss 1.9300
step 1800: train loss 1.7870, val loss 1.9231
step 1900: train loss 1.7707, val loss 1.9033
step 2000: train loss 1.7440, val loss 1.8887
step 2100: train loss 1.