In [10]:
import torch
import torch.nn as nn
from torch.nn import functional as F

In [11]:
# hyperparameters
batch_size = 64 # how many independent sequences will we process in parallel?
block_size = 256 # what is the maximum context length for predictions?
max_iters = 5000
eval_interval = 500
learning_rate = 3e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 384
n_head = 6
n_layer = 6
dropout = 0.2
# ------------

In [12]:
device

'cuda'

In [3]:
#!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

In [9]:
with open('data_gpt_from_scratch/input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [13]:
print("length of dataset in characters: ", len(text))

length of dataset in characters:  1115394


In [15]:
# let's look at the first 1000 characters
print(text[:500])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor


In [16]:
# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [19]:
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

print(encode("hii there"))
print(decode(encode("hii there")))

[46, 47, 47, 1, 58, 46, 43, 56, 43]
hii there


In [24]:
for i,ch in enumerate(chars):
    print(chars)

['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
['\n', ' ', '!', '$

## The mathematical trick in self-attention

In [35]:
# toy example illustrating how matrix multiplication can be used for a "weighted aggregation"
torch.manual_seed(42)
a = torch.tril(torch.ones(3, 3))
a = a / torch.sum(a, 1, keepdim=True)
b = torch.randint(0,10,(3,2)).float()
c = a @ b
print('a=')
print(a)
print('--')
print('b=')
print(b)
print('--')
print('c=')
#3x3 x 3x2 = 3x2 
print(c)

a=
tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
--
b=
tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
--
c=
tensor([[2.0000, 7.0000],
        [4.0000, 5.5000],
        [4.6667, 5.3333]])


In [52]:
B,T,C = 4,8, 32
head_size = 16
x=torch.rand(B,T,C)
key = nn.Linear(C,head_size,bias=False)
query = nn.Linear(C,head_size,bias=False)

In [57]:
k = key(x) #(B,T,16)
print(k.shape) #4 batches with a 16 head size vector for each token in the contxt window of 8
q = query(x) #(B,T,16)
weights = q @ k.transpose(-2,-1) #(B,T,116) x (B,16,T)  = (B,T,T)

tril = torch.tril(torch.ones(T,T))
weights = weights.masked_fill(tril ==0, float('-inf'))
weights = F.softmax(weights,dim=-1)
weights

torch.Size([4, 8, 16])


tensor([[[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.4926, 0.5074, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.3903, 0.3706, 0.2390, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.2690, 0.3225, 0.1790, 0.2295, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.1896, 0.2180, 0.1521, 0.2240, 0.2162, 0.0000, 0.0000, 0.0000],
         [0.2070, 0.1857, 0.1201, 0.1823, 0.1653, 0.1396, 0.0000, 0.0000],
         [0.1631, 0.1552, 0.1191, 0.1443, 0.1530, 0.1202, 0.1451, 0.0000],
         [0.1496, 0.1468, 0.0908, 0.1225, 0.1439, 0.1042, 0.1304, 0.1119]],

        [[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.5598, 0.4402, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.3664, 0.3479, 0.2858, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.2966, 0.2289, 0.1703, 0.3043, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.2121, 0.2148, 0.1696, 0.1847, 0.2187, 0.0000, 0.0000, 0.0000],
         [0.2053, 0.177