In [1]:
!curl https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt -o shakespeare.txt

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 1089k  100 1089k    0     0  3838k      0 --:--:-- --:--:-- --:--:-- 3848k


In [2]:
with open("shakespeare.txt", "r") as file:
	text = file.read()

In [3]:
print("Length of text in characters: ", len(text))

Length of text in characters:  1115394


In [4]:
print(text[:200])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you


In [5]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print('Vocab size: ', vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
Vocab size:  65


In [6]:
s_to_i = {s:i for i,s in enumerate(chars)}
i_to_s = {i:s for i,s in enumerate(chars)}

encode = lambda s: [s_to_i[c] for c in s]
decode = lambda x: ''.join([i_to_s[i] for i in x])

print(encode('hello'))
print(decode(encode('hello')))

[46, 43, 50, 50, 53]
hello


In [7]:
import torch
data = torch.tensor(encode(text), dtype=torch.int64)
print(data.shape, data.dtype)
print(data[:100])

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])


In [8]:
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

In [9]:
block_size = 8
train_data[:block_size+1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [10]:
x = train_data[:block_size]
y = train_data[1:block_size+1]

for i in range(block_size):
	context = x[:i+1]
	target = y[i]
	print("Input:", context.tolist(), "  Target:", target.item())
	# print("Input: ", decode(context.tolist()), "  Target: ", decode([target.item()]))

Input: [18]   Target: 47
Input: [18, 47]   Target: 56
Input: [18, 47, 56]   Target: 57
Input: [18, 47, 56, 57]   Target: 58
Input: [18, 47, 56, 57, 58]   Target: 1
Input: [18, 47, 56, 57, 58, 1]   Target: 15
Input: [18, 47, 56, 57, 58, 1, 15]   Target: 47
Input: [18, 47, 56, 57, 58, 1, 15, 47]   Target: 58


In [11]:
torch.manual_seed(1337)
batch_size = 4 # how many independent sequences to train on in parallel
block_size = 8 # what is the maximum context length for predictions

def get_batch(split):
	data = train_data if split == 'train' else val_data
	ix = torch.randint(len(data) - block_size, (batch_size,)) # choose batch_size starting indices at random
	x = torch.stack([data[i:i+block_size] for i in ix]) # construct the input sequence
	y = torch.stack([data[i+1:i+block_size+1] for i in ix]) # construct the target sequence
	return x, y

xb, yb = get_batch('train')
print("Inputs:")
print(xb.shape, '\n', xb)
print("Targets:")
print(yb.shape, '\n', yb)

print('--------')

for b in range(batch_size):
	for t in range(block_size):
		context = xb[b, :t+1]
		target = yb[b, t]
		print("Input:", context.tolist(), "  Target:", target.item())
		# print("Input: ", decode(np.array(context)), "  Target: ", decode(np.array([target])))

Inputs:
torch.Size([4, 8]) 
 tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
Targets:
torch.Size([4, 8]) 
 tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])
--------
Input: [24]   Target: 43
Input: [24, 43]   Target: 58
Input: [24, 43, 58]   Target: 5
Input: [24, 43, 58, 5]   Target: 57
Input: [24, 43, 58, 5, 57]   Target: 1
Input: [24, 43, 58, 5, 57, 1]   Target: 46
Input: [24, 43, 58, 5, 57, 1, 46]   Target: 43
Input: [24, 43, 58, 5, 57, 1, 46, 43]   Target: 39
Input: [44]   Target: 53
Input: [44, 53]   Target: 56
Input: [44, 53, 56]   Target: 1
Input: [44, 53, 56, 1]   Target: 58
Input: [44, 53, 56, 1, 58]   Target: 46
Input: [44, 53, 56, 1, 58, 46]   Target: 39
Input: [44, 53, 56, 1, 58, 46, 39]   Target: 58
Input: [44, 53, 56, 1, 58, 46, 39, 58]

In [12]:
print(xb)

tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])


In [13]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):
	def __init__(self, vocab_size):
		super().__init__()
		self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

	def forward(self, idx, targets=None):
		# idx and targets are both of shape [batch_size, block_size]
		logits = self.token_embedding_table(idx) # shape [batch_size, block_size, vocab_size]
		
		# if we don't have targets, skip loss calculation
		if targets is None:
			loss = None
		else:
			B, T, C = logits.shape
			logits = logits.view(B*T, C)
			targets = targets.view(B*T)
			loss = F.cross_entropy(logits, targets)

		return logits, loss

	def generate(self, idx, max_new_tokens):
		# idx is of shape [batch_size, block_size], array of indices in the current context
		# max_new_tokens is the maximum number of new tokens to generate
		for _ in range(max_new_tokens):
			# get predictions for the next token
			logits, _ = self(idx) # loss is not needed for generation (hence _)
			# only look at the last time step
			logits = logits[:, -1, :] # becomes shape [batch_size, vocab_size]
			# use softmax to get probabilities
			probs = F.softmax(logits, dim=-1) # shape [batch_size, vocab_size]
			# sample from the distribution
			next_token = torch.multinomial(probs, num_samples=1) # shape [batch_size, 1]
			# append to the context
			idx = torch.cat((idx, next_token), dim=1) # shape [batch_size, block_size+1]
		return idx


m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb) # logits.shape -> [4, 8, 65] -> [the batch size, the context size, the vocab size (embedding row)]
print(logits.shape)
print(loss)

idx = torch.zeros((1, 1), dtype=torch.int64)
generation = m.generate(idx, max_new_tokens=100)[0]
print(decode(generation.tolist()))

torch.Size([32, 65])
tensor(4.8786, grad_fn=<NllLossBackward0>)

Sr?qP-QWktXoL&jLDJgOLVz'RIoDqHdhsV&vLLxatjscMpwLERSPyao.qfzs$Ys$zF-w,;eEkzxjgCKFChs!iWW.ObzDnxA Ms$3


In [14]:
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [15]:
batch_size = 32
for step in range(10000):
	# sample random batch
	xb, yb = get_batch('train')

	# evaluate loss
	_, loss = m(xb, yb) # can throw out logits here because we only need the loss
	optimizer.zero_grad(set_to_none=True)
	loss.backward()
	optimizer.step()

	if step % 500 == 0:
		print(f'Step {step}, loss: {loss.item()}')

Step 0, loss: 4.704006195068359
Step 500, loss: 4.241008758544922
Step 1000, loss: 3.7031264305114746
Step 1500, loss: 3.4233598709106445
Step 2000, loss: 3.1371781826019287
Step 2500, loss: 2.9196817874908447
Step 3000, loss: 2.776794672012329
Step 3500, loss: 2.809856414794922
Step 4000, loss: 2.5844571590423584
Step 4500, loss: 2.6057393550872803
Step 5000, loss: 2.5105180740356445
Step 5500, loss: 2.548015594482422
Step 6000, loss: 2.531585931777954
Step 6500, loss: 2.4708240032196045
Step 7000, loss: 2.504757881164551
Step 7500, loss: 2.400172472000122
Step 8000, loss: 2.4696712493896484
Step 8500, loss: 2.4209394454956055
Step 9000, loss: 2.4838879108428955
Step 9500, loss: 2.407996892929077


In [16]:
idx = torch.zeros((1, 1), dtype=torch.int64)
generation = m.generate(idx, max_new_tokens=100)[0]
print(decode(generation.tolist()))


Iyoteng h hasbe pave pirance
Rie hicomyonthar's
Plinseard ith henoure wounonthioneir thondy, y helti


In [17]:
# self attention look ahead
import torch

torch.manual_seed(1337)
B, T, C = 4, 8, 2
x = torch.randn(B, T, C)
x.shape

torch.Size([4, 8, 2])

In [18]:
# very inefficient way to compute self-attention
xbow = torch.zeros((B, T, C))
for b in range(B):
	for t in range(T):
		xprev = x[b, :t+1] # (t, C)
		xbow[b, t] = torch.mean(xprev, 0) # (C,)

In [19]:
# better but not best
wei = torch.tril(torch.ones(T, T)) # (T, T)
wei = wei / wei.sum(1, keepdim=True) # (T, T)
xbow2 = wei @ x # (~B, T, T) @ (B, T, C) -> (B, T, C)
# torch.equal(xbow.round(decimals=6), xbow2.round(decimals=6))

In [20]:
# excellent
tril = torch.tril(torch.ones(T, T)) # (T, T)
wei = torch.zeros((T, T))
wei = wei.masked_fill(tril == 0, float('-inf')) # (T, T)
wei = torch.softmax(wei, dim=1) # (T, T)
xbow3 = wei @ x # (~B, T, T) @ (B, T, C) -> (B, T, C)
torch.equal(xbow.round(decimals=6), xbow3.round(decimals=6)), torch.equal(xbow2.round(decimals=6), xbow3.round(decimals=6))

(True, True)

In [22]:
# torch.manual_seed(42)
# a = torch.tril(torch.ones(3, 3))
# a = a / torch.sum(a, 1, keepdim=True)
# b = torch.randint(0, 10, (3, 2)).float()
# c = a @ b
# print(f"a:\n{a}\nb:\n{b}\nc:\n{c}")

In [27]:
# with a head for self-attention
torch.manual_seed(1337)
B, T, C = 4, 8, 32 # batch size, time steps, channels
x = torch.randn(B, T, C)

# making a Head perform self-attention
head_size = 16
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)
k = key(x) # (B, T, head_size)
q = query(x) # (B, T, head_size)
wei = q @ k.transpose(-2, -1) # (B, T, head_size) @ (B, head_size, T) -> (B, T, T)

tril = torch.tril(torch.ones(T, T))
# wei = torch.zeros((T, T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = torch.softmax(wei, dim=1)

v = value(x) # (B, T, head_size)
out = wei @ v

out.shape

torch.Size([4, 8, 16])

In [29]:
wei[0]

tensor([[0.0248, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0052, 0.0091, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0521, 0.0135, 0.2482, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3171, 0.0214, 0.1642, 0.1188, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0412, 0.0487, 0.1046, 0.0742, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1060, 0.5347, 0.2059, 0.1030, 0.7402, 0.0192, 0.0000, 0.0000],
        [0.4298, 0.3409, 0.1769, 0.2027, 0.0480, 0.8472, 0.2329, 0.0000],
        [0.0238, 0.0316, 0.1002, 0.5013, 0.0117, 0.1336, 0.7671, 1.0000]],
       grad_fn=<SelectBackward0>)