In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
shakespeare = open('../gpt/input.txt', 'r').read()

In [3]:
print(shakespeare[:200])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you


In [4]:
vocab = sorted(list(set(''.join(shakespeare))))
# print(''.join(vocab))
vocab_size = len(vocab)
vocab_size

65

In [5]:
stoi = {ch:i for i,ch in enumerate(vocab)}
itos = {i:ch for ch,i in stoi.items()}

# encode list of characters to list of integers
encode = lambda s: [stoi[ch] for ch in s]
# encode('Hello')

# decode list of int to list of chars
decode = lambda l: ''.join([itos[i] for i in l])
# decode(encode('Hello'))

In [6]:
data = torch.tensor(encode(shakespeare))
len(data) == len(shakespeare)

True

In [7]:
type(data)

torch.Tensor

In [8]:
data[:10], shakespeare[:10]

(tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47]), 'First Citi')

In [9]:
# prepare inputs n outputs

block_size = 4 # time dimension
# [18, 47, 56, 57]  --> [58]
# [18]              --> [47]
# [18, 47]          --> [56]
# [18, 47, 56]      --> [57]

x, y = data[:block_size], data[1:block_size+1]
print(x, y)
print('-'*20)
for t in range(block_size):
    inp = x[:t+1]
    out = y[t]
    print(f'{inp} --> {out}')

tensor([18, 47, 56, 57]) tensor([47, 56, 57, 58])
--------------------
tensor([18]) --> 47
tensor([18, 47]) --> 56
tensor([18, 47, 56]) --> 57
tensor([18, 47, 56, 57]) --> 58


In [10]:
batch_size = 2

ix = torch.randint(0, len(data), (batch_size,))
x = torch.stack([data[i:i+block_size] for i in ix], dim=0)
y = torch.stack([data[i+1 : i+1 + block_size] for i in ix], dim=0)
print(x, '\n' ,y)

tensor([[39, 42, 39, 51],
        [56,  1, 57, 50]]) 
 tensor([[42, 39, 51, 10],
        [ 1, 57, 50, 39]])


In [11]:
n = int(len(data)*0.9)
train = data[:n]
val = data[n:]

In [388]:
def get_batch(split, bs):
    d = train if split=='train' else val
    ix = torch.randint(0, len(d) - block_size, (batch_size,))
    xb = torch.stack([d[i   : i +   block_size] for i in ix], dim=0)
    yb = torch.stack([d[i+1 : i+1 + block_size] for i in ix], dim=0)
    return xb, yb

xb, yb = get_batch('train', bs=batch_size)
print(xb, '\n', yb)
print()

for b in range(batch_size):
    for t in range(block_size):
        print(f'{xb[b, :t+1]} --> {yb[b, t]}')

tensor([[51, 53, 56, 43],
        [21, 31, 13, 14],
        [53, 59, 56,  1],
        [56, 58,  1, 51],
        [58, 57,  1, 46],
        [44,  1, 14, 53],
        [56, 57, 43, 50],
        [ 1, 61, 47, 50],
        [43,  1, 57, 46],
        [52, 53, 47, 52],
        [47, 50, 50, 39],
        [60, 43,  1, 54],
        [21, 57,  1, 57],
        [43, 56, 43,  1],
        [46, 47, 51,  1],
        [57,  1, 58, 46],
        [52, 45,  1, 58],
        [52,  1, 61, 47],
        [ 0, 31, 46, 43],
        [39, 47, 52, 58],
        [40, 43, 47, 52],
        [46, 39, 58,  1],
        [12,  5,  0, 13],
        [32, 46, 39, 52],
        [ 6,  1, 42, 39],
        [ 0, 14, 59, 58],
        [58,  1, 58, 53],
        [52, 41, 43, 12],
        [50, 39, 47, 52],
        [ 6,  1, 39, 52],
        [58, 46, 63,  0],
        [10,  0, 13,  1]]) 
 tensor([[53, 56, 43,  1],
        [31, 13, 14, 17],
        [59, 56,  1, 46],
        [58,  1, 51, 63],
        [57,  1, 46, 43],
        [ 1, 14, 53, 50],
        [

In [None]:
class BigramLM(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, x, targets = None):
        logits = self.token_embedding_table(x) # (b, t, vocab_size)

        if targets is None:
            loss = None
        else:
            # targets: (b, t)                                   --> (b*t)
            # logits: (b, t, c) - c: channels --> (b, c, t)     --> (b*t, c)
            # loss = F.cross_entropy(logits.transpose(-1,-2), targets)

            B,T,C = logits.shape
            logits_new = logits.view(B*T,C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits_new, targets)
        return logits, loss
    
    def generate(self, idx, max_new_tokens=100):
        # idx: (b, t)
        for _ in range(max_new_tokens):
            logits, _ = self(idx) # (b, t, vocab_size)
            logits = logits[:, -1, :] # (b, vocab_size)
            probs = F.softmax(logits, dim=-1) # (b, vocab_size)
            idx_new = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_new), dim=-1) # (b, t+1)
        return idx

model = BigramLM()
logits, loss = model(xb, yb)
# loss.item()

idx = torch.zeros((1,1), dtype=torch.long)
print(decode(model.generate(idx)[0].tolist()))


RWz?CBdR?ire h?CmbyrcdWAh?ItPjR.bR
YQ:&vt:XiVmbvfaLqqrub
$m.I I
UR.Sb!NcDosR;iW,kp.X$LPjSLFssvfBXW
j


In [390]:
optimizer = torch.optim.AdamW(params=model.parameters(), lr=1e-3)

In [391]:
max_iters = 10000
batch_size = 32

for _ in range(max_iters):
    # get a batch
    xb, yb = get_batch('train', bs=batch_size)

    # forward pass
    logits, loss = model(xb, yb)

    # set grad to None and backward pass
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
print(f'{loss.item() = }')

idx = torch.zeros((1,1), dtype=torch.long)
print(decode(model.generate(idx)[0].tolist()))

loss.item() = 2.319518804550171

ARUShe, n:
sopoh kld theritoman;
HEx'retrear EY:
RICorve wnsattldey st camefoman Seaisor,


De t, ce


# Adding a diversion of `n_embd`

In [392]:
eval_iters = 250

# evaluate loss
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            x, y = get_batch(split, bs=batch_size)
            _, loss = model(x, y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [393]:
n_embd = 32

class BigramLM(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, x, targets = None):
        tok_emb = self.token_embedding_table(x) # (b, t, n_embd)
        logits = self.lm_head(tok_emb) # (b, t, vocab_size)

        if targets is None:
            loss = None
        else:
            # targets: (b, t)                                   --> (b*t)
            # logits: (b, t, c) - c: channels --> (b, c, t)     --> (b*t, c)
            # loss = F.cross_entropy(logits.transpose(-1,-2), targets)

            B,T,C = logits.shape
            logits_new = logits.view(B*T,C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits_new, targets)
        return logits, loss
    
    def generate(self, idx, max_new_tokens=100):
        # idx: (b, t)
        for _ in range(max_new_tokens):
            logits, _ = self(idx) # (b, vocab_size)
            logits = logits[:, -1, :] # (b, vocab_size)
            probs = F.softmax(logits, dim=-1) # (b, vocab_size)
            idx_new = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_new), dim=-1) # (b, t+1)
        return idx

model = BigramLM()
optimizer = torch.optim.AdamW(params=model.parameters(), lr=1e-3)

In [396]:
max_iters = 10000
batch_size = 32
eval_interval = 1000

for iter in range(max_iters):
    # get a batch
    xb, yb = get_batch('train', bs=batch_size)

    # forward pass
    logits, loss = model(xb, yb)

    if iter % eval_interval == 0:
        out = estimate_loss()
        print(f'Iteration {iter + 1} : Train Loss = {out['train']:.4f}, Validation Loss = {out['val']:.4f}')

    # set grad to None and backward pass
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

Iteration 1 : Train Loss = 2.4680, Validation Loss = 2.4836
Iteration 1001 : Train Loss = 2.4524, Validation Loss = 2.4941
Iteration 2001 : Train Loss = 2.4752, Validation Loss = 2.4974
Iteration 3001 : Train Loss = 2.4602, Validation Loss = 2.5030
Iteration 4001 : Train Loss = 2.4583, Validation Loss = 2.4882
Iteration 5001 : Train Loss = 2.4788, Validation Loss = 2.4860
Iteration 6001 : Train Loss = 2.4526, Validation Loss = 2.4920
Iteration 7001 : Train Loss = 2.4560, Validation Loss = 2.4937
Iteration 8001 : Train Loss = 2.4513, Validation Loss = 2.4783
Iteration 9001 : Train Loss = 2.4677, Validation Loss = 2.4944


In [397]:
idx = torch.zeros((1,1), dtype=torch.long)
print(decode(model.generate(idx)[0].tolist()))


unendesilowo'listrdealeraim.
Bus llar he hest wevowarr ch ayotllthor t heavisen, ids ir bime t wive 
