### Load the data

In [1]:
# curl is preinstall in macos
!curl -O https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 1089k  100 1089k    0     0  3216k      0 --:--:-- --:--:-- --:--:-- 3213k


In [4]:
# read it in to inspect it
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

## Encode data into training and validation sets

In [5]:
vocab = sorted(list(set(text)))
print(vocab)
stoi = {s: i for i, s in enumerate(vocab)}
itos = {i: s for i, s in enumerate(vocab)}
print('Total unique characters:', len(vocab))

['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
Total unique characters: 65


In [6]:
encode = lambda s: [stoi[ch] for ch in s]
decode = lambda s: ''.join([itos[ch] for ch in s])

decode(encode('hello'))

'hello'

In [7]:
n = int(0.9*len(text))

data = encode(text)

import torch
data = torch.tensor(data, dtype=torch.long)
print(data.shape, data.dtype)

train_data = data[:n]
val_data = data[n:]

torch.Size([1115394]) torch.int64


In [8]:
block_size = 8

def examples(arr):
    for num in range(block_size):
        print(arr[:num+1].tolist() ,':', arr[num+1].tolist())

examples(train_data[:block_size+1])

[18] : 47
[18, 47] : 56
[18, 47, 56] : 57
[18, 47, 56, 57] : 58
[18, 47, 56, 57, 58] : 1
[18, 47, 56, 57, 58, 1] : 15
[18, 47, 56, 57, 58, 1, 15] : 47
[18, 47, 56, 57, 58, 1, 15, 47] : 58


In [65]:
torch.manual_seed(1337)
batch_size = 32 # how many independent sequences will we process in parallel?
block_size = 8 # what is the maximum context length for predictions?

def get_batch(split):
    data = train_data if split == 'training' else val_data

    # generate a small batch of data of inputs x and targets y
    ix = torch.randint(0, len(data)-block_size, (batch_size, ))
    x = torch.stack([data[ind:ind+block_size] for ind in ix])
    y = torch.stack([data[ind+1:ind+block_size+1] for ind in ix])
    return x, y


print(get_batch('training')[0].shape)

torch.Size([32, 8])


### Bigram Language model

- Base class for all neural network modules: `torch.nn`
- Every model in PyTorch is essentially a subclass of `nn.Module`
- You must define a forward method in your subclass.
- Here tokens are not talking to each other

`nn.Embedding` output is organized in (B, T, C): (batch, time, dimension)
batch is batch_size (4)
time is block_size (8)
channels is number of dimensions (65)

As per docs:
Output: (âˆ—, H), where * is the input shape and H=embedding_dim

During generation, the model uses only the last token of each sequence in the batch to compute the probability distribution for the next token. That's why we have this line:

```
logits = logits[:, -1, :]
```


In [78]:
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        # self.vocab_size = vocab_size
        # self.emb = torch.randn(self.vocab_size, self.vocab_size)
        self.token_embedding_table = nn.Embedding(
            num_embeddings = vocab_size,
            embedding_dim = vocab_size
        )
    
    def forward(self, idx, targets = None):
        
        logits = self.token_embedding_table(idx) # logits size: (4, 8, 65)
        if targets is not None:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            # 32 bigram predictions at once
            loss = F.cross_entropy(logits, targets)
        else:
            # logits is not reshaped
            loss = None
        return logits, loss

    def generate(self, idx, max_new_tokens):
        """
        idx is current batch context
        idx is (B, T) initially and it adds max_new_tokens to idx
        B: batch size (4)
        T: context length (8)
        """
        for _ in range(max_new_tokens):
            # Generate logits & probability distribution
            logits, _ = self(idx)
            
            # This line throws away predictions for all earlier characters.
            logits = logits[:, -1, :] # (4, 65)
            
            # apply softmax across the last dimension
            probs = F.softmax(logits, dim=-1)
            # print(logits.shape, probs.shape) # (4, 65), (4, 65)
            
            # Generate numbers from the probability distribution for all 4 examples in batch
            idx_next = torch.multinomial(probs, num_samples=1)
            # print(idx_next)
            idx = torch.cat((idx, idx_next), dim=1)

        return idx

model = BigramLanguageModel(vocab_size=len(vocab))

x, y = get_batch('training')
logits, loss = model(x, y)
print(loss.item())
gens = model.generate(torch.zeros((1, 1), dtype=torch.long), max_new_tokens=200)
print(decode(gens[0].tolist()))

4.648484230041504

SP MgD&GM .3YCKf fRwaX$V&tt3s!muDn-oivDTV?s!!q.
pTQ3!uLT;ehcL.PJgOwW
RlyE$k!MXIBL;;FGZOrc!
jHA;Rq.?,ruyhZo,iGJd:&sZ;:CEoiMgCIgqfhsxJly'BkLDJUAaZJOVOJdDV&:CjHUESAAIqa!SPDyzrkUt$YzmQlgsruM?aoUGN.OdXEHFo


Loss without training = 4.7051
Just input embeddings
Bad predications

### Training the model
torch.optim.SGD: Gradient descend
torch.optim.AdamW

In [85]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
for _ in range(1000):
    xs, ys = get_batch('training')
    logits, loss = model(xs, ys)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    # print(loss.item())
    optimizer.step()

print(loss.item())

3.044837474822998


In [86]:
gens = model.generate(torch.zeros((1, 1), dtype=torch.long), max_new_tokens=500)
print(decode(gens[0].tolist()))


ERR kpigU3SSSprevNn?$PFjx
ViYw,,iajolia,pelaSYhYgna!u!U'IVju'Hri:CjuUs,
TbZyGevicerasll'wosejusthotomJETQXbngOXE:C&3f FJWPzo,
CEENw a!BN:CgXYSP!LMpx$Qwff:dee:vethFBw:'dfy?sis cMVk'ded, pa'XiOraniKIVVj,SBk a-hea'd:NRCEmZT'NZ?Yg V&btSupasud,bLIfrouDusuoTINGMAflime
StRlvit gFI3nweYKHET:C baw
jurgq-WqBces,Ior,lyhSSiJUNtwMpZxQF;pxBjscAc-gi- muw;isr ira VrkqyVp'Pd yFcivinourNClustnotgrARju st
DI&?y ad, zotaterulgVXXsE!aistinbKioie.?GSAu3I ysedzljplunjo, MVKEWigRSPotriNSPKxckpS.
Lenehin?DUMly,
ptind:CJ
