In [109]:
!curl -L https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt -o input.txt

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 1089k  100 1089k    0     0  10.7M      0 --:--:-- --:--:-- --:--:-- 10.9M


In [110]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [111]:
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [112]:
print("Length of dataset in characters: ", len(text))

Length of dataset in characters:  1115394


In [113]:
print(text[:1000])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.



In [114]:
# Get sorted list of all characters that occur in the dataset
chars = sorted(list(set(text)))
vocab_size = len(chars)
print("".join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [115]:
stoi = {ch:i for i,ch in enumerate(chars)}
itos = {i:ch for i,ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ["".join(itos[i]) for i in l]

In [116]:
print(encode("hii there"))
print("".join(decode([46, 47, 47, 1, 58, 46, 43, 56, 43])))

[46, 47, 47, 1, 58, 46, 43, 56, 43]
hii there


## Encode text datset to torch.Tensor

In [117]:
import torch
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:1000])

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15, 39, 47, 59, 57,  1, 25, 39, 56, 41,
      

### Train/validate split

In [118]:
# Train on 90% of data
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

In [119]:
# Context length
block_size = 8
train_data[:block_size+1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [120]:
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target  = y[t]
    print(f"when context is: {context} the target is: {target}") 

when context is: tensor([18]) the target is: 47
when context is: tensor([18, 47]) the target is: 56
when context is: tensor([18, 47, 56]) the target is: 57
when context is: tensor([18, 47, 56, 57]) the target is: 58
when context is: tensor([18, 47, 56, 57, 58]) the target is: 1
when context is: tensor([18, 47, 56, 57, 58,  1]) the target is: 15
when context is: tensor([18, 47, 56, 57, 58,  1, 15]) the target is: 47
when context is: tensor([18, 47, 56, 57, 58,  1, 15, 47]) the target is: 58


### Set up batching

In [121]:
torch.manual_seed(1337)
batch_size = 4
block_size = block_size

def get_batch(split):
    # Generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x,y

xb, yb = get_batch('train')
print(f"inputs:\n{xb.shape}\n{xb}")
print(f"targets:\n{yb.shape}\n{yb}")

inputs:
torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]], device='cuda:0')
targets:
torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]], device='cuda:0')


In [122]:
for b in range(batch_size):  # Batch dimension
    for t in range(block_size):  # Time dimension
        context = xb[b, :t+1]
        target = yb[b,t]
        print(f"when input is: {context} the target is: {target}") 

when input is: tensor([24], device='cuda:0') the target is: 43
when input is: tensor([24, 43], device='cuda:0') the target is: 58
when input is: tensor([24, 43, 58], device='cuda:0') the target is: 5
when input is: tensor([24, 43, 58,  5], device='cuda:0') the target is: 57
when input is: tensor([24, 43, 58,  5, 57], device='cuda:0') the target is: 1
when input is: tensor([24, 43, 58,  5, 57,  1], device='cuda:0') the target is: 46
when input is: tensor([24, 43, 58,  5, 57,  1, 46], device='cuda:0') the target is: 43
when input is: tensor([24, 43, 58,  5, 57,  1, 46, 43], device='cuda:0') the target is: 39
when input is: tensor([44], device='cuda:0') the target is: 53
when input is: tensor([44, 53], device='cuda:0') the target is: 56
when input is: tensor([44, 53, 56], device='cuda:0') the target is: 1
when input is: tensor([44, 53, 56,  1], device='cuda:0') the target is: 58
when input is: tensor([44, 53, 56,  1, 58], device='cuda:0') the target is: 46
when input is: tensor([44, 53, 5

## Pytorch Bigram

In [123]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        # Each token reads the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):

        # idx and targets are both (B,T) tensor of integers
        logits = self.token_embedding_table(idx)  # (B,T,C) (batch, time, channel)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        # idc is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # Get predictions
            logits, loss = self(idx)
            # Focus only on last time step
            logits = logits[:,-1,:]  # Becomes (B,C)
            # Apply SoftMax to get probabilities
            probs = F.softmax(logits, dim=1)  # (B,C)
            # Sample from distribution
            idx_next = torch.multinomial(probs, num_samples=1)  # (B,1)
            # Append sampled index to running sequence
            idx = torch.cat((idx, idx_next), dim=1)  # (B, T+1)
        return idx
    
model = BigramLanguageModel(vocab_size)
m = model.to(device)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)

torch.Size([32, 65])
tensor(4.8786, device='cuda:0', grad_fn=<NllLossBackward0>)


In [132]:
print("".join(decode(m.generate(idx=torch.zeros((1,1), dtype=torch.long, device=device), max_new_tokens=100)[0].tolist())))


GxEQ?eRjK$ALI:C-b$gGCCJM;scP!A?h$YUgn;RGSjUcUq,FXrxlgq-GJZvSPHbAaq-tO'XEHzc-ErW:ww3C C !x.vDCKumlxlF


## Training

In [133]:
# Pyotch optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [134]:
batch_size = 32
for steps in range(10000):
    # Sample a batch of data
    xb,yb = get_batch("train")
    # Evaluate loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

    print(loss.item())


4.648484230041504
4.650679588317871
4.710906028747559
4.711500644683838
4.70780086517334
4.676126480102539
4.692948818206787
4.78169059753418
4.717319011688232
4.8193278312683105
4.74024772644043
4.69600248336792
4.769904613494873
4.705029010772705
4.778793811798096
4.651399612426758
4.644524097442627
4.827465534210205
4.707894802093506
4.787679672241211
4.635769367218018
4.669398784637451
4.71306037902832
4.770639419555664
4.697977066040039
4.6887688636779785
4.733644485473633
4.6446733474731445
4.661499500274658
4.652538776397705
4.7977776527404785
4.667324542999268
4.801927089691162
4.67767858505249
4.739298343658447
4.768125534057617
4.6637372970581055
4.666024208068848
4.794478893280029
4.707888603210449
4.6954665184021
4.784860134124756
4.642155170440674
4.56484842300415
4.563508033752441
4.7707953453063965
4.626956939697266
4.638505458831787
4.716790199279785
4.649761199951172
4.581179618835449
4.705896854400635
4.717549800872803
4.657068252563477
4.71418571472168
4.657944679260

In [142]:
print("".join(decode(m.generate(idx=torch.zeros((1,1), dtype=torch.long, device=device), max_new_tokens=100)[0].tolist())))


Thins; s ookesthouk bl,-mer, s, es s;
RI th t olk!--I scee Clll!
Y theALORieeresed we h, st ar
ENCEL
