## References
- [A Recipe for Training Neural Networks
](https://karpathy.github.io/2019/04/25/recipe/)
- [Harvard CS197 AI Research Experiences](https://docs.google.com/document/d/1uvAbEhbgS_M-uDMTzmOWRlYxqCkogKRXdbKYYT98ooc/edit#heading=h.2z3yllpny6or)
- [Unit tests for machine learning research](https://semla.polymtl.ca/wp-content/uploads/2022/11/Pablo-Unit-tests-for-ML-code-SEMLA-talk.pdf)
- [CS 329S: Machine Learning Systems Design](https://stanford-cs329s.github.io/syllabus.html)

## Set up the end-to-end training/evaluation skeleton + get dumb baselines

In [1]:
import torch
import torch.nn as nn
from torch.nn import functional as F

torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.bigram_table = nn.Embedding(vocab_size, vocab_size)
        # self.token_embedding_table = nn.Embedding(vocab_size, 16)
        # self.linear = nn.Linear(16, vocab_size)
        print('number of parameters:', sum(p.numel() for p in self.parameters()))
    
    def forward(self, token_indexes):
        # token_index: (batch_size, sequence_length)
        logits = self.bigram_table(token_indexes)

        # embedding = self.token_embedding_table(token_indexes)
        # logits = self.linear(embedding)
        # logits: (batch_size, sequence_length, vocab_size)
        return logits

    def loss_per_token(self, token_indexes, targets):
        logits = self(token_indexes)
        # logits: (batch_size, sequence_length, vocab_size)
        # targets: (batch_size, sequence_length)
        batch_size, sequence_length, vocab_size = logits.shape
        loss = F.cross_entropy(
            logits.view(batch_size*sequence_length, vocab_size),
            targets.view(batch_size*sequence_length),
            reduction='none'
            )
        # loss: (batch_size*sequence_length)
        return loss.view(batch_size, sequence_length)
    
    def loss(self, token_indexes, targets):
        logits = self(token_indexes)
        # logits: (batch_size, sequence_length, vocab_size)
        # targets: (batch_size, sequence_length)
        batch_size, sequence_length, vocab_size = logits.shape
        loss = F.cross_entropy(
            logits.view(batch_size*sequence_length, vocab_size),
            targets.view(batch_size*sequence_length)
            )
        # loss: scalar
        return loss
    
    def generate(self, token_indexes, max_new_tokens):
        # token_indexes: (batch_size, sequence_length)
        batch_size, sequence_length = token_indexes.shape
        for _ in range(max_new_tokens):
            logits = self(token_indexes)
            # logits: (batch_size, sequence_length, vocab_size)
            next_token_logits = logits[:, -1, :]
            # next_token_logits: (batch_size, vocab_size)
            next_token_probs = F.softmax(next_token_logits, dim=-1)
            # next_token_probs: (batch_size, vocab_size)
            next_token = torch.multinomial(next_token_probs, num_samples=1)
            # next_token: (batch_size, 1)
            token_indexes = torch.cat([token_indexes, next_token], dim=1)
            # token_indexes: (batch_size, sequence_length+1)
        return token_indexes


In [2]:
from data_char import text, CharTokenizer

tokenizer = CharTokenizer(text)
print(tokenizer.n_vocab)
print(tokenizer.vocab)
print(tokenizer.encode('Hello my name is keno'))
print(tokenizer.decode(tokenizer.encode('Hello my name is keno')))

65
['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
[20, 43, 50, 50, 53, 1, 51, 63, 1, 52, 39, 51, 43, 1, 47, 57, 1, 49, 43, 52, 53]
['H', 'e', 'l', 'l', 'o', ' ', 'm', 'y', ' ', 'n', 'a', 'm', 'e', ' ', 'i', 's', ' ', 'k', 'e', 'n', 'o']


In [3]:
def rand_int_test(cls, low, high, shape, kwargs):
    layer = cls(**kwargs).cuda()
    random_input = torch.randint(low, high, shape).cuda()
    print('input shape:', random_input.shape)
    output = layer(random_input)
    print('output shape:', output.shape)
    return output

In [4]:
test_cls = BigramLanguageModel
batch_size = 4
context_length = 1024
vocab_size = 256

kwargs = {'vocab_size': vocab_size}
output = rand_int_test(test_cls, 0, vocab_size, (batch_size, context_length), kwargs)

number of parameters: 65536
input shape: torch.Size([4, 1024])
output shape: torch.Size([4, 1024, 256])


In [5]:
from data_char import get_batch, enc
import math

x, y = get_batch(batch_size, context_length, 'train')
vocab_size = enc.n_vocab
model = BigramLanguageModel(vocab_size).cuda()
loss = model.loss(x.cuda(), y.cuda())
print('random guess loss:', -math.log(1/vocab_size))
print(loss)
loss_per_token = model.loss_per_token(x.cuda(), y.cuda())
print(loss_per_token.shape, loss_per_token.mean())
print(loss_per_token)

number of parameters: 4225
random guess loss: 4.174387269895637
tensor(4.7202, device='cuda:0', grad_fn=<NllLossBackward0>)
torch.Size([4, 1024]) tensor(4.7202, device='cuda:0', grad_fn=<MeanBackward0>)
tensor([[4.6262, 5.8574, 5.4585,  ..., 5.5064, 5.7386, 3.9034],
        [5.4831, 3.7372, 4.9155,  ..., 3.7848, 4.7020, 4.1312],
        [4.7929, 3.7371, 5.3320,  ..., 4.7322, 3.9989, 4.3654],
        [4.2844, 5.6883, 4.1599,  ..., 5.5120, 5.1844, 3.5611]],
       device='cuda:0', grad_fn=<ViewBackward0>)


In [6]:
input_tokens = x[0, :4].unsqueeze(0).cuda()
max_new_token = 8
generated_tokens = model.generate(input_tokens, max_new_token)
print('input', [enc.decode([i.item()]) for i in input_tokens[0]])
print('output', [enc.decode([i.item()]) for i in generated_tokens[0]])
print('Gold label', [enc.decode([i.item()]) for i in  x[0]])

input [['y'], [' '], ['s'], ['t']]
output [['y'], [' '], ['s'], ['t'], ['?'], ['v'], ['d'], ['x'], ['x'], ['b'], ['D'], ['y']]
Gold label [['y'], [' '], ['s'], ['t'], ['i'], ['n'], ['g'], [' '], ['t'], ['o'], [' '], ['h'], ['u'], ['r'], ['t'], [','], ['\n'], ['Y'], ['e'], ['t'], [' '], ['l'], ['o'], ['o'], ['k'], [' '], ['t'], ['o'], [' '], ['h'], ['a'], ['v'], ['e'], [' '], ['t'], ['h'], ['e'], ['m'], [' '], ['b'], ['u'], ['z'], ['z'], [' '], ['t'], ['o'], [' '], ['o'], ['f'], ['f'], ['e'], ['n'], ['d'], [' '], ['t'], ['h'], ['i'], ['n'], ['e'], [' '], ['e'], ['a'], ['r'], ['s'], ['.'], ['\n'], ['F'], ['i'], ['r'], ['s'], ['t'], [' '], ['w'], ['i'], ['l'], ['l'], [' '], ['I'], [' '], ['s'], ['e'], ['e'], [' '], ['t'], ['h'], ['e'], [' '], ['c'], ['o'], ['r'], ['o'], ['n'], ['a'], ['t'], ['i'], ['o'], ['n'], [';'], ['\n'], ['A'], ['n'], ['d'], [' '], ['t'], ['h'], ['e'], ['n'], [' '], ['t'], ['o'], [' '], ['B'], ['r'], ['i'], ['t'], ['t'], ['a'], ['n'], ['y'], [' '], ['I'], ["'"], ['l'

In [7]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
batch_size = 32
context_length = 1024
iterations = 1000
for steps in range(iterations):
    x, y = get_batch(batch_size, context_length, 'train')
    # print(x[0], y[0])
    x, y = x.cuda(), y.cuda()
    loss = model.loss(x, y)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    if steps % 100 == 0:
        print('steps:', steps, 'loss:', loss.item())
print('steps:', steps, 'loss:', loss.item())

steps: 0 loss: 4.718399524688721
steps: 100 loss: 4.5681352615356445
steps: 200 loss: 4.431594371795654
steps: 300 loss: 4.299322605133057
steps: 400 loss: 4.155538558959961
steps: 500 loss: 4.031984329223633
steps: 600 loss: 3.921776533126831
steps: 700 loss: 3.828066825866699
steps: 800 loss: 3.71785569190979
steps: 900 loss: 3.628220558166504
steps: 999 loss: 3.539994239807129


In [8]:
input_tokens = x[0, :4].unsqueeze(0).cuda()
max_new_token = 8
generated_tokens = model.generate(input_tokens, max_new_token)
print('input', [enc.decode([i.item()]) for i in input_tokens[0]])
print('output', [enc.decode([i.item()]) for i in generated_tokens[0]])
print('Gold label', [enc.decode([i.item()]) for i in  x[0]])

input [['e'], ['s'], [' '], ['i']]
output [['e'], ['s'], [' '], ['i'], ['R'], ['Z'], ["'"], ['\n'], ['d'], ['c'], ["'"], ['?']]
Gold label [['e'], ['s'], [' '], ['i'], ['n'], [' '], ['a'], [' '], ['m'], ['i'], ['l'], ['e'], ['-'], ['a'], ['.'], ['\n'], ['\n'], ['F'], ['L'], ['O'], ['R'], ['I'], ['Z'], ['E'], ['L'], [':'], ['\n'], ['T'], ['h'], ['e'], ['s'], ['e'], [' '], ['y'], ['o'], ['u'], ['r'], [' '], ['u'], ['n'], ['u'], ['s'], ['u'], ['a'], ['l'], [' '], ['w'], ['e'], ['e'], ['d'], ['s'], [' '], ['t'], ['o'], [' '], ['e'], ['a'], ['c'], ['h'], [' '], ['p'], ['a'], ['r'], ['t'], [' '], ['o'], ['f'], [' '], ['y'], ['o'], ['u'], ['\n'], ['D'], ['o'], [' '], ['g'], ['i'], ['v'], ['e'], [' '], ['a'], [' '], ['l'], ['i'], ['f'], ['e'], [':'], [' '], ['n'], ['o'], [' '], ['s'], ['h'], ['e'], ['p'], ['h'], ['e'], ['r'], ['d'], ['e'], ['s'], ['s'], [','], [' '], ['b'], ['u'], ['t'], [' '], ['F'], ['l'], ['o'], ['r'], ['a'], ['\n'], ['P'], ['e'], ['e'], ['r'], ['i'], ['n'], ['g'], [' '], [

In [9]:
print('seen tokens: ', batch_size * context_length * iterations)

seen tokens:  32768000


In [10]:
from ngram import Ngram
from data_char import text, enc
import torch
vocab = list(range(enc.n_vocab))
context_lengh = 16
ngram = Ngram(2, vocab)
inputs = [enc.encode(text)[:context_lengh]]
targets = torch.LongTensor([enc.encode(text)[1:context_lengh+1]]).cuda()
loss = ngram.loss(inputs, targets)
print(loss)
epochs = (batch_size * context_length * iterations) // len(enc.encode(text))
print(epochs)
ngram.train(enc.encode(text))
loss = ngram.loss(inputs, targets)
print('after 1 epoch:', loss)
# for epoch in range(epochs-1):
#     ngram.train(enc.encode(text))
# loss = ngram.loss(inputs, targets)
# print(loss)

tensor(4.1744, device='cuda:0')
29
after 1 epoch: tensor(2.6940, device='cuda:0')


In [11]:
len(text)

1115394

In [12]:
text[:10]

'First Citi'

In [13]:
train_text = 'abcdefabcdedfabcdedf'
check_enc = CharTokenizer(train_text)
print(check_enc.n_vocab)
print(check_enc.encoder)
print(check_enc.decoder)
ngram = Ngram(2, list(range(enc.n_vocab)))
loss = ngram.loss([check_enc.encode(train_text)[:-1]], torch.LongTensor([check_enc.encode(train_text)[1:]]).cuda())
print(loss)
ngram.train(check_enc.encode(train_text))
loss = ngram.loss([check_enc.encode(train_text)[:-1]], torch.LongTensor([check_enc.encode(train_text)[1:]]).cuda())
print(loss)

6
{'a': 0, 'b': 1, 'c': 2, 'd': 3, 'e': 4, 'f': 5}
{0: 'a', 1: 'b', 2: 'c', 3: 'd', 4: 'e', 5: 'f'}
tensor(4.1744, device='cuda:0')
tensor(2.9666, device='cuda:0')


In [14]:
check_enc.encode(train_text)

[0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 3, 5, 0, 1, 2, 3, 4, 3, 5]

In [15]:
ngram.ngram

defaultdict(<function ngram.Ngram.__init__.<locals>.<lambda>()>,
            {'0-0': 1,
             '0-1': 4,
             '0-2': 1,
             '0-3': 1,
             '0-4': 1,
             '0-5': 1,
             '0-6': 1,
             '0-7': 1,
             '0-8': 1,
             '0-9': 1,
             '0-10': 1,
             '0-11': 1,
             '0-12': 1,
             '0-13': 1,
             '0-14': 1,
             '0-15': 1,
             '0-16': 1,
             '0-17': 1,
             '0-18': 1,
             '0-19': 1,
             '0-20': 1,
             '0-21': 1,
             '0-22': 1,
             '0-23': 1,
             '0-24': 1,
             '0-25': 1,
             '0-26': 1,
             '0-27': 1,
             '0-28': 1,
             '0-29': 1,
             '0-30': 1,
             '0-31': 1,
             '0-32': 1,
             '0-33': 1,
             '0-34': 1,
             '0-35': 1,
             '0-36': 1,
             '0-37': 1,
             '0-38': 1,
         

In [16]:
ngram = Ngram(2, vocab, 1e-3)
loss = ngram.loss(inputs, targets)
print(loss)
ngram.train(enc.encode(text))
loss = ngram.loss(inputs, targets)
print(loss)

tensor(4.1744, device='cuda:0')


tensor(2.6793, device='cuda:0')


In [17]:
ngram = Ngram(4, vocab, 1e-3)
loss = ngram.loss(inputs, targets)
print(loss)
ngram.train(enc.encode(text))
loss = ngram.loss(inputs, targets)
print(loss)

tensor(4.1744, device='cuda:0')


tensor(1.2322, device='cuda:0')


In [19]:
# ngram.ngram