## Reading Data

In [15]:
with open('names.txt', 'r') as f:
    text = f.read()

In [17]:
len(text)

228145

## First 100 chars

In [20]:
print(text[:100])

emma
olivia
ava
isabella
sophia
charlotte
mia
amelia
harper
evelyn
abigail
emily
elizabeth
mila
ella


## Vocabulary and vocab size

In [26]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print("".join(chars))
print(vocab_size)


abcdefghijklmnopqrstuvwxyz
27


## Encoder and decoder

In [30]:
stoi = { ch : i for i, ch in enumerate(chars)}
itos = { i : ch for i, ch in enumerate(chars)}

encode = lambda s: [stoi[c] for c in s]
decode = lambda l: "".join([itos[i] for i in l])

In [35]:
encode("hello\nworld"), decode(encode("hello\nworld"))

([8, 5, 12, 12, 15, 0, 23, 15, 18, 12, 4], 'hello\nworld')

## Dataloader

In [38]:
import torch
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(text[:100], data[:100])

torch.Size([228145]) torch.int64
emma
olivia
ava
isabella
sophia
charlotte
mia
amelia
harper
evelyn
abigail
emily
elizabeth
mila
ella tensor([ 5, 13, 13,  1,  0, 15, 12,  9, 22,  9,  1,  0,  1, 22,  1,  0,  9, 19,
         1,  2,  5, 12, 12,  1,  0, 19, 15, 16,  8,  9,  1,  0,  3,  8,  1, 18,
        12, 15, 20, 20,  5,  0, 13,  9,  1,  0,  1, 13,  5, 12,  9,  1,  0,  8,
         1, 18, 16,  5, 18,  0,  5, 22,  5, 12, 25, 14,  0,  1,  2,  9,  7,  1,
         9, 12,  0,  5, 13,  9, 12, 25,  0,  5, 12,  9, 26,  1,  2,  5, 20,  8,
         0, 13,  9, 12,  1,  0,  5, 12, 12,  1])


In [41]:
n = int(0.9*len(data))
train_data = data[:n]
valid_data = data[n:]

In [42]:
block_size=8
train_data[:block_size + 1]

tensor([ 5, 13, 13,  1,  0, 15, 12,  9, 22])

In [43]:
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"when input is {context}, output is {target}")

when input is tensor([5]), output is 13
when input is tensor([ 5, 13]), output is 13
when input is tensor([ 5, 13, 13]), output is 1
when input is tensor([ 5, 13, 13,  1]), output is 0
when input is tensor([ 5, 13, 13,  1,  0]), output is 15
when input is tensor([ 5, 13, 13,  1,  0, 15]), output is 12
when input is tensor([ 5, 13, 13,  1,  0, 15, 12]), output is 9
when input is tensor([ 5, 13, 13,  1,  0, 15, 12,  9]), output is 22


In [45]:
torch.manual_seed(1337)
batch_size = 4
block_size = 8

def get_batch(split):
    data = train_data if split == 'train' else valid_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i: i+block_size] for i in ix])
    y = torch.stack([data[i+1: i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')
print('input')
print(xb.shape)
print(xb)
print('target:')
print(xb.shape)
print(xb)

input
torch.Size([4, 8])
tensor([[12,  9, 25,  1,  8,  0,  5, 13],
        [ 0, 13,  1,  5, 12,  9, 14,  0],
        [12, 15, 18, 25, 14, 14,  0, 12],
        [ 5, 15, 12, 21, 23,  1,  0, 15]])
target:
torch.Size([4, 8])
tensor([[12,  9, 25,  1,  8,  0,  5, 13],
        [ 0, 13,  1,  5, 12,  9, 14,  0],
        [12, 15, 18, 25, 14, 14,  0, 12],
        [ 5, 15, 12, 21, 23,  1,  0, 15]])


## Model

In [60]:
import torch.nn as nn
import torch.nn.functional as F
torch.manual_seed(1337)

class BigramLM(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):
        logits = self.token_embedding_table(idx) #B,T,C

        if targets is None:
            loss=None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, loss = self(idx)
            logits = logits[:, -1, :]
            output = F.softmax(logits, dim=1)
            idx_next = torch.multinomial(output, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)

        return idx

In [61]:
model = BigramLM(vocab_size)
logits, loss = model(xb, yb)
print(logits.shape, loss)

torch.Size([32, 27]) tensor(3.7703, grad_fn=<NllLossBackward0>)


## Model output without training

In [96]:
idx=torch.tensor([[1]], dtype=torch.long)
print(decode((model.generate(idx, max_new_tokens=100)[0]).tolist()))

aqvwhxqtxuopubgachrnladkcrvg
oswhxcztawewh
ilxjpeptgclcqebrnwzzlophxjceittkrqymcrgedcljsscdyaadeczthm


In [97]:
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)

In [110]:
batch_size = 32
for steps in range(100000):
    xb, yb = get_batch('train')
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
print(loss.item())

2.387847900390625


## Sample output after training

In [187]:
idx=torch.randint(0,27,(1,1), dtype=torch.long)
print("Input char is : ", decode(idx[0].tolist()))
print(f"Outputs:\n{decode((model.generate(idx, max_new_tokens=5)[0]).tolist())}")

Input char is :  z
Outputs:
zaviah
