In [1]:
import seaborn as sns
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F

In [2]:
words = open('names.txt', 'r').read().splitlines()

In [3]:
vocab = ['.'] + sorted(list(set(''.join(words))))
vocab_size = len(vocab)

# mapping from char to integer, to fill the matrix
stoi = {ch: i for i,ch in enumerate(vocab)}

# inverse map, to decode while generation
itos = {i: ch for ch,i in stoi.items()}

### Construct input and outputs to neural network

In [4]:
xs, ys = [], []

for w in words[:1]:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2 in zip(chs, chs[1:]):
        xs.append(stoi[ch1])
        ys.append(stoi[ch2])
xs = torch.tensor(xs)
ys = torch.tensor(ys)
# type(xs), type(ys)

In [5]:
for x,y in zip(xs, ys):
    print(f'{x} - {y}')

0 - 5
5 - 13
13 - 13
13 - 1
1 - 0


In [6]:
ys

tensor([ 5, 13, 13,  1,  0])

In [None]:
xenc = F.one_hot(xs, num_classes=vocab_size).float()

In [37]:
W = torch.randn((27,27))

In [38]:
(xenc @ W) # (5,27) @ (27,27) -> (5,27)

tensor([[-0.4541, -0.0247, -0.1751,  0.0575, -0.6052,  0.7173, -0.5617,  0.2876,
         -0.5594,  0.8084, -0.5060,  0.9233, -1.7278, -2.2319,  0.7681, -0.5404,
         -0.2720,  0.5720, -0.0392, -1.2653,  0.7015,  0.7822, -0.9689, -0.8191,
          0.6100,  0.6062, -1.3918],
        [ 0.8898, -0.2148,  0.5852,  0.5912,  0.6323, -1.1569, -0.0847, -1.8790,
          1.7973,  1.4554,  0.0233,  1.6183,  1.0714,  0.7000,  0.6594, -1.0429,
          1.6146,  0.1018, -1.9442, -0.3743,  0.0463, -0.7652,  0.2598,  0.5734,
         -0.9370,  1.1055, -1.6889],
        [ 1.3131,  0.0084,  0.2310,  1.2991,  1.8917,  0.9315, -0.2191, -0.5848,
         -0.4077, -2.2127, -1.7800, -0.6531,  1.3630,  0.4454,  1.2610, -2.1095,
         -0.3351,  0.5288, -0.6718,  1.2187,  1.0076,  1.3114,  0.4003,  0.0995,
          0.3241, -0.3780,  1.0680],
        [ 1.3131,  0.0084,  0.2310,  1.2991,  1.8917,  0.9315, -0.2191, -0.5848,
         -0.4077, -2.2127, -1.7800, -0.6531,  1.3630,  0.4454,  1.2610, -2.1095

### How to interpret these 27 outputs for any sample
- probability distribution over 27 characters in our vocab
- we want to think of these as a row in the probability matrix, which is calculated from input car
- think of these numbers as log-counts, exponentiating these gets us counts (all positives)
- then normalize to sum to 1, for a row (axis=1)

In [39]:
probs = torch.exp(xenc @ W) / torch.exp(xenc @ W).sum(dim=1, keepdim=True)
probs.sum(dim=1, keepdim=True)

tensor([[1.],
        [1.],
        [1.],
        [1.],
        [1.]])

In [48]:
xs, ys = [], []

for w in words:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2 in zip(chs, chs[1:]):
        xs.append(stoi[ch1])
        ys.append(stoi[ch2])
xs = torch.tensor(xs)
ys = torch.tensor(ys)

xenc = F.one_hot(xs, num_classes=vocab_size).float()

In [63]:
xs.shape, ys.shape

(torch.Size([228146]), torch.Size([228146]))

In [61]:
# weight initialization
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((vocab_size, vocab_size), generator=g, requires_grad=True)

In [None]:
for i in range(100):
    n = xenc.shape[0]
    # forward pass
    logits = xenc @ W
    counts = torch.exp(logits)
    probs = counts / counts.sum(dim=1, keepdim=True)

    # calculate loss
    # we need to pluck probabilities assigned to correct indices from out calculated `probs`

    loss = - probs[torch.arange(n),ys].log().mean()
    print(loss.item())

    # backward pass
    W.grad = None  # same as setting gradients = 0.
    loss.backward()

    W.data += -50 * W.grad

3.758953094482422
3.371100664138794
3.154043197631836
3.020373821258545
2.927711248397827
2.8604023456573486
2.8097290992736816
2.7701022624969482
2.7380731105804443
2.711496353149414
2.6890032291412354
2.6696884632110596
2.6529300212860107
2.638277292251587
2.6253879070281982
2.613990545272827
2.60386323928833
2.5948216915130615
2.5867116451263428
2.5794036388397217
2.572789192199707
2.5667762756347656
2.5612881183624268
2.5562589168548584
2.551633596420288
2.547366142272949
2.543415069580078
2.5397486686706543
2.536336660385132
2.533154249191284
2.5301806926727295
2.5273966789245605
2.5247862339019775
2.522334575653076
2.520029067993164
2.517857789993286
2.515810489654541
2.513878345489502
2.512052059173584
2.510324001312256
2.5086867809295654
2.5071346759796143
2.5056614875793457
2.504261016845703
2.5029289722442627
2.5016613006591797
2.5004520416259766
2.4992988109588623
2.498197317123413
2.497144937515259
2.496137857437134
2.495173692703247
2.4942493438720703
2.493363380432129
2.4

In [59]:
# g = torch.Generator().manual_seed(2147483647)

for _ in range(5):
    out = []
    ix = 0
    while True:
        x = F.one_hot(torch.tensor([ix]), num_classes=27).float()
        logits = x @ W
        counts = logits.exp()
        p = counts / counts.sum(1, keepdim=True)
        ix = torch.multinomial(p, num_samples=1, replacement=True).item()
        out.append(itos[ix])
        if ix == 0:
            break
    print(''.join(out))

a.
mma.
asttlerabruiona.
bejolicori.
s.


#### Adding regularization / same as smoothing in count based modeling

In [64]:
# weight initialization
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((vocab_size, vocab_size), generator=g, requires_grad=True)

In [66]:
for i in range(100):
    n = xenc.shape[0]
    # forward pass
    logits = xenc @ W
    counts = torch.exp(logits)
    probs = counts / counts.sum(dim=1, keepdim=True)

    # calculate loss
    # we need to pluck probabilities assigned to correct indices from out calculated `probs`

    loss = -probs[torch.arange(n),ys].log().mean() + 0.01 * (W**2).mean()
    print(loss.item())

    # backward pass
    W.grad = None # same as setting gradients = 0.
    loss.backward()

    W.data += -50 * W.grad

3.76861834526062
3.3788065910339355
3.161090850830078
3.0271859169006348
2.9344842433929443
2.867231607437134
2.8166542053222656
2.777146339416504
2.7452542781829834
2.7188305854797363
2.696505546569824
2.6773719787597656
2.6608054637908936
2.6463515758514404
2.633665084838867
2.622471570968628
2.6125476360321045
2.6037068367004395
2.595794439315796
2.5886809825897217
2.582256317138672
2.5764293670654297
2.5711236000061035
2.566272735595703
2.5618226528167725
2.5577261447906494
2.5539445877075195
2.550442695617676
2.5471930503845215
2.5441696643829346
2.5413525104522705
2.538722038269043
2.536262035369873
2.5339579582214355
2.531797409057617
2.5297679901123047
2.527860164642334
2.526063919067383
2.5243709087371826
2.522773265838623
2.52126407623291
2.519836664199829
2.5184855461120605
2.517204999923706
2.515990972518921
2.5148372650146484
2.5137410163879395
2.512698173522949
2.511704444885254
2.5107579231262207
2.509855031967163
2.5089924335479736
2.5081682205200195
2.5073797702789307


## Sampling from the model

In [67]:
g = torch.Generator().manual_seed(2147483647)

for _ in range(10):
    out = []
    ix = 0
    while True:
        x = F.one_hot(torch.tensor([ix]), num_classes=27).float()
        logits = x @ W
        counts = logits.exp()
        p = counts / counts.sum(1, keepdim=True)
        ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
        out.append(itos[ix])
        if ix == 0:
            break
    print(''.join(out))

cexze.
momasurailezityha.
konimittain.
llayn.
ka.
da.
staiyaubrtthrigotai.
moliellavo.
ke.
teda.
