<a href="https://colab.research.google.com/github/mandliya/dailyLearning/blob/main/notebooks/MakeMore_Trigram.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# MakeMore Tri-Gram Model

In [3]:
words = open('names.txt').read().splitlines()

In [4]:
words[:5]

['emma', 'olivia', 'ava', 'isabella', 'sophia']

In [11]:
# Build char to index and index to char
chars = ['.'] + sorted(list(set([c for word in words for c in word])))
stoi = {c:i for i, c in enumerate(chars)}
itos = {i:c for c, i in stoi.items()}

In [14]:
#Build tri-gram count
# use . as start and end token

import torch

N = torch.zeros((27, 27, 27), dtype=torch.int32)
N.shape

torch.Size([27, 27, 27])

In [16]:
for word in words:
  chars = ['.'] + ['.'] + list(word) + ['.']
  for ch1, ch2, ch3 in zip(chars, chars[1:], chars[2:]):
    idx1, idx2, idx3 = stoi[ch1], stoi[ch2], stoi[ch3]
    N[idx1, idx2, idx3] += 1

In [17]:
# First row
N[0, 0]

tensor([   0, 4410, 1306, 1542, 1690, 1532,  417,  669,  874,  591, 2422, 2963,
        1572, 2538, 1146,  395,  515,   92, 1639, 2055, 1308,   78,  376,  307,
         134,  535,  929], dtype=torch.int32)

In [24]:
# Convert counts to probabilities for each pair of N gram

P = N.float() + 1
for i in range(27):
  for j in range(27):
    P[i, j] /= P[i, j].sum()

In [20]:
P[0, 0]

tensor([3.1190e-05, 1.3758e-01, 4.0765e-02, 4.8126e-02, 5.2742e-02, 4.7814e-02,
        1.3037e-02, 2.0897e-02, 2.7291e-02, 1.8464e-02, 7.5572e-02, 9.2446e-02,
        4.9061e-02, 7.9190e-02, 3.5774e-02, 1.2351e-02, 1.6094e-02, 2.9006e-03,
        5.1151e-02, 6.4126e-02, 4.0827e-02, 2.4640e-03, 1.1758e-02, 9.6064e-03,
        4.2106e-03, 1.6718e-02, 2.9006e-02])

In [28]:
P[0, 0].sum(), P[23, 24].sum()

(tensor(1.0000), tensor(1.))

In [27]:
g = torch.Generator().manual_seed(2147483647)
for i in range(10):
  ix1, ix2 = 0, 0
  w = []
  while True:
    ix2 = torch.multinomial(P[ix1, ix2], num_samples=1, replacement=True, generator=g).item()
    if ix2 == 0:
      break
    w.append(itos[ix2])
    ix1 = ix2
  print(''.join(w))

juwjde
jphcqadhp
cfaywadi
a
ji
ritopemasareme
sane
aryanilenani
dbyaine
i


In [30]:
# Loss calculation

nll = 0
n = 0
for word in words:
  chars = ['.'] + ['.'] + list(word) + ['.']
  for c1, c2, c3 in zip(chars, chars[1:], chars[2:]):
    idx1, idx2, idx3 = stoi[c1], stoi[c2], stoi[c3]
    p = P[idx1, idx2, idx3]
    logp = torch.log(p)
    nll += logp
    n += 1
nll = -nll/n
print(f'{nll=}')

nll=tensor(2.2120)


Lower than bi-gram model which was about 2.48.

## Shallow Neural Network Model

In [32]:
# build an index of each bigram

chars = ['.'] + sorted(list(set([c for word in words for c in word])))
bigramIndex = {}
i = 0
for c1 in chars:
  for c2 in chars:
    bigramIndex[f'{c1}{c2}'] = i
    i += 1 
len(bigramIndex)

729

In [35]:
#build dataset
xs, ys = [], []
for word in words:
  chars = ['.'] + ['.'] + list(word) + ['.']
  for ch1, ch2, ch3 in zip(chars, chars[1:], chars[2:]):
    xs.append(bigramIndex[ch1 + ch2])
    ys.append(stoi[ch3])

xs = torch.tensor(xs)
ys = torch.tensor(ys)

In [39]:
xs[1], ys[1]

(tensor(5), tensor(13))

In [50]:
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((27*27, 27), generator=g, requires_grad=True)

In [51]:
import torch.nn.functional as F
xenc = F.one_hot(xs, num_classes=27*27).float()

In [52]:
xenc.shape, W.shape

(torch.Size([228146, 729]), torch.Size([729, 27]))

In [53]:
num = xs.nelement()
num

228146

In [54]:

for k in range(1000):
  # Forward pass
  logits = xenc @ W
  counts = logits.exp()
  probs = counts / counts.sum(dim=1, keepdims=True)
  loss = -probs[torch.arange(num), ys].log().mean() + 0.01*(W**2).mean()
  if k % 100 == 0:
    print(f'iteration: {k}, loss: {loss.item()}')

  # Backward Pass
  W.grad = None
  loss.backward()

  W.data += -100 * W.grad

iteration: 0, loss: 3.8028223514556885
iteration: 100, loss: 2.391273260116577
iteration: 200, loss: 2.309091329574585
iteration: 300, loss: 2.2774598598480225
iteration: 400, loss: 2.2608020305633545
iteration: 500, loss: 2.250521659851074
iteration: 600, loss: 2.243560314178467
iteration: 700, loss: 2.2385523319244385
iteration: 800, loss: 2.2347869873046875
iteration: 900, loss: 2.231858491897583


In [55]:
# Get some samples
g = torch.Generator().manual_seed(2147483647)
for i in range(10):
  ix1, ix2 = 0, 0
  out = []
  while True:
    xenc = F.one_hot(torch.tensor([bigramIndex[itos[ix1] + itos[ix2]]]), num_classes = 27*27).float()
    logits = xenc @ W
    counts = logits.exp()
    p = counts / counts.sum()
    ix2 = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
    out.append(itos[ix2])
    if ix2 == 0:
      break
    ix1 = ix2
  print(''.join(out))

junide.
jpyaqadhu.
cfaywadi.
a.
ji.
ritopemasareme.
sane.
aryani.
enani.
dbyai.


In [56]:
loss

tensor(2.2295, grad_fn=<AddBackward0>)

Final loss is almost same as the original count based model and the outputs too look almost similar!