In [1]:
import torch
import itertools
from string import ascii_lowercase

In [2]:
words = open("data/names.txt").read().splitlines()
words[:10]

['emma',
 'olivia',
 'ava',
 'isabella',
 'sophia',
 'charlotte',
 'mia',
 'amelia',
 'harper',
 'evelyn']

In [3]:
# 2 for bigram, 3 for trigram, etc...
block_size = 2

vocab = list("." + ascii_lowercase)
vtoi = {c: i for i, c in enumerate(vocab)}
itov = {v: k for k, v in vtoi.items()}

l = [vocab] * block_size
contexts = ["".join(p) for p in itertools.product(*l)]
ctoi = {c: i for i, c in enumerate(contexts)}
itoc = {v: k for k, v in ctoi.items()}

vocab[:10], contexts[:10]

(['.', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i'],
 ['..', '.a', '.b', '.c', '.d', '.e', '.f', '.g', '.h', '.i'])

In [4]:
n = int(len(words)*0.9)
train_set = words[:n]
test_set  = words[n:]

In [5]:
N = torch.zeros(len(contexts), len(vocab), dtype=torch.int64)
for w in train_set:
  context = "." * block_size
  for c in w + ".":
    ic = ctoi[context]
    il = vtoi[c]

    N[ic, il] += 1
    context = context[1:] + c

# increasing this parameters make the model more uniform
smoothing = 1
P = (N+smoothing).float()
P /= P.sum(1, keepdim=True)

In [6]:
for _ in range(5):
  w = ""
  context = "." * block_size
  while True:
    ic = ctoi[context]

    p = P[ic]
    i = torch.multinomial(p, num_samples=1, replacement=True).item()
    if i == 0: 
      print(w)
      break

    l = itov[i]
    w += l
    context = context[1:] + l

kimaryn
kha
jadie
sleyon
taya


In [7]:
for X in [train_set, test_set]:
  logsum = 0
  n = 0
  for i, x in enumerate(X):
    w = '.' * block_size + x + '.'
    for i in range(len(w) - block_size):
      c = w[i:i+block_size]
      ic = ctoi[c]

      v = w[i+block_size]
      iv = vtoi[v]

      p = P[ic, iv]
      logsum += p.log()
      n += 1
  print(f"{-logsum/n:.2f} on {'train set' if X == train_set else 'test set'}")

2.20 on train set
2.41 on test set
