In [8]:
import matplotlib.pyplot as plt
import torch
import torch.nn.functional as F
from sklearn.model_selection import train_test_split

In [9]:
import string
NUM_CHARS = 27
CHARS = list(string.ascii_lowercase)
CTOI = {c: i+1 for i, c in enumerate(CHARS)}
CTOI['.'] = 0
ITOC = {i: c for c, i in CTOI.items()}

In [10]:
class Dataset:

  def __init__(self, path: str, split: list = [0.8, 0.1, 0.1]) -> None:
    self.words = open(path, 'r').read().splitlines()
    self.train_set, test_eval = train_test_split(self.words, train_size=split[0])
    self.eval_set, self.test_set = train_test_split(test_eval, train_size=split[1])

data = Dataset('../../data/names.txt')
print(len(data.train_set))

25626


In [11]:
class NGramExplicitModel:

  def __init__(self, n: int, data: Dataset) -> None:
    self.n = n
    self.data = data

  def count(self):
    self.counts = torch.zeros(tuple([NUM_CHARS for _ in range(self.n)]), dtype=torch.int32)
    for word in self.data.train_set:
      context = [0] * (self.n-1)
      for c in word + '.':
        ix = CTOI[c]
        indices = tuple(context + [ix])
        self.counts[indices] += 1
        context = context[1:] + [ix]
      
    self.P = F.normalize((self.counts+1).float(), p=1, dim=-1)
  
  def sample(self, generator: torch.Generator, num_samples: int = 1):
    for i in range(num_samples):
      sample = []
      context = [0 for _ in range(self.n-1)]
      while True:
        ix = torch.multinomial(self.P[tuple(context)], num_samples=1, replacement=True, generator=generator).item()
        sample.append(ITOC[ix])
        if ix == 0:
          break
        context = context[1:] + [ix]

      print(''.join(sample))

  def eval(self, mode: str = 'eval'):
    # average neg log likelihood
    llh, n = 0, 0
    eval_set = self.data.eval_set if mode=='eval' else self.data.train_set if mode=='train' else self.data.test_set
    for word in eval_set:
      context = [0 for _ in range(self.n-1)]
      for c in word + '.':
        ix = CTOI[c]
        indices = tuple(context + [ix])
        logprob = torch.log(self.P[indices])
        llh += logprob
        n += 1
    return -llh/n

In [12]:
class NGramMLPModel:

  def __init__(self, n: int, data: Dataset, embed_dim: int) -> None:
    self.n = n
    self.data = data
    self.embed_dim = embed_dim
    self.prepare_ds()
    self.init_network()

  def prepare_ds(self):
    # Create train set
    self.xs, self.ys = [], []
    for w in self.data.train_set:
      context = [0] * (self.n-1) 
      for c in w + '.':
        ix = CTOI[c]
        self.xs.append(context)
        self.ys.append(CTOI[c])
        context = context[1:] + [ix]
    self.xs = torch.tensor(self.xs)
    self.ys = torch.tensor(self.ys)

  def init_network(self):
    g = torch.Generator().manual_seed(2147483647)
    self.C = torch.randn((NUM_CHARS, self.embed_dim), generator=g, requires_grad=True)
    # C[xs].shape = (num_data_pairs, context_len, embed_dim)
    self.W = torch.randn(((self.n-1)*self.embed_dim, NUM_CHARS), generator=g, requires_grad=True)
    self.params = [self.C, self.W]
    for p in self.params:
      p.requires_grad = True


  def train_network(self, num_iters: int, lr: float):
    for k in range(num_iters):
      # NN forward pass
      xenc = self.C[self.xs]
      xenc = xenc.view(-1, (self.n-1)*self.embed_dim)
      logits = xenc @ self.W             # log counts -> only thing that will change in Transformers
      counts = logits.exp()              # equivalent to counts
      P = counts / counts.sum(dim=1, keepdims=True)
      # last 2 lines: softmax
      
      # loss: negative llh of probs corresponding to true labels
      loss = -P[torch.arange(self.ys.nelement()), self.ys].log().mean() + 0.01*(self.W**2).mean()
      
      ## NN backward pass
      for p in self.params:
        p.grad = None       # set grad to 0
      loss.backward()
      if k%10 == 0:
        print(f'Iter {k}, loss {loss.item()}')
      for p in self.params:
        p.data += -lr*p.grad


  def sample(self, generator: torch.Generator, num_samples: int):
    for i in range(num_samples):
      sample = []
      context = [0 for _ in range(self.n-1)]
      while True:
        xenc = self.C[torch.tensor(context)].flatten().unsqueeze(0)
        logits = xenc @ self.W
        counts = logits.exp()
        p = F.normalize(counts.float(), p=1, dim=-1)

        ix = torch.multinomial(p, num_samples=1, replacement=True, generator=generator).item()
        sample.append(ITOC[ix])
        if ix == 0:
          break
        context = context[1:] + [ix]

      print(''.join(sample))

In [13]:
g = torch.Generator().manual_seed(2147483647)
data = Dataset('../../data/names.txt')

In [None]:
model = NGramExplicitModel(n=2, data=data)
model.count()
model.eval(mode='train')

In [14]:
model = NGramMLPModel(2, data, 20)
model.train_network(250, 1)

Iter 0, loss 11.019983291625977
Iter 10, loss 4.332583427429199
Iter 20, loss 3.5005760192871094
Iter 30, loss 3.1635384559631348
Iter 40, loss 2.976283550262451
Iter 50, loss 2.85859751701355
Iter 60, loss 2.7770068645477295
Iter 70, loss 2.718291997909546
Iter 80, loss 2.675196409225464
Iter 90, loss 2.6428942680358887
Iter 100, loss 2.6179697513580322
Iter 110, loss 2.5982062816619873
Iter 120, loss 2.5822365283966064
Iter 130, loss 2.5691401958465576
Iter 140, loss 2.558243989944458
Iter 150, loss 2.549051284790039
Iter 160, loss 2.5412018299102783
Iter 170, loss 2.534437656402588
Iter 180, loss 2.5285682678222656
Iter 190, loss 2.523449182510376
Iter 200, loss 2.5189688205718994
Iter 210, loss 2.5150375366210938
Iter 220, loss 2.511582136154175
Iter 230, loss 2.5085413455963135
Iter 240, loss 2.5058605670928955


In [15]:
g = torch.Generator().manual_seed(2147483647)
model.sample(generator=g, num_samples=5)

moq.
axs.
minaynnnyles.
klemiairah.
anchriny.
