Basic trigram model, Excercise 01 from karpathys lectures


In [1]:
import torch
from torch import nn

In [2]:
names = open('names.txt', 'r').read().splitlines()
names[:10]

['emma',
 'olivia',
 'ava',
 'isabella',
 'sophia',
 'charlotte',
 'mia',
 'amelia',
 'harper',
 'evelyn']

In [3]:
# Conversion functions
chars = sorted(list(set(''.join(names))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0 # special char denoting start/end
itos = {i:s for s,i in stoi.items()}

In [4]:

# Generate trigrams and count occurences
t = {}
for n in names:
    chs = ['.'] + list(n) + ['.']
    for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
        trigram = (ch1, ch2, ch3)
        t[trigram] =  t.get(trigram,0) +1



In [5]:
# Transform trigram dict to 3D tensor, each element in tensor corresponds to specific trigram
N = torch.zeros((27,27,27), dtype=torch.int32)    

for (ch1, ch2, ch3), count in t.items():
    print(f" {ch1}{ch2}{ch3}   count {count}")
    ix1 = stoi[ch1]
    ix2 = stoi[ch2]
    ix3 = stoi[ch3]
    N[ix1, ix2, ix3] =  count


 .em   count 288
 emm   count 100
 mma   count 72
 ma.   count 174
 .ol   count 104
 oli   count 69
 liv   count 54
 ivi   count 78
 via   count 147
 ia.   count 903
 .av   count 243
 ava   count 161
 va.   count 93
 .is   count 124
 isa   count 142
 sab   count 76
 abe   count 173
 bel   count 201
 ell   count 822
 lla   count 337
 la.   count 684
 .so   count 152
 sop   count 21
 oph   count 37
 phi   count 61
 hia   count 81
 .ch   count 352
 cha   count 236
 har   count 329
 arl   count 287
 rlo   count 44
 lot   count 14
 ott   count 34
 tte   count 121
 te.   count 175
 .mi   count 393
 mia   count 95
 .am   count 384
 ame   count 226
 mel   count 188
 eli   count 537
 lia   count 518
 .ha   count 505
 arp   count 8
 rpe   count 5
 per   count 77
 er.   count 683
 .ev   count 154
 eve   count 142
 vel   count 76
 ely   count 353
 lyn   count 976
 yn.   count 953
 .ab   count 190
 abi   count 76
 big   count 15
 iga   count 35
 gai   count 18
 ail   count 259
 il.   count 119
 emi

Notes:
Add-one smoothing/Laplace-smoothing: avoid assigning a probability of 0 to unseen n-grams
keepdim: True --> output tensor has same dimensions as input tensor


In [6]:
#P(ch3)/P(ch1 and ch2) --> that_specific_trigram_count/count_all_trigrams_starting_with_same_bigram
P = (N+1).float() # Add-one smoothing
P /= P.sum(2, keepdim=True) # Sum over 3rd dim and normalize
P

tensor([[[3.7037e-02, 3.7037e-02, 3.7037e-02,  ..., 3.7037e-02,
          3.7037e-02, 3.7037e-02],
         [2.2538e-04, 4.6879e-02, 4.3047e-02,  ..., 6.3106e-03,
          3.9216e-02, 3.4483e-02],
         [7.5019e-04, 1.2753e-01, 7.5019e-04,  ..., 7.5019e-04,
          3.7509e-03, 7.5019e-04],
         ...,
         [6.2112e-03, 3.6025e-01, 6.2112e-03,  ..., 1.2422e-02,
          1.1180e-01, 7.4534e-02],
         [1.7794e-03, 4.3950e-01, 1.7794e-03,  ..., 1.7794e-03,
          1.7794e-03, 5.3381e-03],
         [1.0460e-03, 4.7803e-01, 1.0460e-03,  ..., 1.0460e-03,
          9.6234e-02, 2.0921e-03]],

        [[3.7037e-02, 3.7037e-02, 3.7037e-02,  ..., 3.7037e-02,
          3.7037e-02, 3.7037e-02],
         [7.0326e-02, 1.7153e-03, 1.0292e-02,  ..., 1.7153e-03,
          3.6021e-02, 2.0583e-02],
         [6.5141e-02, 5.1056e-02, 3.6972e-02,  ..., 1.7606e-03,
          2.2887e-02, 1.7606e-03],
         ...,
         [5.7416e-02, 2.8708e-02, 4.7847e-03,  ..., 8.6124e-02,
          3.349

In [7]:
for i in range(5):
  out = ['.']
  while True:
    if len(out) < 2:
      ix1 = ix2 = stoi[out[-1]]
    else: 
      ix1 = stoi[out[-2]]
      ix2 = stoi[out[-1]]
    p = P[ix1,ix2] # p for next char 
    #p = torch.ones(27)/ 27.0  #sanity check that there is an effect actually --> make uniform distribution
    ix3 = torch.multinomial(p,num_samples=1,replacement=True).item() # Get actual number
    #print(f"{itos[ix1] }{itos[ix2]} next best char: {itos[ix3]}")
    if ix3 == stoi['.']:
      break
    out.append(itos[ix3])

  print(''.join(out[1:])) 


zey
prenzpbalameskmon
dalyn
dyaha
woody
