In [1]:
words = open("names.txt").read().splitlines()

In [2]:
charset = sorted(list(set(''.join(words))))

In [3]:
stoi = {j:i+1  for i,j in enumerate(charset)}

In [4]:
stoi['.']=0

In [5]:
stoi

{'a': 1,
 'b': 2,
 'c': 3,
 'd': 4,
 'e': 5,
 'f': 6,
 'g': 7,
 'h': 8,
 'i': 9,
 'j': 10,
 'k': 11,
 'l': 12,
 'm': 13,
 'n': 14,
 'o': 15,
 'p': 16,
 'q': 17,
 'r': 18,
 's': 19,
 't': 20,
 'u': 21,
 'v': 22,
 'w': 23,
 'x': 24,
 'y': 25,
 'z': 26,
 '.': 0}

In [6]:
itos = {i:j for j,i in stoi.items()}
itos


{1: 'a',
 2: 'b',
 3: 'c',
 4: 'd',
 5: 'e',
 6: 'f',
 7: 'g',
 8: 'h',
 9: 'i',
 10: 'j',
 11: 'k',
 12: 'l',
 13: 'm',
 14: 'n',
 15: 'o',
 16: 'p',
 17: 'q',
 18: 'r',
 19: 's',
 20: 't',
 21: 'u',
 22: 'v',
 23: 'w',
 24: 'x',
 25: 'y',
 26: 'z',
 0: '.'}

create dataset

In [7]:
import torch

In [8]:
xs, ys = [], []
for w in words:
  chs = ['.'] + list(w) + ['.']
  for ch1, ch2 in zip(chs, chs[1:]):
    ix1 = stoi[ch1]
    ix2 = stoi[ch2]
    xs.append(ix1)
    ys.append(ix2)  

xs = torch.tensor(xs)
ys = torch.tensor(ys)

One hot endoding, because they are indices

In [9]:
import torch.nn.functional as F

x_enc = F.one_hot(xs, num_classes=len(stoi)).float()
x_enc.shape

torch.Size([228146, 27])

In [10]:
x_enc.dtype

torch.float32

In [11]:
W = torch.randn((27, 27)) # The output is also 27 dimensional, one for each character
logits =x_enc @ W
# probs = F.softmax(logits, dim=1)
counts = logits.exp()
probs = counts / counts.sum(1, keepdim=True)
probs.shape

torch.Size([228146, 27])

In [12]:
probs[0].sum()  # should be 1.0

tensor(1.)

In [13]:
nlls = torch.zeros(5)
for i in range(5):
  # i-th bigram:
  x = xs[i].item() # input character index
  y = ys[i].item() # label character index
  print('--------')
  print(f'bigram example {i+1}: {itos[x]}{itos[y]} (indexes {x},{y})')
  print('input to the neural net:', x)
  print('output probabilities from the neural net:', probs[i])
  print('label (actual next character):', y)
  p = probs[i, y]
  print('probability assigned by the net to the the correct character:', p.item())
  logp = torch.log(p)
  print('log likelihood:', logp.item())
  nll = -logp
  print('negative log likelihood:', nll.item())
  nlls[i] = nll

print('=========')
print('average negative log likelihood, i.e. loss =', nlls.mean().item())

--------
bigram example 1: .e (indexes 0,5)
input to the neural net: 0
output probabilities from the neural net: tensor([0.0275, 0.0072, 0.0182, 0.0068, 0.0126, 0.0102, 0.0148, 0.0005, 0.1086,
        0.0428, 0.0172, 0.0246, 0.0394, 0.1003, 0.0401, 0.0145, 0.0784, 0.0482,
        0.0212, 0.0164, 0.1487, 0.0630, 0.0124, 0.0207, 0.0392, 0.0375, 0.0292])
label (actual next character): 5
probability assigned by the net to the the correct character: 0.010179250501096249
log likelihood: -4.587403774261475
negative log likelihood: 4.587403774261475
--------
bigram example 2: em (indexes 5,13)
input to the neural net: 5
output probabilities from the neural net: tensor([0.0286, 0.1065, 0.0110, 0.0136, 0.0992, 0.0090, 0.0225, 0.0532, 0.0113,
        0.0754, 0.0494, 0.0154, 0.0164, 0.0190, 0.0202, 0.0166, 0.0368, 0.0241,
        0.0809, 0.0505, 0.0108, 0.0236, 0.0543, 0.0106, 0.0195, 0.1169, 0.0047])
label (actual next character): 13
probability assigned by the net to the the correct character: 0

optimization

In [14]:
# create the dataset
xs, ys = [], []
for w in words:
  chs = ['.'] + list(w) + ['.']
  for ch1, ch2 in zip(chs, chs[1:]):
    ix1 = stoi[ch1]
    ix2 = stoi[ch2]
    xs.append(ix1)
    ys.append(ix2)
xs = torch.tensor(xs)
ys = torch.tensor(ys)
num = xs.nelement()
print('number of examples: ', num)

# initialize the 'network'
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((27, 27), generator=g, requires_grad=True)

number of examples:  228146


In [15]:
# gradient descent
for k in range(100):
  
  # forward pass
  xenc = F.one_hot(xs, num_classes=27).float() # input to the network: one-hot encoding
  logits = xenc @ W # predict log-counts
  counts = logits.exp() # counts, equivalent to N
  probs = counts / counts.sum(1, keepdims=True) # probabilities for next character
  loss = -probs[torch.arange(num), ys].log().mean() + 0.1*(W**2).mean()
  print(loss.item())
  
  # backward pass
  W.grad = None # set to zero the gradient
  loss.backward()
  
  # update
  W.data += -50 * W.grad

3.855605363845825
3.446014404296875
3.220780849456787
3.0834012031555176
2.9891412258148193
2.9212939739227295
2.8705763816833496
2.8312137126922607
2.799691677093506
2.7738349437713623
2.7522473335266113
2.7339911460876465
2.718406915664673
2.7050065994262695
2.6934139728546143
2.68333101272583
2.6745150089263916
2.6667675971984863
2.6599278450012207
2.653859853744507
2.6484534740448
2.64361572265625
2.639270782470703
2.6353535652160645
2.631809949874878
2.6285951137542725
2.6256701946258545
2.623002529144287
2.62056303024292
2.6183278560638428
2.616276264190674
2.614389181137085
2.6126503944396973
2.611046314239502
2.6095635890960693
2.608191728591919
2.6069204807281494
2.605740547180176
2.604644298553467
2.603623867034912
2.6026737689971924
2.601787805557251
2.6009607315063477
2.6001875400543213
2.599463939666748
2.5987868309020996
2.598151683807373
2.59755539894104
2.5969958305358887
2.5964694023132324
2.5959746837615967
2.595508337020874
2.595069169998169
2.5946547985076904
2.5942

In [16]:
# finally, sample from the 'neural net' model
g = torch.Generator().manual_seed(2147483647)

for i in range(5):
  
  out = []
  ix = 0
  while True:
    
    # ----------
    # BEFORE:
    #p = P[ix]
    # ----------
    # NOW:
    xenc = F.one_hot(torch.tensor([ix]), num_classes=27).float()
    logits = xenc @ W # predict log-counts
    counts = logits.exp() # counts, equivalent to N
    p = counts / counts.sum(1, keepdims=True) # probabilities for next character
    # ----------
    
    ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
    if ix == 0:
      break
    out.append(itos[ix])

  print(''.join(out))

cexze
momakurailezityha
konimittain
llayn
ka
