# Exercises:
* E01: train a trigram language model, i.e. take two characters as an input to predict the 3rd one. Feel free to use either counting or a neural net. Evaluate the loss; Did it improve over a bigram model?


* E02: split up the dataset randomly into 80% train set, 10% dev set, 10% test set. Train the bigram and trigram models only on the training set. Evaluate them on dev and test splits. What can you see?


* E03: use the dev set to tune the strength of smoothing (or regularization) for the trigram model - i.e. try many possibilities and see which one works best based on the dev set loss. What patterns can you see in the train and dev set loss as you tune this strength? Take the best setting of the smoothing and evaluate on the test set once and at the end. How good of a loss do you achieve?

* E04: we saw that our 1-hot vectors merely select a row of W, so producing these vectors explicitly feels wasteful. Can you delete our use of F.one_hot in favor of simply indexing into rows of W?


* E05: look up and use F.cross_entropy instead. You should achieve the same result. Can you think of why we'd prefer to use F.cross_entropy instead?


* E06: meta-exercise! Think of a fun/interesting exercise and complete it.

In [167]:
import torch 
import torch.nn.functional as F
from torch.utils.data.sampler import SubsetRandomSampler
import numpy as np
import matplotlib.pyplot as plt 
%matplotlib inline

In [20]:
words = open("names.txt", "r").read().splitlines()
words[:5]

['emma', 'olivia', 'ava', 'isabella', 'sophia']

# Exersise 1

In [14]:
itos = dict(enumerate(sorted(set(''.join(words))), start=1))
itos[0] = '.'
stoi = {v: k for k, v in itos.items()}

In [126]:
xs, ys = [], []

for word in words: 
    w_tmp = ['.'] +list(word)+ ['.']
    for i in range(len((w_tmp)) - 2): 
        for ch1, ch2, ch3 in zip(w_tmp[i], w_tmp[i+1], w_tmp[i+2]): 
            # Count bi-grams frequencies 
            ix1 = stoi[ch1]
            ix2 = stoi[ch2]
            ix3 = stoi[ch3]

            xs.append([ix1, ix2])
            ys.append(ix3)

xs = torch.tensor(xs)
ys = torch.tensor(ys)
num = xs.nelement()
print('number of examples: ', num)

# initialize the 'network'
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((27, 27), generator=g, requires_grad=True)



number of examples:  392226


In [134]:
xenc = F.one_hot(xs, num_classes=27).float()
xenc.shape


torch.Size([196113, 2, 27])

In [158]:
# gradient descent
for k in range(5):
  
  # forward pass
  xenc = F.one_hot(xs, num_classes=27).float() # input to the network: one-hot encoding
  logits = xenc @ W # predict log-counts
  counts = logits.exp() # counts, equivalent to N
  probs = counts / counts.sum(1, keepdims=True) # probabilities for next character
  loss = -probs[torch.arange(num), ys].log().mean() + 0.01*(W**2).mean()
  print(loss.item())
  
  # backward pass
  W.grad = None # set to zero the gradient
  loss.backward()
  
  # update
  W.data += -50 * W.grad

IndexError: index 196113 is out of bounds for dimension 0 with size 196113

In [157]:
torch.arange(num).view(ys.shape[0], -1).shape[0]

196113

# Exersise 2

In [178]:
from sklearn.model_selection import train_test_split

train, valid = train_test_split(words, train_size=0.8, random_state=42)
dev, test = train_test_split(valid, train_size=0.5, random_state=42)

In [186]:
# create the dataset
xs, ys = [], []
for w in train:
  chs = ['.'] + list(w) + ['.']
  for ch1, ch2 in zip(chs, chs[1:]):
    ix1 = stoi[ch1]
    ix2 = stoi[ch2]
    xs.append(ix1)
    ys.append(ix2)
xs = torch.tensor(xs)
ys = torch.tensor(ys)
num = xs.nelement()
print('number of examples train: ', num)

# create the dataset
xs_dev, ys_dev = [], []
for w in dev:
  chs = ['.'] + list(w) + ['.']
  for ch1, ch2 in zip(chs, chs[1:]):
    ix1 = stoi[ch1]
    ix2 = stoi[ch2]
    xs_dev.append(ix1)
    ys_dev.append(ix2)
xs_dev = torch.tensor(xs)
ys_dev = torch.tensor(ys)
num_dev = xs_dev.nelement()
print('number of examples dev: ', num_dev)

# create the dataset
xs_test, ys_test = [], []
for w in test:
  chs = ['.'] + list(w) + ['.']
  for ch1, ch2 in zip(chs, chs[1:]):
    ix1 = stoi[ch1]
    ix2 = stoi[ch2]
    xs_test.append(ix1)
    ys_test.append(ix2)
xs_test = torch.tensor(xs)
ys_test = torch.tensor(ys)
num_test = xs_test.nelement()
print('number of examples dev: ', num_test)

# initialize the 'network'
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((27, 27), generator=g, requires_grad=True)

number of examples train:  182497
number of examples dev:  182497
number of examples dev:  182497


  xs_dev = torch.tensor(xs)
  ys_dev = torch.tensor(ys)
  xs_test = torch.tensor(xs)
  ys_test = torch.tensor(ys)


In [185]:
# gradient descent
for k in range(20):
  
  # forward pass
  xenc = F.one_hot(xs, num_classes=27).float() # input to the network: one-hot encoding
  logits = xenc @ W # predict log-counts
  counts = logits.exp() # counts, equivalent to N
  probs = counts / counts.sum(1, keepdims=True) # probabilities for next character
  loss = -probs[torch.arange(num), ys].log().mean() + 0.01*(W**2).mean()
  print(loss.item())
  
  # backward pass
  W.grad = None # set to zero the gradient
  loss.backward()
  
  # update
  W.data += -50 * W.grad

3.7672512531280518
3.378115653991699
3.1608033180236816
3.027028799057007
2.9344685077667236
2.86735463142395
2.816909074783325
2.777521848678589
2.745732069015503
2.719390869140625
2.6971280574798584
2.678039789199829
2.6615042686462402
2.6470706462860107
2.6343963146209717
2.62320876121521
2.613285779953003
2.604443073272705
2.5965263843536377
2.589406728744507


In [None]:
# gradient descent
for k in range(len(dev)):
  
  # forward pass
  xenc = F.one_hot(xs, num_classes=27).float() # input to the network: one-hot encoding
  logits = xenc @ W # predict log-counts
  counts = logits.exp() # counts, equivalent to N
  probs = counts / counts.sum(1, keepdims=True) # probabilities for next character
  loss = -probs[torch.arange(num), ys].log().mean() + 0.01*(W**2).mean()
  print(loss.item())
  
  # backward pass
  W.grad = None # set to zero the gradient
  loss.backward()
  
  # update
  W.data += -50 * W.grad