In [1]:
import torch
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
with open('names.txt') as file:
    names = file.read().splitlines()

In [3]:
names[:3]

['emma', 'olivia', 'ava']

In [4]:
letters = set()
for name in names:
    for letter in name:
        letters.add(letter)

In [5]:
letters = list(sorted(letters))

## E01: train a trigram language model, i.e. take two characters as an input to predict the 3rd one. Feel free to use either counting or a neural net. Evaluate the loss; Did it improve over a bigram model?

First idea is to do something like this:

In [6]:
for name in names[:3]:
    print('########')
    print(name)
    name = '.' + name + '.'
    for i,j,k in (zip(name, name[1:], name[2:])):
        print(i+j, '->',k)
    print('########')

########
emma
.e -> m
em -> m
mm -> a
ma -> .
########
########
olivia
.o -> l
ol -> i
li -> v
iv -> i
vi -> a
ia -> .
########
########
ava
.a -> v
av -> a
va -> .
########


If we do this, then we have the problem that the very first 3-gram is actually not starting from a neutral state, it starts with ".x" where x is the first letter of the name. So instead, we need to do something like this:

In [7]:
for name in names[:3]:
    print('########')
    print(name)
    name = '..' + name + '.'
    for i,j,k in (zip(name, name[1:], name[2:])):
        print(i+j, '->',k)
    print('########')

########
emma
.. -> e
.e -> m
em -> m
mm -> a
ma -> .
########
########
olivia
.. -> o
.o -> l
ol -> i
li -> v
iv -> i
vi -> a
ia -> .
########
########
ava
.. -> a
.a -> v
av -> a
va -> .
########


So to append `..` to the beginning, then we start with a neutral state and predict the first letter from that neutral state

Next, we need to construct the matrix in which we will store the counts. Previously we had a 27x27 matrix to store all this information, but now that we have a trigram, so the row dimension will become larger. Because the row dimension will have to cover all the `..`, `.x` and `xy` combinations

With itertools product we can make these kind of combinations:

In [8]:
from itertools import product

list(product([1,2,3], [1,2,3]))

[(1, 1), (1, 2), (1, 3), (2, 1), (2, 2), (2, 3), (3, 1), (3, 2), (3, 3)]

In [16]:
combinations = ['..'] + list(product('.', letters)) + list(product(letters, letters))
combinations[0:10], combinations[-10:]

(['..',
  ('.', 'a'),
  ('.', 'b'),
  ('.', 'c'),
  ('.', 'd'),
  ('.', 'e'),
  ('.', 'f'),
  ('.', 'g'),
  ('.', 'h'),
  ('.', 'i')],
 [('z', 'q'),
  ('z', 'r'),
  ('z', 's'),
  ('z', 't'),
  ('z', 'u'),
  ('z', 'v'),
  ('z', 'w'),
  ('z', 'x'),
  ('z', 'y'),
  ('z', 'z')])

In [10]:
ss_to_i = {c[0]+c[1]:i for i, c in enumerate(combinations)}

In [12]:
len(ss_to_i)

703

In [297]:
s_to_i = {s: i+1 for i,s in enumerate(letters)}
s_to_i['.'] = 0

i_to_s = {i: s for s,i in s_to_i.items()}

In [298]:
s_to_i

{'a': 1,
 'b': 2,
 'c': 3,
 'd': 4,
 'e': 5,
 'f': 6,
 'g': 7,
 'h': 8,
 'i': 9,
 'j': 10,
 'k': 11,
 'l': 12,
 'm': 13,
 'n': 14,
 'o': 15,
 'p': 16,
 'q': 17,
 'r': 18,
 's': 19,
 't': 20,
 'u': 21,
 'v': 22,
 'w': 23,
 'x': 24,
 'y': 25,
 'z': 26,
 '.': 0}

### Build a simple counting model:

In [299]:
combinations = ['..'] + list(product('.', letters)) + list(product(letters, letters))

ss_to_i = {c[0]+c[1]:i for i, c in enumerate(combinations)}

for name in names[:3]:
    name = '..' + name + '.'
    for i,j,k in (zip(name, name[1:], name[2:])):
        print(i+j, '->',k)

N = torch.zeros((len(ss_to_i), len(s_to_i)))

for name in names:
    name = '..' + name + '.'
    for i,j,k in (zip(name, name[1:], name[2:])):
        row = ss_to_i[i+j]
        col = s_to_i[k]
        N[row, col] += 1
print(N.sum(), N.shape)

.. -> e
.e -> m
em -> m
mm -> a
ma -> .
.. -> o
.o -> l
ol -> i
li -> v
iv -> i
vi -> a
ia -> .
.. -> a
.a -> v
av -> a
va -> .
tensor(228146.) torch.Size([703, 27])


In [300]:
# Smoothing
# N = N+1

# Use this model to generate some names:

P = N / N.sum(1, keepdim=True)

assert P[0].sum().item() - 1.0 < 1e-5

g = torch.Generator().manual_seed(2147483647)

for i in range(20):
    name = ".."
    while True:
        ss = name[-2:]
        i = ss_to_i[ss]
        sampled_i = int(torch.multinomial(P[i], 1, replacement=True, generator=g).item())
        sampled_s = i_to_s[sampled_i]
        if sampled_i == 0:
            break
        name += sampled_s
    print(name)

..mip
..axx
..mereyannyaar
..knooraen
..el
..marviovania
..odarimalaalexiaganilley
..helahroni
..haat
..affiya
..isemarrisleemikh
..bech
..amilleia
..trutandenneppalycethon
..jan
..kryn
..yusehanii
..laymira
..knoenoah
..nowynni


In [301]:
# What's the negative loss likelihood of our model? Depends on how we do this, here we use ".name.", which is wrong:

sumlogprob = torch.tensor(0.0)
count = 0
for name in names:
    name = '.' + name + '.'
    for ch1, ch2, ch3 in zip(name, name[1:], name[2:]):
        prob = P[ss_to_i[ch1+ch2], s_to_i[ch3]]
        logprob = torch.log(prob)
        sumlogprob += logprob
        count += 1
nll = -sumlogprob / count
print(f'{nll.item()=}')


nll.item()=2.0619611740112305


In [302]:
# What's the negative loss likelihood of our model? Depends on how we do this, here we use "..name.":

sumlogprob = torch.tensor(0.0)
count = 0
for name in names:
    name = '..' + name + '.'
    for ch1, ch2, ch3 in zip(name, name[1:], name[2:]):
        prob = P[ss_to_i[ch1+ch2], s_to_i[ch3]]
        logprob = torch.log(prob)
        sumlogprob += logprob
        count += 1
nll = -sumlogprob / count
print(f'{nll.item()=}')


nll.item()=2.185652017593384


### Instead of the 2D matrix that holds the counts, we can also use a 3D tensor of size 27x27x27

In [303]:
N = torch.zeros((len(s_to_i), len(s_to_i), len(s_to_i)))

for name in names:
    name = '..' + name + '.'
    for i,j,k in (zip(name, name[1:], name[2:])):
        ix1 = s_to_i[i]
        ix2 = s_to_i[j]
        ix3 = s_to_i[k]
        N[ix1, ix2, ix3] += 1
print(N.sum(), N.shape)

tensor(228146.) torch.Size([27, 27, 27])


In [304]:
# Smoothing
# N = N+100

# Use this model to generate some names:

P = N / N.sum(2, keepdim=True)

assert P[0,0].sum().item() - 1.0 < 1e-5

g = torch.Generator().manual_seed(2147483647)

for i in range(20):
    name = ".."
    while True:
        ix1 = s_to_i[name[-2]]
        ix2 = s_to_i[name[-1]]
        sampled_i = int(torch.multinomial(P[ix1, ix2], 1, replacement=True, generator=g).item())
        sampled_s = i_to_s[sampled_i]
        if sampled_i == 0:
            break
        name += sampled_s
    print(name)

..mip
..axx
..mereyannyaar
..knooraen
..el
..marviovania
..odarimalaalexiaganilley
..helahroni
..haat
..affiya
..isemarrisleemikh
..bech
..amilleia
..trutandenneppalycethon
..jan
..kryn
..yusehanii
..laymira
..knoenoah
..nowynni


In [305]:
# What's the negative loss likelihood of our model?
sumlogprob = torch.tensor(0.0)
count = 0
for name in names:
    name = '..' + name + '.'
    for ch1, ch2, ch3 in zip(name, name[1:], name[2:]):
        ix1 = s_to_i[ch1]
        ix2 = s_to_i[ch2]
        ix3 = s_to_i[ch3]
        prob = P[ix1, ix2, ix3]
        logprob = torch.log(prob)
        sumlogprob += logprob
        count += 1
nll = -sumlogprob / count
print(f'{nll.item()=}')


nll.item()=2.185652017593384


### Shows the exact same number as before, that's nice 

### Let's try to make the same model but then with gradient descent

In [330]:
# Create datasets:

xs = []
x1s = []
x2s = []
ys = []

for name in names:
    name = '..' + name + '.'
    for x1, x2, y in zip(name, name[1:], name[2:]):
        x1s.append(s_to_i[x1])
        x2s.append(s_to_i[x2])
        xs.append([s_to_i[x1], s_to_i[x2]])
        ys.append(s_to_i[y])

xs = torch.tensor(xs)
x1s = torch.tensor(x1s)
x2s = torch.tensor(x2s)
ys = torch.tensor(ys)
num = len(xs)

In [331]:
x1s[:3], x2s[:3]

(tensor([0, 0, 5]), tensor([ 0,  5, 13]))

In [332]:
xs[:3]

tensor([[ 0,  0],
        [ 0,  5],
        [ 5, 13]])

In [333]:
ys[:3]

tensor([ 5, 13, 13])

In [334]:
W = torch.arange(3*3*3).reshape([3,3,3])

In [335]:
W

tensor([[[ 0,  1,  2],
         [ 3,  4,  5],
         [ 6,  7,  8]],

        [[ 9, 10, 11],
         [12, 13, 14],
         [15, 16, 17]],

        [[18, 19, 20],
         [21, 22, 23],
         [24, 25, 26]]])

In [336]:
x1 = torch.nn.functional.one_hot(torch.tensor(0), num_classes=3)
x2 = torch.nn.functional.one_hot(torch.tensor(1), num_classes=3)
xs = torch.nn.functional.one_hot(torch.tensor([0,1]), num_classes=3)

In [337]:
x2 @ (x1 @ W)

tensor([ 9, 10, 11])

In [338]:
x1.shape, W.shape

(torch.Size([3]), torch.Size([3, 3, 3]))

In [339]:
x1 @ W

tensor([[ 0,  1,  2],
        [ 9, 10, 11],
        [18, 19, 20]])

In [340]:
(x1 @ W).shape

torch.Size([3, 3])

In [341]:
(x2 @ (x1 @ W)).shape

torch.Size([3])

In [342]:
x1 = torch.nn.functional.one_hot(torch.tensor([0,0]), num_classes=3)
x2 = torch.nn.functional.one_hot(torch.tensor([1,1]), num_classes=3)

In [343]:
x2 @ (x1 @ W)

RuntimeError: mat1 and mat2 shapes cannot be multiplied (9x2 and 3x2)

In [344]:
x1.shape, W.shape

(torch.Size([2, 3]), torch.Size([3, 3, 3]))

In [345]:
(x1 @ W).shape

torch.Size([3, 2, 3])

In [346]:
(x1 @ W)

tensor([[[ 0,  1,  2],
         [ 0,  1,  2]],

        [[ 9, 10, 11],
         [ 9, 10, 11]],

        [[18, 19, 20],
         [18, 19, 20]]])

In [347]:
x2.shape

torch.Size([2, 3])

In [348]:
x2.view(3,-1) @ (x1 @ W).view(3,3,-1)

RuntimeError: mat1 and mat2 shapes cannot be multiplied (6x3 and 2x3)

In [349]:
x2.view(3,-1).shape

torch.Size([3, 2])

This really doesn't work, I don't know how we can use two one-hot encoded vectors and multiply that with W to get the row vector we are interested in, instead we can just index into W:

In [350]:
x1s[:3], x2s[:3]

(tensor([0, 0, 5]), tensor([ 0,  5, 13]))

In [351]:
W = torch.rand((27,27,27), dtype=torch.float, requires_grad=True)

In [352]:
for i in range(20):
    logits = W[x1s, x2s, :]
    counts = torch.exp(logits)
    probs = counts / counts.sum(1, keepdim=True)
    nll = -probs[torch.arange(num), ys].log().mean()
    print(nll.item())
    nll.backward()

    W.data += -torch.tensor(200.0) * W.grad
    W.grad = None

3.340085506439209
3.1340880393981934
3.0591394901275635
2.8926663398742676
2.837334156036377
2.8620455265045166
2.7483880519866943
2.697908878326416
2.733649730682373
2.6369221210479736
2.6320204734802246
2.6898727416992188
2.5979843139648438
2.558075428009033
2.5886223316192627
2.524501323699951
2.5174810886383057
2.5000827312469482
2.503399610519409
2.508382797241211


In [353]:
for i in range(100):
    logits = W[x1s, x2s, :]
    counts = torch.exp(logits)
    probs = counts / counts.sum(1, keepdim=True)
    nll = -probs[torch.arange(num), ys].log().mean()
    if i%10 == 0:
        print(nll.item())
    nll.backward()

    W.data += -torch.tensor(150.0) * W.grad
    W.grad = None

2.5790505409240723
2.431211471557617
2.401346445083618
2.3636972904205322
2.3635730743408203
2.330418825149536
2.337049722671509
2.308791399002075
2.3189427852630615
2.2933919429779053


In [355]:
for i in range(2000):
    logits = W[x1s, x2s, :]
    counts = torch.exp(logits)
    probs = counts / counts.sum(1, keepdim=True)
    nll = -probs[torch.arange(num), ys].log().mean()
    if i%100 == 0:
        print(nll.item())
    nll.backward()

    W.data += -torch.tensor(100.0) * W.grad
    W.grad = None

2.2422726154327393
2.2320902347564697
2.2253077030181885
2.2203996181488037
2.2166666984558105
2.2137210369110107
2.2113301753997803
2.209345579147339
2.2076680660247803
2.206228733062744
2.2049784660339355
2.2038803100585938
2.2029073238372803
2.202038049697876
2.2012557983398438
2.200547933578491
2.199903726577759
2.199314594268799
2.198773145675659
2.1982738971710205


In [357]:
# finally, sample from the neural net model
g = torch.Generator().manual_seed(2147483647)

for i in range(20):
    out = []
    name = ".."
    while True:
        ix1 = s_to_i[name[-2]]
        ix2 = s_to_i[name[-1]]

        logits = W[ix1,ix2,:]
        counts = logits.exp()
        p = counts / counts.sum()

        ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
        name += i_to_s[int(ix)]
        if ix == 0:
            break
    print(''.join(name))

..mip.
..axx.
..mereyannyaar.
..knooraen.
..el.
..marviovania.
..odarimalaalexiaganilley.
..helahroni.
..haat.
..affiya.
..isemarrisleemikh.
..bech.
..amilleia.
..trutandenneppalycethon.
..jan.
..kryn.
..yusehanii.
..laymira.
..kni.
..steferrysioratten.


## E02: split up the dataset randomly into 80% train set, 10% dev set, 10% test set. Train the bigram and trigram models only on the training set. Evaluate them on dev and test splits. What can you see?


In [166]:
len(names)

32033

In [358]:
train_threshold = int(len(names)*0.8)
val_threshold = int(len(names)*0.9)

train_names = names[:train_threshold]
val_names = names[train_threshold:val_threshold]
test_names = names[val_threshold:]

In [359]:
names[train_threshold-5:train_threshold+5]

['alioune',
 'alix',
 'alois',
 'alva',
 'amirr',
 'amrom',
 'aniket',
 'ansen',
 'apolo',
 'aqib']

In [360]:
train_names[-1], val_names[0]

('amirr', 'amrom')

In [361]:
len(train_names) + len(val_names) + len(test_names) == len(names)

True

In [362]:
N = torch.zeros((len(ss_to_i), len(s_to_i)))

for name in train_names:
    name = '..' + name + '.'
    for i,j,k in (zip(name, name[1:], name[2:])):
        row = ss_to_i[i+j]
        col = s_to_i[k]
        N[row, col] += 1
print(N.sum(), N.shape)

tensor(182778.) torch.Size([703, 27])


In [366]:
# Smoothing
N = N+1

# Use this model to generate some names:

P = N / N.sum(1, keepdim=True)

assert P[0].sum().item() == 1.0

g = torch.Generator().manual_seed(2147483647)

for i in range(20):
    name = ".."
    while True:
        ss = name[-2:]
        i = ss_to_i[ss]
        sampled_i = int(torch.multinomial(P[i], 1, replacement=True, generator=g).item())
        sampled_s = i_to_s[sampled_i]
        if sampled_i == 0:
            break
        name += sampled_s
    print(name)

..mir
..axx
..merfynney
..grahamir
..ivaj
..angerhyx
..ron
..na
..ollah
..daishaleilliencelbelyn
..race
..ta
..ceevlah
..heigh
..roldjqjr
..ai
..ed
..jilleia
..trutcjlgmusqxdfzdevbwqplen
..kryn


In [367]:
# What's the negative loss likelihood of our model on train_names?
sumlogprob = torch.tensor(0.0)
count = 0
for name in train_names:
    name = '.' + name + '.'
    for ch1, ch2, ch3 in zip(name, name[1:], name[2:]):
        prob = P[ss_to_i[ch1+ch2], s_to_i[ch3]]
        logprob = torch.log(prob)
        sumlogprob += logprob
        count += 1
nll = -sumlogprob / count
print(f'{nll.item()=}')

nll.item()=2.053244113922119


In [370]:
# What's the negative loss likelihood of our model on val_names?
sumlogprob = torch.tensor(0.0)
count = 0
for name in val_names:
    name = '.' + name + '.'
    for ch1, ch2, ch3 in zip(name, name[1:], name[2:]):
        prob = P[ss_to_i[ch1+ch2], s_to_i[ch3]]
        logprob = torch.log(prob)
        sumlogprob += logprob
        count += 1
nll = -sumlogprob / count
print(f'{nll.item()=}')

nll.item()=2.3264338970184326


In [371]:
# What's the negative loss likelihood of our model on test_names?
sumlogprob = torch.tensor(0.0)
count = 0
for name in test_names:
    name = '.' + name + '.'
    for ch1, ch2, ch3 in zip(name, name[1:], name[2:]):
        prob = P[ss_to_i[ch1+ch2], s_to_i[ch3]]
        logprob = torch.log(prob)
        sumlogprob += logprob
        count += 1
nll = -sumlogprob / count
print(f'{nll.item()=}')

nll.item()=2.337472677230835


## E03: use the dev set to tune the strength of smoothing (or regularization) for the trigram model - i.e. try many possibilities and see which one works best based on the dev set loss. What patterns can you see in the train and dev set loss as you tune this strength? Take the best setting of the smoothing and evaluate on the test set once and at the end. How good of a loss do you achieve?


In [373]:
N = torch.zeros((len(ss_to_i), len(s_to_i)))

for name in train_names:
    name = '..' + name + '.'
    for i,j,k in (zip(name, name[1:], name[2:])):
        row = ss_to_i[i+j]
        col = s_to_i[k]
        N[row, col] += 1
print(N.sum(), N.shape)

tensor(182778.) torch.Size([703, 27])


In [374]:
for n in np.linspace(0,1,20+1):

    print(f'{n=}')
    
    # Smoothing
    NN = N+n

    PP = NN / NN.sum(1, keepdim=True)

    sumlogprob = torch.tensor(0.0)
    count = 0
    for name in val_names:
        name = '.' + name + '.'
        for ch1, ch2, ch3 in zip(name, name[1:], name[2:]):
            prob = PP[ss_to_i[ch1+ch2], s_to_i[ch3]]
            logprob = torch.log(prob)
            sumlogprob += logprob
            count += 1
    nll = -sumlogprob / count
    print(f'{nll.item()=}')
    
    

n=0.0
nll.item()=nan
n=0.05
nll.item()=2.3361752033233643
n=0.1
nll.item()=2.3272202014923096
n=0.15000000000000002
nll.item()=2.3231360912323
n=0.2
nll.item()=2.320887565612793
n=0.25
nll.item()=2.319657802581787
n=0.30000000000000004
nll.item()=2.319011926651001
n=0.35000000000000003
nll.item()=2.3187434673309326
n=0.4
nll.item()=2.31874942779541
n=0.45
nll.item()=2.318986415863037
n=0.5
nll.item()=2.3193023204803467
n=0.55
nll.item()=2.3197741508483887
n=0.6000000000000001
nll.item()=2.320326089859009
n=0.65
nll.item()=2.3209550380706787
n=0.7000000000000001
nll.item()=2.321634531021118
n=0.75
nll.item()=2.322371482849121
n=0.8
nll.item()=2.323124408721924
n=0.8500000000000001
nll.item()=2.3239188194274902
n=0.9
nll.item()=2.324730157852173
n=0.9500000000000001
nll.item()=2.3255748748779297
n=1.0
nll.item()=2.3264338970184326


In [375]:
# Smoothing
NN = N+0.375

# Use this model to generate some names:

P = NN / NN.sum(1, keepdim=True)

assert P[1].sum().item() - 1.0 < 1e-5

g = torch.Generator().manual_seed(2147483647)

for i in range(20):
    name = ".."
    while True:
        ss = name[-2:]
        i = ss_to_i[ss]
        sampled_i = int(torch.multinomial(P[i], 1, replacement=True, generator=g).item())
        sampled_s = i_to_s[sampled_i]
        if sampled_i == 0:
            break
        name += sampled_s
    print(name)

..mir
..axx
..mereyannya
..salonaia
..raad
..marwinzephhara
..ollah
..daishaleilliencelbelyn
..race
..ta
..ceevi
..iselannamie
..mell
..ai
..ed
..jilleia
..trutchelissitaey
..crevilean
..kryn
..yuridanjine


In [250]:
x1s_train = x1s[:dev_threshold]
x1s_val = x1s[dev_threshold:val_threshold]
x1s_test = x1s[val_threshold:]

x2s_train = x2s[:dev_threshold]
x2s_val = x2s[dev_threshold:val_threshold]
x2s_tes = x2s[val_threshold:]

ys_train = ys[:dev_threshold]
ys_val = ys[dev_threshold:val_threshold]
ys_test = ys[val_threshold:]



for r in [1e1, 1e0, 1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6]:

    W = torch.rand((27,27,27), dtype=torch.float, requires_grad=True)
    num = x1s_train.nelement()

    for i in range(20):
        logits = W[x1s_train, x2s_train, :]
        counts = torch.exp(logits)
        probs = counts / counts.sum(1, keepdim=True)
        nll = -probs[torch.arange(num), ys_train].log().mean() + r * (W*W).mean()
        nll.backward()

        W.data += -torch.tensor(200.0) * W.grad
        W.grad = None

    for i in range(100):
        logits = W[x1s_train, x2s_train, :]
        counts = torch.exp(logits)
        probs = counts / counts.sum(1, keepdim=True)
        nll = -probs[torch.arange(num), ys_train].log().mean() + r * (W*W).mean()
        nll.backward()

        W.data += -torch.tensor(150.0) * W.grad
        W.grad = None

    for i in range(2000):
        logits = W[x1s_train, x2s_train, :]
        counts = torch.exp(logits)
        probs = counts / counts.sum(1, keepdim=True)
        nll = -probs[torch.arange(num), ys_train].log().mean() + r * (W*W).mean()
        nll.backward()

        W.data += -torch.tensor(100.0) * W.grad 
        W.grad = None
    
    print(f'{r=}')
    print(f'{nll=}')
    
    
    num = x1s_val.nelement()
    logits = W[x1s_val, x2s_val, :]
    counts = torch.exp(logits)
    probs = counts / counts.sum(1, keepdim=True)
    nll = -probs[torch.arange(num), ys_val].log().mean() + r * (W*W).mean()
    
    print(f'{nll=}')
    

r=10.0
nll=tensor(2.8295, grad_fn=<AddBackward0>)
nll=tensor(2.8946, grad_fn=<AddBackward0>)
r=1.0
nll=tensor(2.3131, grad_fn=<AddBackward0>)
nll=tensor(2.4403, grad_fn=<AddBackward0>)
r=0.1
nll=tensor(2.0189, grad_fn=<AddBackward0>)
nll=tensor(2.1990, grad_fn=<AddBackward0>)
r=0.01
nll=tensor(1.9246, grad_fn=<AddBackward0>)
nll=tensor(2.1490, grad_fn=<AddBackward0>)
r=0.001
nll=tensor(1.9056, grad_fn=<AddBackward0>)
nll=tensor(2.1413, grad_fn=<AddBackward0>)
r=0.0001
nll=tensor(1.9034, grad_fn=<AddBackward0>)
nll=tensor(2.1390, grad_fn=<AddBackward0>)
r=1e-05
nll=tensor(1.9058, grad_fn=<AddBackward0>)
nll=tensor(2.1428, grad_fn=<AddBackward0>)
r=1e-06
nll=tensor(1.9058, grad_fn=<AddBackward0>)
nll=tensor(2.1439, grad_fn=<AddBackward0>)


### E04: we saw that our 1-hot vectors merely select a row of W, so producing these vectors explicitly feels wasteful. Can you delete our use of F.one_hot in favor of simply indexing into rows of W?


Yes, we did that already above. Instead of using the one-hot vector we can just index into the matrix/tensor, since I couldn't make the one-hot vector approach work for the 3D tensor model

We did this above when doing:

```
logits = W[x1s, x2s, :]
```

In [318]:
W = torch.tensor(np.arange(16))
W = W.reshape([4,4])
W

tensor([[ 0,  1,  2,  3],
        [ 4,  5,  6,  7],
        [ 8,  9, 10, 11],
        [12, 13, 14, 15]])

In [321]:
x_enc = torch.nn.functional.one_hot(torch.tensor(3), num_classes=4)
x_enc

tensor([0, 0, 0, 1])

In [322]:
x_enc @ W

tensor([12, 13, 14, 15])

In [323]:
W[3]

tensor([12, 13, 14, 15])

In [326]:
assert (x_enc @ W == W[3]).all()

### E05: look up and use F.cross_entropy instead. You should achieve the same result. Can you think of why we'd prefer to use F.cross_entropy instead?

Yes, cross entropy is the same as softmax + nll (see also exercises from lesson 1)

Cross entropy is probably doing some more optimizations, numerically more stable, less manual work so less probability of errors

### E06: meta-exercise! Think of a fun/interesting exercise and complete it.