# L01_E04

We saw that our 1-hot vectors merely select a row of W, so producing these vectors explicitly feels wasteful. Can you delete our use of F.one_hot in favor of simply indexing into rows of W?

In [None]:
import torch

In [None]:
import random
random.seed(42)

words = open('../names.txt','r').read().splitlines()

random.shuffle(words)
n1 = int(0.8*len(words))
n2 = int(0.9*len(words))

words_tr = words[:n1]
words_dev = words[n1:n2]
words_te = words[n2:]

In [None]:
chars = sorted(list(set(''.join(words_tr))))
len(chars)

26

In [None]:
ctoi = {c : i+1 for i,c in enumerate(chars)}
ctoi['.'] = 0

In [None]:
itoc = {i:c for c,i in ctoi.items()}

In [None]:
num_chars = len(ctoi.keys())
num_chars

27

In [None]:
stoi = {}
for i0,c0 in sorted(itoc.items(), key=lambda kv: kv[0]):
    for i1,c1 in sorted(itoc.items(), key=lambda kv: kv[0]):
        #print((i0*num_chars) + i1,c0,c1)
        stoi[(c0,c1)] = (i0*num_chars) + i1

In [None]:
def build_dataset(words):
    xs,ys = [],[]
    
    for word in words:
        chs = '..' + word + '.'
        for ch1,ch2,ch3 in zip(chs,chs[1:],chs[2:]):
            ix1 = stoi[ch1,ch2]
            ix2 = ctoi[ch3]
            xs.append(ix1)
            ys.append(ix2)
    
    # prefer to use torch.tensor instead of torch.Tensor
    xs = torch.tensor(xs)
    ys = torch.tensor(ys)
    num = xs.nelement()
    print(f'number of examples: {num}')   

    return xs, ys

In [None]:
Xtr,Ytr=build_dataset(words_tr)
Xdev,Ydev=build_dataset(words_dev)
Xte,Yte=build_dataset(words_te)

number of examples: 182625
number of examples: 22655
number of examples: 22866


In [None]:
Xtr.dtype

torch.int64

In [None]:
import torch.nn.functional as F

In [None]:
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((num_chars*num_chars,num_chars), generator=g, requires_grad=True) #single layer of 27 neurons each getting 27x27 inputs

reg = 0.004552933843974289
for k in range(400):
    xs, ys = Xtr, Ytr
    logits = W[xs] #log-counts
    counts = logits.exp() # exponentiate the logits to get fake counts
    probs = counts/counts.sum(1,keepdims=True)  
    
    loss_1 = (-(probs[torch.arange(xs.nelement()),ys]).log()).mean()
    loss_2 = reg*(W**2).mean() #regularization loss
    
    loss = loss_1 #+ loss_2

    # if k%40==0: print(loss.item())
    
    #backward pass
    W.grad = None #More efficient than setting to zero directly. Lack of gradient is interpreted as zero by PyTorch
    loss.backward()
    
    #update
    W.data += -4*50 * W.grad
print(loss_1.item(), loss_2.item(), loss.item())    

2.2389516830444336 0.006938230711966753 2.2389516830444336


Finally let's evaluate the loss on the test set

In [None]:
xs, ys = Xte, Yte
logits = W[xs] #log-counts
counts = logits.exp() # exponentiate the logits to get fake counts
probs = counts/counts.sum(1,keepdims=True)
loss = (-(probs[torch.arange(xs.nelement()),ys]).log()).mean()
print(loss.item())

2.323352575302124
