### E02
Split up the dataset randomly into 80% train set, 10% dev set, 10% test set. 

Train the bigram and trigram models only on the training set. 

Evaluate them on dev and test splits. What can you see?

In [3]:
# Load the dataset into a list
with open('names.txt', 'r') as file:
    names = file.read().split()

print('Total names:', len(names))
print(min(len(item) for item in names), max(len(item) for item in names))

Total names: 32033
2 15


In [4]:
# Splitting names into 80% train set, 10% dev set, 10% test set
import random

random.shuffle(names) # Shuffles the list in-place

split_1, split_2 = int(0.8*len(names)), int(0.9*len(names))
train_list = names[:split_1]
dev_list = names[split_1:split_2]
test_list = names[split_2:]

In [None]:
print(f'Training set: {len(train_list)/len(names)*100.:4f} %')
print(f'Dev set: {len(dev_list)/len(names)*100.:4f} %')
print(f'Test set: {len(test_list)/len(names)*100.:4f} %')

Training set: 79.998751 %
Dev set:, 9.999063 %
Test set:, 10.002185 %


In [9]:
# Form stoi and itos
vocab = sorted(list(set(''.join(names))))
stoi = {s: i+1 for i, s in enumerate(vocab)}
stoi['.'] = 0
itos = {stoi[s]: s for s in stoi}

## Bigram

In [16]:
import torch

# Preparing bigrams
xs, ys = [], []
for name in train_list:
    name = ['.'] + list(name) + ['.']
    for xi, yi in zip(name, name[1:]):
        xs.append(stoi[xi])
        ys.append(stoi[yi])

xs = torch.tensor(xs)
ys = torch.tensor(ys)

# Encoding the inputs
import torch.nn.functional as F
xenc = F.one_hot(xs, num_classes=27).float()
yenc = F.one_hot(ys, num_classes=27).float()

In [17]:
# Initialization
g = torch.Generator().manual_seed(2147483647)
W_bigram = torch.randn(27, 27, generator=g, requires_grad=True)
logits = (xenc @ W_bigram)
counts = logits.exp()
probs = counts / counts.sum(1, keepdims=True)

In [18]:
num = ys.nelement() # number of examples

# Gradient descent
for _ in range(10):
    # Forward pass
    logits = xenc @ W_bigram # predicted log counts

    # softmax (next two lines)
    counts = logits.exp() # predicted counts
    probs = counts / counts.sum(1, keepdims=True) # calculating probablities
    
    loss = -probs[torch.arange(num), ys].log().mean()

    print(loss.item())

    # Backward pass
    W_bigram.grad = None
    loss.backward()

    # Update the weights
    W_bigram.data += -50 * W_bigram.grad

3.758641004562378
3.3703830242156982
3.153103828430176
3.019284963607788
2.9265940189361572
2.8592894077301025
2.8086068630218506
2.7689595222473145
2.7369072437286377
2.7103095054626465


In [34]:
# Evaluate loss of Dev set and test set
# - prepare dataset on dev

xs_dev, xs_test, ys_dev, ys_test = [], [], [], []
for name in dev_list:
    name = ['.'] + list(name) + ['.']
    for ch1, ch2 in zip(name, name[1:]):
        xs_dev.append(stoi[ch1])
        ys_dev.append(stoi[ch2])

for name in test_list:
    name = ['.'] + list(name) + ['.']
    for ch1, ch2 in zip(name, name[1:]):
        xs_test.append(stoi[ch1])
        ys_test.append(stoi[ch2])

xs_dev = torch.tensor(xs_dev)
ys_dev = torch.tensor(ys_dev)
xs_test = torch.tensor(xs_test)
ys_test = torch.tensor(ys_test)

num_dev = len(xs_dev)
num_test = len(xs_test)

In [36]:
probs.shape

torch.Size([182719, 27])

In [None]:
dev_loss = -probs[torch.arange(num_dev), ys_dev].log().mean()
test_loss = -probs[torch.arange(num_test), ys_test].log().mean()

In [None]:
print(loss.item(), test.item())

NameError: name 'test' is not defined