### E02
Split up the dataset randomly into 80% train set, 10% dev set, 10% test set. 

Train the bigram and trigram models only on the training set. 

Evaluate them on dev and test splits. What can you see?

### Setting up datasets
- Load all the names into the list
- Split the list into 80% train set, 10% dev set, 10% test set
- Form stoi and itos dictionaries

In [4]:
# Load the dataset into a list
with open('names.txt', 'r') as file:
    names = file.read().split()

print('Total names:', len(names))
print(min(len(item) for item in names), max(len(item) for item in names))

Total names: 32033
2 15


In [5]:
# Splitting names into 80% train set, 10% dev set, 10% test set
import random

random.shuffle(names) # Shuffles the list in-place

split_1, split_2 = int(0.8*len(names)), int(0.9*len(names))
train_list = names[:split_1]
dev_list = names[split_1:split_2]
test_list = names[split_2:]

In [80]:
print(f'Training set: {len(train_list)/len(names)*100.:4f} %')
print(f'Dev set: {len(dev_list)/len(names)*100.:4f} %')
print(f'Test set: {len(test_list)/len(names)*100.:4f} %')

Training set: 79.998751 %
Dev set: 9.999063 %
Test set: 10.002185 %


In [6]:
# Form stoi and itos
vocab = sorted(list(set(''.join(names))))
stoi = {s: i+1 for i, s in enumerate(vocab)}
stoi['.'] = 0
itos = {stoi[s]: s for s in stoi}

### Bigram: training
- Perform Gradient descent and update weights only based on training list

In [7]:
import torch

# Preparing bigrams
xs, ys = [], []
for name in train_list:
    name = ['.'] + list(name) + ['.']
    for xi, yi in zip(name, name[1:]):
        xs.append(stoi[xi])
        ys.append(stoi[yi])

xs = torch.tensor(xs)
ys = torch.tensor(ys)

# Encoding the inputs
import torch.nn.functional as F
xenc = F.one_hot(xs, num_classes=27).float()
yenc = F.one_hot(ys, num_classes=27).float()

In [8]:
# Initialization
g = torch.Generator().manual_seed(2147483647)
W_bigram = torch.randn(27, 27, generator=g, requires_grad=True)
logits = (xenc @ W_bigram)
counts = logits.exp()
probs = counts / counts.sum(1, keepdims=True)

In [None]:
num = ys.nelement() # number of examples

# Gradient descent
for _ in range(100):
    # Forward pass
    logits = xenc @ W_bigram # predicted log counts

    # softmax (next two lines): converts logits to probabilities
    counts = logits.exp() # predicted counts
    probs = counts / counts.sum(1, keepdims=True) # calculating probablities
    
    loss = -probs[torch.arange(num), ys].log().mean()

    print(loss.item())

    # Backward pass
    W_bigram.grad = None
    loss.backward()

    # Update the weights
    W_bigram.data += -50 * W_bigram.grad

3.7582688331604004
3.369582414627075
3.1526994705200195
3.0192978382110596
2.926764488220215
2.859539747238159
2.8089253902435303
2.7693326473236084
2.737311363220215
2.7107222080230713
2.688204765319824
2.6688623428344727
2.652076482772827
2.6373987197875977
2.6244869232177734
2.613070487976074
2.60292649269104
2.5938706398010254
2.5857484340667725
2.578429698944092
2.571805715560913
2.5657849311828613
2.5602893829345703
2.5552542209625244
2.550623655319214
2.5463504791259766
2.54239559173584
2.5387251377105713
2.535308837890625
2.532122850418091
2.5291450023651123
2.526357412338257
2.52374267578125
2.521287441253662
2.518977165222168
2.5168018341064453
2.5147504806518555
2.5128138065338135
2.5109832286834717
2.5092508792877197
2.5076098442077637
2.5060532093048096
2.504575729370117
2.503171682357788
2.501835823059082
2.500563859939575
2.4993512630462646
2.498194456100464
2.4970896244049072
2.4960334300994873
2.495023488998413
2.494056224822998
2.4931297302246094
2.4922404289245605
2.

### Bigram: loss on dev and test sets
- Prepare xs & corresponding ys dataset on dev & test
- Evaluate loss of Dev set and test set

In [10]:
xs_dev, xs_test, ys_dev, ys_test = [], [], [], []
for name in dev_list:
    name = ['.'] + list(name) + ['.']
    for ch1, ch2 in zip(name, name[1:]):
        xs_dev.append(stoi[ch1])
        ys_dev.append(stoi[ch2])

for name in test_list:
    name = ['.'] + list(name) + ['.']
    for ch1, ch2 in zip(name, name[1:]):
        xs_test.append(stoi[ch1])
        ys_test.append(stoi[ch2])

xs_dev = torch.tensor(xs_dev)
ys_dev = torch.tensor(ys_dev)
xs_test = torch.tensor(xs_test)
ys_test = torch.tensor(ys_test)

num_dev = len(xs_dev)
num_test = len(xs_test)

In [11]:
def compute_bigram_loss(xs, ys, W):
    xenc = F.one_hot(xs, num_classes=27).float()
    logits = xenc @ W
    counts = logits.exp()
    probs = counts / counts.sum(1, keepdims=True)
    loss = -probs[torch.arange(len(xs)), ys].log().mean()
    return loss

In [12]:
dev_loss = compute_bigram_loss(xs_dev, ys_dev, W_bigram)
test_loss = compute_bigram_loss(xs_dev, ys_dev, W_bigram)

In [13]:
print(dev_loss.item(), test_loss.item())

2.4807989597320557 2.4807989597320557


### Trigrams: training

In [14]:
xs_train, ys_train = [], []
for name in train_list:
    name = ['.'] + list(name) + ['.']
    for ch1, ch2, ch3 in zip(name, name[1:], name[2:]):
        xs_train.append([stoi[ch1], stoi[ch2]])
        ys_train.append(stoi[ch3])

xs_train = torch.tensor(xs_train)
ys_train = torch.tensor(ys_train)

In [15]:
# One hot encoding of the inputs
import torch.nn.functional as F
xenc_train = F.one_hot(xs_train, num_classes=27).float() # casting to float32 to match dtype of Weight
xenc_train = xenc_train.view(xenc_train.shape[0], 2*27) 
yenc_train = F.one_hot(ys_train, num_classes=27).float()

num_train = ys_train.nelement()

In [16]:
g = torch.Generator().manual_seed(2147483647)
W_tri = torch.randn(27*2, 27, generator=g, requires_grad=True)

In [36]:
for _ in range(10):
    # Forward pass
    logits = (xenc_train @ W_tri)
    counts = logits.exp()
    # Normalize the count to get probability distribution
    probs = counts / counts.sum(1, keepdims=True)

    # print(torch.arange(num))
    # print(ys.shape)
    # print(logits.shape, probs.shape)
    loss = -probs[torch.arange(num_train), ys_train].log().mean()
    print(loss.item())
    
    # Backward pass
    W_tri.grad = None
    loss.backward()
    
    # Update the weights
    W_tri.data += -10 * W_tri.grad

2.368177652359009
2.3670051097869873
2.365851879119873
2.364717483520508
2.363600730895996
2.362502098083496
2.3614211082458496
2.360356569290161
2.3593087196350098
2.3582775592803955


### Trigram: loss on dev and test sets

In [18]:
xs_dev_tri, xs_test_tri, ys_dev_tri, ys_test_tri = [], [], [], []
for name in dev_list:
    name = ['.'] + list(name) + ['.']
    for ch1, ch2, ch3 in zip(name, name[1:], name[2:]):
        xs_dev_tri.append([stoi[ch1], stoi[ch2]])
        ys_dev_tri.append(stoi[ch3])

for name in test_list:
    name = ['.'] + list(name) + ['.']
    for ch1, ch2, ch3 in zip(name, name[1:], name[2:]):
        xs_test_tri.append([stoi[ch1], stoi[ch2]])
        ys_test_tri.append(stoi[ch3])

xs_dev_tri = torch.tensor(xs_dev_tri)
ys_dev_tri = torch.tensor(ys_dev_tri)
xs_test_tri = torch.tensor(xs_test_tri)
ys_test_tri = torch.tensor(ys_test_tri)

num_dev_tri = len(ys_dev_tri)
num_test_tri = len(ys_test_tri)

In [19]:
def compute_trigram_loss(xs, ys, W):
    xenc = F.one_hot(xs, num_classes=27).float()
    xenc = xenc.view(xenc.shape[0], 2*27) 
    logits = xenc @ W
    counts = logits.exp()
    probs = counts / counts.sum(1, keepdims=True)
    loss = -probs[torch.arange(len(xs)), ys].log().mean()
    return loss

In [37]:
dev_loss_tri = compute_trigram_loss(xs_dev_tri, ys_dev_tri, W_tri)
test_loss_tri = compute_trigram_loss(xs_test_tri, ys_test_tri, W_tri)

In [38]:
print('Dev loss on trigrams:', dev_loss_tri.item())
print('Test loss on trigrams:', test_loss_tri.item())

Dev loss on trigrams: 2.366079807281494
Test loss on trigrams: 2.3610594272613525
