In [126]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
import torchtext
from torch.utils.data import DataLoader, TensorDataset
from datetime import datetime
import torchvision
from torchvision import transforms
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import torch.nn.functional as F
import numpy as np
from collections import Counter
import os
import re

[nltk_data] Downloading package punkt to C:\Users\Magnus
[nltk_data]     Lohne\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Task 2.1 - Word embedding

## Task 2.1.1 - Tokenizing
We start by defining a function that takes a file path, reads the file and returns the text. Then, we tokenize it using the nltk package. The we define the lists of txt files, and run them through the function to define the lists of tokens. We use the functions defined in the tutorial for handling text data and adjust them.

In [46]:
# tokenizer will split a long text into a list of english words
TOKENIZER_EN = get_tokenizer('basic_english')
# Where we will store / load all our models, datasets, vocabulary, etc.
PATH_GENERATED = '../Project 3/'
# Minimum number of occurence of a word in the text to add it to the vocabulary
MIN_FREQ = 100

def read_files(datapath='./data_train/'):
    """
    Return a list of strings, one for each line in each .txt files in 'datapath'
    """
    # Find all txt files in directory 
    files = os.listdir(datapath)
    files = [datapath + f for f in files if f.endswith(".txt")]
    
    # Stores each line of each book in a list
    lines = []
    for f_name in files:
        with open(f_name, encoding="utf8") as f:
            lines += f.readlines()
    return lines

def tokenize(lines, tokenizer=TOKENIZER_EN):
    """
    Tokenize the list of lines
    """
    list_text = []
    for line in lines:
        list_text += tokenizer(line)
    return list_text

def yield_tokens(lines, tokenizer=TOKENIZER_EN):
    """
    Yield tokens, ignoring names and digits to build vocabulary
    """
    # Match any word containing digit
    no_digits = '\w*[0-9]+\w*'
    # Match word containing a uppercase 
    no_names = '\w*[A-Z]+\w*'
    # Match any sequence containing more than one space
    no_spaces = '\s+'
    
    for line in lines:
        line = re.sub(no_digits, ' ', line)
        line = re.sub(no_names, ' ', line)
        line = re.sub(no_spaces, ' ', line)
        yield tokenizer(line)
        
# ----------------------- Tokenize texts -------------------------------
# Load tokenized versions of texts if you have already generated it
# Otherwise, create it and save it
if os.path.isfile(PATH_GENERATED + "words_train.pt"):
    words_train = torch.load(PATH_GENERATED + "words_train.pt")
    words_val = torch.load(PATH_GENERATED + "words_val.pt")
    words_test = torch.load(PATH_GENERATED + "words_test.pt")
else:
    # Get lists of strings, one for each line in each .txt files in 'datapath' 
    lines_books_train = read_files('./data_train/')
    lines_books_val = read_files('./data_val/')
    lines_books_test = read_files('./data_test/')

    # List of words contained in the dataset
    words_train = tokenize(lines_books_train)
    words_val = tokenize(lines_books_val)
    words_test = tokenize(lines_books_test)
    
    torch.save(words_train , PATH_GENERATED + "words_train.pt")
    torch.save(words_val , PATH_GENERATED + "words_val.pt")
    torch.save(words_test , PATH_GENERATED + "words_test.pt")

## Task 2.1.2 - Vocabulary
We define a vocabulary and print some information about it. We use the functions defined in the tutorial for handling text data and adjust them.

In [47]:
# ----------------------- Create vocabulary ----------------------------

def count_freqs(words, vocab):
    """
    Count occurrences of each word in vocabulary in the data
    
    Useful to get some insight on the data and to compute loss weights
    """
    freqs = torch.zeros(len(vocab), dtype=torch.int)
    for w in words:
        freqs[vocab[w]] += 1
    return freqs

def create_vocabulary(lines, min_freq=MIN_FREQ):
    """
    Create a vocabulary (list of known tokens) from a list of strings
    """
    # vocab contains the vocabulary found in the data, associating an index to each word
    vocab = build_vocab_from_iterator(yield_tokens(lines), min_freq=min_freq, specials=["<unk>"])
    # Since we removed all words with an uppercase when building the vocabulary, we skipped the word "I"
    vocab.append_token("i")
    # Value of default index. This index will be returned when OOV (Out Of Vocabulary) token is queried.
    vocab.set_default_index(vocab["<unk>"])
    return vocab

VOCAB_FNAME = "vocabulary.pt"
# Load vocabulary if you have already generated it
# Otherwise, create it and save it
if os.path.isfile(PATH_GENERATED + VOCAB_FNAME):
    vocab = torch.load(PATH_GENERATED + VOCAB_FNAME)
else:
    # Create vocabulary based on the words in the training dataset
    vocab = create_vocabulary(lines_books_train, min_freq=MIN_FREQ)
    torch.save(vocab, PATH_GENERATED + VOCAB_FNAME)
    


# ------------------------ Quick analysis ------------------------------
VOCAB_SIZE = len(vocab)
print("Total number of words in the training dataset:     ", len(words_train))
print("Total number of words in the validation dataset:   ", len(words_val))
print("Total number of words in the test dataset:         ", len(words_test))
print("Number of distinct words in the training dataset:  ", len(set(words_train)))
print("Number of distinct words kept (vocabulary size):   ", VOCAB_SIZE)

Total number of words in the training dataset:      2684706
Total number of words in the validation dataset:    49526
Total number of words in the test dataset:          124152
Number of distinct words in the training dataset:   52105
Number of distinct words kept (vocabulary size):    1880


Given the size of the training dataset, the vocabulary is quite small. 

## Task 2.1.3 - CBOW Model
We start by defining the training dataset with context and target words. Then we define a simple architecture. We use the functions defined in the tutorial for handling text data and adjust them.

In [48]:
# ------------------------ Define targets ------------------------------
def compute_label(w):
    """
    helper function to define MAP_TARGET
    
    - 0 = 'unknown word'
    - 1 = 'punctuation' (i.e. the '<unk>' token)
    - 2 = 'is an actual word'
    """
    if w in ['<unk>']:
        return 0
    elif w in [',', '.', '(', ')', '?', '!']:
        return 1
    else:
        return 2

# true labels for this task:
MAP_TARGET = {
    vocab[w]:compute_label(w) for w in vocab.lookup_tokens(range(VOCAB_SIZE))
}

# context size for this task 
CONTEXT_SIZE = 3


# ---------------- Define context / target pairs -----------------------
def create_dataset(
    text, vocab, 
    context_size=CONTEXT_SIZE, map_target=MAP_TARGET
):
    """
    Create a pytorch dataset of context / target pairs from a text
    """
    
    n_text = len(text)
    n_vocab = len(vocab)
    
    # Change labels if only a few target are kept, otherwise, each word is
    # associated with its index in the vocabulary
    if map_target is None:
        map_target = {i:i for i in range(n_vocab)}
    
    # Transform the text as a list of integers.
    txt = [vocab[w] for w in text]

    # Start constructing the context / target pairs...
    contexts = []
    targets = []
    for i in range(n_text - context_size):
        
        # Word used to define target
        t = txt[i + context_size]
        
        # Context before the target
        c = txt[i:i + context_size]
        
        targets.append(map_target[t])
        contexts.append(torch.tensor(c))
            
    # contexts of shape (N_dataset, context_size)
    # targets of shape  (N_dataset)
    contexts = torch.stack(contexts)
    targets = torch.tensor(targets)
    # Create a pytorch dataset out of these context / target pairs
    return TensorDataset(contexts, targets)

In [49]:
def load_dataset(words, vocab, fname):
    """
    Load dataset if already generated, otherwise, create it and save it
    """
    # If already generated
    if os.path.isfile(PATH_GENERATED + fname):
        dataset = torch.load(PATH_GENERATED + fname)
    else:
        # Create context / target dataset based on the list of strings
        dataset = create_dataset(words, vocab)
        torch.save(dataset, PATH_GENERATED + fname)
    return dataset

data_train = load_dataset(words_train, vocab, "data_train.pt")
data_val = load_dataset(words_val, vocab, "data_val.pt")
data_test = load_dataset(words_test, vocab, "data_test.pt")

In [50]:
class CBOWModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(CBOWModel, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear = nn.Linear(embedding_dim, vocab_size)
    
    def forward(self, context_idxs):
        embeds = self.embeddings(context_idxs)
        out = torch.mean(embeds, dim=1)
        log_probs = self.linear(out)
        return log_probs

## Task 2.1.4 - Training several models
We define a training loop and a function to compute accuracy. Then we train models with the architecture above with different hyperparameters, i.e. embedding dimension, batch size and learning rate. Then we choose the best model, based on validation accuracy.

In [202]:
def train(model, loss_fn, optimizer, data_loader, n_epochs):

    model.train()
    
    for epoch in range(n_epochs):
        total_loss = 0
        for context_idxs, target_idxs in data_loader:
            
            optimizer.zero_grad()
            log_probs = model(context_idxs)
            
            loss = loss_fn(log_probs, target_idxs)
            total_loss += loss.item()
            
            loss.backward()
            optimizer.step()
        
        print('{}  |  Epoch {}  |  Training loss {:.3f}'.format(
            datetime.now().time(), epoch+1, total_loss / len(data_loader)))

In [203]:
def compute_accuracy(model, loader):
    model.eval()

    correct = 0
    total = 0

    with torch.no_grad():
        for contexts, targets in loader:

            outputs = model(contexts)
            _, predicted = torch.max(outputs, dim=1)
            total += len(targets)
            correct += int((predicted == targets).sum())

    acc =  correct / total
    return acc

In [96]:
torch.manual_seed(13)

vocab_size = VOCAB_SIZE
embedding_dim = 10

cbow_model1 = CBOWModel(vocab_size, embedding_dim)
optimizer = optim.Adam(cbow_model1.parameters(), lr=0.01)
loss_fn = nn.CrossEntropyLoss()
train_loader = torch.utils.data.DataLoader(data_train, batch_size=128, shuffle=True)

train(
    model = cbow_model1, 
    loss_fn = loss_fn, 
    optimizer = optimizer,
    data_loader = train_loader, 
    n_epochs = 21
)

Epoch 1/21, Loss: 0.7568479743020895
Epoch 2/21, Loss: 0.7391480775697865
Epoch 3/21, Loss: 0.7386429603932441
Epoch 4/21, Loss: 0.7383203918089883
Epoch 5/21, Loss: 0.7380729230083924
Epoch 6/21, Loss: 0.7379986406026211
Epoch 7/21, Loss: 0.7380109585937072
Epoch 8/21, Loss: 0.7379625817561463
Epoch 9/21, Loss: 0.7379501215919977
Epoch 10/21, Loss: 0.7380440340650238
Epoch 11/21, Loss: 0.7380130876346765
Epoch 12/21, Loss: 0.7380772951959285
Epoch 13/21, Loss: 0.7381345853078067
Epoch 14/21, Loss: 0.7381831162450424
Epoch 15/21, Loss: 0.7382165207467858
Epoch 16/21, Loss: 0.7382367236196498
Epoch 17/21, Loss: 0.7382709037177753
Epoch 18/21, Loss: 0.7383780183183991
Epoch 19/21, Loss: 0.7382899303913685
Epoch 20/21, Loss: 0.7383282051296711
Epoch 21/21, Loss: 0.7382786875544061


In [98]:
val_loader = torch.utils.data.DataLoader(data_val, batch_size=128, shuffle=False)

acc_train = compute_accuracy(cbow_model1, train_loader)
acc_val = compute_accuracy(cbow_model1, val_loader)
print("Training Accuracy:     %.4f" %acc_train)
print("Validation Accuracy:   %.4f" %acc_val)

Training Accuracy:     0.7129
Validation Accuracy:   0.7303


In [99]:
torch.manual_seed(13)

vocab_size = VOCAB_SIZE
embedding_dim = 12

cbow_model2 = CBOWModel(vocab_size, embedding_dim)
optimizer = optim.Adam(cbow_model2.parameters(), lr=0.001)
loss_fn = nn.CrossEntropyLoss()
train_loader = torch.utils.data.DataLoader(data_train, batch_size=128, shuffle=True)

train(
    model = cbow_model2, 
    loss_fn = loss_fn, 
    optimizer = optimizer,
    data_loader = train_loader, 
    n_epochs = 21
)

Epoch 1/21, Loss: 0.8596063743046839
Epoch 2/21, Loss: 0.7370489141341472
Epoch 3/21, Loss: 0.7358415521086328
Epoch 4/21, Loss: 0.7356351911183335
Epoch 5/21, Loss: 0.7355914771400561
Epoch 6/21, Loss: 0.7355422079378431
Epoch 7/21, Loss: 0.7355288976707959
Epoch 8/21, Loss: 0.73546467530429
Epoch 9/21, Loss: 0.7354956221182668
Epoch 10/21, Loss: 0.7354541537147881
Epoch 11/21, Loss: 0.7354502648420641
Epoch 12/21, Loss: 0.7354375941435117
Epoch 13/21, Loss: 0.735459428430314
Epoch 14/21, Loss: 0.7354125745418671
Epoch 15/21, Loss: 0.7354130342208444
Epoch 16/21, Loss: 0.7354271591320651
Epoch 17/21, Loss: 0.7354087704270901
Epoch 18/21, Loss: 0.7354136936215593
Epoch 19/21, Loss: 0.7353675464197051
Epoch 20/21, Loss: 0.7353538356378622
Epoch 21/21, Loss: 0.7353685641601344


In [267]:
val_loader = torch.utils.data.DataLoader(data_val, batch_size=128, shuffle=False)

acc_train = compute_accuracy(cbow_model2, train_loader)
acc_val2 = compute_accuracy(cbow_model2, val_loader)
print("Training Accuracy:     %.4f" %acc_train)
print("Validation Accuracy:   %.4f" %acc_val2)

Training Accuracy:     0.0490
Validation Accuracy:   0.1214


In [101]:
torch.manual_seed(13)

vocab_size = VOCAB_SIZE
embedding_dim = 16

cbow_model3 = CBOWModel(vocab_size, embedding_dim)
optimizer = optim.Adam(cbow_model3.parameters(), lr=0.001)
loss_fn = nn.CrossEntropyLoss()
train_loader = torch.utils.data.DataLoader(data_train, batch_size=512, shuffle=True)

train(
    model = cbow_model3, 
    loss_fn = loss_fn, 
    optimizer = optimizer,
    data_loader = train_loader, 
    n_epochs = 21
)

Epoch 1/21, Loss: 1.0898744947100123
Epoch 2/21, Loss: 0.7428367885694896
Epoch 3/21, Loss: 0.736730985064983
Epoch 4/21, Loss: 0.7355983536789928
Epoch 5/21, Loss: 0.7353192499855108
Epoch 6/21, Loss: 0.7352139426496173
Epoch 7/21, Loss: 0.7351928712369649
Epoch 8/21, Loss: 0.7351608302395913
Epoch 9/21, Loss: 0.7351475127615336
Epoch 10/21, Loss: 0.7351378691755661
Epoch 11/21, Loss: 0.7351301855980941
Epoch 12/21, Loss: 0.7351212146785039
Epoch 13/21, Loss: 0.7351147522233443
Epoch 14/21, Loss: 0.7351035327060271
Epoch 15/21, Loss: 0.7351067304202021
Epoch 16/21, Loss: 0.7351044692172976
Epoch 17/21, Loss: 0.7351011715937352
Epoch 18/21, Loss: 0.7350970284116805
Epoch 19/21, Loss: 0.7350812663816478
Epoch 20/21, Loss: 0.7350749991760283
Epoch 21/21, Loss: 0.7350564083918463


In [103]:
val_loader = torch.utils.data.DataLoader(data_val, batch_size=128, shuffle=False)

acc_train = compute_accuracy(cbow_model3, train_loader)
acc_val3 = compute_accuracy(cbow_model3, val_loader)
print("Training Accuracy:     %.4f" %acc_train)
print("Validation Accuracy:   %.4f" %acc_val3)

Training Accuracy:     0.7138
Validation Accuracy:   0.7307


Choosing the best model, using a model selection function.

In [108]:
val_accs = [acc_val, acc_val2, acc_val3]

def model_selection(val_accs):
    return f"Model {np.argmax(val_accs)+1} had the highest validation accuracy\n\nValidation accuracy: {max(val_accs)}"

print(model_selection(val_accs))

Model 2 had the highest validation accuracy

Validation accuracy: 0.7308725238777941


In [176]:
test_loader = torch.utils.data.DataLoader(data_test, batch_size=128, shuffle=False)

print(f'Test accuracy = {compute_accuracy(cbow_model2, test_loader)}')

Test accuracy = 0.2562485400607335


## Task 2.1.5 - Cosine similarity
We compute the cosine similarity matrix for the vocabulary, given a the embedding above. Then we print the most similar words to some example words and their similarities.

In [313]:
trained_embeddings = cbow_model2.embeddings.weight.data
normed_embeddings = F.normalize(trained_embeddings, p=2, dim=1)
cosine_similarity_matrix = torch.mm(normed_embeddings, normed_embeddings.t())

In [322]:
word_to_index = {word: index for index, word in enumerate(vocab.get_itos())}
index_to_word = {index: word for word, index in word_to_index.items()}

def k_most_similar_words(word, vocab, k):
    word_idx = word_to_index[word]
    sim_scores = cosine_similarity_matrix[word_idx]
    top_scores, top_indices = torch.topk(sim_scores, k+1)
    
    similar_words = [(index_to_word[idx.item()], score.item()) for idx, score in zip(top_indices[1:], top_scores[1:])]
    
    result = f"Most similar words to '{word}':\n"
    result += "\n".join([f"{word_sim[0]} (similarity: {word_sim[1]:.4f})" for word_sim in similar_words])
    return result + "\n\n"

words = ["me", "white", "man", "have", "be", "child", "yes", "what"]

for word in words:
    print(k_most_similar_words(word, vocab, k=10))

Most similar words to 'me':
yourself (similarity: 0.9330)
thee (similarity: 0.9247)
myself (similarity: 0.8752)
indeed (similarity: 0.8520)
mine (similarity: 0.8328)
us (similarity: 0.8162)
yours (similarity: 0.7865)
i (similarity: 0.7827)
certainly (similarity: 0.7681)
done (similarity: 0.7424)


Most similar words to 'white':
red (similarity: 0.9831)
blue (similarity: 0.9700)
heavy (similarity: 0.9562)
deep (similarity: 0.9554)
black (similarity: 0.9486)
sharp (similarity: 0.9402)
large (similarity: 0.9353)
soft (similarity: 0.9331)
yellow (similarity: 0.9319)
thick (similarity: 0.9156)


Most similar words to 'man':
woman (similarity: 0.9433)
person (similarity: 0.9169)
dogs (similarity: 0.9166)
soldier (similarity: 0.9162)
creature (similarity: 0.9106)
lady (similarity: 0.9105)
boy (similarity: 0.8961)
servant (similarity: 0.8941)
maid (similarity: 0.8936)
husband (similarity: 0.8936)


Most similar words to 'have':
has (similarity: 0.9597)
having (similarity: 0.9417)
had (similari

We can see that the ten most similar words actually are quite similar to the example words, at least a great part of them. With the word 'yes', there were mostly similar words that did not make sense, however.   

# Task 2.2 - Conjugating *be* and *have*

## Task 2.2.1 - Defining RNN and MLP architecture
First of all, we need to define a dataset where the targets are the particular words *be, am, are, is, was, were, been, being, have, has, had, having*. After this, we define a simple MLP first, then the RNN architecture.

In [194]:
def create_be_have_dataset(text, vocab, context_size=3, specific_targets=None):

    target_to_index = {target: i for i, target in enumerate(specific_targets)}
    n_text = len(text)
    txt = [vocab[w] for w in text]
    contexts = []
    targets = [] 
    for i in range(n_text - context_size):
        target_word = text[i + context_size]
        
        if target_word not in specific_targets:
            continue
        
        target_index = target_to_index[target_word]
        context = txt[i:i + context_size]

        targets.append(target_index)
        contexts.append(torch.tensor(context, dtype=torch.long))

    contexts = torch.stack(contexts)
    targets = torch.tensor(targets, dtype=torch.long)

    return TensorDataset(contexts, targets)

In [204]:
def load_be_have_dataset(words, vocab, fname):

    # If already generated
    if os.path.isfile(PATH_GENERATED + fname):
        dataset = torch.load(PATH_GENERATED + fname)
    else:
        # Create context / target dataset based on the list of strings
        targets = ["be", "am", "are", "is", "was", "were", "been", "being", "have", "has", "had", "having"]
        dataset = create_be_have_dataset(words, vocab, specific_targets=targets)
        torch.save(dataset, PATH_GENERATED + fname)
    return dataset

data_train_conjugate = load_be_have_dataset(words_train, vocab, "data_train_conjugation.pt")
data_val_conjugate = load_be_have_dataset(words_val, vocab, "data_val_conjugation.pt")
data_test_conjugate = load_be_have_dataset(words_test, vocab, "data_test_conjugation.pt")

In [205]:
class SimpleMLP(nn.Module):
    def __init__(self, pretrained_embeddings, hidden_size, num_classes):
        super(SimpleMLP, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(pretrained_embeddings, freeze=False)
        input_features = 3 * 12 
        self.fc1 = nn.Linear(input_features, hidden_size)
        self.fc2 = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        x = self.embedding(x)
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [206]:
class ConjugationRNN(nn.Module):
    def __init__(self, pretrained_embeddings, hidden_size, num_classes):
        super(ConjugationRNN, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(pretrained_embeddings, freeze=False)
        self.lstm = nn.LSTM(pretrained_embeddings.size(1), hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        x = self.embedding(x)
        lstm_out, _ = self.lstm(x)
        x = lstm_out[:, -1, :]
        x = self.fc(x)
        return x

## Task 2.2.2 - Training several models
We train different models and then compute their training and validation accuracy.

In [207]:
torch.manual_seed(13)

simple_mlp1 = SimpleMLP(pretrained_embeddings = trained_embeddings, 
                        hidden_size = 10, num_classes = 12)

optimizer = optim.Adam(simple_mlp1.parameters(), lr=0.01)
loss_fn = nn.CrossEntropyLoss()
train_loader = torch.utils.data.DataLoader(data_train_conjugate, batch_size=512, shuffle=True)

train(
    model = simple_mlp1, 
    loss_fn = loss_fn, 
    optimizer = optimizer,
    data_loader = train_loader, 
    n_epochs = 21
)

14:54:07.539031  |  Epoch 1  |  Training loss 1.495
14:54:10.423324  |  Epoch 2  |  Training loss 1.339
14:54:12.628337  |  Epoch 3  |  Training loss 1.307
14:54:14.820748  |  Epoch 4  |  Training loss 1.294
14:54:17.058169  |  Epoch 5  |  Training loss 1.288
14:54:19.244533  |  Epoch 6  |  Training loss 1.280
14:54:21.788528  |  Epoch 7  |  Training loss 1.276
14:54:24.151099  |  Epoch 8  |  Training loss 1.271
14:54:26.598739  |  Epoch 9  |  Training loss 1.266
14:54:28.777532  |  Epoch 10  |  Training loss 1.264
14:54:30.976935  |  Epoch 11  |  Training loss 1.260
14:54:33.146028  |  Epoch 12  |  Training loss 1.258
14:54:35.657199  |  Epoch 13  |  Training loss 1.254
14:54:37.913783  |  Epoch 14  |  Training loss 1.252
14:54:40.653162  |  Epoch 15  |  Training loss 1.250
14:54:42.998909  |  Epoch 16  |  Training loss 1.248
14:54:45.240181  |  Epoch 17  |  Training loss 1.246
14:54:47.379212  |  Epoch 18  |  Training loss 1.245
14:54:49.629362  |  Epoch 19  |  Training loss 1.243
14

In [218]:
val_loader = torch.utils.data.DataLoader(data_val_conjugate, batch_size=128, shuffle=False)

acc_train = compute_accuracy(simple_mlp1, train_loader)
acc_val = compute_accuracy(simple_mlp1, val_loader)
print("Training Accuracy:     %.4f" %acc_train)
print("Validation Accuracy:   %.4f" %acc_val3)

Training Accuracy:     0.4637
Validation Accuracy:   0.4579


In [210]:
torch.manual_seed(13)

trained_embeddings = cbow_model2.embeddings.weight.data

conjugation_rnn = ConjugationRNN(pretrained_embeddings = trained_embeddings,
                                 hidden_size = 10, num_classes = 12)

optimizer = optim.Adam(conjugation_rnn.parameters(), lr=0.01)
loss_fn = nn.CrossEntropyLoss()
train_loader = torch.utils.data.DataLoader(data_train_conjugate, batch_size=512, shuffle=True)

train(
    model = conjugation_rnn, 
    loss_fn = loss_fn, 
    optimizer = optimizer,
    data_loader = train_loader, 
    n_epochs = 21
)

14:56:27.127544  |  Epoch 1  |  Training loss 1.527
14:56:29.796404  |  Epoch 2  |  Training loss 1.324
14:56:33.053674  |  Epoch 3  |  Training loss 1.298
14:56:35.795347  |  Epoch 4  |  Training loss 1.285
14:56:38.511040  |  Epoch 5  |  Training loss 1.275
14:56:41.515107  |  Epoch 6  |  Training loss 1.269
14:56:44.246470  |  Epoch 7  |  Training loss 1.263
14:56:46.896097  |  Epoch 8  |  Training loss 1.257
14:56:49.827285  |  Epoch 9  |  Training loss 1.256
14:56:52.551694  |  Epoch 10  |  Training loss 1.250
14:56:55.573496  |  Epoch 11  |  Training loss 1.248
14:56:58.244525  |  Epoch 12  |  Training loss 1.246
14:57:00.884368  |  Epoch 13  |  Training loss 1.242
14:57:03.978466  |  Epoch 14  |  Training loss 1.240
14:57:06.893126  |  Epoch 15  |  Training loss 1.238
14:57:09.557850  |  Epoch 16  |  Training loss 1.234
14:57:12.272973  |  Epoch 17  |  Training loss 1.233
14:57:14.960042  |  Epoch 18  |  Training loss 1.231
14:57:18.111286  |  Epoch 19  |  Training loss 1.230
14

In [219]:
val_loader = torch.utils.data.DataLoader(data_val_conjugate, batch_size=128, shuffle=False)

acc_train = compute_accuracy(conjugation_rnn, train_loader)
acc_val2 = compute_accuracy(conjugation_rnn, val_loader)
print("Training Accuracy:     %.4f" %acc_train)
print("Validation Accuracy:   %.4f" %acc_val3)

Training Accuracy:     0.4713
Validation Accuracy:   0.4579


In [212]:
torch.manual_seed(13)

trained_embeddings = cbow_model2.embeddings.weight.data
conjugation_rnn2 = ConjugationRNN(pretrained_embeddings = trained_embeddings, 
                                  hidden_size = 16, num_classes = 12)

optimizer = optim.Adam(conjugation_rnn2.parameters(), lr=0.01)
loss_fn = nn.CrossEntropyLoss()
train_loader = torch.utils.data.DataLoader(data_train_conjugate, batch_size=512, shuffle=True)

train(
    model = conjugation_rnn2, 
    loss_fn = loss_fn, 
    optimizer = optimizer,
    data_loader = train_loader, 
    n_epochs = 21
)

14:58:02.133004  |  Epoch 1  |  Training loss 1.433
14:58:04.985147  |  Epoch 2  |  Training loss 1.272
14:58:08.255441  |  Epoch 3  |  Training loss 1.255
14:58:11.518217  |  Epoch 4  |  Training loss 1.245
14:58:14.280389  |  Epoch 5  |  Training loss 1.237
14:58:17.397564  |  Epoch 6  |  Training loss 1.231
14:58:20.266093  |  Epoch 7  |  Training loss 1.228
14:58:23.141125  |  Epoch 8  |  Training loss 1.222
14:58:26.120047  |  Epoch 9  |  Training loss 1.218
14:58:28.765073  |  Epoch 10  |  Training loss 1.215
14:58:31.792394  |  Epoch 11  |  Training loss 1.212
14:58:34.415940  |  Epoch 12  |  Training loss 1.209
14:58:37.080469  |  Epoch 13  |  Training loss 1.207
14:58:40.227737  |  Epoch 14  |  Training loss 1.204
14:58:43.022375  |  Epoch 15  |  Training loss 1.202
14:58:45.710324  |  Epoch 16  |  Training loss 1.200
14:58:48.347113  |  Epoch 17  |  Training loss 1.197
14:58:51.025813  |  Epoch 18  |  Training loss 1.196
14:58:53.976760  |  Epoch 19  |  Training loss 1.194
14

In [220]:
val_loader = torch.utils.data.DataLoader(data_val_conjugate, batch_size=128, shuffle=False)

acc_train = compute_accuracy(conjugation_rnn2, train_loader)
acc_val3 = compute_accuracy(conjugation_rnn2, val_loader)
print("Training Accuracy:     %.4f" %acc_train)
print("Validation Accuracy:   %.4f" %acc_val3)

Training Accuracy:     0.5211
Validation Accuracy:   0.4579


In [221]:
val_accs = [acc_val, acc_val2, acc_val3]
print(model_selection(val_accs))

Model 3 had the highest validation accuracy

Validation accuracy: 0.45791505791505793


We can see that the RNN architectures performed a little better than the simple MLP in terms of accuracy. They also use approximately the same time training.

In [222]:
test_loader = torch.utils.data.DataLoader(data_test_conjugate, batch_size=128, shuffle=False)

print(f'Test accuracy = {compute_accuracy(conjugation_rnn2, test_loader)}')

Test accuracy = 0.40650577124868836


# Task 2.3 - Text generation

## Task 2.3.1 - Predicting the next word
We define an RNN architecture that based on a context can predict the next word of the sequence. Before this, however, we need to create a new dataset for the task.

In [230]:
def create_text_generation_dataset(text, vocab, context_size=2):

    n_text = len(text)
    txt = [vocab[w] for w in text] 

    contexts = []
    targets = []
    for i in range(context_size, n_text):
        context = txt[i-context_size:i] 
        target = txt[i] 
        contexts.append(torch.tensor(context, dtype=torch.long))
        targets.append(target)

    contexts = torch.stack(contexts)
    targets = torch.tensor(targets, dtype=torch.long)
    return TensorDataset(contexts, targets)

data_train_generate = create_text_generation_dataset(words_train, vocab, CONTEXT_SIZE)
data_val_generate = create_text_generation_dataset(words_val, vocab, CONTEXT_SIZE)
data_test_generate = create_text_generation_dataset(words_test, vocab, CONTEXT_SIZE)

In [231]:
class RNNPredicting(nn.Module):
    def __init__(self, pretrained_embeddings, hidden_size, output_size):
        super(RNNPredicting, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(pretrained_embeddings, freeze=False)
        embedding_dim = pretrained_embeddings.size(1)
        self.rnn = nn.RNN(embedding_dim, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x, hidden=None):
        x = self.embedding(x)
        x, hidden = self.rnn(x, hidden)
        x = x[:, -1, :]
        x = self.fc(x)
        return x

## Task 2.3.2 - Training several models

In [233]:
torch.manual_seed(13)

trained_embeddings = cbow_model2.embeddings.weight.data

predicting_rnn = RNNPredicting(pretrained_embeddings = trained_embeddings, 
                               hidden_size = 16, output_size = VOCAB_SIZE)

optimizer = optim.Adam(predicting_rnn.parameters(), lr=0.01)
loss_fn = nn.CrossEntropyLoss()
train_loader = torch.utils.data.DataLoader(data_train_generate, batch_size=512, shuffle=True)

train(
    model = predicting_rnn, 
    loss_fn = loss_fn, 
    optimizer = optimizer,
    data_loader = train_loader, 
    n_epochs = 21
)

15:56:02.959698  |  Epoch 1  |  Training loss 4.385
15:58:52.781815  |  Epoch 2  |  Training loss 4.273
16:01:34.001148  |  Epoch 3  |  Training loss 4.242
16:04:15.600314  |  Epoch 4  |  Training loss 4.226
16:07:04.520755  |  Epoch 5  |  Training loss 4.214
16:09:55.754391  |  Epoch 6  |  Training loss 4.209
16:12:45.212371  |  Epoch 7  |  Training loss 4.205
16:15:29.895858  |  Epoch 8  |  Training loss 4.201
16:18:16.003278  |  Epoch 9  |  Training loss 4.197
16:20:54.136697  |  Epoch 10  |  Training loss 4.194
16:23:31.764411  |  Epoch 11  |  Training loss 4.192
16:26:12.557297  |  Epoch 12  |  Training loss 4.190
16:28:53.708228  |  Epoch 13  |  Training loss 4.188
16:31:34.710182  |  Epoch 14  |  Training loss 4.187
16:34:24.042504  |  Epoch 15  |  Training loss 4.185
16:37:25.692957  |  Epoch 16  |  Training loss 4.183
16:40:26.423674  |  Epoch 17  |  Training loss 4.180
16:43:27.517089  |  Epoch 18  |  Training loss 4.178
16:46:13.274395  |  Epoch 19  |  Training loss 4.176
16

In [268]:
val_loader = torch.utils.data.DataLoader(data_val_generate, batch_size=128, shuffle=False)

acc_train = compute_accuracy(predicting_rnn, train_loader)
acc_val = compute_accuracy(predicting_rnn, val_loader)
print("Training Accuracy:     %.4f" %acc_train)
print("Validation Accuracy:   %.4f" %acc_val)

Training Accuracy:     0.1980
Validation Accuracy:   0.1929


In [249]:
torch.manual_seed(13)

trained_embeddings = cbow_model2.embeddings.weight.data

predicting_rnn2 = RNNPredicting(pretrained_embeddings = trained_embeddings, 
                               hidden_size = 32, output_size = VOCAB_SIZE)

optimizer = optim.Adam(predicting_rnn2.parameters(), lr=0.01)
loss_fn = nn.CrossEntropyLoss()
train_loader = torch.utils.data.DataLoader(data_train_generate, batch_size=512, shuffle=True)

train(
    model = predicting_rnn2, 
    loss_fn = loss_fn, 
    optimizer = optimizer,
    data_loader = train_loader, 
    n_epochs = 21
)

17:04:33.715233  |  Epoch 1  |  Training loss 4.158
17:07:31.976969  |  Epoch 2  |  Training loss 4.082
17:10:31.303135  |  Epoch 3  |  Training loss 4.066
18:21:14.832738  |  Epoch 4  |  Training loss 4.059
18:23:46.185132  |  Epoch 5  |  Training loss 4.054
18:26:25.773285  |  Epoch 6  |  Training loss 4.050
18:29:04.194332  |  Epoch 7  |  Training loss 4.047
18:32:04.116195  |  Epoch 8  |  Training loss 4.045
18:34:51.720145  |  Epoch 9  |  Training loss 4.044
18:37:36.027472  |  Epoch 10  |  Training loss 4.042
18:40:27.191111  |  Epoch 11  |  Training loss 4.041
18:43:06.258644  |  Epoch 12  |  Training loss 4.040
18:45:37.763255  |  Epoch 13  |  Training loss 4.039
18:48:08.862609  |  Epoch 14  |  Training loss 4.038
18:50:41.502874  |  Epoch 15  |  Training loss 4.038
18:53:14.726007  |  Epoch 16  |  Training loss 4.037
18:56:02.570742  |  Epoch 17  |  Training loss 4.037
18:58:45.260631  |  Epoch 18  |  Training loss 4.036
19:01:29.817532  |  Epoch 19  |  Training loss 4.036
19

In [269]:
val_loader = torch.utils.data.DataLoader(data_val_generate, batch_size=128, shuffle=False)

acc_train = compute_accuracy(predicting_rnn2, train_loader)
acc_val2 = compute_accuracy(predicting_rnn2, val_loader)
print("Training Accuracy:     %.4f" %acc_train)
print("Validation Accuracy:   %.4f" %acc_val2)

Training Accuracy:     0.2322
Validation Accuracy:   0.2182


In [270]:
val_accs = [acc_val, acc_val2]
print(model_selection(val_accs))

Model 2 had the highest validation accuracy

Validation accuracy: 0.21824202895624256


In [271]:
test_loader = torch.utils.data.DataLoader(data_test_generate, batch_size=128, shuffle=False)

print(f'Test accuracy = {compute_accuracy(predicting_rnn2, test_loader)}')

Test accuracy = 0.262756848625442


## Task 2.3.3 - Beam search algorithm

In [272]:
def beam_search(model, initial_context, vocab, beam_width=3, max_len=10):

    model.eval()
    initial_context = torch.tensor(initial_context, dtype=torch.long).unsqueeze(0)
    sequences = [(initial_context, 0)]

    for _ in range(max_len):
        all_candidates = []
        for seq, score in sequences:
            with torch.no_grad():
                output = model(seq)
                probabilities = F.softmax(output[-1], dim=0)
                probabilities[vocab['<unk>']] = 0

            top_probs, top_indices = torch.topk(probabilities, beam_width)
            for i in range(beam_width):
                next_word_idx = top_indices[i].unsqueeze(0)
                next_word_log_prob = top_probs[i].item()
                new_seq = torch.cat((seq, next_word_idx.unsqueeze(0)), dim=1)
                new_score = score + next_word_log_prob
                all_candidates.append((new_seq, new_score))

        all_candidates.sort(key=lambda tup: tup[1], reverse=True)
        sequences = all_candidates[:beam_width]

    best_sequence = sequences[0][0]
    best_sequence = best_sequence.squeeze().tolist()
    generated_words = [vocab.lookup_token(idx) if idx < len(vocab) else "<unk>" for idx in best_sequence]

    return generated_words

## Task 2.1.4 - Playing with the model

In [323]:
def playing_with_beam(phrases, max_len, beam_width):
    for phrase in range(len(phrases)):
        phrase_context = [vocab.get_stoi()[word] for word in phrases[phrase].split()][:CONTEXT_SIZE]
        generated_text = beam_search(predicting_rnn2, phrase_context, vocab, beam_width=beam_width, max_len=max_len)
        print(f"Starting with phrase '{phrases[phrase]}': {' '.join(generated_text)}\n\n")

In [324]:
phrases = ["i want", "he needed", "the man"]
playing_with_beam(phrases, max_len=4, beam_width=3)

Starting with phrase 'i want': i want to be done ,


Starting with phrase 'he needed': he needed , and i have


Starting with phrase 'the man': the man , and i have




In [325]:
phrases = ["i want", "he needed", "the man"]
playing_with_beam(phrases, max_len=10, beam_width=5)

Starting with phrase 'i want': i want to be able to be seen , and i have


Starting with phrase 'he needed': he needed , and i have no reason . it is a


Starting with phrase 'the man': the man s son , said the other , and i have




In [326]:
phrases = ["i want", "he needed", "the man"]
playing_with_beam(phrases, max_len=10, beam_width=10)

Starting with phrase 'i want': i want to be able to be able to be able to


Starting with phrase 'he needed': he needed , and it was not to be done , and


Starting with phrase 'the man': the man in the middle of the night , and it was




Some of the sentences made okay sense, maybe better with a higher beam width. After this last cell, the model just generated the same words over and over. Thus, we decided to remove it.