In [1]:
import torch
from torch import nn, optim
import torch.nn.functional as F
from torch.utils.data import DataLoader
import numpy as np

from attention import MultiHeadAttention
from utils import model_selection, model_evaluation, set_device
from cbow import create_dataset, CBoW
from embedding_utils import similarity_matrix, find_N_closest

seed = 265
torch.manual_seed(seed)
device = set_device()

On device cuda.


In [2]:
# List of words contained in the dataset
generated_path = '../generated/'
list_words_train = torch.load(generated_path + 'books_train.pt')
list_words_val = torch.load(  generated_path + 'books_val.pt')
list_words_test = torch.load( generated_path + 'books_test.pt')

# vocab contains the vocabulary found in the data, associating an index to each word
vocab = torch.load( generated_path + 'vocab.pt')
weight = torch.load(generated_path + 'weight.pt')

vocab_size = len(vocab)

print("Total number of words in the dataset:   ", len(list_words_train))
print("Total number of words in the dataset:   ", len(list_words_val))
print("Number of distinct words kept:          ", vocab_size)

Total number of words in the dataset:    2684706
Total number of words in the dataset:    49526
Number of distinct words kept:           1879


In [3]:
model_cbow = torch.load(generated_path + 'CBoW.pt')
context_size = 4
embedding_dim = model_cbow.embedding_dim
print(model_cbow)


CBoW(
  (embeddings): Embedding(1879, 16)
  (fc1): Linear(in_features=64, out_features=1879, bias=True)
)


In [4]:
be = ["was", "were", 'be', "is", 'are', "am", "been", 'being']
have = ["have", "has", 'had', 'having']

white_list = be + have
n_out = len(white_list)
map_tokens = {vocab[w]:i for i,w in enumerate(white_list)}


data_train = create_dataset(
    list_words_train, vocab, context_size, white_list=white_list,
    occ_max=np.inf, map_target=map_tokens, bidirectional=True)
data_val = create_dataset(list_words_val, vocab, context_size, white_list=white_list,
    occ_max=np.inf, map_target=map_tokens, bidirectional=True)
data_test = create_dataset(list_words_test, vocab, context_size, white_list=white_list,
    occ_max=np.inf, map_target=map_tokens, bidirectional=True)

print(len(data_train))
print(len(data_val))
print(len(data_test))

n_epochs = 50
batch_size = 1024

loss_fn = nn.CrossEntropyLoss()

123651
2584
4735


In [5]:
class MyNet(nn.Module):

    def __init__(self, out_size, embedding, context_size=10):
        super().__init__()

        (vocab_size, embedding_dim) = embedding.weight.shape

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.embedding.load_state_dict(embedding.state_dict())
        for p in self.embedding.parameters():
            p.requires_grad = False
        self.fc1 = nn.Linear(embedding_dim*context_size*2, 128)
        self.fc2 = nn.Linear(128, out_size)

    def forward(self, x):
        with torch.no_grad():
            self.emb = self.embedding(x)
        out = F.relu(self.fc1(torch.flatten(self.emb, 1)))
        out = self.fc2(out)
        return out
    
class MyRNN(nn.Module):

    def __init__(self, out_size, embedding, context_size=10, L=1, hidden_size=None):
        super().__init__()

        (vocab_size, embedding_dim) = embedding.weight.shape
        if hidden_size is None:
            self.hidden_size = embedding_dim*2
        else:
            self.hidden_size = hidden_size
            
        self.context_size = context_size

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.embedding.load_state_dict(embedding.state_dict())
        for p in self.embedding.parameters():
            p.requires_grad = False
        self.lstm1 = nn.LSTM(input_size=embedding_dim, hidden_size=self.hidden_size, num_layers=L, batch_first=True)
        self.lstm2 = nn.LSTM(input_size=embedding_dim, hidden_size=self.hidden_size, num_layers=1, batch_first=True)
        self.fc1 = nn.Linear(self.hidden_size*2, 512)
        self.fc2 = nn.Linear(512, out_size)

    def forward(self, x):
        # Shape: (N, L, embedding_dim)
        with torch.no_grad():
            self.emb = self.embedding(x)
        N = self.emb.shape[0]
        out = torch.zeros((N, self.hidden_size*2)).to(device=device)
        # LSTM outputs: (out, (h, c)) with h of shape (num_layer, N, H_out) and we want h[-1,:,:]
        out[:, :self.hidden_size] = F.relu(self.lstm1(self.emb[:, :self.context_size])[1][0][-1])
        out[:, self.hidden_size:] = F.relu(self.lstm2(self.emb[:, self.context_size:])[1][0][-1])
        out = F.relu(self.fc1(out))
        out = self.fc2(out)
        return out
    
class MyAttentionNN(nn.Module):

    def __init__(
        self, out_size, embedding, 
        context_size=10, h=1, p=None):
        super().__init__()

        (vocab_size, embedding_dim) = embedding.weight.shape
        self.context_size = context_size*2

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.embedding.load_state_dict(embedding.state_dict())
        for param in self.embedding.parameters():
            param.requires_grad = False
            
        self.attention = MultiHeadAttention(h, p, embedding_dim, max_len=100)
        self.fc1 = nn.Linear(2*context_size*embedding_dim, 128)
        self.fc2 = nn.Linear(128, out_size)

    def forward(self, x):
        # Shape: (N, L, embedding_dim)
        with torch.no_grad():
            self.emb = self.embedding(x)
        N = self.emb.shape[0]
        # out is of shape (N, actual_len, embedding dim)
        out = self.attention(self.emb)
        out = torch.flatten(out, 1)
        # out is of shape (N, actual_len*embedding dim)
        out = F.relu(self.fc1(out))
        out = self.fc2(out)
        return out
    

In [6]:
def pipeline(
    data_train, data_val, data_test,
    context_size, model_class, model_params, model_name,
    use_unk_limit=True,
    generated_path='../generated/'
):
    """
    Warning: this function relies heavily on global variables and default parameters
    """
    device = set_device()
    
    print("="*59)
    print(
        "Context size  %d  |  use_unk_limit %s"
        %(context_size,  str(use_unk_limit) )
    )

    # -------------- Datasets -------------
    torch.manual_seed(seed)
    train_loader = DataLoader(data_train, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(data_val, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(data_test, batch_size=batch_size, shuffle=True)

    # ------- Loss function parameters -------
    loss_fn = nn.CrossEntropyLoss()

    # ---------- Optimizer parameters --------
    list_lr = [0.001]
    optimizers = [optim.Adam for _ in range(len(list_lr))]
    optim_params = [{
            "lr" : list_lr[i],
        } for i in range(len(list_lr))]

    # -------- Model class parameters --------

    
    # ----------- Model name -----------------
    hyperparams = {
        "context": context_size,
        "emb_dim": embedding_dim,
    }
    model_name += "_".join(['%s=%s' %(k, v) for (k, v) in hyperparams.items()]) + '.pt'

    # ----------- Model selection -----------
    best_model, i_best_model = model_selection(
        model_class, model_params, optimizers, optim_params,
        n_epochs, loss_fn,
        train_loader, val_loader,
        seed=265, model_name=model_name, device=device
    )

    # ----------- Model evaluation -----------
    test_acc = model_evaluation(best_model, train_loader, val_loader, test_loader, device=device)

    # ----------- Embedding analysis -----------
        
    return best_model

In [7]:
model_class = MyNet
model_params = (n_out, model_cbow.embeddings, context_size)
model_name = 'MLP_'

model_MLP = pipeline(
    data_train, data_val, data_test,
    context_size, model_class, model_params, model_name
)

On device cuda.
Context size  4  |  use_unk_limit True
   Current parameters: 
lr = 0.001

On device cuda.
18:02:50.477670  |  Epoch 1  |  Training loss 1.66432
18:02:55.513069  |  Epoch 5  |  Training loss 1.24367
18:03:01.769582  |  Epoch 10  |  Training loss 1.15639
18:03:07.513124  |  Epoch 15  |  Training loss 1.11465
18:03:13.738080  |  Epoch 20  |  Training loss 1.09111
18:03:19.951691  |  Epoch 25  |  Training loss 1.07481
18:03:26.134262  |  Epoch 30  |  Training loss 1.06341
18:03:32.006061  |  Epoch 35  |  Training loss 1.05535
18:03:38.150290  |  Epoch 40  |  Training loss 1.04812
18:03:44.391204  |  Epoch 45  |  Training loss 1.04247
18:03:50.613600  |  Epoch 50  |  Training loss 1.03680
Training Accuracy:     0.6223
Validation Accuracy:   0.5418
Training Accuracy:     0.6223
Validation Accuracy:   0.5418
Test Accuracy:         0.5056


In [8]:
model_class = MyRNN
model_params = (n_out, model_cbow.embeddings, context_size, 1)
model_name = 'RNN_'

model_RNN = pipeline(
    data_train, data_val, data_test,
    context_size, model_class, model_params, model_name
)

On device cuda.
Context size  4  |  use_unk_limit True
   Current parameters: 
lr = 0.001

On device cuda.
18:03:56.519836  |  Epoch 1  |  Training loss 1.64106
18:04:03.360536  |  Epoch 5  |  Training loss 1.13505
18:04:12.442212  |  Epoch 10  |  Training loss 1.04699
18:04:21.453917  |  Epoch 15  |  Training loss 1.00097
18:04:30.113286  |  Epoch 20  |  Training loss 0.96872
18:04:39.211178  |  Epoch 25  |  Training loss 0.94214
18:04:48.248879  |  Epoch 30  |  Training loss 0.92061
18:04:56.839077  |  Epoch 35  |  Training loss 0.90116
18:05:05.829705  |  Epoch 40  |  Training loss 0.88295
18:05:14.838065  |  Epoch 45  |  Training loss 0.86607
18:05:23.442554  |  Epoch 50  |  Training loss 0.85084
Training Accuracy:     0.6897
Validation Accuracy:   0.6161
Training Accuracy:     0.6897
Validation Accuracy:   0.6161
Test Accuracy:         0.5614


In [9]:
model_class = MyAttentionNN
h=1
p=32
max_len=20

model_params = (
    n_out, model_cbow.embeddings, context_size,
    h, p,
)
model_name = 'AttentionSingle_'

model_AttentionSingle = pipeline(
    data_train, data_val, data_test,
    context_size, model_class, model_params, model_name
)

On device cuda.
Context size  4  |  use_unk_limit True
   Current parameters: 
lr = 0.001

On device cuda.
18:05:27.850773  |  Epoch 1  |  Training loss 1.80227
18:05:35.649431  |  Epoch 5  |  Training loss 1.39124
18:05:45.508090  |  Epoch 10  |  Training loss 1.31015
18:05:55.385518  |  Epoch 15  |  Training loss 1.26078
18:06:05.364456  |  Epoch 20  |  Training loss 1.22599
18:06:15.008399  |  Epoch 25  |  Training loss 1.20039
18:06:24.342419  |  Epoch 30  |  Training loss 1.18202
18:06:34.071710  |  Epoch 35  |  Training loss 1.16748
18:06:43.453614  |  Epoch 40  |  Training loss 1.15717
18:06:53.306697  |  Epoch 45  |  Training loss 1.14934
18:07:03.208472  |  Epoch 50  |  Training loss 1.14253
Training Accuracy:     0.5779
Validation Accuracy:   0.5290
Training Accuracy:     0.5779
Validation Accuracy:   0.5290
Test Accuracy:         0.4929


In [10]:
model_class = MyAttentionNN
h=4
p=6
max_len=20


model_params = (
    n_out, model_cbow.embeddings, context_size,
    h, p,
)
model_name = 'AttentionMulti01_'

model_AttentionMulti01 = pipeline(
    data_train, data_val, data_test,
    context_size, model_class, model_params, model_name
)

On device cuda.
Context size  4  |  use_unk_limit True
   Current parameters: 
lr = 0.001

On device cuda.
18:07:08.732527  |  Epoch 1  |  Training loss 1.82552
18:07:18.284512  |  Epoch 5  |  Training loss 1.33305
18:07:30.765595  |  Epoch 10  |  Training loss 1.24623
18:07:42.985415  |  Epoch 15  |  Training loss 1.19817
18:07:56.530104  |  Epoch 20  |  Training loss 1.16555
18:08:10.430550  |  Epoch 25  |  Training loss 1.14245
18:08:25.413769  |  Epoch 30  |  Training loss 1.12143
18:08:39.228472  |  Epoch 35  |  Training loss 1.10688
18:08:52.509785  |  Epoch 40  |  Training loss 1.09494
18:09:05.756401  |  Epoch 45  |  Training loss 1.08419
18:09:18.637869  |  Epoch 50  |  Training loss 1.07549
Training Accuracy:     0.6041
Validation Accuracy:   0.5430
Training Accuracy:     0.6041
Validation Accuracy:   0.5430
Test Accuracy:         0.5132


In [11]:
model_class = MyAttentionNN
h=4
p=32
max_len=20


model_params = (
    n_out, model_cbow.embeddings, context_size,
    h, p,
)
model_name = 'AttentionMulti02_'

model_AttentionMulti02 = pipeline(
    data_train, data_val, data_test,
    context_size, model_class, model_params, model_name
)

On device cuda.
Context size  4  |  use_unk_limit True
   Current parameters: 
lr = 0.001

On device cuda.
18:09:24.682991  |  Epoch 1  |  Training loss 1.67113
18:09:35.185680  |  Epoch 5  |  Training loss 1.22359
18:09:48.852845  |  Epoch 10  |  Training loss 1.14638
18:10:02.569807  |  Epoch 15  |  Training loss 1.10283
18:10:16.722914  |  Epoch 20  |  Training loss 1.07517
18:10:31.063254  |  Epoch 25  |  Training loss 1.05488
18:10:45.440966  |  Epoch 30  |  Training loss 1.04098
18:10:59.559197  |  Epoch 35  |  Training loss 1.02967
18:11:13.092930  |  Epoch 40  |  Training loss 1.02217
18:11:26.780427  |  Epoch 45  |  Training loss 1.01308
18:11:39.994776  |  Epoch 50  |  Training loss 1.00817
Training Accuracy:     0.6315
Validation Accuracy:   0.5445
Training Accuracy:     0.6315
Validation Accuracy:   0.5445
Test Accuracy:         0.5016


In [12]:
model_class = MyAttentionNN
h=4
p=128
max_len=20


model_params = (
    n_out, model_cbow.embeddings, context_size,
    h, p,
)
model_name = 'AttentionMulti02_'

model_AttentionMulti02 = pipeline(
    data_train, data_val, data_test,
    context_size, model_class, model_params, model_name
)

On device cuda.
Context size  4  |  use_unk_limit True
   Current parameters: 
lr = 0.001

On device cuda.
18:24:05.887995  |  Epoch 1  |  Training loss 1.60508
18:24:15.567411  |  Epoch 5  |  Training loss 1.22025
18:24:27.666381  |  Epoch 10  |  Training loss 1.14532
18:24:39.769178  |  Epoch 15  |  Training loss 1.10464
18:24:52.035523  |  Epoch 20  |  Training loss 1.07825
18:25:04.700185  |  Epoch 25  |  Training loss 1.06181
18:25:18.158391  |  Epoch 30  |  Training loss 1.05027
18:25:32.029670  |  Epoch 35  |  Training loss 1.03938
18:25:46.230678  |  Epoch 40  |  Training loss 1.03162
18:25:59.695014  |  Epoch 45  |  Training loss 1.02462
18:26:11.944832  |  Epoch 50  |  Training loss 1.02007
Training Accuracy:     0.6264
Validation Accuracy:   0.5402
Training Accuracy:     0.6264
Validation Accuracy:   0.5402
Test Accuracy:         0.5090


In [13]:
model_class = MyAttentionNN
h=10
p=16
max_len=20


model_params = (
    n_out, model_cbow.embeddings, context_size,
    h, p,
)
model_name = 'AttentionMulti02_'

model_AttentionMulti02 = pipeline(
    data_train, data_val, data_test,
    context_size, model_class, model_params, model_name
)

On device cuda.
Context size  4  |  use_unk_limit True
   Current parameters: 
lr = 0.001

On device cuda.
18:26:18.627220  |  Epoch 1  |  Training loss 1.65583
18:26:31.681392  |  Epoch 5  |  Training loss 1.20988
18:26:47.868401  |  Epoch 10  |  Training loss 1.12296
18:27:03.797991  |  Epoch 15  |  Training loss 1.07308
18:27:20.106841  |  Epoch 20  |  Training loss 1.04006
18:27:36.271818  |  Epoch 25  |  Training loss 1.01482
18:27:52.457905  |  Epoch 30  |  Training loss 0.99828
18:28:08.518587  |  Epoch 35  |  Training loss 0.98324
18:28:24.739358  |  Epoch 40  |  Training loss 0.97096
18:28:40.954383  |  Epoch 45  |  Training loss 0.96157
18:28:57.467884  |  Epoch 50  |  Training loss 0.95463
Training Accuracy:     0.6530
Validation Accuracy:   0.5542
Training Accuracy:     0.6530
Validation Accuracy:   0.5542
Test Accuracy:         0.5191
