In [1]:
import torch
from torch import nn, optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import pandas as pd

from utils import model_selection, model_evaluation, set_device
from cbow import create_dataset, CBoW
from embedding_utils import similarity_matrix, find_N_closest

seed = 265
torch.manual_seed(seed)
device = set_device()

On device cuda.


In [2]:
# List of words contained in the dataset
generated_path = '../generated/'
list_words_train = torch.load(generated_path + 'books_train.pt')
list_words_val = torch.load(  generated_path + 'books_val.pt')
list_words_test = torch.load( generated_path + 'books_test.pt')

# vocab contains the vocabulary found in the data, associating an index to each word
vocab = torch.load( generated_path + 'vocab.pt')
weight = torch.load(generated_path + 'weight.pt')

vocab_size = len(vocab)

print("Total number of words in the dataset:   ", len(list_words_train))
print("Total number of words in the dataset:   ", len(list_words_val))
print("Number of distinct words kept:          ", vocab_size)

Total number of words in the dataset:    2684706
Total number of words in the dataset:    49526
Number of distinct words kept:           1879


In [3]:
def pipeline(
    context_size, embedding_dim, occ_max=np.inf, use_weight=True, use_unk_limit=True,
    black_list=["<unk>", ",", ".", "!", "?", '"'],
    generated_path='../generated/'
):
    """
    Warning: this function relies heavily on global variables and default parameters
    """
    device = set_device()
    
    print("="*59)
    print(
        "Context size  %d  |  Embedding dim  %d  |  occ_max  %s  |  weights %s"
        %(context_size, embedding_dim, str(occ_max), str(use_weight) )
    )
    print(
        "use_unk_limit %s " %(str(use_unk_limit))
    )
    print("Black_list: %s" %" | ".join(black_list))

    # -------------- Datasets -------------
    data_train_ngram = create_dataset(list_words_train, vocab, context_size, black_list=black_list, occ_max=occ_max, use_unk_limit=use_unk_limit)
    data_val_ngram = create_dataset(list_words_val,     vocab, context_size, black_list=black_list, occ_max=occ_max, use_unk_limit=use_unk_limit)
    data_test_ngram = create_dataset(list_words_test,   vocab, context_size, black_list=black_list, occ_max=occ_max, use_unk_limit=use_unk_limit)

    print(len(data_train_ngram))
    print(len(data_val_ngram))
    print(len(data_test_ngram))

    train_loader = DataLoader(data_train_ngram, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(data_val_ngram, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(data_test_ngram, batch_size=batch_size, shuffle=True)

    # ------- Loss function parameters -------
    if use_weight:
        loss_fn = nn.CrossEntropyLoss(weight=weight.to(device=device))
    else:
        loss_fn = nn.CrossEntropyLoss()

    # ---------- Optimizer parameters --------
    list_lr = [0.001]
    optimizers = [optim.Adam for _ in range(len(list_lr))]
    optim_params = [{
            "lr" : list_lr[i],
        } for i in range(len(list_lr))]

    # -------- Model class parameters --------
    model_class = CBoW
    model_params = (vocab_size, embedding_dim, context_size)
    
    # ----------- Model name -----------------
    model_name = generated_path +'CBoW_'
    hyperparams = {
        "context": context_size,
        "emb_dim": embedding_dim,
        "weights": use_weight,
        "unk_limit": use_unk_limit,
        "occ_max": occ_max, 
    }
    model_name += "_".join(['%s=%s' %(k, v) for (k, v) in hyperparams.items()]) + '.pt'

    # ----------- Model selection -----------
    model_cbow, i_best_model = model_selection(
        model_class, model_params, optimizers, optim_params,
        n_epochs, loss_fn,
        train_loader, val_loader,
        seed=265, model_name=model_name, device=device
    )

    # ----------- Model evaluation -----------
    test_acc = model_evaluation(model_cbow, train_loader, val_loader, test_loader, device=device)

    # ----------- Embedding analysis -----------
    sim, embs = similarity_matrix(vocab, model_cbow)
    words = [
        'the', 'table', "man", 'little', 'big', 'always', 'mind', 'black', 'white', 'child', 'children', 
        'yes', 'out', "me", "have", "be"
    ]
    for w in words:
        print('-'*59)
        find_N_closest(sim, w, vocab)
        
    return model_cbow, embs, sim

In [4]:
n_epochs = 30
batch_size = 2048

# These hyperparameters were decided after analysing the output of bigger experiments
# ------------------------------------------------------------------------
context_size = 2
embedding_dim = 12
occ_max = np.inf
use_weight = True
use_unk_limit=True
black_list = ["<unk>"]
# ------------------------------------------------------------------------

model_cbow, embs, sim = pipeline(
    context_size, embedding_dim, 
    occ_max=occ_max, use_weight=use_weight, use_unk_limit=use_unk_limit,
    black_list=black_list,
)

On device cuda.
Context size  2  |  Embedding dim  12  |  occ_max  inf  |  weights True
use_unk_limit True 
Black_list: <unk>
2189034
40333
94891
   Current parameters: 
lr = 0.001

On device cuda.
12:57:54.175866  |  Epoch 1  |  Training loss 5.26034
12:59:13.015910  |  Epoch 5  |  Training loss 4.18400
13:00:48.817708  |  Epoch 10  |  Training loss 4.08659
13:02:25.125085  |  Epoch 15  |  Training loss 4.05612
13:04:00.358892  |  Epoch 20  |  Training loss 4.03864
13:05:36.383461  |  Epoch 25  |  Training loss 4.02694
13:07:12.073661  |  Epoch 30  |  Training loss 4.01881
Accuracy: 0.23610
Accuracy: 0.23423
Training Accuracy:     0.2361
Validation Accuracy:   0.2342
Accuracy: 0.23610
Accuracy: 0.23423
Accuracy: 0.24609
Training Accuracy:     0.2361
Validation Accuracy:   0.2342
Validation Accuracy:   0.2461
On device cuda.
-----------------------------------------------------------
the
0  |   similitude: 1.000000   |   the 
1  |   similitude: 0.690791   |   a 
2  |   similitude: 0.65

In [5]:
embs_np = embs.cpu().numpy()
embs_df = pd.DataFrame(embs_np)
embs_df.to_csv(generated_path + 'embeddings.tsv', sep="\t", header=False, index=False)
words_np = np.array(vocab.lookup_tokens(range(vocab_size)))
words_df = pd.DataFrame(words_np)
words_df.to_csv(generated_path + 'vocabulary.tsv', sep="\t", header=False, index=False)