# To generate the trained embedding, just use the functions from the project 3, so if those functions got improved in the meantime, just copy paste again from there.

In [1]:
import torch
from torch import nn, optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import pandas as pd

# Copy paste from project 3
from cbow import CBoW, create_dataset
from train_embedding_utils import similarity_matrix, find_N_closest, model_selection, model_evaluation, set_device

seed = 265
torch.manual_seed(seed)
device = set_device()

On device cpu.


In [2]:
# List of words contained in the dataset
generated_path = '../generated/'
list_words_train = torch.load(generated_path + 'words_train.pt')
list_words_val = torch.load(  generated_path + 'words_val.pt')
list_words_test = torch.load( generated_path + 'words_test.pt')

# vocab contains the vocabulary found in the data, associating an index to each word
vocab = torch.load( generated_path + 'vocabulary.pt')

vocab_size = len(vocab)

print("Total number of words in the dataset:   ", len(list_words_train))
print("Total number of words in the dataset:   ", len(list_words_val))
print("Number of distinct words kept:          ", vocab_size)

Total number of words in the dataset:    347870
Total number of words in the dataset:    49526
Number of distinct words kept:           324


In [3]:
def pipeline(
    context_size, embedding_dim, occ_max=np.inf, use_weight=True, use_unk_limit=True,
    black_list=["<unk>", ",", ".", "!", "?", '"'],
    generated_path='../generated/'
):
    """
    Warning: this function relies heavily on global variables and default parameters
    """
    device = set_device()
    
    print("="*59)
    print(
        "Context size  %d  |  Embedding dim  %d  |  occ_max  %s  |  weights %s"
        %(context_size, embedding_dim, str(occ_max), str(use_weight) )
    )
    print(
        "use_unk_limit %s " %(str(use_unk_limit))
    )
    print("Black_list: %s" %" | ".join(black_list))

    # -------------- Datasets -------------
    data_train_ngram = create_dataset(list_words_train, vocab, context_size, black_list=black_list, occ_max=occ_max, use_unk_limit=use_unk_limit)
    data_val_ngram = create_dataset(list_words_val,     vocab, context_size, black_list=black_list, occ_max=occ_max, use_unk_limit=use_unk_limit)
    data_test_ngram = create_dataset(list_words_test,   vocab, context_size, black_list=black_list, occ_max=occ_max, use_unk_limit=use_unk_limit)

    print(len(data_train_ngram))
    print(len(data_val_ngram))
    print(len(data_test_ngram))

    batch_size = 512
    n_epochs = 30
    train_loader = DataLoader(data_train_ngram, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(data_val_ngram, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(data_test_ngram, batch_size=batch_size, shuffle=True)

    # ------- Loss function parameters -------
    loss_fn = nn.CrossEntropyLoss()

    # ---------- Optimizer parameters --------
    list_lr = [0.001]
    optimizers = [optim.Adam for _ in range(len(list_lr))]
    optim_params = [{
            "lr" : list_lr[i],
        } for i in range(len(list_lr))]

    # -------- Model class parameters --------
    model_class = CBoW
    model_params = (vocab_size, embedding_dim, context_size)
    
    # ----------- Model name -----------------
    model_name = generated_path +'CBoW_'
    hyperparams = {
        "context": context_size,
        "emb_dim": embedding_dim,
        "weights": use_weight,
        "unk_limit": use_unk_limit,
        "occ_max": occ_max, 
    }
    model_name += "_".join(['%s=%s' %(k, v) for (k, v) in hyperparams.items()]) + '.pt'

    # ----------- Model selection -----------
    model_cbow, i_best_model = model_selection(
        model_class, model_params, optimizers, optim_params,
        n_epochs, loss_fn,
        train_loader, val_loader,
        seed=265, model_name=model_name, device=device
    )

    # ----------- Model evaluation -----------
    test_acc = model_evaluation(model_cbow, train_loader, val_loader, test_loader, device=device)

    # ----------- Embedding analysis -----------
    sim, embs = similarity_matrix(vocab, model_cbow)
    words = [
        'the', 'table', "man", 'little', 'big', 'always', 'mind', 'black', 'white', 'child', 'children', 
        'yes', 'out', "me", "have", "be"
    ]
    for w in words:
        print('-'*59)
        find_N_closest(sim, w, vocab)
        
    return model_cbow, embs, sim

In [4]:
n_epochs = 30
batch_size = 2048

# These hyperparameters were decided after analysing the output of bigger experiments
# ------------------------------------------------------------------------
context_size = 2
embedding_dim = 4
occ_max = np.inf
use_weight = False
use_unk_limit=True
black_list = ["<unk>"]
# ------------------------------------------------------------------------

model_cbow, embs, sim = pipeline(
    context_size, embedding_dim, 
    occ_max=occ_max, use_weight=use_weight, use_unk_limit=use_unk_limit,
    black_list=black_list, generated_path=generated_path
)

On device cpu.
Context size  2  |  Embedding dim  4  |  occ_max  inf  |  weights False
use_unk_limit True 
Black_list: <unk>
247766
30660
67934
   Current parameters: 
lr = 0.001

On device cpu.
08:54:55.914320  |  Epoch 1  |  Training loss 4.91033
08:55:06.726443  |  Epoch 5  |  Training loss 3.96708
08:55:17.629282  |  Epoch 10  |  Training loss 3.81031
08:55:30.194513  |  Epoch 15  |  Training loss 3.76172
08:55:42.116142  |  Epoch 20  |  Training loss 3.73408
08:55:55.544088  |  Epoch 25  |  Training loss 3.71714
08:56:07.389205  |  Epoch 30  |  Training loss 3.70545
Training Accuracy:     0.2336
Validation Accuracy:   0.1862
Training Accuracy:     0.2336
Validation Accuracy:   0.1862
Test Accuracy:         0.2188
On device cpu.
-----------------------------------------------------------
the
0  |   similitude: 1.000000   |   the 
1  |   similitude: 0.982028   |   any 
2  |   similitude: 0.930834   |   my 
3  |   similitude: 0.922812   |   every 
4  |   similitude: 0.917259   |   an

In [5]:
embs_np = embs.cpu().numpy()
embs_df = pd.DataFrame(embs_np)
embs_df.to_csv(generated_path + 'embeddings.tsv', sep="\t", header=False, index=False)
words_np = np.array(vocab.lookup_tokens(range(vocab_size)))
words_df = pd.DataFrame(words_np)
words_df.to_csv(generated_path + 'vocabulary.tsv', sep="\t", header=False, index=False)

In [6]:
embedding = model_cbow.embeddings.cpu()
torch.save(model_cbow.embeddings, "embedding.pt")