In [1]:
import torch
from torch import nn, optim
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torchtext.data.utils import get_tokenizer
import numpy as np

from cbow import CBoW, create_dataset
from utils import model_selection, model_evaluation, set_device

seed = 265
torch.manual_seed(seed)
device = set_device()

On device cuda.


### Loading 

In [2]:
# List of words contained in the dataset
generated_path = '../generated/'
list_words_train = torch.load(generated_path + 'books_train.pt')
list_words_val = torch.load(  generated_path + 'books_val.pt')
list_words_test = torch.load( generated_path + 'books_test.pt')

# vocab contains the vocabulary found in the data, associating an index to each word
vocab = torch.load( generated_path + 'vocab.pt')
weight = torch.load(generated_path + 'weight.pt')

vocab_size = len(vocab)

print("Total number of words in the dataset:   ", len(list_words_train))
print("Total number of words in the dataset:   ", len(list_words_val))
print("Number of distinct words kept:          ", vocab_size)

# tokenizer will split a long text into a list of english words
tokenizer = get_tokenizer('basic_english')

model_cbow = torch.load(generated_path + 'CBoW.pt')
context_size = 4
embedding_dim = model_cbow.embedding_dim
print(model_cbow)

Total number of words in the dataset:    2684706
Total number of words in the dataset:    49526
Number of distinct words kept:           1879
CBoW(
  (embeddings): Embedding(1879, 16)
  (fc1): Linear(in_features=64, out_features=1879, bias=True)
)


### Beam search

In [35]:
def top_k_candidates(batch, model, k, device=None):
    """
    Find top k candidates and log probabilities for each sequences in a batch

    `batch` of shape (N, seq_len)
    `prob` and `cand` of shape (N, k)
    """
    if device is None:
        device = set_device()
    with torch.no_grad():
        # Predict word probabilities `out` using model
        # Shape (N, n_out)
        batch = batch.to(device=device)
        outputs = model(batch)
        # Get log probabilities
        log_probs = F.log_softmax(outputs, dim=-1)
        # Find top k candidates and probabilities for each sequence individually
        # prob and cand of shape (N, k)
        (prob, cand) = torch.topk(log_probs, k, dim=1, largest=True, sorted=True)
    return prob, cand

def find_top_ij(probs):
    """
    Find (i,j) indices of global top k candidates.

    Assume that probs[i, :] are sorted for each i (see top_k_candidates)
    `top_ij` integer tensor of shape (k, 2)
    """
    device = probs.device
    (N, k) = probs.shape
    # For each sequence, index of candidate currently considered
    # Shape (N)
    curr_ind = torch.zeros(N, dtype=int).to(device=device)
    # For each sequence candidate currently considered
    # Shape (N)
    curr_cand = probs[:, 0]
    top_ij = torch.zeros((k, 2), dtype=int).to(device=device)
    for k_curr in range(k):
        # Find best among considered candidates
        i_best = torch.argmax(curr_cand)
        j_best = curr_ind[i_best]
        top_ij[k_curr, :] = torch.tensor([i_best, j_best])

        # Update currently considered candidates
        curr_ind[i_best] += 1
        curr_cand[i_best] = probs[i_best, j_best+1]
    return top_ij

def global_top_k_candidates(probs, cands, prev_seqs, prev_probs):
    """
    Find global top k candidates.

    Assume that probs[i, :] are sorted for each i
    `probs` and `cands` of shape (N, k) (with N=1 or k)
    `seqs` of shape (N, seq_len)        (with N=1 or k)
    `kept_probs` of shape (k)
    kept_seqs of shape (k, seq_len+1)
    """
    device = probs.device
    (N, k) = probs.shape
    (N, seq_len) = prev_seqs.shape

    kept_seqs = torch.zeros((k, seq_len+1), dtype=int).to(device=device)
    kept_probs = torch.zeros(k).to(device=device)

    # indices of global top_k candidates
    top_ij = find_top_ij(probs)
    for k_curr in range(k):
        i, j = top_ij[k_curr]

        # Update kept sequences
        kept_seqs[k_curr, :seq_len] = prev_seqs[i]
        kept_seqs[k_curr, -1] = cands[i, j]

        # Update log probabilities
        kept_probs[k_curr] = prev_probs[i] + probs[i, j]
    return kept_probs, kept_seqs

def beam_search(model, seq, n_preds=5, k=3):
    """
    Return the `n_preds` next word after `seq` according to the beam search algo

    `seq` of shape(1, seq_len)
    `completion` of shape (1, seq_len+k)
    `prob_completion` being a float
    """
    device = seq.device
    # Initialisation with N=1
    start_seq_len = seq.shape[-1]
    # Find top k candidates and log prob for the initial sequence
    # prob and cand of shape (1, k)
    prob, cand = top_k_candidates(seq, model, k, device=device)

    # kept_seqs of shape (k, seq_len) (with seq_len being incremented)
    kept_seqs = torch.zeros((k, start_seq_len+1), dtype=int)
    kept_seqs[:, :start_seq_len] = start_seq_len
    kept_seqs[:, -1] = cand
    
    # kept_probs of shape (k) (won't change)
    kept_probs = prob.squeeze()

    for i in range(n_preds):
        # Find top k candidates and log prob for each sequences in batch
        # probs and cands of shape (N, k) with N=k
        probs, cands = top_k_candidates(kept_seqs, model, k, device=device)

        # Keep only the global k top candidates and log probs
        kept_probs, kept_seqs = global_top_k_candidates(
            probs, cands, kept_seqs, kept_probs
        )

    # Keep only the best completion among the top k
    completion = kept_seqs[0, start_seq_len:]
    prob_completion = kept_probs[0].item()
    return completion, prob_completion


### Generators

In [4]:
class RNN_generator(nn.Module):

    def __init__(self, embedding, L=1, hidden_size=None):
        super().__init__()

        (vocab_size, embedding_dim) = embedding.weight.shape
        if hidden_size is None:
            self.hidden_size = embedding_dim*2
        else:
            self.hidden_size = hidden_size

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.embedding.load_state_dict(embedding.state_dict())
        for p in self.embedding.parameters():
            p.requires_grad = False
            
        self.lstm1 = nn.LSTM(input_size=embedding_dim, hidden_size=self.hidden_size, num_layers=L, batch_first=True)
        self.fc1 = nn.Linear(self.hidden_size, vocab_size)
        

    def forward(self, x):
        # Shape: (N, L, embedding_dim)
        self.emb = self.embedding(x)
        # LSTM outputs: (out, (h, c)) with h of shape (num_layer, N, H_out) and we want h[-1,:,:]
        out = F.relu(self.lstm1(self.emb)[1][0][-1])
        out = self.fc1(out)
        
        return out

In [5]:
context_size = 6
n_beam = 3
data_train_gen = create_dataset(list_words_train, vocab, context_size, bidirectional=False, occ_max=np.inf)
data_val_gen = create_dataset(list_words_val, vocab, context_size, bidirectional=False, occ_max=np.inf)
data_test_gen = create_dataset(list_words_test, vocab, context_size, bidirectional=False, occ_max=np.inf)

print(len(data_train_gen))
print(len(data_val_gen))
print(len(data_test_gen))

1888297
35124
80751


In [6]:
n_epochs = 5
batch_size = 1024

loss_fn = nn.CrossEntropyLoss()

train_loader = DataLoader(data_train_gen, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(data_val_gen, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(data_test_gen, batch_size=batch_size, shuffle=True)

list_lr = [0.001]
optimizers = [optim.Adam for _ in range(len(list_lr))]
optim_params = [{
        "lr" : list_lr[i],
    } for i in range(len(list_lr))]


model_class = RNN_generator
model_params = (model_cbow.embeddings, 1)
model_name = 'generator_'

model_generator, i_best_model = model_selection(
    model_class, model_params, optimizers, optim_params,
    n_epochs, loss_fn,
    train_loader, val_loader,
    seed=265, model_name=model_name, device=device
)


   Current parameters: 
lr = 0.001

On device cuda.
17:04:15.073012  |  Epoch 1  |  Training loss 5.41481
17:05:44.560565  |  Epoch 5  |  Training loss 4.78494
Training Accuracy:     0.1585
Validation Accuracy:   0.1550


In [7]:
test_acc = model_evaluation(model_generator, train_loader, val_loader, test_loader, device=device)

Training Accuracy:     0.1585
Validation Accuracy:   0.1550
Test Accuracy:         0.1789


In [37]:
def predict(model, vocab, sentences, n_preds, k):
    """
    Complete a given sequence using the beam search algorithm

    sentences are list strings, each element representing a sequence
    """
    for s in sentences:
        # From a long string to a list of words
        start_seq = tokenizer(s)
        # After that start_seq is a int tensor of shape (1, seq_len)
        start_seq = torch.tensor([vocab[w] for w in start_seq]).unsqueeze(0)
        start_seq = start_seq.to(device=device)

        # Find most likely completion according to beam search
        completion, prob_completion = beam_search( model, start_seq, n_preds, k)

        # From tensor of integer to long string
        completion =" ".join(vocab.lookup_tokens(list(completion.squeeze())))
        print("{:.3f} | {} {} \n".format(prob_completion, s, completion))

sentences = [
    "Did you know that I am a very", "Sometimes, I wish I were", "Do you think that", 
    "I am not angry, I just think it is strange that", "What would happen if", "I am not sure, but perhaps I",
    "I am so happy that i", "The most important thing in life is to",
]

n_preds = 10
k = 20
predict(model_generator, vocab, sentences, n_preds, k)
    

-35.525 | Did you know that I am a very as he could not see him in his eyes to the 

-33.735 | Sometimes, I wish I were that he said to me that she had been of the 

-36.171 | Do you think that that the king was not in a man who had been 

-33.664 | I am not angry, I just think it is strange that in his head in his eyes in his head in the 

-36.414 | What would happen if he had not been to be a great and in the 

-32.965 | I am not sure, but perhaps I in his head in his head in his head in the 

-34.930 | I am so happy that i am to go in his eyes to his head of the 

-33.136 | The most important thing in life is to say to him in his head in his head of the 

