In [1]:
import torch
import torch.nn as nn

### Tools for data processing 

In [2]:
import os
import time
import math
from collections import Counter
import pprint
pp = pprint.PrettyPrinter(indent=1)

In [3]:
class Dictionary(object):
    def __init__(self):
        self.word2idx = {}
        self.idx2word = []
        self.counter = {}
        self.total = 0

    def add_word(self, word):
        if word not in self.word2idx:
            self.idx2word.append(word)
            self.word2idx[word] = len(self.idx2word) - 1
            self.counter.setdefault(word, 0)
        self.counter[word] += 1
        self.total += 1
        return self.word2idx[word]

    def __len__(self):
        return len(self.idx2word)

In [4]:
class Corpus(object):
    def __init__(self, path):
        # We create an object Dictionary associated to Corpus
        self.dictionary = Dictionary()
        # We go through all files, adding all words to the dictionary
        self.train = self.tokenize(os.path.join(path, 'train.txt'))
        self.valid = self.tokenize(os.path.join(path, 'valid.txt'))
        self.test = self.tokenize(os.path.join(path, 'test.txt'))
        
    def tokenize(self, path):
        """Tokenizes a text file, knowing the dictionary, in order to tranform it into a list of indexes"""
        assert os.path.exists(path)
        # Add words to the dictionary
        with open(path, 'r') as f:
            tokens = 0
            for line in f:
                words = line.split() + ['<eos>']
                for word in words:
                    self.dictionary.add_word(word)
                tokens += len(words)
        
        # Once done, go through the file a second time and fill a Torch Tensor with the associated indexes 
        with open(path, 'r') as f:
            ids = torch.LongTensor(tokens)
            token = 0
            for line in f:
                words = line.split() + ['<eos>']
                for word in words:
                    ids[token] = self.dictionary.word2idx[word]
                    token += 1
        return ids

In [5]:
###############################################################################
# Load data
###############################################################################

data = './corpus/'
corpus = Corpus(data)

In [6]:
print(corpus.dictionary.total)
print(len(corpus.dictionary.idx2word))
print(len(corpus.dictionary.word2idx))

print(corpus.train.shape)
print(corpus.train[0:7])
print([corpus.dictionary.idx2word[corpus.train[i]] for i in range(7)])

print(corpus.valid.shape)
print(corpus.valid[0:7])
print([corpus.dictionary.idx2word[corpus.valid[i]] for i in range(7)])

1076937
23244
23244
torch.Size([773840])
tensor([0, 1, 2, 3, 4, 5, 6])
['Oui', ',', 'je', 'viens', 'dans', 'son', 'Temple']
torch.Size([135984])
tensor([  33,    2, 1176,  104,   21, 1218,   41])
['Que', 'je', 'sens', 'à', 'la', 'fois', 'de']


In [7]:
# We now have data under a very long list of indexes: the text is as one sequence.
# The idea now is to create batches from this. Note that this is absolutely not the best
# way to proceed with large quantities of data (where we'll try not to store huge tensors
# in memory but read them from file as we go) !
# Here, we are looking for simplicity and efficiency with regards to computation time.
# That is why we will ignore sentence separations and treat the data as one long stream that
# we will cut arbitrarily as we need.
# With the alphabet being our data, we currently have the sequence:
# [a b c d e f g h i j k l m n o p q r s t u v w x y z]
# We want to reorganize it as independant batches that will be processed independantly by the model !
# For instance, with the alphabet as the sequence and batch size 4, we'd get the 4 following sequences:
# ┌ a g m s ┐
# │ b h n t │
# │ c i o u │
# │ d j p v │
# │ e k q w │
# └ f l r x ┘
# with the last two elements being lost.
# Again, these columns are treated as independent by the model, which means that the
# dependence of e. g. 'g' on 'f' can not be learned, but allows more efficient processing.

def batchify(data, batch_size, cuda = False):
    # Cut the elements that are unnecessary
    nbatch = data.size(0) // batch_size
    data = data.narrow(0, 0, nbatch * batch_size)
    # Reorganize the data
    data = data.view(batch_size, -1).t().contiguous()
    # If we can use a GPU, let's tranfer the tensor to it
    return data.to(device)

# get_batch subdivides the source data into chunks of the appropriate length.
# If source is equal to the example output of the batchify function, with
# a sequence length (seq_len) of 3, we'd get the following two variables:
# ┌ a g m s ┐ ┌ b h n t ┐
# | b h n t | | c i o u │
# └ c i o u ┘ └ d j p v ┘
# The first variable contains the letters input to the network, while the second
# contains the one we want the network to predict (b for a, h for g, v for u, etc..)
# Note that despite the name of the function, we are cutting the data in the
# temporal dimension, since we already divided data into batches in the previous
# function. 

def get_batch(source, i, seq_len, evaluation=False):
    # Deal with the possibility that there's not enough data left for a full sequence
    seq_len = min(seq_len, len(source) - 1 - i)
    # Take the input data
    data = source[i:i+seq_len]
    # Shift by one for the target data
    target = source[i+1:i+1+seq_len]
    return data, target

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [9]:
batch_size = 100
eval_batch_size = 4
train_data = batchify(corpus.train, batch_size)
val_data = batchify(corpus.valid, eval_batch_size)
test_data = batchify(corpus.test, eval_batch_size)

print(train_data.shape)
print(val_data.shape)

torch.Size([7738, 100])
torch.Size([33996, 4])


### LSTM Cells in pytorch

### Creating our own LSTM Model

In [10]:
# Models are usually implemented as custom nn.Module subclass
# We need to redefine the __init__ method, which creates the object
# We also need to redefine the forward method, which transform the input into outputs
# We can also add any method that we need: here, in order to initiate weights in the model

class LSTMModel(nn.Module):
    def __init__(self, ntoken, ninp, nhid, nlayers, dropout=0.5):
        super(LSTMModel, self).__init__()
        # Create a dropout object to use on layers for regularization
        self.drop = nn.Dropout(dropout)
        # Create an encoder - which is an embedding layer
        self.encoder = nn.Embedding(ntoken, ninp)
        # Create the LSTM layers - find out how to stack them !
        self.rnn = nn.LSTM(ninp, nhid, nlayers, dropout=dropout)
        # Create what we call the decoder: a linear transformation to map the hidden state into scores for all words in the vocabulary
        # (Note that the softmax application function will be applied out of the model)
        self.decoder = nn.Linear(nhid, ntoken)
        
        # Initialize non-reccurent weights 
        self.init_weights()

        self.ninp = ninp
        self.nhid = nhid
        self.nlayers = nlayers
        
    def init_weights(self):
        # Initialize the encoder and decoder weights with the uniform distribution
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.fill_(0)
        self.decoder.weight.data.uniform_(-initrange, initrange)
        
    def init_hidden(self, batch_size):
        # Initialize the hidden state and cell state to zero, with the right sizes
        weight = next(self.parameters())
        return (weight.new_zeros(self.nlayers, batch_size, self.nhid),
                weight.new_zeros(self.nlayers, batch_size, self.nhid))    

    def forward(self, input, hidden, return_h=False):
        # Process the input
        emb = self.drop(self.encoder(input))   
        
        # Apply the LSTMs
        output, hidden = self.rnn(emb, hidden)
        
        # Decode into scores
        output = self.drop(output)      
        decoded = self.decoder(output)
        return decoded, hidden

### Building the Model

In [11]:
# Set the random seed manually for reproducibility.
torch.manual_seed(1)

<torch._C.Generator at 0x7f05f4e37180>

In [12]:
embedding_size = 200
hidden_size = 200
layers = 2
dropout = 0.5

###############################################################################
# Build the model
###############################################################################

vocab_size = len(corpus.dictionary)
model = LSTMModel(vocab_size, embedding_size, hidden_size, layers, dropout).to(device)
params = list(model.parameters())
criterion = nn.CrossEntropyLoss()

In [13]:
lr = 10.0
optimizer = 'sgd'
wdecay = 1.2e-6
# For gradient clipping
clip = 0.25

if optimizer == 'sgd':
    optim = torch.optim.SGD(params, lr=lr, weight_decay=wdecay)
if optimizer == 'adam':
    optim = torch.optim.Adam(params, lr=lr, weight_decay=wdecay)

In [14]:
# Let's think about gradient propagation:
# We plan to keep the second ouput of the LSTM layer (the hidden/cell states) to initialize
# the next call to LSTM. In this way, we can back-propagate the gradient for as long as we want.
# However, this put a huge strain on the memory used by the model, since it implies retaining
# a always-growing number of tensors of gradients in the cache.
# We decide to not backpropagate through time beyond the current sequence ! 
# We use a specific function to cut the 'hidden/state cell' states from their previous dependencies
# before using them to initialize the next call to the LSTM.
# This is done with the .detach() function.

def repackage_hidden(h):
    """Wraps hidden states in new Tensors, to detach them from their history."""
    if isinstance(h, torch.Tensor):
        return h.detach()
    else:
        return tuple(repackage_hidden(v) for v in h)

In [15]:
# Other global parameters
epochs = 100
seq_len = 30
log_interval = 10
save = 'model.pt'

In [16]:
def evaluate(data_source):
    # Turn on evaluation mode which disables dropout.
    model.eval()
    total_loss = 0.
    hidden = model.init_hidden(eval_batch_size)
    with torch.no_grad():
        for i in range(0, data_source.size(0) - 1, seq_len):
            data, targets = get_batch(data_source, i, seq_len)
            output, hidden = model(data, hidden)
            hidden = repackage_hidden(hidden)
            total_loss += len(data) * criterion(output.view(-1, vocab_size), targets.view(-1)).item()
    return total_loss / (len(data_source) - 1)

In [17]:
def train():
    # Turn on training mode which enables dropout.
    model.train()
    total_loss = 0.
    start_time = time.time()
    hidden = model.init_hidden(batch_size)
    for batch, i in enumerate(range(0, train_data.size(0) - 1, seq_len)):
        data, targets = get_batch(train_data, i, seq_len)
        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        hidden = repackage_hidden(hidden)
        optim.zero_grad()
        
        output, hidden = model(data, hidden)
        loss = criterion(output.view(-1, vocab_size), targets.view(-1))
        loss.backward()

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_norm_(params, clip)
        optim.step()
        
        total_loss += loss.data

        if batch % log_interval == 0 and batch > 0:
            cur_loss = total_loss / log_interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | '
                    'loss {:5.2f} | ppl {:8.2f}'.format(
                epoch, batch, len(train_data) // seq_len, lr,
                elapsed * 1000 / log_interval, cur_loss, math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()

In [18]:
# Loop over epochs.
best_val_loss = None

# At any point you can hit Ctrl + C to break out of training early.
try:
    for epoch in range(1, epochs+1):
        epoch_start_time = time.time()
        train()
        val_loss = evaluate(val_data)
        print('-' * 89)
        print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
                'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                           val_loss, math.exp(val_loss)))
        print('-' * 89)
        # Save the model if the validation loss is the best we've seen so far.
        if not best_val_loss or val_loss < best_val_loss:
            with open(save, 'wb') as f:
                torch.save(model, f)
            best_val_loss = val_loss
        else:
            # Anneal the learning rate if no improvement has been seen in the validation dataset.
            lr /= 4.0
except KeyboardInterrupt:
    print('-' * 89)
    print('Exiting from training early')

# Load the best saved model.
with open(save, 'rb') as f:
    model = torch.load(f)
    # after load the rnn params are not a continuous chunk of memory
    # this makes them a continuous chunk, and will speed up forward pass
    model.rnn.flatten_parameters()

# Run on test data.
test_loss = evaluate(test_data)
print('=' * 89)
print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
    test_loss, math.exp(test_loss)))
print('=' * 89)

| epoch   1 |    10/  257 batches | lr 10.00 | ms/batch 69.94 | loss 10.01 | ppl 22163.15
| epoch   1 |    20/  257 batches | lr 10.00 | ms/batch 47.47 | loss  7.99 | ppl  2936.72
| epoch   1 |    30/  257 batches | lr 10.00 | ms/batch 47.66 | loss  7.54 | ppl  1874.41
| epoch   1 |    40/  257 batches | lr 10.00 | ms/batch 47.60 | loss  7.33 | ppl  1526.34
| epoch   1 |    50/  257 batches | lr 10.00 | ms/batch 47.49 | loss  7.26 | ppl  1428.74
| epoch   1 |    60/  257 batches | lr 10.00 | ms/batch 47.31 | loss  7.12 | ppl  1235.36
| epoch   1 |    70/  257 batches | lr 10.00 | ms/batch 47.75 | loss  7.12 | ppl  1240.68
| epoch   1 |    80/  257 batches | lr 10.00 | ms/batch 47.42 | loss  7.09 | ppl  1199.11
| epoch   1 |    90/  257 batches | lr 10.00 | ms/batch 47.55 | loss  7.06 | ppl  1159.92
| epoch   1 |   100/  257 batches | lr 10.00 | ms/batch 47.56 | loss  6.94 | ppl  1029.84
| epoch   1 |   110/  257 batches | lr 10.00 | ms/batch 47.42 | loss  6.87 | ppl   959.19
| epoch   

In [244]:
def beam_search_decode(device, net, words, vocab_to_int, int_to_vocab, top_k):
  net.eval()
  words = words.split(' ')
  state_h, state_c = net.init_hidden(len(words))
  state_h = state_h.to(device)
  state_c = state_c.to(device)
  ix = torch.tensor([[vocab_to_int[w] for w in words]]).to(device)
  output, (state_h, state_c) = net(ix, (state_h, state_c))
  prob, top_ix = torch.topk(nn.functional.softmax(output[0][-1], dim=0), k=top_k)
  #print(top_ix)
  list_ids = [[vocab_to_int[w] for w in words] + [id] for id in top_ix.tolist()]
  outputs = [output for _ in range(top_k)]
  hiddens = [(state_h, state_c) for _ in range(top_k)] 
  #print("avant beam search, top indices : ",top_ix.tolist(), "de proba", prob.tolist())
  #print(list_ids)
  for i in range(50):
    probas = torch.tensor([[0 for i in range(top_k)] for k in range(top_k)]).float().to(device)
    indxes = torch.tensor([[0 for i in range(top_k)] for k in range(top_k)]).to(device)
    for k in range(top_k):
      ix = torch.tensor([list_ids[k][i+1:]]).to(device)
      output, hiddens[k] = net(ix, hiddens[k])
      pro, indxes[k] = torch.topk(nn.functional.softmax(output[0][-1], dim=0), k=top_k)
      #print("probas du choix ", k+1," : ", pro.tolist())
      probas[k] = pro * prob[k]
    #print(indxes.tolist())
    #print(list_ids)
    prob, indices = torch.topk(probas.flatten(), top_k)
    for k in range(top_k):
      top_ix[k] = indxes.flatten()[indices[k]]
    indices = indices // top_k
    temp1 = []
    temp2 = []
    for k in range(top_k):
      temp1.append(hiddens[indices.tolist()[k]])
      temp2.append(list_ids[indices.tolist()[k]] + [top_ix.tolist()[k]])
    hiddens = temp1
    list_ids = temp2
    print("top indices : ",top_ix.tolist(), "de proba", prob.tolist())
  best_branch = list_ids[torch.argmax(prob)]
  words = []
  for id in best_branch:
    words.append(int_to_vocab[id])
  print(' '.join(words))

In [195]:
def predict(device, net, words, vocab_to_int, int_to_vocab, top_k):
  net.eval()
  words = words.split(' ')
  state_h, state_c = net.init_hidden(len(words))
  state_h = state_h.to(device)
  state_c = state_c.to(device)
  for i in range(50):
      ix = torch.tensor([[vocab_to_int[w] for w in words[i:]]]).to(device)
      output, (state_h, state_c) = net(ix, (state_h, state_c))
      _, top_ix = torch.topk(nn.functional.softmax(output[0][-1], dim=0), k=1)
      words.append(int_to_vocab[top_ix[0]])
  print(' '.join(words))

In [245]:
predict(device, model, 'Ce grand jour où l’ hymen étouffant la vengeance <eos> Entre le Parthe et nous remet l’ intelligence , <eos> Affranchit', corpus.dictionary.word2idx, corpus.dictionary.idx2word,10)
beam_search_decode(device, model, 'Ce grand jour où l’ hymen étouffant la vengeance <eos> Entre le Parthe et nous remet l’ intelligence , <eos> Affranchit', corpus.dictionary.word2idx, corpus.dictionary.idx2word,10)

Ce grand jour où l’ hymen étouffant la vengeance <eos> Entre le Parthe et nous remet l’ intelligence , <eos> Affranchit , <eos> Et je vous ai promis , et je vous ai promis . <eos> Je vous ai vu , Madame , et je vous ai promis . <eos> Je vous ai vu , Madame , et je vous ai promis . <eos> Je vous ai vu , Madame ,
top indices :  [11, 11, 11, 11, 10, 11, 11, 16, 33, 2] de proba [0.31770649552345276, 0.2779606282711029, 0.07023846358060837, 0.03300094231963158, 0.02745000645518303, 0.024644888937473297, 0.012532630935311317, 0.009786729700863361, 0.005823318846523762, 0.003604043507948518]
top indices :  [61, 10, 12, 33, 141, 12, 61, 141, 245, 245] de proba [0.07251192629337311, 0.027372043579816818, 0.026348581537604332, 0.01595715805888176, 0.015921423211693764, 0.015702245756983757, 0.012015080079436302, 0.009843970648944378, 0.00969643983989954, 0.00937218964099884]
top indices :  [11, 2, 20, 20, 41, 288, 40, 115, 82, 299] de proba [0.005344506818801165, 0.004971526097506285, 0.00390555

In [222]:
a = [[1],[2]]
b = []
b.append(a[1])
b.append(a[1])
print(b)
print(a)
a = b


[[2], [2]]
[[1], [2]]


Je veux , et l’ en a vu . . Mais il faut à l’ entendre : . . ; <eos> 

Je vous dois voir à vous . Et que j’ ai su l’ attendre <eos> 

Ce que j’ avais pu voir , et l’ autre à l’ honneur ; <eos> 

Il n’ a rien dit qu’ en ce mot je n’ en dois point de rien