In [1]:
import torch
import torch.nn as nn

### Tools for data processing 

In [2]:
import os
import time
import math
from collections import Counter
import pprint
pp = pprint.PrettyPrinter(indent=1)

In [3]:
with open('mots_rimes_final.txt', encoding='utf-8') as f:
  categories, eos_tokens = [], []
  incr = 0
  for line in f:
    if incr % 2 == 1:
      categories.append(line.split())
      eos_tokens.append('<eos'+str(incr //2)+'>')
    incr +=1
print(categories[1])
print(eos_tokens[12])

['Silence', 'Balance', 'convalescence', 'instance', 'Ordonnance', 'surséance', 'Vengeance', 'Clémence', 'prééminence', 'condescendance', 'Innocence', 'Puissance', 'Danse', 'Dépense', 'indécence', 'Térence', 'Maxence', 'Byzance', 'Constance', 'Florence', 'France', 'abondance', 'absence', 'alliance', 'allégeance', 'apparence', 'arrogance', 'assistance', 'assurance', 'audience', 'avance', 'balance', 'bienséance', 'bienveillance', 'circonstance', 'clémence', 'commence', 'complaisance', 'concurrence', 'confiance', 'confidence', 'conférence', 'connaissance', 'conscience', 'constance', 'conséquence', 'croyance', 'créance', 'danse', 'devance', 'différence', 'diligence', 'dispense', 'distance', 'défense', 'défiance', 'déférence', 'délivrance', 'dépendance', 'désobéissance', 'enfance', 'espérance', 'excellence', 'expérience', 'extravagance', 'ignorance', 'immense', 'impatience', 'importance', 'imprudence', 'impudence', 'impuissance', 'inclémence', 'inconstance', 'indifférence', 'indulgence', 'in

In [4]:
class Dictionary(object):
    def __init__(self):
        self.word2idx = {}
        self.idx2word = []
        self.counter = {}
        self.total = 0

    def add_word(self, word):
        if word not in self.word2idx:
            self.idx2word.append(word)
            self.word2idx[word] = len(self.idx2word) - 1
            self.counter.setdefault(word, 0)
        self.counter[word] += 1
        self.total += 1
        return self.word2idx[word]

    def __len__(self):
        return len(self.idx2word)

In [5]:
class Corpus(object):
    def __init__(self, path):
        # We create an object Dictionary associated to Corpus
        self.dictionary = Dictionary()
        # We go through all files, adding all words to the dictionary
        self.train = self.tokenize(os.path.join(path, 'train.txt'))
        self.valid = self.tokenize(os.path.join(path, 'valid.txt'))
        self.test = self.tokenize(os.path.join(path, 'test.txt'))
        
    def tokenize(self, path):
        """Tokenizes a text file, knowing the dictionary, in order to tranform it into a list of indexes"""
        assert os.path.exists(path)
        # Add words to the dictionary
        with open(path, 'r') as f:
            tokens = 0
            eos_seq = []
            for line in f:
              #words = ['<sos>'] + line.split() + ['<eos>']
              words = ['<sos>'] + line.split()
              rime, inc = words[-1], len(words)-1
              while(rime in ['"', '.', ',', ';', ':', '?', '!', ' ', ')', '»', '-', '\xa0', '\n']):
                inc -= 1
                rime = words[inc]
              isInNoCat = True
              for i in range(70):
                if rime in categories[i]:
                  isInNoCat = False
                  words.append(eos_tokens[i])
                  eos_seq.append(eos_tokens[i])
                  break
              if isInNoCat :
                words.append('<eos_>')
                eos_seq.append('<eos_>')
              else:
                for word in words:
                  self.dictionary.add_word(word)
                tokens += len(words)
        
        # Once done, go through the file a second time and fill a Torch Tensor with the associated indexes 
        with open(path, 'r') as f:
            ids = torch.LongTensor(tokens)
            token,incr = 0, 0
            for line in f:
                
                #words = ['<sos>'] + line.split() + ['<eos>']
                if eos_seq[incr] != '<eos_>':
                  words = ['<sos>'] + line.split() + [eos_seq[incr]]
                  for word in words:
                    ids[token] = self.dictionary.word2idx[word]
                    token += 1
                incr +=1
        return ids

In [6]:
###############################################################################
# Load data
###############################################################################

data = './corpus/'
corpus = Corpus(data)

In [7]:
print(corpus.dictionary.total)
print(len(corpus.dictionary.idx2word))
print(len(corpus.dictionary.word2idx))

print(corpus.train.shape)
print(corpus.train[0:7])
print([corpus.dictionary.idx2word[corpus.train[i]] for i in range(40)])

print(corpus.valid.shape)
print(corpus.valid[0:7])
print([corpus.dictionary.idx2word[corpus.valid[i]] for i in range(7)])

938548
19215
19215
torch.Size([565726])
tensor([0, 1, 2, 3, 4, 5, 6])
['<sos>', 'Impatients', 'désirs', 'd’', 'une', 'illustre', 'vengeance', '<eos1>', '<sos>', 'Dont', 'la', 'mort', 'de', 'mon', 'père', 'a', 'formé', 'la', 'naissance', ',', '<eos1>', '<sos>', 'Enfants', 'impétueux', 'de', 'mon', 'ressentiment', '<eos9>', '<sos>', 'Que', 'ma', 'douleur', 'séduite', 'embrasse', 'aveuglément', ',', '<eos9>', '<sos>', 'Vous', 'régnez']
torch.Size([189741])
tensor([    0,   252,   214, 11853,     9,   976,    42])
['<sos>', 'Je', 'lui', 'prescris', 'la', 'loi', 'que']


In [8]:
# We now have data under a very long list of indexes: the text is as one sequence.
# The idea now is to create batches from this. Note that this is absolutely not the best
# way to proceed with large quantities of data (where we'll try not to store huge tensors
# in memory but read them from file as we go) !
# Here, we are looking for simplicity and efficiency with regards to computation time.
# That is why we will ignore sentence separations and treat the data as one long stream that
# we will cut arbitrarily as we need.
# With the alphabet being our data, we currently have the sequence:
# [a b c d e f g h i j k l m n o p q r s t u v w x y z]
# We want to reorganize it as independant batches that will be processed independantly by the model !
# For instance, with the alphabet as the sequence and batch size 4, we'd get the 4 following sequences:
# ┌ a g m s ┐
# │ b h n t │
# │ c i o u │
# │ d j p v │
# │ e k q w │
# └ f l r x ┘
# with the last two elements being lost.
# Again, these columns are treated as independent by the model, which means that the
# dependence of e. g. 'g' on 'f' can not be learned, but allows more efficient processing.

def batchify(data, batch_size, cuda = False):
    # Cut the elements that are unnecessary
    nbatch = data.size(0) // batch_size
    data = data.narrow(0, 0, nbatch * batch_size)
    # Reorganize the data
    data = data.view(batch_size, -1).t().contiguous()
    # If we can use a GPU, let's tranfer the tensor to it
    return data.to(device)

# get_batch subdivides the source data into chunks of the appropriate length.
# If source is equal to the example output of the batchify function, with
# a sequence length (seq_len) of 3, we'd get the following two variables:
# ┌ a g m s ┐ ┌ b h n t ┐
# | b h n t | | c i o u │
# └ c i o u ┘ └ d j p v ┘
# The first variable contains the letters input to the network, while the second
# contains the one we want the network to predict (b for a, h for g, v for u, etc..)
# Note that despite the name of the function, we are cutting the data in the
# temporal dimension, since we already divided data into batches in the previous
# function. 

def get_batch(source, i, seq_len, evaluation=False):
    # Deal with the possibility that there's not enough data left for a full sequence
    seq_len = min(seq_len, len(source) - 1 - i)
    # Take the input data
    data = source[i:i+seq_len]
    # Shift by one for the target data
    target = source[i+1:i+1+seq_len]
    return data, target

In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [10]:
batch_size = 100
eval_batch_size = 4
train_data = batchify(corpus.train, batch_size)
val_data = batchify(corpus.valid, eval_batch_size)
test_data = batchify(corpus.test, eval_batch_size)

print(train_data.shape)
print(val_data.shape)

torch.Size([5657, 100])
torch.Size([47435, 4])


### LSTM Cells in pytorch

### Creating our own LSTM Model

In [11]:
# Models are usually implemented as custom nn.Module subclass
# We need to redefine the __init__ method, which creates the object
# We also need to redefine the forward method, which transform the input into outputs
# We can also add any method that we need: here, in order to initiate weights in the model

class LSTMModel(nn.Module):
    def __init__(self, ntoken, ninp, nhid, nlayers, dropout=0.5):
        super(LSTMModel, self).__init__()
        # Create a dropout object to use on layers for regularization
        self.drop = nn.Dropout(dropout)
        # Create an encoder - which is an embedding layer
        self.encoder = nn.Embedding(ntoken, ninp)
        # Create the LSTM layers - find out how to stack them !
        self.rnn = nn.LSTM(ninp, nhid, nlayers, dropout=dropout)
        # Create what we call the decoder: a linear transformation to map the hidden state into scores for all words in the vocabulary
        # (Note that the softmax application function will be applied out of the model)
        self.decoder = nn.Linear(nhid, ntoken)
        
        # Initialize non-reccurent weights 
        self.init_weights()

        self.ninp = ninp
        self.nhid = nhid
        self.nlayers = nlayers
        
    def init_weights(self):
        # Initialize the encoder and decoder weights with the uniform distribution
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.fill_(0)
        self.decoder.weight.data.uniform_(-initrange, initrange)
        
    def init_hidden(self, batch_size):
        # Initialize the hidden state and cell state to zero, with the right sizes
        weight = next(self.parameters())
        return (weight.new_zeros(self.nlayers, batch_size, self.nhid),
                weight.new_zeros(self.nlayers, batch_size, self.nhid))    

    def forward(self, input, hidden, return_h=False):
        # Process the input
        emb = self.drop(self.encoder(input))   
        
        # Apply the LSTMs
        output, hidden = self.rnn(emb, hidden)
        
        # Decode into scores
        output = self.drop(output)      
        decoded = self.decoder(output)
        return decoded, hidden

### Building the Model

In [12]:
# Set the random seed manually for reproducibility.
torch.manual_seed(1)

<torch._C.Generator at 0x7f77696f7b58>

In [13]:
embedding_size = 500
hidden_size = 500
layers = 2
dropout = 0.2

###############################################################################
# Build the model
###############################################################################

vocab_size = len(corpus.dictionary)
model = LSTMModel(vocab_size, embedding_size, hidden_size, layers, dropout).to(device)
params = list(model.parameters())
criterion = nn.CrossEntropyLoss()

In [14]:
lr = 10.0
optimizer = 'sgd'
wdecay = 1.2e-6
# For gradient clipping
clip = 0.25

if optimizer == 'sgd':
    optim = torch.optim.SGD(params, lr=lr, weight_decay=wdecay)
if optimizer == 'adam':
    optim = torch.optim.Adam(params, lr=lr, weight_decay=wdecay)

In [15]:
# Let's think about gradient propagation:
# We plan to keep the second ouput of the LSTM layer (the hidden/cell states) to initialize
# the next call to LSTM. In this way, we can back-propagate the gradient for as long as we want.
# However, this put a huge strain on the memory used by the model, since it implies retaining
# a always-growing number of tensors of gradients in the cache.
# We decide to not backpropagate through time beyond the current sequence ! 
# We use a specific function to cut the 'hidden/state cell' states from their previous dependencies
# before using them to initialize the next call to the LSTM.
# This is done with the .detach() function.

def repackage_hidden(h):
    """Wraps hidden states in new Tensors, to detach them from their history."""
    if isinstance(h, torch.Tensor):
        return h.detach()
    else:
        return tuple(repackage_hidden(v) for v in h)

In [16]:
# Other global parameters
epochs = 50
seq_len = 40
log_interval = 10
save = 'model.pt'

In [17]:
def evaluate(data_source):
    # Turn on evaluation mode which disables dropout.
    model.eval()
    total_loss = 0.
    hidden = model.init_hidden(eval_batch_size)
    with torch.no_grad():
        for i in range(0, data_source.size(0) - 1, seq_len):
            data, targets = get_batch(data_source, i, seq_len)
            output, hidden = model(data, hidden)
            hidden = repackage_hidden(hidden)
            total_loss += len(data) * criterion(output.view(-1, vocab_size), targets.view(-1)).item()
    return total_loss / (len(data_source) - 1)

In [18]:
def train():
    # Turn on training mode which enables dropout.
    model.train()
    total_loss = 0.
    start_time = time.time()
    hidden = model.init_hidden(batch_size)
    for batch, i in enumerate(range(0, train_data.size(0) - 1, seq_len)):
        data, targets = get_batch(train_data, i, seq_len)
        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        hidden = repackage_hidden(hidden)
        optim.zero_grad()
        
        output, hidden = model(data, hidden)
        loss = criterion(output.view(-1, vocab_size), targets.view(-1))
        loss.backward()

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_norm_(params, clip)
        optim.step()
        
        total_loss += loss.data

        if batch % log_interval == 0 and batch > 0:
            cur_loss = total_loss / log_interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | '
                    'loss {:5.2f} | ppl {:8.2f}'.format(
                epoch, batch, len(train_data) // seq_len, lr,
                elapsed * 1000 / log_interval, cur_loss, math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()

In [19]:
# Loop over epochs.
best_val_loss = None

# At any point you can hit Ctrl + C to break out of training early.
try:
    for epoch in range(1, epochs+1):
        epoch_start_time = time.time()
        train()
        val_loss = evaluate(val_data)
        print('-' * 89)
        print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
                'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                           val_loss, math.exp(val_loss)))
        print('-' * 89)
        # Save the model if the validation loss is the best we've seen so far.
        if not best_val_loss or val_loss < best_val_loss:
            with open(save, 'wb') as f:
                torch.save(model, f)
            best_val_loss = val_loss
        else:
            # Anneal the learning rate if no improvement has been seen in the validation dataset.
            lr /= 4.0
except KeyboardInterrupt:
    print('-' * 89)
    print('Exiting from training early')

# Load the best saved model.
with open(save, 'rb') as f:
    model = torch.load(f)
    # after load the rnn params are not a continuous chunk of memory
    # this makes them a continuous chunk, and will speed up forward pass
    model.rnn.flatten_parameters()

# Run on test data.
test_loss = evaluate(test_data)
print('=' * 89)
print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
    test_loss, math.exp(test_loss)))
print('=' * 89)

| epoch   1 |    10/  141 batches | lr 10.00 | ms/batch 271.42 | loss 10.20 | ppl 27016.21
| epoch   1 |    20/  141 batches | lr 10.00 | ms/batch 234.10 | loss  8.06 | ppl  3174.22
| epoch   1 |    30/  141 batches | lr 10.00 | ms/batch 234.15 | loss  8.04 | ppl  3109.75
| epoch   1 |    40/  141 batches | lr 10.00 | ms/batch 234.22 | loss  7.33 | ppl  1529.60
| epoch   1 |    50/  141 batches | lr 10.00 | ms/batch 236.15 | loss  7.39 | ppl  1615.96
| epoch   1 |    60/  141 batches | lr 10.00 | ms/batch 234.04 | loss  7.24 | ppl  1393.04
| epoch   1 |    70/  141 batches | lr 10.00 | ms/batch 236.02 | loss  7.01 | ppl  1108.71
| epoch   1 |    80/  141 batches | lr 10.00 | ms/batch 235.93 | loss  6.99 | ppl  1083.34
| epoch   1 |    90/  141 batches | lr 10.00 | ms/batch 236.21 | loss  6.95 | ppl  1041.86
| epoch   1 |   100/  141 batches | lr 10.00 | ms/batch 237.95 | loss  6.94 | ppl  1036.89
| epoch   1 |   110/  141 batches | lr 10.00 | ms/batch 234.22 | loss  6.82 | ppl   916.05

In [20]:
def pretty_print(words):
  pretty_mat = []
  i,last_eos = 0, 0
  for j in range(len(words)-1,-1,-1):
    if words[j][:4] == '<eos':
      last_eos = j
      break
  while i<last_eos:
    line = []
    while (words[i][:4] != '<eos'):
      line.append(words[i])
      i +=1
    line.append(words[i])
    i +=1
    pretty_mat.append(line[1:])
  for line in pretty_mat :
    print(' '.join(line))
  print('\n')

In [21]:
def beam_search_decode(device, net, words, vocab_to_int, int_to_vocab, top_k, temperature):
  net.eval()
  softmax = nn.Softmax(dim = 1)
  words = words.split(' ')
  words.append('<sos>')
  hidden = net.init_hidden(1)
  for v in hidden:
    v = v.to(device)
  for w in words:
    ix = torch.tensor([[vocab_to_int[w]]]).to(device)
    output, hidden = net(ix, hidden)
  output = output / temperature
  prob, top_ix = torch.topk(softmax(output[0]), k=top_k)
  prob = torch.log(prob)
  #print(top_ix)
  list_ids = [[id] for id in top_ix[0].tolist()]
  outputs = [output for _ in range(top_k)]
  hiddens = [hidden for _ in range(top_k)] 
  #print("avant beam search, top indices : ",top_ix.tolist(), "de proba", prob.tolist())
  #print(list_ids)
  for i in range(100):
    probas = torch.zeros(top_k, top_k).float().to(device)
    indxes = torch.zeros(top_k, top_k).to(device)
    for k in range(top_k):
      ix = torch.tensor([[top_ix[0][k]]]).to(device)
      output, hiddens[k] = net(ix, hiddens[k])
      output = output / temperature
      pro, indxes[k] = torch.topk(softmax(output[0]), k=top_k)
      pro = torch.log(pro)
      #print("probas du choix ", k+1," : ", pro.tolist())
      probas[k] = torch.add(pro[0], prob[0][k])
    #print(indxes.tolist())
    #print(list_ids)
    prob, indices = torch.topk(probas.flatten(), top_k)
    prob = torch.unsqueeze(prob, 0)
    for k in range(top_k):
      top_ix[0][k] = indxes.flatten()[indices[k]]
    indices = indices // top_k
    temp1 = []
    temp2 = []
    for k in range(top_k):
      temp1.append(hiddens[indices.tolist()[k]])
      temp2.append(list_ids[indices.tolist()[k]] + [top_ix[0].tolist()[k]])
    hiddens = temp1
    list_ids = temp2
    #print("top indices : ",top_ix.tolist(), "de proba", prob.tolist())
  best_branch = list_ids[torch.argmax(prob)]
  words = []
  for id in best_branch:
    words.append(int_to_vocab[id])
  pretty_print(words)

In [22]:
def predict(device, net, words, vocab_to_int, int_to_vocab, temperature):
  net.eval()
  softmax = nn.Softmax(dim=1)
  words = words.split(' ')
  hidden = net.init_hidden(1)
  for v in hidden:
    v = v.to(device)
  for w in words:
    ix = torch.tensor([[vocab_to_int[w]]]).to(device)
    output, hidden = net(ix, hidden)
  output = output / temperature
  idx_max = torch.argmax(softmax(output[0]))
  words = []
  words.append(int_to_vocab[idx_max])
  for i in range(0, 100):
      ix = torch.tensor([[idx_max]]).to(device)
      output, hidden = net(ix, hidden)
      output = output / temperature
      idx_max = torch.argmax(softmax(output[0]))
      words.append(int_to_vocab[idx_max])
  pretty_print(words)

In [23]:
def top_k_sampling(device, net, words, vocab_to_int, int_to_vocab, top_k, temperature):
  net.eval()
  softmax = nn.Softmax(dim=-1)
  words = words.split(' ')
  hidden = net.init_hidden(1)
  for v in hidden:
    v = v.to(device)
  for w in words:
    ix = torch.tensor([[vocab_to_int[w]]]).to(device)
    output, hidden = net(ix, hidden)
  output = output / temperature
  indices_to_remove = output[0] < torch.topk(output[0], top_k)[0][..., -1, None]
  output[0][indices_to_remove] = -float('Inf')
  prob = softmax(output[0])
  idx_max = torch.multinomial(prob, 1)
  words = []
  words.append(int_to_vocab[idx_max])
  for i in range(0, 100):
      ix = torch.tensor([[idx_max]]).to(device)
      output, hidden = net(ix, hidden)
      output = output[0][0] / temperature
      indices_to_remove = output < torch.topk(output, top_k)[0][..., -1, None]
      output[indices_to_remove] = -float('Inf')
      prob = softmax(output)
      idx_max = torch.multinomial(prob, 1)
      words.append(int_to_vocab[idx_max])
  pretty_print(words)

In [25]:
words = '<sos> Je ne t’ en parle plus , va , sers la tyrannie , <eos3> <sos> Abandonne ton âme à son lâche génie ; <eos3>'
predict(device, model, words, corpus.dictionary.word2idx, corpus.dictionary.idx2word,1)
beam_search_decode(device, model, words, corpus.dictionary.word2idx, corpus.dictionary.idx2word,10,0.2)
beam_search_decode(device, model, words, corpus.dictionary.word2idx, corpus.dictionary.idx2word,10,0.5)
beam_search_decode(device, model, words, corpus.dictionary.word2idx, corpus.dictionary.idx2word,10,1)
top_k_sampling(device, model, words, corpus.dictionary.word2idx, corpus.dictionary.idx2word, 10, 1)
top_k_sampling(device, model, words, corpus.dictionary.word2idx, corpus.dictionary.idx2word, 50, 1)
top_k_sampling(device, model, words, corpus.dictionary.word2idx, corpus.dictionary.idx2word, 50, 0.5)
top_k_sampling(device, model, words, corpus.dictionary.word2idx, corpus.dictionary.idx2word, 250, 0.7)

Mais si de ce grand rang le ciel est de l’ empire , <eos13>
Et que de sa vertu les plaisirs de l’ empire , <eos13>
Ne me donne point à vous , et ne puis rien de vous . <eos4>
Je ne sais point d’ état , et ne m’ en peut défendre ; <eos16>
Et je ne puis souffrir qu’ un si grand changement <eos9>
Ne peut être à l’ amour de l’ amour de l’ amour . <eos26>


si de ce grand rang le ciel m’ a fait naître , <eos18>
Je ne sais qu’ un rival , et le Roi m’ a fait naître . <eos18>
Et si l’ on m’ en veut croire , il faut qu’ il s’ en souvienne . <eos17>
Et bien , pour le tirer , il est toujours aimable , <eos32>
Et je ne puis souffrir qu’ un si grand changement <eos9>
Ne peut être à l’ amour d’ un pouvoir absolu . <eos46>


si de ce grand rang la fortune est suivie , <eos3>
Je ne m’ en défends point , et mon cœur est à moi , <eos6>
Et je ne puis souffrir qu’ un autre que je suis . <eos12>
Je ne sais si je suis , et ce que je puis croire <eos33>
À ce que je puis voir , et ce n’ est qu’ un faux crime 

Je veux , et l’ en a vu . . Mais il faut à l’ entendre : . . ; <eos> 

Je vous dois voir à vous . Et que j’ ai su l’ attendre <eos> 

Ce que j’ avais pu voir , et l’ autre à l’ honneur ; <eos> 

Il n’ a rien dit qu’ en ce mot je n’ en dois point de rien

**Ajout du SoS**

. . . Ah ! Seigneur , c’ est moi - même . <eos> 

<sos> Ah ! Madame , il est vrai , je n’ en veux point douter , <eos>

<sos> Et je n’ ai pas besoin de m’ en faire haïr .