In [None]:
!pip install fast_bleu
!pip3 install haspirater frhyme plint
!pip install kora
import torch
import torch.nn as nn

Collecting fast_bleu
  Downloading https://files.pythonhosted.org/packages/50/9d/82d4dec5947242dad485af5d95c62d2735f19f4dde828905d62d20ccc2f5/fast-bleu-0.0.86.tar.gz
Building wheels for collected packages: fast-bleu
  Building wheel for fast-bleu (setup.py) ... [?25l[?25hdone
  Created wheel for fast-bleu: filename=fast_bleu-0.0.86-cp36-cp36m-linux_x86_64.whl size=581605 sha256=5184106897c2119feb1eb10d8e125addee13f53ad71528ed42d07e969fe88d0a
  Stored in directory: /root/.cache/pip/wheels/e9/69/bb/3ca889cdb275ea238498844f2a65e839105db499c404f544d7
Successfully built fast-bleu
Installing collected packages: fast-bleu
Successfully installed fast-bleu-0.0.86
Collecting haspirater
  Downloading https://files.pythonhosted.org/packages/4f/18/f451ccd267b3d0a97bd46e8854b3226164219b24afb6c669d89626df5653/haspirater-0.2-py3-none-any.whl
Collecting frhyme
[?25l  Downloading https://files.pythonhosted.org/packages/a5/ff/85a73b2672fe8d40b870c92aa0908419c03025b27c4b30ee44eaf4218fdd/frhyme-0.3-py3-

### Tools for data processing 

In [None]:
import os
import time
import math
from collections import Counter
import pprint
pp = pprint.PrettyPrinter(indent=1)

In [None]:
with open('mots_rimes.txt', encoding='utf-8') as f:
  categories, eos_tokens = [], []
  incr = 0
  for line in f:
    if incr % 2 == 1:
      categories.append(line.split())
      
    else :
      eos_tokens.append(line.split()[0])
    incr +=1
print(categories[1])
print(eos_tokens[0])

['Silence', 'Balance', 'convalescence', 'instance', 'Ordonnance', 'surséance', 'Vengeance', 'Clémence', 'prééminence', 'condescendance', 'Innocence', 'Puissance', 'Danse', 'Dépense', 'Innocence', 'indécence', 'Térence', 'Maxence', 'Byzance', 'Constance', 'Florence', 'France', 'abondance', 'absence', 'alliance', 'allégeance', 'apparence', 'arrogance', 'assistance', 'assurance', 'audience', 'avance', 'balance', 'bienséance', 'bienveillance', 'circonstance', 'clémence', 'commence', 'complaisance', 'concurrence', 'confiance', 'confidence', 'conférence', 'connaissance', 'conscience', 'constance', 'conséquence', 'croyance', 'créance', 'danse', 'devance', 'différence', 'diligence', 'dispense', 'distance', 'défense', 'défiance', 'déférence', 'délivrance', 'dépendance', 'désobéissance', 'enfance', 'espérance', 'excellence', 'expérience', 'extravagance', 'ignorance', 'immense', 'impatience', 'importance', 'imprudence', 'impudence', 'impuissance', 'inclémence', 'inconstance', 'indifférence', 'ind

In [None]:
class Dictionary(object):
    def __init__(self):
        self.word2idx = {}
        self.idx2word = []
        self.counter = {}
        self.total = 0

    def add_word(self, word):
        if word not in self.word2idx:
            self.idx2word.append(word)
            self.word2idx[word] = len(self.idx2word) - 1
            self.counter.setdefault(word, 0)
        self.counter[word] += 1
        self.total += 1
        return self.word2idx[word]

    def __len__(self):
        return len(self.idx2word)

In [None]:
class Corpus(object):
    def __init__(self, path):
        # We create an object Dictionary associated to Corpus
        self.dictionary = Dictionary()
        # We go through all files, adding all words to the dictionary
        self.train = self.tokenize(os.path.join(path, 'train.txt'))
        self.valid = self.tokenize(os.path.join(path, 'valid.txt'))
        self.test = self.tokenize(os.path.join(path, 'test.txt'))
        
    def tokenize(self, path):
        """Tokenizes a text file, knowing the dictionary, in order to tranform it into a list of indexes"""
        assert os.path.exists(path)
        # Add words to the dictionary
        with open(path, 'r') as f:
            tokens = 0
            eos_seq = []
            for line in f:
                  words = ['<sos>'] + line.split() + ['<eos>']
                  rime, inc = words[-1], len(words)-1
                  while(rime in ['"', '.', ',', ';', ':', '?', '!', ' ', ')', '»', '-', '\xa0', '\n', '<eos>']):
                    inc -= 1
                    rime = words[inc]
                  isInNoCat = True
                  for i in range(100):
                    if rime in categories[i]:
                      isInNoCat = False
                      words.insert(inc, eos_tokens[i])
                      eos_seq.append([eos_tokens[i], inc])
                      break
                  if isInNoCat :
                    eos_seq.append(['UNK', inc])
                  else:
                      for word in words:
                        self.dictionary.add_word(word)
                      tokens += len(words)
        
        # Once done, go through the file a second time and fill a Torch Tensor with the associated indexes 
        with open(path, 'r') as f:
            ids = torch.LongTensor(tokens)
            token,incr = 0, 0
            for line in f:
              cur_eos = eos_seq[incr]
              if cur_eos[0] != 'UNK':
                  words = ['<sos>'] + line.split() + ['<eos>']
                  words.insert(cur_eos[1], cur_eos[0])
                  for word in words:
                    ids[token] = self.dictionary.word2idx[word]
                    token += 1
              incr +=1
        return ids

In [None]:
###############################################################################
# Load data
###############################################################################

data = './corpus/'
corpus = Corpus(data)

In [None]:
print(corpus.dictionary.total)
print(len(corpus.dictionary.idx2word))
print(len(corpus.dictionary.word2idx))

print(corpus.train.shape)
print(corpus.train[0:7])
print([corpus.dictionary.idx2word[corpus.train[i]] for i in range(40)])

print(corpus.valid.shape)
print(corpus.valid[0:7])
print([corpus.dictionary.idx2word[corpus.valid[i]] for i in range(7)])

2000441
28918
28918
torch.Size([1502851])
tensor([0, 1, 2, 3, 4, 5, 6])
['<sos>', 'Impatients', 'désirs', 'd’', 'une', 'illustre', 'vengeance', '<eos>', '<sos>', 'Dont', 'la', 'mort', 'de', 'mon', 'père', 'a', 'formé', 'la', 'naissance', ',', '<eos>', '<sos>', 'Enfants', 'impétueux', 'de', 'mon', 'ressentiment', '<eos>', '<sos>', 'Que', 'ma', 'douleur', 'séduite', 'embrasse', 'aveuglément', ',', '<eos>', '<sos>', 'Vous', 'régnez']
torch.Size([252869])
tensor([    0,    87,    96, 14596,     9,  1267,    40])
['<sos>', 'Je', 'lui', 'prescris', 'la', 'loi', 'que']


In [None]:
# We now have data under a very long list of indexes: the text is as one sequence.
# The idea now is to create batches from this. Note that this is absolutely not the best
# way to proceed with large quantities of data (where we'll try not to store huge tensors
# in memory but read them from file as we go) !
# Here, we are looking for simplicity and efficiency with regards to computation time.
# That is why we will ignore sentence separations and treat the data as one long stream that
# we will cut arbitrarily as we need.
# With the alphabet being our data, we currently have the sequence:
# [a b c d e f g h i j k l m n o p q r s t u v w x y z]
# We want to reorganize it as independant batches that will be processed independantly by the model !
# For instance, with the alphabet as the sequence and batch size 4, we'd get the 4 following sequences:
# ┌ a g m s ┐
# │ b h n t │
# │ c i o u │
# │ d j p v │
# │ e k q w │
# └ f l r x ┘
# with the last two elements being lost.
# Again, these columns are treated as independent by the model, which means that the
# dependence of e. g. 'g' on 'f' can not be learned, but allows more efficient processing.

def batchify(data, batch_size, cuda = False):
    # Cut the elements that are unnecessary
    nbatch = data.size(0) // batch_size
    data = data.narrow(0, 0, nbatch * batch_size)
    # Reorganize the data
    data = data.view(batch_size, -1).t().contiguous()
    # If we can use a GPU, let's transfer the tensor to it
    return data.to(device)

# get_batch subdivides the source data into chunks of the appropriate length.
# If source is equal to the example output of the batchify function, with
# a sequence length (seq_len) of 3, we'd get the following two variables:
# ┌ a g m s ┐ ┌ b h n t ┐
# | b h n t | | c i o u │
# └ c i o u ┘ └ d j p v ┘
# The first variable contains the letters input to the network, while the second
# contains the one we want the network to predict (b for a, h for g, v for u, etc..)
# Note that despite the name of the function, we are cutting the data in the
# temporal dimension, since we already divided data into batches in the previous
# function. 

def get_batch(source, i, seq_len, evaluation=False):
    # Deal with the possibility that there's not enough data left for a full sequence
    seq_len = min(seq_len, len(source) - 1 - i)
    # Take the input data
    data = source[i:i+seq_len]
    # Shift by one for the target data
    target = source[i+1:i+1+seq_len]
    return data, target

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
batch_size = 100
eval_batch_size = 4
train_data = batchify(corpus.train, batch_size)
val_data = batchify(corpus.valid, eval_batch_size)
test_data = batchify(corpus.test, eval_batch_size)

print(train_data.shape)
print(val_data.shape)

torch.Size([15028, 100])
torch.Size([63217, 4])


### LSTM Cells in pytorch

### Creating our own LSTM Model

In [None]:
# Models are usually implemented as custom nn.Module subclass
# We need to redefine the __init__ method, which creates the object
# We also need to redefine the forward method, which transform the input into outputs
# We can also add any method that we need: here, in order to initiate weights in the model

class LSTMModel(nn.Module):
    def __init__(self, ntoken, ninp, nhid, nlayers, dropout=0.5):
        super(LSTMModel, self).__init__()
        # Create a dropout object to use on layers for regularization
        self.drop = nn.Dropout(dropout)
        # Create an encoder - which is an embedding layer
        self.encoder = nn.Embedding(ntoken, ninp)
        # Create the LSTM layers - find out how to stack them !
        self.rnn = nn.LSTM(ninp, nhid, nlayers, dropout=dropout)
        # Create what we call the decoder: a linear transformation to map the hidden state into scores for all words in the vocabulary
        # (Note that the softmax application function will be applied out of the model)
        self.decoder = nn.Linear(nhid, ntoken)
        
        # Initialize non-reccurent weights 
        self.init_weights()

        self.ninp = ninp
        self.nhid = nhid
        self.nlayers = nlayers
        
    def init_weights(self):
        # Initialize the encoder and decoder weights with the uniform distribution
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.fill_(0)
        self.decoder.weight.data.uniform_(-initrange, initrange)
        
    def init_hidden(self, batch_size):
        # Initialize the hidden state and cell state to zero, with the right sizes
        weight = next(self.parameters())
        return (weight.new_zeros(self.nlayers, batch_size, self.nhid),
                weight.new_zeros(self.nlayers, batch_size, self.nhid))    

    def forward(self, input, hidden, return_h=False):
        # Process the input
        emb = self.drop(self.encoder(input))   
        
        # Apply the LSTMs
        output, hidden = self.rnn(emb, hidden)
        
        # Decode into scores
        output = self.drop(output)      
        decoded = self.decoder(output)
        return decoded, hidden

### Building the Model

In [None]:
# Set the random seed manually for reproducibility.
torch.manual_seed(1)

<torch._C.Generator at 0x7f20af16fba0>

In [None]:
embedding_size = 500
hidden_size = 1024
layers = 2
dropout = 0.2

###############################################################################
# Build the model
###############################################################################

vocab_size = len(corpus.dictionary)
model = LSTMModel(vocab_size, embedding_size, hidden_size, layers, dropout).to(device)
params = list(model.parameters())
criterion = nn.CrossEntropyLoss()

In [None]:
lr = 10.0
optimizer = 'sgd'
wdecay = 1.2e-6
# For gradient clipping
clip = 0.25

if optimizer == 'sgd':
    optim = torch.optim.SGD(params, lr=lr, weight_decay=wdecay)
if optimizer == 'adam':
    optim = torch.optim.Adam(params, lr=lr, weight_decay=wdecay)

In [None]:
# Let's think about gradient propagation:
# We plan to keep the second ouput of the LSTM layer (the hidden/cell states) to initialize
# the next call to LSTM. In this way, we can back-propagate the gradient for as long as we want.
# However, this put a huge strain on the memory used by the model, since it implies retaining
# a always-growing number of tensors of gradients in the cache.
# We decide to not backpropagate through time beyond the current sequence ! 
# We use a specific function to cut the 'hidden/state cell' states from their previous dependencies
# before using them to initialize the next call to the LSTM.
# This is done with the .detach() function.

def repackage_hidden(h):
    """Wraps hidden states in new Tensors, to detach them from their history."""
    if isinstance(h, torch.Tensor):
        return h.detach()
    else:
        return tuple(repackage_hidden(v) for v in h)

In [None]:
# Other global parameters
epochs = 30
seq_len = 40
log_interval = 10
save = 'model.pt'

In [None]:
def evaluate(data_source):
    # Turn on evaluation mode which disables dropout.
    model.eval()
    total_loss = 0.
    hidden = model.init_hidden(eval_batch_size)
    with torch.no_grad():
        for i in range(0, data_source.size(0) - 1, seq_len):
            data, targets = get_batch(data_source, i, seq_len)
            output, hidden = model(data, hidden)
            hidden = repackage_hidden(hidden)
            total_loss += len(data) * criterion(output.view(-1, vocab_size), targets.view(-1)).item()
    return total_loss / (len(data_source) - 1)

In [None]:
def train():
    # Turn on training mode which enables dropout.
    model.train()
    total_loss = 0.
    start_time = time.time()
    hidden = model.init_hidden(batch_size)
    for batch, i in enumerate(range(0, train_data.size(0) - 1, seq_len)):
        data, targets = get_batch(train_data, i, seq_len)
        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        hidden = repackage_hidden(hidden)
        optim.zero_grad()
        
        output, hidden = model(data, hidden)
        loss = criterion(output.view(-1, vocab_size), targets.view(-1))
        loss.backward()

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_norm_(params, clip)
        optim.step()
        
        total_loss += loss.data

        if batch % log_interval == 0 and batch > 0:
            cur_loss = total_loss / log_interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | '
                    'loss {:5.2f} | ppl {:8.2f}'.format(
                epoch, batch, len(train_data) // seq_len, lr,
                elapsed * 1000 / log_interval, cur_loss, math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()

In [None]:
# Loop over epochs.
best_val_loss = None

# At any point you can hit Ctrl + C to break out of training early.
try:
    for epoch in range(1, epochs+1):
        epoch_start_time = time.time()
        train()
        val_loss = evaluate(val_data)
        print('-' * 89)
        print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
                'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                           val_loss, math.exp(val_loss)))
        print('-' * 89)
        # Save the model if the validation loss is the best we've seen so far.
        if not best_val_loss or val_loss < best_val_loss:
            with open(save, 'wb') as f:
                torch.save(model, f)
            best_val_loss = val_loss
        else:
            # Anneal the learning rate if no improvement has been seen in the validation dataset.
            lr /= 4.0
except KeyboardInterrupt:
    print('-' * 89)
    print('Exiting from training early')

# Load the best saved model.
with open(save, 'rb') as f:
    model = torch.load(f)
    # after load the rnn params are not a continuous chunk of memory
    # this makes them a continuous chunk, and will speed up forward pass
    model.rnn.flatten_parameters()

# Run on test data.
test_loss = evaluate(test_data)
print('=' * 89)
print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
    test_loss, math.exp(test_loss)))
print('=' * 89)

| epoch   1 |    10/  375 batches | lr 10.00 | ms/batch 337.83 | loss 10.38 | ppl 32151.94
| epoch   1 |    20/  375 batches | lr 10.00 | ms/batch 298.88 | loss  8.27 | ppl  3906.01
| epoch   1 |    30/  375 batches | lr 10.00 | ms/batch 300.37 | loss  7.74 | ppl  2302.01
| epoch   1 |    40/  375 batches | lr 10.00 | ms/batch 303.33 | loss  7.61 | ppl  2020.72
| epoch   1 |    50/  375 batches | lr 10.00 | ms/batch 304.50 | loss  7.25 | ppl  1407.82
| epoch   1 |    60/  375 batches | lr 10.00 | ms/batch 304.01 | loss  7.05 | ppl  1152.44
| epoch   1 |    70/  375 batches | lr 10.00 | ms/batch 305.77 | loss  7.03 | ppl  1128.31
| epoch   1 |    80/  375 batches | lr 10.00 | ms/batch 306.60 | loss  6.93 | ppl  1019.55
| epoch   1 |    90/  375 batches | lr 10.00 | ms/batch 310.56 | loss  6.84 | ppl   938.45
| epoch   1 |   100/  375 batches | lr 10.00 | ms/batch 312.45 | loss  6.74 | ppl   845.07
| epoch   1 |   110/  375 batches | lr 10.00 | ms/batch 315.43 | loss  6.46 | ppl   640.72

In [None]:
#from nltk.translate.bleu_score import sentence_bleu
#from nltk.translate.bleu_score import SmoothingFunction
#smoothie = SmoothingFunction().method4
#reference = train_data.tolist()
n_words = 200
def pretty_print(words, beam=False, filename=''):
  '''candidate = [corpus.dictionary.word2idx[word] for word in words]
  print('Cumulative 1-gram: %f' % sentence_bleu(reference, candidate, weights=(1, 0, 0, 0)))
  print('Cumulative 2-gram: %f' % sentence_bleu(reference, candidate, weights=(0.5, 0.5, 0, 0)))
  print('Cumulative 3-gram: %f' % sentence_bleu(reference, candidate, weights=(0.33, 0.33, 0.33, 0)))
  print('Cumulative 4-gram: %f' % sentence_bleu(reference, candidate, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smoothie))'''
  pretty_mat = []
  i,last_eos = 0, 0
  for j in range(len(words)-1,-1,-1):
    if words[j] == '<eos>':
      last_eos = j
      break
  while i<last_eos:
    line = []
    while (words[i] != '<eos>'):
      isApost = False
      if not(words[i] in eos_tokens):
        if i>0:
          if ('’' in words[i-1]):
            line[-1] += words[i]
            isApost = True
        if not(isApost):
          line.append(words[i])
      else :
        words[i] = words[i-1]
      i +=1
    i +=1
    if (pretty_mat == []) & (beam):
      pretty_mat.append(line)
    else:
      pretty_mat.append(line[1:])
  if filename =='':
    for line in pretty_mat :
      print(' '.join(line))
    print('\n')
  else :
    with open(filename, 'a') as f:
      for line in pretty_mat :
        f.write(' '.join(line)+'\n')

In [None]:
def beam_search_decode(device, net, words, vocab_to_int, int_to_vocab, top_k, temperature, pprint=True):
  net.eval()
  softmax = nn.Softmax(dim = -1)
  words = words.split(' ')
  words.append('<sos>')
  hidden = net.init_hidden(1)
  for v in hidden:
    v = v.to(device)
  for w in words:
    ix = torch.tensor([[vocab_to_int[w]]]).to(device)
    output, hidden = net(ix, hidden)
  output = output / temperature
  prob, top_ix = torch.topk(softmax(output[0]), k=top_k)
  prob = torch.log(prob)
  #print(top_ix)
  list_ids = [[id] for id in top_ix[0].tolist()]
  outputs = [output for _ in range(top_k)]
  hiddens = [hidden for _ in range(top_k)] 
  #print("avant beam search, top indices : ",top_ix.tolist(), "de proba", prob.tolist())
  #print(list_ids)
  for i in range(n_words):
    probas = torch.zeros(top_k, top_k).float().to(device)
    indxes = torch.zeros(top_k, top_k).to(device)
    for k in range(top_k):
      ix = torch.tensor([[top_ix[0][k]]]).to(device)
      output, hiddens[k] = net(ix, hiddens[k])
      output = output / temperature
      pro, indxes[k] = torch.topk(softmax(output[0]), k=top_k)
      pro = torch.log(pro)
      #print("probas du choix ", k+1," : ", pro.tolist())
      probas[k] = torch.add(pro[0], prob[0][k])
    #print(indxes.tolist())
    #print(list_ids)
    prob, indices = torch.topk(probas.flatten(), top_k)
    prob = torch.unsqueeze(prob, 0)
    for k in range(top_k):
      top_ix[0][k] = indxes.flatten()[indices[k]]
    indices = indices // top_k
    temp1 = []
    temp2 = []
    for k in range(top_k):
      temp1.append(hiddens[indices.tolist()[k]])
      temp2.append(list_ids[indices.tolist()[k]] + [top_ix[0].tolist()[k]])
    hiddens = temp1
    list_ids = temp2
    #print("top indices : ",top_ix.tolist(), "de proba", prob.tolist())
  best_branch = list_ids[torch.argmax(prob)]
  words = []
  for id in best_branch:
    words.append(int_to_vocab[id])
  if pprint:
    print('beam', top_k, temperature)
    pretty_print(words, beam=True)
  else:
    return words

In [None]:
def predict(device, net, words, vocab_to_int, int_to_vocab, temperature, pprint=True):
  net.eval()
  softmax = nn.Softmax(dim=-1)
  words = words.split(' ')
  hidden = net.init_hidden(1)
  for v in hidden:
    v = v.to(device)
  for w in words:
    ix = torch.tensor([[vocab_to_int[w]]]).to(device)
    output, hidden = net(ix, hidden)
  output = output / temperature
  idx_max = torch.argmax(softmax(output[0]))
  words = []
  words.append(int_to_vocab[idx_max])
  for i in range(n_words):
      ix = torch.tensor([[idx_max]]).to(device)
      output, hidden = net(ix, hidden)
      output = output / temperature
      idx_max = torch.argmax(softmax(output[0]))
      words.append(int_to_vocab[idx_max])
  if pprint:
    print('greedy')
    pretty_print(words)
  else:
    return words

In [None]:
def top_k_sampling(device, net, words, vocab_to_int, int_to_vocab, top_k, temperature, pprint=True):
  net.eval()
  softmax = nn.Softmax(dim=-1)
  words = words.split(' ')
  hidden = net.init_hidden(1)
  for v in hidden:
    v = v.to(device)
  for w in words:
    ix = torch.tensor([[vocab_to_int[w]]]).to(device)
    output, hidden = net(ix, hidden)
  output = output / temperature
  indices_to_remove = output[0] < torch.topk(output[0], top_k)[0][..., -1, None]
  output[0][indices_to_remove] = -float('Inf')
  prob = softmax(output[0])
  idx_max = torch.multinomial(prob, 1)
  words = []
  words.append(int_to_vocab[idx_max])
  for i in range(n_words):
      ix = torch.tensor([[idx_max]]).to(device)
      output, hidden = net(ix, hidden)
      output = output[0] / temperature
      indices_to_remove = output < torch.topk(output, top_k)[0][..., -1, None]
      output[indices_to_remove] = -float('Inf')
      prob = softmax(output)
      idx_max = torch.multinomial(prob, 1)
      words.append(int_to_vocab[idx_max])
  if pprint:
    print('top-k', top_k, temperature)
    pretty_print(words)
  else:
    return words

In [None]:
def top_p_sampling(device, net, words, vocab_to_int, int_to_vocab, top_p, temperature, pprint=True):
  net.eval()
  softmax = nn.Softmax(dim=-1)
  words = words.split(' ')
  hidden = net.init_hidden(1)
  for v in hidden:
    v = v.to(device)
  for w in words:
    ix = torch.tensor([[vocab_to_int[w]]]).to(device)
    output, hidden = net(ix, hidden)
  output = output / temperature
  cum_prob = 0.0
  incr = 0
  probs, indices = torch.sort(softmax(output[0][0]), descending=True)
  while cum_prob < top_p:
    cum_prob += probs[incr]
    incr += 1
  indices_to_remove = indices[incr:]
  output[0][0][indices_to_remove] = -float('Inf')
  prob = softmax(output[0])
  idx_max = torch.multinomial(prob, 1)
  words = []
  words.append(int_to_vocab[idx_max])
  for i in range(n_words):
      ix = torch.tensor([[idx_max]]).to(device)
      output, hidden = net(ix, hidden)
      output = output[0][0] / temperature
      cum_prob = 0.0
      incr = 0
      probs, indices = torch.sort(softmax(output), descending=True)
      while cum_prob < top_p:
        cum_prob += probs[incr]
        incr += 1
      indices_to_remove = indices[incr:]
      #print(len(output) - len(indices_to_remove), probs[0].tolist(), probs[incr-1].tolist())
      output[indices_to_remove] = -float('Inf')
      prob = softmax(output)
      idx_max = torch.multinomial(prob, 1)
      words.append(int_to_vocab[idx_max])
  if pprint:
    print('top-p', top_p, temperature)
    pretty_print(words)
  else:
    return words

In [None]:
words = '<sos> Je ne t’ en parle plus , va , sers la tyrannie , <eos> <sos> Abandonne ton âme à son lâche génie ; <eos>'
#words = '<sos> Je ne t’ en parle plus , va , sers la -i-[e] tyrannie , <eos> <sos> Abandonne ton âme à son lâche -i-[e] génie ; <eos>'
predict(device, model, words, corpus.dictionary.word2idx, corpus.dictionary.idx2word,1)
beam_search_decode(device, model, words, corpus.dictionary.word2idx, corpus.dictionary.idx2word,20,1)
top_k_sampling(device, model, words, corpus.dictionary.word2idx, corpus.dictionary.idx2word, 50, 1)
top_k_sampling(device, model, words, corpus.dictionary.word2idx, corpus.dictionary.idx2word, 100, 0.7)
top_p_sampling(device, model, words, corpus.dictionary.word2idx, corpus.dictionary.idx2word, 0.9, 1)
top_p_sampling(device, model, words, corpus.dictionary.word2idx, corpus.dictionary.idx2word, 0.7, 1)

greedy
Et si tu m’as tantôt la vie pour le crime ,
Je me suis fait à toi , si tu n’es pas le cœur ,
Et ne veux pas servir de mon sang et de toi .
Je n’en veux point douter , et je n’en puis douter ,
Et je n’en puis douter que pour te faire aimer .
Je n’en veux pas douter , et je n’en puis douter ,
Et je ne puis souffrir que mon cœur s’en irrite .
Je n’en veux point douter , et je n’en puis douter ,
Et je n’en puis douter que pour te faire aimer .
Je n’en veux pas douter , et je n’en puis douter ,
Et je ne puis souffrir que mon père m’accuse .
Je n’en veux point douter , mais je n’en puis douter .
Je n’en veux pas douter , et je n’en puis douter .


beam 20 1
Je n’en veux point douter , mais je n’en puis douter .
Je sais ce que je dois , et ce que je dois faire . . .
C’en est fait , je l’avoue , et je n’en puis douter .
Je sais ce que je dois , et ce que je dois faire . . .
C’est ce que je demande , et ce que je vous dois . . .
C’en est fait , je l’avoue , et je n’en suis pas moins .
Je

In [None]:
from fast_bleu import SelfBLEU
list_greedy, list_beam = [], []
list_topk, list_topp = [], []
incr,nb_line = 0, 0
nb_gen = 10
for k in range(nb_gen):
  nb_line +=1
  start_seq = []
  while (nb_line % 3 != 0):
    cur_word = corpus.test.tolist()[incr]
    start_seq.append(cur_word)
    incr +=1
    if cur_word == corpus.dictionary.word2idx['<eos>']:
      nb_line +=1
  start_seq = ' '.join([corpus.dictionary.idx2word[k] for k in start_seq])
  list_greedy.append(predict(device, model, start_seq, corpus.dictionary.word2idx, corpus.dictionary.idx2word, 1, pprint = False))
  list_beam.append(beam_search_decode(device, model, start_seq, corpus.dictionary.word2idx, corpus.dictionary.idx2word, 20, 1, pprint = False))
  list_topk.append(top_k_sampling(device, model, start_seq, corpus.dictionary.word2idx, corpus.dictionary.idx2word, 100, 0.7, pprint = False))
  list_topp.append(top_p_sampling(device, model, start_seq, corpus.dictionary.word2idx, corpus.dictionary.idx2word, 0.7, 1, pprint = False))

In [None]:
rimes_greedy, rimes_beam, rimes_topk, rimes_topp = [], [], [], []
for i in range(nb_gen):
  for j in range(len(list_greedy[i])-1):
    if list_greedy[i][j] in eos_tokens : rimes_greedy.append(list_greedy[i][j])
    if list_beam[i][j] in eos_tokens : rimes_beam.append(list_beam[i][j])
    if list_topk[i][j] in eos_tokens : rimes_topk.append(list_topk[i][j])
    if list_topp[i][j] in eos_tokens : rimes_topp.append(list_topp[i][j])
for i in range(nb_gen):
  pretty_print(list_greedy[i], filename='greedy.txt')
  pretty_print(list_beam[i], beam= True, filename='beam.txt')
  pretty_print(list_topk[i], filename='topk.txt')
  pretty_print(list_topp[i], filename='topp.txt')
for i in range(nb_gen):
  for j in range(len(list_greedy[i])-1,-1,-1):
    if (list_greedy[i][j] == '<eos>') | (list_greedy[i][j] == '<sos>') | (list_greedy[i][j] in eos_tokens):
      list_greedy[i].pop(j)
    if (list_beam[i][j] == '<eos>') | (list_beam[i][j] == '<sos>') | (list_beam[i][j] in eos_tokens):
      list_beam[i].pop(j)
    if (list_topk[i][j] == '<eos>') | (list_topk[i][j] == '<sos>') | (list_topk[i][j] in eos_tokens):
      if list_topk[i][j] in eos_tokens: rimes_topk.append(list_topk[i][j])
      list_topk[i].pop(j)
    if (list_topp[i][j] == '<eos>') | (list_topp[i][j] == '<sos>') | (list_topp[i][j] in eos_tokens):
      list_topp[i].pop(j)

weights = {'quadrigram': (1/4., 1/4., 1/4., 1/4.)}
self_bleu_greedy = SelfBLEU(list_greedy, weights)
self_bleu_beam = SelfBLEU(list_beam, weights)
self_bleu_topk = SelfBLEU(list_topk, weights)
self_bleu_topp = SelfBLEU(list_topp, weights)
mean_greedy, mean_beam = 0,0
mean_topk, mean_topp = 0,0
for v in self_bleu_greedy.get_score()['quadrigram']:
  mean_greedy += v
for v in self_bleu_beam.get_score()['quadrigram']:
  mean_beam += v
for v in self_bleu_topk.get_score()['quadrigram']:
  mean_topk += v
for v in self_bleu_topp.get_score()['quadrigram']:
  mean_topp += v
print('selfBLEU greedy: ', mean_greedy/nb_gen,'selfBLEU beam :', mean_beam/nb_gen)
print('selfBLEU top k: ',mean_topk/nb_gen, 'selfBLEU top p: ',mean_topp/nb_gen)

nb_rimes_greedy, nb_rimes_beam, nb_rimes_topk, nb_rimes_topp = 0,0,0,0

for k in range(len(rimes_greedy)-1):
  if rimes_greedy[k] == rimes_greedy[k+1]:
    nb_rimes_greedy +=1
for k in range(len(rimes_beam)-1):
  if rimes_beam[k] == rimes_beam[k+1]:
    nb_rimes_beam +=1
for k in range(len(rimes_topk)-1):
  if rimes_topk[k] == rimes_topk[k+1]:
    nb_rimes_topk +=1
for k in range(len(rimes_topp)-1):
  if rimes_topp[k] == rimes_topp[k+1]:
    nb_rimes_topp +=1  


selfBLEU greedy:  0.8276116899690857 selfBLEU beam : 0.8706516458826468
selfBLEU top k:  0.13457289036683154 selfBLEU top p:  0.1262996272491695


In [None]:
print('rimes greedy :', nb_rimes_greedy/len(rimes_greedy) *200, 'rimes beam :', nb_rimes_beam/len(rimes_beam) *200)
print('rimes top k :', nb_rimes_topk/len(rimes_topk) *200, 'rimes top p :', nb_rimes_topp/len(rimes_topp) *200)

ZeroDivisionError: ignored

In [None]:
print(self_bleu_topp.get_score())

Et je n’ ai point de part à qui je dois parler , <eos15>

Que le cœur à mes yeux n’ a point lieu de douter . <eos27>

Je ne sais si je puis , et je veux bien juger <eos53>

Que je n’ ai pu douter que pour vous le venger . <eos53>

Mais je ne sais pas bien que l’ on me considère , <eos0>

Que vous n’ avez rien fait de votre gloire entière , <eos0>

top p sampling 0.2 : les distributions sont assez plates : première proba environ 0.15, puis beaucoup de 0.02, sauf pour estimer sos, proba proche de 1. Avec un p de 0.9 on a des fois un sampling sur des milliers de mots, avec 0.7 on ne passe pas la centaine.

In [None]:
!bash 
#/usr/local/bin/poemlint <(echo '6/6 x x') <greedy.txt 2> erreurs_greedy.txt
#/usr/local/bin/poemlint <(echo '6/6 x x') <beam.txt 2> erreurs_beam.txt
#/usr/local/bin/poemlint <(echo '6/6 x x') <topk.txt 2> erreurs_topk.txt
#/usr/local/bin/poemlint <(echo '6/6 x x') <topp.txt 2> erreurs_topp.txt

In [None]:
nb_greedy, nb_beam, nb_topk, nb_topp = 0,0,0,0
with open('greedy.txt', 'r') as f:
  for line in f:
    nb_greedy +=1
with open('beam.txt', 'r') as f:
  for line in f:
    nb_beam +=1
with open('topk.txt', 'r') as f:
  for line in f:
    nb_topk +=1
with open('topp.txt', 'r') as f:
  for line in f:
    nb_topp +=1

with open('erreurs_greedy.txt', 'r') as f:
  nb_err = 0
  for line in f:
    nb_err +=0.25
  print('greedy error : ',nb_err/nb_greedy * 100, ' %')
with open('erreurs_beam.txt', 'r') as f:
  nb_err = 0
  for line in f:
    nb_err +=0.25
  print('beam error : ',nb_err/nb_beam * 100, ' %')
with open('erreurs_topk.txt', 'r') as f:
  nb_err = 0
  for line in f:
    nb_err +=0.25
  print('topk error : ',nb_err/nb_topk * 100, ' %')
with open('erreurs_topp.txt', 'r') as f:
  nb_err = 0
  for line in f:
    nb_err +=0.25
  print('topp error : ',nb_err/nb_topp * 100, ' %')