In [24]:
import numpy as np
import tensorflow as tf


In [25]:
assert hasattr(tf, "function")

**Open and process data**

In [26]:
with open("data/victorhugo.txt", "r" ) as f:
  text = f.read()

print(len(text))
print(text[:500])

127286
Parce que, jargonnant vêpres, jeûne et vigile,
Exploitant Dieu qui rêve au fond du firmament,
Vous avez, au milieu du divin évangile,
Ouvert boutique effrontément ;

Parce que vous feriez prendre à Jésus la verge,
Cyniques brocanteurs sortis on ne sait d'où ;
Parce que vous allez vendant la sainte vierge
Dix sous avec miracle, et sans miracle un sou ;

Parce que vous contez d'effroyables sornettes
Qui font des temples saints trembler les vieux piliers ;
Parce que votre style éblouit les lunettes


On peut avoir deux approches, soit on génère le texte mot par mot ou alors lettre par lettre. Pour notre cas on va générer le texte lettre par lettre car le modèle est assez simple. En utilisant les mots la complexité sera plus grande car il y'a beaucoup de mots que de lettres. En effet avec les lettres on aura que 36 caractères à traiter.

In [27]:

text = text.lower()
text = text.replace("2", "")
text = text.replace("1", "")
text = text.replace("8", "")
text = text.replace("5", "")
text = text.replace(">", "")
text = text.replace("<", "")
text = text.replace("!", "")
text = text.replace(";", "")
text = text.replace("?", "")
text = text.replace("$", "")
text = text.replace("-", "")

text = text.strip()

vocab = sorted(set(text)) 
print(len(vocab), vocab)
print(text[:10])

51 ['\n', ' ', '"', "'", ',', '.', ':', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '«', '»', 'à', 'â', 'æ', 'ç', 'è', 'é', 'ê', 'ë', 'î', 'ï', 'ô', 'ù', 'û', 'œ', '–', '—']
parce que,


In [28]:
vocab_size = len(vocab)

vocab_to_int = {l:i for i,l in enumerate(vocab)}
int_to_vocab = {i:l for l,i in enumerate(vocab)}


In [29]:
print('{')
print("vocab_to_int")
for char,_ in zip(vocab_to_int, range(10)):
    print('  {:4s}: {:3d},'.format(repr(char), vocab_to_int[char]))
print('  ...\n}')
print()
print('{')
print("int_to_vocab")
for char,_ in zip(int_to_vocab, range(10)):
    print('  {:4s}: {:3d},'.format(repr(char), int_to_vocab[char]))
print('  ...\n}')

{
vocab_to_int
  '\n':   0,
  ' ' :   1,
  '"' :   2,
  "'" :   3,
  ',' :   4,
  '.' :   5,
  ':' :   6,
  'a' :   7,
  'b' :   8,
  'c' :   9,
  ...
}

{
int_to_vocab
  '\n':   0,
  ' ' :   1,
  '"' :   2,
  "'" :   3,
  ',' :   4,
  '.' :   5,
  ':' :   6,
  'a' :   7,
  'b' :   8,
  'c' :   9,
  ...
}


In [30]:
encoded = [vocab_to_int[char] for char in text]
encoded_sentence = encoded[:50]

print("Texte non encodé :", text[:50])
print("Texte encodé :", encoded_sentence)

Texte non encodé : parce que, jargonnant vêpres, jeûne et vigile,
exp
Texte encodé : [22, 7, 24, 9, 11, 1, 23, 27, 11, 4, 1, 16, 7, 24, 13, 21, 20, 20, 7, 20, 26, 1, 28, 41, 22, 24, 11, 25, 4, 1, 16, 11, 47, 20, 11, 1, 11, 26, 1, 28, 15, 13, 15, 18, 11, 4, 0, 11, 30, 22]


Génération des batchs (lots) pour l'entrainement

In [31]:
inputs, targets = encoded, encoded[1:]

print("Inputs", inputs[:10])
print("Targets", targets[:10])

Inputs [22, 7, 24, 9, 11, 1, 23, 27, 11, 4]
Targets [7, 24, 9, 11, 1, 23, 27, 11, 4, 1]


In [32]:
#Fonction utilisée pour générer des batchs 

def gen_batch(inputs, targets, seq_length, batch_size, noise=0):
  #batch_size: la taille des lots souhaités de taille fixe
  #inputs: phrase en entrée, seq_length: la taille des séquences
  chunk_size = (len(inputs) - 1) // batch_size

  #nombre de sequence par chunk
  sequences_per_chunk = chunk_size // seq_length #4 per chunk dans l'exemple

  for s in range(0, sequences_per_chunk):
    batch_inputs = np.zeros((batch_size, seq_length))
    batch_targets = np.zeros((batch_size, seq_length))
    for b in range(0, batch_size):
      fr = (b * chunk_size) + (s * seq_length)
      to = fr + seq_length
      batch_inputs[b] = inputs[fr:to]
      batch_targets[b] = inputs[fr+1:to+1]

      if noise > 0: 
        noise_indices = np.random.choice(seq_length, noise)
        batch_inputs[b][noise_indices] = np.random.randint(0, vocab_size)

      yield batch_inputs, batch_targets

for batch_inputs, batch_targets in gen_batch(inputs, targets, 5, 32, noise=0):
  print(batch_inputs[0], batch_targets[0])
  break

"""for batch_inputs, batch_targets in gen_batch(inputs, targets, 5, 32, noise=30):
  print(batch_inputs[0], batch_targets[0])
  break"""

[22.  7. 24.  9. 11.] [ 7. 24.  9. 11.  1.]


'for batch_inputs, batch_targets in gen_batch(inputs, targets, 5, 32, noise=30):\n  print(batch_inputs[0], batch_targets[0])\n  break'

We can also use tensorflow to generate batch 

Pour ce faire, utilisez d'abord la fonction tf.data.Dataset.from_tensor_slices pour convertir le vecteur de texte en un flux d'indices de caractères.




In [35]:
# The maximum length sentence you want for a single input in characters
seq_len = 100
examples_per_epoch = len(text)//(seq_len+1)

# Creating a mapping from unique characters to indices
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

text_as_int = np.array([char2idx[c] for c in text])

# Create training examples / targets
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

for i in char_dataset.take(5):
    print(idx2char[i.numpy()])

p
a
r
c
e


In [36]:
sequences = char_dataset.batch(seq_len+1, drop_remainder=True)

for item in sequences.take(5):
    print(repr(''.join(idx2char[item.numpy()])))

'parce que, jargonnant vêpres, jeûne et vigile,\nexploitant dieu qui rêve au fond du firmament,\nvous av'
'ez, au milieu du divin évangile,\nouvert boutique effrontément \n\nparce que vous feriez prendre à jésus'
" la verge,\ncyniques brocanteurs sortis on ne sait d'où \nparce que vous allez vendant la sainte vierge"
"\ndix sous avec miracle, et sans miracle un sou \n\nparce que vous contez d'effroyables sornettes\nqui fo"
'nt des temples saints trembler les vieux piliers \nparce que votre style éblouit les lunettes\ndes duèg'


In [37]:
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)

for input_example, target_example in  dataset.take(1):
    print('Input data: ', repr(''.join(idx2char[input_example.numpy()])))
    print('Target data:', repr(''.join(idx2char[target_example.numpy()])))

Input data:  'parce que, jargonnant vêpres, jeûne et vigile,\nexploitant dieu qui rêve au fond du firmament,\nvous a'
Target data: 'arce que, jargonnant vêpres, jeûne et vigile,\nexploitant dieu qui rêve au fond du firmament,\nvous av'



One Hot encoding

In [None]:
#create a personnalize layer using tensorflow layer
#depth : number of caracters, in this case we have 51
class OneHot(tf.keras.layers.Layer):
  def __init__(self, depth, **kwargs):
    super(OneHot, self).__init__(**kwargs)
    self.depth = depth

  def call(self, x, mask=None):
    return tf.one_hot(tf.cast(x, tf.int32), self.depth)

In [None]:
class RnnModel(tf.keras.Model):
  def __init__(self, vocab_size):
    super(RnnModel, self).__init__()
    self.one_hot = OneHot(len(vocab))

  def call(self, inputs):
    output = self.one_hot(inputs)
    return output

batch_inputs, batch_targets = next(gen_batch(inputs, targets, 50, 32))

print(batch_inputs.shape)
model = RnnModel(len(vocab))

output = model.predict(batch_inputs)

print(output.shape)


print("input letter : ", batch_inputs[0][0])
print(output[0][0])

(32, 50)
(32, 50, 51)
input letter :  22.0
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0.]


In [None]:
# Batch size
BATCH_SIZE = 64

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

dataset

Construisons notre modèle

In [None]:
vocab_size = len(vocab)

#Create the layers

  #Set the input of the model
tf_inputs = tf.keras.Input(shape=(None,), batch_size=64)
#convert each value of the input into a one encoding vector

one_hot = OneHot(len(vocab))(tf_inputs)

#stack LSTM cells
rnn_layer1 = tf.keras.layers.LSTM(128, return_sequences=True, stateful=True)(one_hot)
#return_sequences à true permet de récuperer toutes les sorties de la couche, s'il est à false on récupère seulement la dernière sortie
#stateful permet de ne pas reinitialiser l'état à 0 mais plutôt au dernier état précédent
rnn_layer2 = tf.keras.layers.LSTM(128, return_sequences=True, stateful=True)(rnn_layer1)

#Create the outputs of the model
hiden_layer = tf.keras.layers.Dense(128, activation="relu")(rnn_layer2)
outputs = tf.keras.layers.Dense(vocab_size, activation="softmax")(hiden_layer) 
#Sotfmax pour avoir une distribution de probabilité sur toutes les lettres du vocabulaire

#Set up the model
model = tf.keras.Model(inputs=tf_inputs, outputs=outputs)

Set the loss and the objectives

In [None]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy()
optimizer = tf.keras.optimizers.Adam(lr=0.001)

Set metrics to track the progress of the training

In [None]:
#Loss
train_loss = tf.keras.metrics.Mean(name='train_loss')
#Accuracy
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='train_accuracy')

Set the train method and the predict method 

In [None]:
@tf.function
def train_step(inputs, targets):
  with tf.GradientTape() as tape:
    #make a prediction on all the batch
    predictions = model(inputs)
    #Get the error/loss on these predictions
    loss = loss_object(targets, predictions)
  #Compute the gradient which respect to the loss
  gradients = tape.gradient(loss, model.trainable_variables)
  #Change the weights of the model
  optimizer.apply_gradients(zip(gradients, model.trainable_variables))
  #the metrics are accumulate over the time. You don't need to average it yourself

  train_loss(loss)
  train_accuracy(targets, predictions)

@tf.function
def predict(inputs):
  #Make a prediction on all the batch
  predictions = model(inputs)
  return predictions




Train the model

In [None]:
model.reset_states()

for epoch in range(5000):
  for batch_inputs, batch_targets in gen_batch(inputs, targets, 100, 64, noise=13): 
    train_step(batch_inputs, batch_targets)
  template = '\r, Epoch {}, Train Loss: {}, Train Accuracy: {}'
  print(template.format(epoch, train_loss.result(), train_accuracy.result()*100, end="")
  model.reset_states()
