# Recursive Neural Network(RNN) for generating the Divina Commedia of Dante Alighieri.

In [3]:
import numpy as np
import pandas as pd
from pathlib import Path

%tensorflow_version 2.x
import tensorflow as tf
print(tf.__version__)

from keras.callbacks import ModelCheckpoint,  CSVLogger
from keras.layers import Add, Dense, Input, LSTM, Layer
from keras.models import Model, Sequential
from keras.preprocessing.text import Tokenizer
from keras.utils import np_utils
from matplotlib import pyplot as plt

2.3.0


In [4]:
# Settings

# Percent of samples to use for training, might be necessary if you're running out of memory
sample_size = 1

# The latent dimension of the LSTM
latent_dim = 2048

# Number of epochs to train for
epochs = 20

# path of the data
data_path = 'DivinaCommedia.csv'

name = 'all_data_test_2'
output_dir = Path('output_%s' % name)
try:
  output_dir.mkdir()
except FileExistsError:
  pass

# Data import and preprocessing

In [5]:
df = pd.read_csv(str(data_path))
df = df.sample(frac=sample_size)


max_line_length = int(max([df['%s' % i].astype(str).str.len().quantile(.99) for i in range(3)]))

df = df[
    (df['0'].astype(str).str.len() <= max_line_length) & 
    (df['1'].astype(str).str.len() <= max_line_length) & 
    (df['2'].astype(str).str.len() <= max_line_length)
].copy()

# preprocessing data
# Pad the lines to the max line length with new lines
for i in range(3):
    # For input, duplicate the first character
    # TODO - Why?
    df['%s_in' % i] = (df[str(i)].str[0] + df[str(i)]).str.pad(max_line_length+2, 'right', '\n')
    
    # 
    #df['%s_out' % i] = df[str(i)].str.pad(max_line_len, 'right', '\n') + ('\n' if i == 2 else df[str(i+1)].str[0])
    
    # TODO - trying to add the next line's first character before the line breaks
    if i == 2: # If it's the last line
        df['%s_out' % i] = df[str(i)].str.pad(max_line_length+2, 'right', '\n')
    else: 
        # If it's the first or second line, add the first character of the next line to the end of this line.
        # This helps with training so that the next RNN has a better chance of getting the first character right.
        df['%s_out' % i] = (df[str(i)] + '\n' + df[str(i+1)].str[0]).str.pad(max_line_length+2, 'right', '\n')
    
max_line_length += 2

inputs = df[['0_in', '1_in', '2_in']].values


tokenizer = Tokenizer(filters='', char_level=True)
tokenizer.fit_on_texts(inputs.flatten())
n_tokens = len(tokenizer.word_counts) + 1


print(df)

# X is the input for each line in sequences of one-hot-encoded values
X = np_utils.to_categorical([
  tokenizer.texts_to_sequences(inputs[:,i]) for i in range(3)
  ], num_classes=n_tokens)

outputs = df[['0_out', '1_out', '2_out']].values

# Y is the output for each line in sequences of one-hot-encoded values
Y = np_utils.to_categorical([
    tokenizer.texts_to_sequences(outputs[:,i]) for i in range(3)
], num_classes=n_tokens)



# X_syllables is the count of syllables for each line
X_syllables = df[['0_syllables', '1_syllables', '2_syllables']].values




      Unnamed: 0  ...                                              2_out
21            21  ...  "qual che tu sii, od ombra od omo certo!". \n\n\n
4383        4383  ...  onde mei che dinanzi vidi poi;\n\n\n\n\n\n\n\n...
1303        1303  ...  fosse in Egina il popol tutto infermo,\n\n\n\n...
531          531  ...  che da quest'altra a più a più giù prema\n\n\n...
1347        1347  ...  La grave idropesì, che sì dispaia\n\n\n\n\n\n\...
...          ...  ...                                                ...
593          593  ...  e quella men che giacèa al tormento,\n\n\n\n\n...
1547        1547  ...   che 'l capo ha dentro e fuor le gambe mena. \n\n
1545        1545  ...  sì che tre ne facea così dolenti. \n\n\n\n\n\n...
4735        4735  ...  che quinci e quindi igualmente si spiri. \n\n\...
3485        3485  ...  di complession potenziata tira\n\n\n\n\n\n\n\n...

[4628 rows x 13 columns]


In [4]:
print(tokenizer.texts_to_sequences(inputs[:, 1]))

[[4, 4, 7, 12, 6, 8, 2, 13, 5, 2, 13, 14, 25, 5, 11, 4, 8, 2, 11, 5, 2, 13, 34, 2, 12, 4, 20, 5, 6, 7, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [3, 3, 2, 9, 34, 2, 15, 19, 4, 16, 16, 4, 8, 17, 3, 18, 2, 10, 30, 2, 12, 6, 15, 19, 3, 9, 9, 5, 2, 4, 16, 16, 4, 8, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [13, 13, 3, 2, 9, 4, 2, 15, 5, 4, 2, 13, 6, 7, 7, 4, 18, 2, 3, 2, 9, 19, 4, 7, 5, 15, 6, 2, 12, 6, 7, 2, 3, 10, 10, 5, 18, 1, 1, 1, 1, 1, 1, 1, 1, 1], [16, 16, 3, 8, 2, 12, 14, 5, 2, 10, 12, 6, 10, 10, 3, 2, 13, 5, 4, 7, 28, 5, 2, 6, 20, 7, 3, 2, 16, 3, 7, 13, 5, 12, 3, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [15, 15, 4, 18, 2, 16, 3, 8, 2, 22, 4, 8, 3, 2, 3, 10, 10, 3, 8, 2, 25, 3, 7, 2, 9, 4, 2, 17, 6, 20, 9, 5, 4, 2, 4, 12, 14, 11, 4, 18, 1, 1, 1, 1, 1, 1], [4, 4, 16, 16, 8, 3, 10, 10, 6, 2, 5, 9, 2, 22, 5, 7, 3, 2, 12, 21, 19, 4, 2, 23, 14, 3, 9, 9, 19, 5, 7, 7, 6, 2, 22, 4, 10, 10, 5, 18, 1, 1, 1, 1, 1, 1], [5, 5, 19, 2, 10, 6, 7, 2, 22, 4, 11, 11, 4, 2, 13, 4, 2, 13, 5, 6, 18, 2, 10, 1

# Model

In [6]:
class BasicDanteRNN(Model):
    def __init__(self, latent_dim, n_tokens, generative=False):
        super(BasicDanteRNN, self).__init__()
        self.n_tokens = n_tokens
        self.latent_dim = latent_dim
        self.generative = generative
        self.lstm = LSTM(latent_dim, return_state=True, return_sequences=True, name='lstm')
        self.tl1 = BasicTrainingLine(self.lstm, self.latent_dim, self.n_tokens)
        self.tl2 = BasicTrainingLine(self.lstm, self.latent_dim, self.n_tokens)
        self.tl3 = BasicTrainingLine(self.lstm, self.latent_dim, self.n_tokens)
        
    def call(self, inputs, training=None):
        #print(inputs) # (<tf.Tensor 'IteratorGetNext:0' shape=(None, 46, 41) dtype=float32>, <tf.Tensor 'ExpandDims:0' shape=(None, 1) dtype=int64>, <tf.Tensor 'IteratorGetNext:2' shape=(None, 46, 41) dtype=float32>, <tf.Tensor 'ExpandDims_1:0' shape=(None, 1) dtype=int64>, <tf.Tensor 'IteratorGetNext:4' shape=(None, 46, 41) dtype=float32>, <tf.Tensor 'ExpandDims_2:0' shape=(None, 1) dtype=int64>)
        outputs = []

        if self.generative:
          #syl1, syl2, syl3 = (11, 11, 11)
          #syl1 = tf.expand_dims((syl1), 0)
          #syl2 = tf.expand_dims((syl2), 0)
          #syl3 = tf.expand_dims((syl3), 0)


          # TODO: there is a better method for create a tensor with shape (None, 1) or at least with 2 dim
          input_eval, syl = inputs
          syl = tf.expand_dims(tf.expand_dims(syl, 0), 1)

          if input_eval is None:
            # using random start
            first_char = chr(int(np.random.randint(ord('a'), ord('z')+1)))
            #print(tokenizer.texts_to_sequences(first_char))
            # Converting start string to numbers (vectorizing)
            input_eval = tokenizer.texts_to_sequences(first_char)[0]
            input_eval = tf.expand_dims(input_eval, 0)

          # print((input_eval, syl))

          x1 = self.tl1((input_eval, syl), training=False, previous_line=None)
          outputs.append(x1)

          #print(x1)

          x2 = self.tl2((x1, syl), training=False, previous_line=self.tl1)
          outputs.append(x2)
          #print(x2)

          x3 = self.tl3((x2, syl), training=False, previous_line=self.tl2)
          outputs.append(x3)
        else:

          char1, syl1, char2, syl2, char3, syl3 = inputs
          outputs.append(self.tl1((char1, syl1), training=training, previous_line=None))
          outputs.append(self.tl2((char2, syl2), training=training, previous_line=self.tl1))
          outputs.append(self.tl3((char3, syl3), training=training, previous_line=self.tl2))

        #input_layer = [self.inp1, self.inp2, self.inp3]


        #print(outputs)

        return outputs


class BasicTrainingLine(Layer):
    def __init__(self, lstm, latent_dim, n_tokens):
        super(BasicTrainingLine, self).__init__()
        self.lstm = lstm
        self.n_tokens = n_tokens
        self.dense_in = Dense(latent_dim, activation='relu')
        self.dense_out = Dense(self.n_tokens, activation='softmax')
        self.lstm_h = None
        self.lstm_c = None

    def call(self, inputs, training=None, previous_line=None, **kwargs):
        #print("BasicTrainingLine Start")
        # print(inputs)
        # x = self.syllable_input(inputs) NON SI FA PERCHé é UN INPUT LAYER
        # INPUTS: ListWrapper([<tf.Tensor 'input_151:0' shape=(None, None, 41) dtype=float32>, UN CARATTERE TOKENIZZATO
        #       <tf.Tensor 'input_152:0' shape=(None, 1) dtype=float32>]) NUMERO SILLABE

        chars, syllable = inputs
        #print(chars)
        #print(syllable)

        x = self.dense_in(syllable, training=training)
        #print(self.n_tokens)

        #print(x)

        if previous_line:
            # WHAT ARE THESE ADD? Simply layer that do an addition maybe
            initial_state = [
                Add()([
                    previous_line.lstm_h,
                    x
                ]),
                Add()([
                    previous_line.lstm_c,
                    x
                ])
            ]

            #print(previous_line.lstm_c)
        else:
            initial_state = [x, x]

        #print(initial_state)

        lstm_out, self.lstm_h, self.lstm_c = self.lstm(chars, initial_state=initial_state, training=training)
        outputs = self.dense_out(lstm_out, training=training)
        #print(lstm_out)
        #print(outputs)

        #print("BasicTrainingLine End")

        return outputs


In [6]:
# The latent dimension of the LSTM
latent_dim = 2048
model = BasicDanteRNN(latent_dim, n_tokens)

model.compile(optimizer='rmsprop', loss='categorical_crossentropy')

filepath = str(output_dir / ("%s-{epoch:02d}-{loss:.2f}.ckpt" % latent_dim))
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, save_weights_only=True, mode='min', save_freq=10)

csv_logger = CSVLogger(str(output_dir / 'training_log.csv'), append=True, separator=',')

callbacks_list = [checkpoint, csv_logger]

# Save the weights using the `checkpoint_path` format
# model.save_weights(checkpoint_path.format(epoch=0))


#model.build(((None, 2048)))
#model.summary()

#print(model.output)

# l'input X[0] contiene 46 vettori con ognuno 41 valori
# 46 è il numero di caratteri massimo per riga
# 41 è il numero di caratteri possibili per ogni carattare

model.fit([
    X[0], X_syllables[:,0],
    X[1], X_syllables[:,1], 
    X[2], X_syllables[:,2]
], [Y[0], Y[1], Y[2]], batch_size=64, epochs=epochs, validation_split=.1, callbacks=callbacks_list)


Epoch 1/20
BasicTrainingLine Start


KeyboardInterrupt: ignored

# Generation

In [1]:
def generate_text(model):
  # Evaluation step (generating text using the learned model)

  # Number of Canti to generate
  num_canti = 33

  first_char = chr(int(np.random.randint(ord('a'), ord('z')+1)))
  #print(tokenizer.texts_to_sequences(first_char))
  #print(tokenizer.texts_to_sequences(first_char)[0])
  # Converting our start string to numbers (vectorizing)
  #input_eval = tokenizer.texts_to_sequences(first_char)[0]
  #input_eval = tf.expand_dims(input_eval, 1)

  input_eval = np_utils.to_categorical([
    tokenizer.texts_to_sequences(first_char)
    ], num_classes=n_tokens)

  # Empty string to store our results
  text_generated = []

  # Low temperatures results in more predictable text.
  # Higher temperatures results in more surprising text.
  # Experiment to find the best setting.
  temperature = 1.0

  # per ogni canto devo chiamare la rete e farmi restituire una terzina, 
  # dopo devo passare come input alla seconda chiamata la fine della terzina appena creata

  end = False
  generative_model.reset_states()
  for _ in range(num_canti):
    line_output = [[], [], []]
    for _ in range(max_line_length):
      predictions = generative_model((input_eval, X_syllables[0,0]), training=False)
      #print("GENERATEEEEE {}".format(tf.squeeze(predictions)))

      cont = 0

      # prediction is an array that contains 3 array, each one is a new char
      for pred in predictions:
        char = sample(pred[0,0], temperature)
        if char == 1 and not end:
            end = True
        if char != 1 and end:
            next_char = char
            char = 1

        line_output[cont].append(char)
        cont += 1
      
    terzina = []
    #print(line_output)
    for i in range(3):
      #print(line_output[i])
      cleaned_text = tokenizer.sequences_to_texts([
                    line_output[i]
                ])[0].strip()[1:].replace(
                    '   ', '\n'
                ).replace(' ', '').replace('\n', ' ')

      terzina.append(cleaned_text)

    #print(terzina)

    text_generated.append(terzina)    

  return text_generated

In [7]:
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    # From https://github.com/llSourcell/keras_explained/blob/master/gentext.py
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [8]:
generative_model = BasicDanteRNN(latent_dim, n_tokens, generative=True)

generative_model.compile(optimizer='rmsprop', loss='categorical_crossentropy')

latest = tf.train.latest_checkpoint("output_all_data_test_2/")

print(latest)

generative_model.load_weights(latest)


output_all_data_test_2/2048-20-1.47.ckpt


<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f09b4aa5e80>

In [11]:
for [x,y,z] in generate_text(generative_model):
  print(x + "\n" + y + "\n" + z + "\n\n")



aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
araa adva lppbplcaaabaaieaaicpe,pàaàuipappddq
iccqqcccqcqqqccvqcqcqcqoclccccqcqccqpccocccqc


aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
 caa cca'aprppaacaaàamèapapdap'ahnaàpluulpap
ccqcccqccciccccqicuqccqcccqcqcqccpcoccccccqcc


aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
cpap'aevtdacrpaaaa'pèaaahèpaaàrpacppaaccamcdi
qciqcccpqccqccccpccccqpqqqcqcccoccpgccccbcqqc


aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
caqppàmaaaanchpaacbppappaaaaàaaàcaaraavaèaap
qccqccqccqoqpqqccqqccccqqqqcccqcccuqqqpcqccci


aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
aaaraceabpadcaaèpcaalaeaaa'a aqpapàa hà'caeaa
cciccuqpccqvqcqqcqcqcqqqccqccccccczqcquuqccaq


aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
pecaaàcaaaèqamècccèaaaaipacaacparaaaaaacat i
qccccccpcqqqcqpqirqqcuccoqqcqcqqccqccqcqqpccu


aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
piaaahaacpdpc aa a caac'pnacèèc a paàrduacaoh
ccqqcipqccqqcpcecuqqcciqcccqcpsqcqqcqcqqccccq


aaaaaaaaaaaaaaaaaaaaaaa