# Recursive Neural Network(RNN) for generating the Divina Commedia of Dante Alighieri.

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path

%tensorflow_version 2.x
import tensorflow as tf
print(tf.__version__)

from keras.callbacks import ModelCheckpoint,  CSVLogger
from keras.layers import Add, Dense, Input, LSTM, Layer
from keras.models import Model, Sequential
from keras.preprocessing.text import Tokenizer
from keras.utils import np_utils
from matplotlib import pyplot as plt

2.3.0


In [3]:
# Settings

# Percent of samples to use for training, might be necessary if you're running out of memory
sample_size = 1

# The latent dimension of the LSTM
latent_dim = 2048

# Number of epochs to train for
epochs = 20

# path of the data
data_path = 'DivinaCommedia.csv'

name = 'all_data_test_2'
output_dir = Path('output_%s' % name)
try:
  output_dir.mkdir()
except FileExistsError:
  pass

# Data import and preprocessing

In [4]:
df = pd.read_csv(str(data_path))
df = df.sample(frac=sample_size)


max_line_length = int(max([df['%s' % i].astype(str).str.len().quantile(.99) for i in range(3)]))

df = df[
    (df['0'].astype(str).str.len() <= max_line_length) & 
    (df['1'].astype(str).str.len() <= max_line_length) & 
    (df['2'].astype(str).str.len() <= max_line_length)
].copy()

# preprocessing data
# Pad the lines to the max line length with new lines
for i in range(3):
    # For input, duplicate the first character
    # TODO - Why?
    df['%s_in' % i] = (df[str(i)].str[0] + df[str(i)]).str.pad(max_line_length+2, 'right', '\n')
    
    # 
    #df['%s_out' % i] = df[str(i)].str.pad(max_line_len, 'right', '\n') + ('\n' if i == 2 else df[str(i+1)].str[0])
    
    # TODO - trying to add the next line's first character before the line breaks
    if i == 2: # If it's the last line
        df['%s_out' % i] = df[str(i)].str.pad(max_line_length+2, 'right', '\n')
    else: 
        # If it's the first or second line, add the first character of the next line to the end of this line.
        # This helps with training so that the next RNN has a better chance of getting the first character right.
        df['%s_out' % i] = (df[str(i)] + '\n' + df[str(i+1)].str[0]).str.pad(max_line_length+2, 'right', '\n')
    
max_line_length += 2

inputs = df[['0_in', '1_in', '2_in']].values


tokenizer = Tokenizer(filters='', char_level=True)
tokenizer.fit_on_texts(inputs.flatten())
n_tokens = len(tokenizer.word_counts) + 1


print(df)

# X is the input for each line in sequences of one-hot-encoded values
X = np_utils.to_categorical([
  tokenizer.texts_to_sequences(inputs[:,i]) for i in range(3)
  ], num_classes=n_tokens)

outputs = df[['0_out', '1_out', '2_out']].values

# Y is the output for each line in sequences of one-hot-encoded values
Y = np_utils.to_categorical([
    tokenizer.texts_to_sequences(outputs[:,i]) for i in range(3)
], num_classes=n_tokens)



# X_syllables is the count of syllables for each line
X_syllables = df[['0_syllables', '1_syllables', '2_syllables']].values




      Unnamed: 0  ...                                              2_out
1043        1043  ...  In quella parte del giovanetto anno\n\n\n\n\n\...
2006        2006  ...  perchè iv'era imaginata quella\n\n\n\n\n\n\n\n...
606          606  ...  nullo martiro, fuor che la tua rabbia,\n\n\n\n...
3867        3867  ...  insieme fui cristiano e Cacciaguida. \n\n\n\n\...
1069        1069  ...  Noi discendemmo il ponte da la testa\n\n\n\n\n...
...          ...  ...                                                ...
2155        2155  ...  e cusce sì, come a sparvier selvaggio\n\n\n\n\...
849          849  ...  forte spingava con ambo le piote. \n\n\n\n\n\n...
2836        2836  ...  la possa del salir più e 'l diletto. \n\n\n\n\...
520          520  ...  Chiròn si volse in su la destra poppa,\n\n\n\n...
1105        1105  ...  de' quai nè io nè 'l duca mio s'accorse, \n\n\...

[4628 rows x 13 columns]


In [None]:
print(max_line_length)

46


# Model

In [7]:
class BasicDanteRNN(Model):
    def __init__(self, latent_dim, n_tokens, generative=False):
        super(BasicDanteRNN, self).__init__()
        self.n_tokens = n_tokens
        self.latent_dim = latent_dim
        self.generative = generative
        self.lstm = LSTM(latent_dim, return_state=True, return_sequences=True, name='lstm')
        self.tl1 = BasicTrainingLine(self.lstm, self.latent_dim, self.n_tokens)
        self.tl2 = BasicTrainingLine(self.lstm, self.latent_dim, self.n_tokens)
        self.tl3 = BasicTrainingLine(self.lstm, self.latent_dim, self.n_tokens)
        
    def call(self, inputs, training=None):
        #print(inputs) # (<tf.Tensor 'IteratorGetNext:0' shape=(None, 46, 41) dtype=float32>, <tf.Tensor 'ExpandDims:0' shape=(None, 1) dtype=int64>, <tf.Tensor 'IteratorGetNext:2' shape=(None, 46, 41) dtype=float32>, <tf.Tensor 'ExpandDims_1:0' shape=(None, 1) dtype=int64>, <tf.Tensor 'IteratorGetNext:4' shape=(None, 46, 41) dtype=float32>, <tf.Tensor 'ExpandDims_2:0' shape=(None, 1) dtype=int64>)
        outputs = []

        if self.generative:
          syl1, syl2, syl3 = (11, 11, 11)

          # using random start
          first_char = chr(int(np.random.randint(ord('a'), ord('z')+1)))
          print(tokenizer.texts_to_sequences(first_char))
          print(tokenizer.texts_to_sequences(first_char)[0])
          # Converting start string to numbers (vectorizing)
          input_eval = tokenizer.texts_to_sequences(first_char)[0]
          input_eval = tf.expand_dims(input_eval, 0)

          x1 = self.tl1((first_char, syl1), training=False, previous_line=None)
          outputs.append(x1)

          x2 = self.tl2((x, syl2), training=False, previous_line=self.tl1)
          outputs.append(x2)

          x3 = self.tl3((x2, syl3), training=False, previous_line=self.tl2)
          outputs.append(x3)
        else:

          char1, syl1, char2, syl2, char3, syl3 = inputs
          outputs.append(self.tl1((char1, syl1), training=training, previous_line=None))
          outputs.append(self.tl2((char2, syl2), training=training, previous_line=self.tl1))
          outputs.append(self.tl3((char3, syl3), training=training, previous_line=self.tl2))

        #input_layer = [self.inp1, self.inp2, self.inp3]


        print(outputs)

        return outputs


class BasicTrainingLine(Model):
    def __init__(self, lstm, latent_dim, n_tokens):
        super(BasicTrainingLine, self).__init__()
        self.lstm = lstm
        self.n_tokens = n_tokens
        self.dense_in = Dense(latent_dim, activation='relu')
        self.dense_out = Dense(self.n_tokens, activation='softmax')
        self.lstm_h = None
        self.lstm_c = None

    def call(self, inputs, training=None, previous_line=None, **kwargs):
        #print("BasicTrainingLine Start")
        #print(inputs)
        # x = self.syllable_input(inputs) NON SI FA PERCHé é UN INPUT LAYER
        # INPUTS: ListWrapper([<tf.Tensor 'input_151:0' shape=(None, None, 41) dtype=float32>, UN CARATTERE TOKENIZZATO
        #       <tf.Tensor 'input_152:0' shape=(None, 1) dtype=float32>]) NUMERO SILLABE

        chars, syllable = inputs
        x = self.dense_in(syllable, training=training)
        #print(self.n_tokens)

        #print(x)

        if previous_line:
            # WHAT ARE THESE ADD? Simply layer that do an addition maybe
            initial_state = [
                Add()([
                    previous_line.lstm_h,
                    x
                ]),
                Add()([
                    previous_line.lstm_c,
                    x
                ])
            ]

            #print(previous_line.lstm_c)
        else:
            initial_state = [x, x]

        #print(initial_state)

        lstm_out, self.lstm_h, self.lstm_c = self.lstm(chars, initial_state=initial_state, training=training)
        outputs = self.dense_out(lstm_out, training=training)
        #print(lstm_out)
        #print(outputs)

        #print("BasicTrainingLine End")

        return outputs


In [8]:
# The latent dimension of the LSTM
latent_dim = 2048
model = BasicDanteRNN(latent_dim, n_tokens)

model.compile(optimizer='rmsprop', loss='categorical_crossentropy')

filepath = str(output_dir / ("%s-{epoch:02d}-{loss:.2f}-{val_loss:.2f}.hdf5" % latent_dim))
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')

csv_logger = CSVLogger(str(output_dir / 'training_log.csv'), append=True, separator=',')

callbacks_list = [checkpoint, csv_logger]


#model.build(((None, 2048)))
#model.summary()

#print(model.output)

# l'input X[0] contiene 46 vettori con ognuno 41 valori
# 46 è il numero di caratteri massimo per riga
# 41 è il numero di caratteri possibili per ogni carattare

model.fit([
    X[0], X_syllables[:,0],
    X[1], X_syllables[:,1], 
    X[2], X_syllables[:,2]
], [Y[0], Y[1], Y[2]], batch_size=64, epochs=epochs, validation_split=.1, callbacks=callbacks_list)


Epoch 1/20
True
[<tf.Tensor 'basic_dante_rnn_1/basic_training_line_3/dense_7/truediv:0' shape=(None, 46, 41) dtype=float32>, <tf.Tensor 'basic_dante_rnn_1/basic_training_line_4/dense_9/truediv:0' shape=(None, 46, 41) dtype=float32>, <tf.Tensor 'basic_dante_rnn_1/basic_training_line_5/dense_11/truediv:0' shape=(None, 46, 41) dtype=float32>]
True
[<tf.Tensor 'basic_dante_rnn_1/basic_training_line_3/dense_7/truediv:0' shape=(None, 46, 41) dtype=float32>, <tf.Tensor 'basic_dante_rnn_1/basic_training_line_4/dense_9/truediv:0' shape=(None, 46, 41) dtype=float32>, <tf.Tensor 'basic_dante_rnn_1/basic_training_line_5/dense_11/truediv:0' shape=(None, 46, 41) dtype=float32>]
[<tf.Tensor 'basic_dante_rnn_1/basic_training_line_3/dense_7/truediv:0' shape=(None, 46, 41) dtype=float32>, <tf.Tensor 'basic_dante_rnn_1/basic_training_line_4/dense_9/truediv:0' shape=(None, 46, 41) dtype=float32>, <tf.Tensor 'basic_dante_rnn_1/basic_training_line_5/dense_11/truediv:0' shape=(None, 46, 41) dtype=float32>]



<tensorflow.python.keras.callbacks.History at 0x7f19beca6898>

# Generation

In [9]:
def generate_text(model, start_string):
  # Evaluation step (generating text using the learned model)

  # Number of characters to generate
  num_generate = 1000

  first_char = chr(int(np.random.randint(ord('a'), ord('z')+1)))
  print(tokenizer.texts_to_sequences(first_char))
  print(tokenizer.texts_to_sequences(first_char)[0])
  # Converting our start string to numbers (vectorizing)
  input_eval = tokenizer.texts_to_sequences(first_char)[0]
  input_eval = tf.expand_dims(input_eval, 0)

  # Empty string to store our results
  text_generated = []

  # Low temperatures results in more predictable text.
  # Higher temperatures results in more surprising text.
  # Experiment to find the best setting.
  temperature = 1.0

  # Here batch size == 1
  model.reset_states()
  for i in range(num_generate):
    predictions = model(input_eval, training=False)
    # remove the batch dimension
    predictions = tf.squeeze(predictions, 0)

    # using a categorical distribution to predict the character returned by the model
    predictions = predictions / temperature
    predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

    # We pass the predicted character as the next input to the model
    # along with the previous hidden state
    input_eval = tf.expand_dims([predicted_id], 0)

    text_generated.append(tokenizer.sequences_to_texts([
                predicted_id
            ])[0].strip()[1:].replace(
                '   ', '\n'
            ).replace(' ', '').replace('\n', ' '))

  return (start_string + ''.join(text_generated))

In [14]:
generative_model = BasicDanteRNN(latent_dim, n_tokens, generative=True)

generative_model.compile(optimizer='rmsprop', loss='categorical_crossentropy')

generative_model.load_weights(output_dir / '2048-20-1.34-4.87.hdf5')


print(generate_text(generative_model))

ValueError: ignored