# A small project about Poetry generation

## Import the needed packages

In [215]:
import tensorflow as tf
import string
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences

import pathlib

## Load data and prepare Training data

In [278]:
folder = pathlib.Path('data/poetry_data/')
data= [ ]
ite = 0
for item in folder.iterdir():
    if item.suffix == '.txt':
        with open(item, 'r') as file:
             lines = file.read().splitlines()
             lines = list(filter(None, lines))
             data.append(lines)

        ite += 1
        if ite==1:
            break



In [279]:
# Transform list of list to flat list
data = [item1 for sublist in data for item1 in sublist]

In [280]:
token = Tokenizer()
token.fit_on_texts(data)

In [281]:
encoded_text = token.texts_to_sequences(data)
vocab_size = len(token.word_counts) + 1

In [282]:
vocab_size

4081

In [283]:
# Prepare the dataset in order to predict the next word based at least on the previous word
datalist = []
for d in encoded_text:
    if len(d)>1:
        for i in range(2, len(d)):
            datalist.append(d[:i])

In [284]:
# padding to make the elements of datalist to have the same length
max_length = 20
sequences = pad_sequences(datalist, maxlen=max_length, padding='pre')

In [285]:
# Take the last sequence as target
X = sequences[:, :-1]
y = sequences[:, -1]
y = to_categorical(y, num_classes=vocab_size)
lenght_seq = X.shape[1]

## Model Training

In [286]:
model = Sequential()
model.add(Embedding(vocab_size, 50, input_length=lenght_seq)) 
model.add(LSTM(100, return_sequences=True)) #
model.add(LSTM(100))
model.add(Dense(100, activation='relu'))
model.add(Dense(vocab_size, activation='softmax'))

In [287]:
model.summary()

Model: "sequential_15"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_15 (Embedding)    (None, 19, 50)            204050    
                                                                 
 lstm_22 (LSTM)              (None, 19, 100)           60400     
                                                                 
 lstm_23 (LSTM)              (None, 100)               80400     
                                                                 
 dense_23 (Dense)            (None, 100)               10100     
                                                                 
 dense_24 (Dense)            (None, 4081)              412181    
                                                                 
Total params: 767,131
Trainable params: 767,131
Non-trainable params: 0
_________________________________________________________________


In [288]:
model.compile(loss='categorical_crossentropy', optimizer="adam", metrics=['accuracy'])

In [291]:
X.shape

(44382, 19)

In [290]:
model.fit(X, y, batch_size=32, epochs=10)

Epoch 1/10


2023-04-05 22:13:43.881174: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2023-04-05 22:13:44.211196: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2023-04-05 22:13:44.419684: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2023-04-05 22:13:44.915573: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2023-04-05 22:13:45.274786: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x322a2f100>

## Poetry Generation

In [292]:
# Number of words per verses
poetry_length = 10

def generate_poetry(init_text, nlines):
    """
    Take an initial sentence and a number of verses
    :params
    init_text: Initial sentence
    nlines: Number of lines
    """
    for i in range(nlines):
        text = []
        for _ in range(poetry_length):
            encoded = token.texts_to_sequences([init_text])
            encoded = pad_sequences(encoded, maxlen=lenght_seq, padding='pre')

            y_pred = np.argmax(model.predict(encoded, verbose=0), axis=-1)

            predicted_word = ""
            for word, index in token.word_index.items():
                if index == y_pred:
                    predicted_word = word
                    break
            init_text = init_text + ' ' + predicted_word
            text.append(predicted_word)
        init_text = text[-1]
        text = ' '.join(text)
        print(text)

In [293]:
init_text = 'Let me go'
generate_poetry(init_text, 5)

2023-04-05 22:25:11.155236: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2023-04-05 22:25:11.272953: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2023-04-05 22:25:11.434140: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


4 the name is prince the critics love u in
the body u can be a beautiful of the critics
love u in the world of a beautiful of the
max u can be a new power generation u can
u have 2 the max u can be a new
