### Importing Libraries

In [2]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

import re

from tensorflow.keras import layers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dropout, Dense

### **1. Dataset Preparation**

#### Loading Dataset

In [3]:
dataset = pd.read_csv('PoetryFoundationData.csv')
dataset = dataset.drop_duplicates(subset=['Poem'])
dataset.head()

Unnamed: 0.1,Unnamed: 0,Title,Poem,Poet,Tags
0,0,\r\r\n Objects Used to Prop...,"\r\r\nDog bone, stapler,\r\r\ncribbage board, ...",Michelle Menting,
1,1,\r\r\n The New Church\r\r\n...,"\r\r\nThe old cupola glinted above the clouds,...",Lucia Cherciu,
2,2,\r\r\n Look for Me\r\r\n ...,\r\r\nLook for me under the hood\r\r\nof that ...,Ted Kooser,
3,3,\r\r\n Wild Life\r\r\n ...,"\r\r\nBehind the silo, the Mother Rabbit\r\r\n...",Grace Cavalieri,
4,4,\r\r\n Umbrella\r\r\n ...,\r\r\nWhen I push your button\r\r\nyou fly off...,Connie Wanek,


#### Dataset Information

In [4]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13754 entries, 0 to 13833
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  13754 non-null  int64 
 1   Title       13754 non-null  object
 2   Poem        13754 non-null  object
 3   Poet        13754 non-null  object
 4   Tags        12854 non-null  object
dtypes: int64(1), object(4)
memory usage: 644.7+ KB


#### Concatenate multiple poems into a single text corpus, separating them by newline characters for clarity

In [5]:
df_cleaned = dataset.dropna(subset=['Tags'])
df_cleaned = df_cleaned.drop_duplicates(subset=['Poem'])

corpus = "\n".join(df_cleaned['Poem'].head(100).values)
# print(corpus)

### **2. Data Preprocessing**

In [6]:
# Converting the text to lowercase and removing the special characters or punctuation
cleaned_text = corpus.lower()

cleaned_text = re.sub(r'[^a-zA-Z\s,\'"-.]', '', cleaned_text)
cleaned_text = re.sub(r'\r\r\n\n\r\r\n', '\n', cleaned_text)
cleaned_text = re.sub(r'[\r\r]', '', cleaned_text)
cleaned_text = re.sub(r'\n \n', '\n\n', cleaned_text)

cleaned_text = re.sub(r'\n{2,}', '\n\n', cleaned_text)
cleaned_text = re.sub(r' +', ' ', cleaned_text)
cleaned_text = cleaned_text.strip()

print(cleaned_text[:5000])
print("============")
print(repr(cleaned_text[:5000]))

corpus = cleaned_text


invisible fish swim this ghost ocean now described by waves of sand, by water-worn rock. soon the fish will learn to walk. then humans will come ashore and paint dreams on the dying stone. then later, much later, the ocean floor will be punctuated by chevy trucks, carrying the dreamers decendants, who are going to the store.
dont bother the earth spirit who lives here. she is working on a story. it is the oldest story in the world and it is delicate, changing. if she sees you watching she will invite you in for coffee, give you warm bread, and you will be obligated to stay and listen. but this is no ordinary story. you will have to endure earthquakes, lightning, the deaths of all those you love, the most blinding beauty. its a story so compelling you may never want to leave this is how she traps you. see that stone finger over there that is the only one who ever escaped.
hour in which i consider hydrangea, a salt or sand plant, varietal, the question of varietals, the diet of every mot

In [19]:
paragraphs = corpus.split("\n\n")

# Splitting each paragraph into lines (split by single \n)
lines_in_paragraphs = [paragraph.split("\n") for paragraph in paragraphs]

temp_lines_in_paragraphs = []

lines = 0
for para in lines_in_paragraphs:
    temp = []
    for line in para:
      line = line.strip()
      if line == "":
        continue
      temp.append(line)
    if len(temp) > 0:
      temp_lines_in_paragraphs.append(temp)

    lines += len(para)

print(f"Paragraphs: {len(paragraphs)}")
print(f"Lines: {lines}")

Paragraphs: 612
Lines: 2790


#### Tokenizing the text (e.g., convert each word to a unique integer).  

In [8]:
flattened_lines = [line for paragraph in temp_lines_in_paragraphs for line in paragraph]

# Tokenize the corpus
tokenizer = Tokenizer()
tokenizer.fit_on_texts(flattened_lines)

# Get the vocabulary size
vocab_size = len(tokenizer.word_index) + 1  # Add 1 for padding token
print("Vocabulary size:", vocab_size)

# Convert the corpus into a sequence of integers (tokens)
sequences = tokenizer.texts_to_sequences(flattened_lines)
print(sequences[:20])  # Print the first 20 tokenized words

Vocabulary size: 4685
[[528, 370, 529, 24, 701, 163, 62, 1013, 36, 530, 3, 371, 36, 121, 1014, 702, 703, 1, 370, 25, 704, 4, 222, 82, 705, 25, 122, 1702, 6, 531, 284, 12, 1, 317, 706, 82, 223, 176, 223, 1, 163, 199, 25, 32, 1015, 36, 1703, 1016, 1704, 1, 1705, 1706, 47, 50, 106, 4, 1, 532], [107, 1707, 1, 135, 1708, 47, 247, 98, 33, 10, 1017, 12, 2, 177, 11, 10, 1, 1709, 177, 5, 1, 99, 6, 11, 10, 1710, 1711, 45, 33, 1712, 9, 533, 33, 25, 1713, 9, 5, 17, 318, 319, 9, 372, 1018, 6, 9, 25, 32, 1714, 4, 430, 6, 373, 30, 24, 10, 46, 1019, 177, 9, 25, 40, 4, 1715, 1716, 534, 1, 707, 3, 37, 224, 9, 123, 1, 374, 1717, 285, 35, 2, 177, 48, 1718, 9, 248, 83, 124, 4, 286, 24, 10, 53, 33, 1719, 9, 85, 13, 706, 708, 90, 56, 13, 10, 1, 94, 31, 47, 200, 1020], [535, 5, 125, 7, 1021, 709, 2, 710, 28, 371, 711, 1720, 1, 536, 3, 1721, 1, 1722, 3, 136, 80, 7, 100, 1022, 537, 14, 7, 40, 154, 7, 40, 154, 201, 2, 431, 3, 8, 137, 712, 285, 538, 1723, 7, 1724, 285, 285, 1023, 12, 1, 713, 3, 1725, 1, 249, 1726

#### Using a sliding window to create sequences of words for the LSTM model. For example, if n=5, create sequences of 5 words with the 6th word as the target.

In [9]:
sequence_length = 5  # Length of the sequence

# Create input-output pairs (X, y) based on the sliding window
X, y = [], []

for line in sequences:
    for i in range(sequence_length, len(line)):
        X.append(line[i-sequence_length:i])
        y.append(line[i])

print("X shape:", len(X))
print("y shape:", len(y))

X shape: 7098
y shape: 7098


#### Pad the sequences so that they all have the same length.

In [10]:
X = pad_sequences(X, maxlen = sequence_length, padding='pre')
y = np.array(y)
print("Padded X shape:", X.shape)

Padded X shape: (7098, 5)


### **3. LSTM Model Development**

Defining an LSTM model with the following structure:  
* An embedding layer with an appropriate input dimension (based on vocabulary size) and output dimension (e.g., 100).  
* One or two LSTM layers with 100 units each.  
* A dropout layer with a rate of 0.2 to prevent overfitting.  
* A dense output layer with softmax activation for word prediction.

In [18]:
# Define the LSTM model
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=100))
model.add(LSTM(100, return_sequences=True))
model.add(LSTM(100))
model.add(Dropout(0.2))  # Dropout layer to prevent overfitting
model.add(Dense(vocab_size, activation='softmax'))  # Output layer for word prediction

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Display model summary
model.build(input_shape=(None, X.shape[1]))  # Explicitly set input shape
model.summary()

### **4. Training the Model**

#### Compiling the model with categorical cross-entropy as the loss function and accuracy as the metric.

In [12]:
from keras.utils import to_categorical

# One-hot encode the target labels
y = to_categorical(y, num_classes=vocab_size)
print("y shape after one-hot encoding:", y.shape)

y shape after one-hot encoding: (7098, 4685)


#### Training the model on the sequences for 10-20 epochs (or until it achieves satisfactory performance).

In [13]:
# Train the model
model.fit(X, y, epochs=10, batch_size=64)

Epoch 1/10
[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 9ms/step - accuracy: 0.0553 - loss: 7.9905
Epoch 2/10
[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.0657 - loss: 6.6735
Epoch 3/10
[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.0629 - loss: 6.5408
Epoch 4/10
[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.0601 - loss: 6.4177
Epoch 5/10
[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.0653 - loss: 6.3050
Epoch 6/10
[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.0665 - loss: 6.2180
Epoch 7/10
[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.0725 - loss: 6.0895
Epoch 8/10
[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.0753 - loss: 5.9723
Epoch 9/10
[1m111/111[0m [32m━━━━━━

<keras.src.callbacks.history.History at 0x7958a649a830>

### **5. Text Generation:**

In [14]:
def generate_text(seed_text, next_words, model, tokenizer, max_sequence_len, temperature=1.0, words_per_set=5):
    previous_word = ""  # Track the previous word to avoid consecutive repetition
    word_count = 0  # Track the number of words generated in the current set
    for i in range(next_words):
        # Tokenize the seed text
        tokenized_text = tokenizer.texts_to_sequences([seed_text])[0]

        # Pad the tokenized sequence
        tokenized_text = pad_sequences([tokenized_text], maxlen=max_sequence_len, padding='pre')

        # Predict the next word probabilities
        predicted = model.predict(tokenized_text, verbose=0)[0]

        # Apply temperature to predictions (to make them more diverse)
        predicted = np.log(predicted + 1e-7) / temperature
        predicted = np.exp(predicted) / np.sum(np.exp(predicted))  # Normalize to get valid probabilities

        # Sample the next word based on the adjusted probabilities
        predicted_word_idx = np.random.choice(len(predicted), p=predicted)
        predicted_word = tokenizer.index_word.get(predicted_word_idx, '')

        # Ensure the predicted word is not the same as the previous word
        if predicted_word == previous_word:
            continue  # Skip the word if it's the same as the previous one

        if word_count != 0:
          seed_text += " "

        # Append the predicted word to the seed text
        seed_text += predicted_word

        # Update the previous word
        previous_word = predicted_word

        # Track the number of words in the current set
        word_count += 1

        # If we've reached the words_per_set limit, add a comma and reset word count
        if word_count >= words_per_set:
            seed_text += ",\n"
            word_count = 0  # Reset the word count for the next set of words

    return seed_text

seed = "once upon a time "
generated_poem = generate_text(seed, next_words=50, model=model, tokenizer=tokenizer, max_sequence_len=X.shape[1], temperature=0.7, words_per_set=5)
print(generated_poem)


once upon a time things more body because we,
today to the painting so,
yes on the fingers she,
outline am window blasting with,
a weapon it like burst,
alone not where an police,
of a breathless child still,
a nose of a great,
river on tear in the,
am thou wearing of stars,



### **6. Evaluation and Experimentation:**

In [15]:
# Define the LSTM model
model2 = Sequential()
model2.add(Embedding(input_dim=vocab_size, output_dim=100))
model2.add(LSTM(50, return_sequences=True))
model2.add(LSTM(50))
model2.add(Dropout(0.4))  # Dropout layer to prevent overfitting
model2.add(Dense(vocab_size, activation='softmax'))  # Output layer for word prediction

# Compile the model
model2.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Display model summary
model2.build(input_shape=(None, X.shape[1]))  # Explicitly set input shape
model2.summary()


In [16]:
# Train the model
model2.fit(X, y, epochs=30, batch_size=64)

Epoch 1/30
[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 10ms/step - accuracy: 0.0487 - loss: 8.1124
Epoch 2/30
[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.0651 - loss: 6.7377
Epoch 3/30
[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.0578 - loss: 6.6605
Epoch 4/30
[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.0572 - loss: 6.6303
Epoch 5/30
[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.0597 - loss: 6.4502
Epoch 6/30
[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.0604 - loss: 6.3906
Epoch 7/30
[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.0590 - loss: 6.3170
Epoch 8/30
[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.0654 - loss: 6.1961
Epoch 9/30
[1m111/111[0m [32m━━━━━

<keras.src.callbacks.history.History at 0x795890287d60>

In [17]:
seed = "In a Faraway Land, "
generated_poem = generate_text(seed, next_words=50, model=model2, tokenizer=tokenizer, max_sequence_len=X.shape[1], temperature=0.7, words_per_set=7)
print(generated_poem)

In a Faraway Land, soon down we walked on my universe,
is the woman not you way down,
a history eyes the kind of most,
varietals of with because a car a,
well a lie to heart my plates,
to an woman in with where no,
by as fresh become try the myself,
with
