<a href="https://colab.research.google.com/github/mohammadreza-mohammadi94/Deep-Learning-Projects/blob/main/Poet%20Generator%20-%20Shakespeare's%20Sonnets/Poem_Generator_Shakespeare's_Sonnets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Libraries & Setup Enviorment

In [None]:
# Libraries
import os
import numpy as np
import tensorflow as tf
import requests
import re

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GRU, Dense, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import TimeDistributed


# Logging
import logging
logging.basicConfig(
    level=logging.INFO,
    handlers=[
        logging.FileHandler('poetry_log.log'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

# Warnings
import warnings
warnings.filterwarnings("ignore")

# Define Functions

*Download Dataset*

In [None]:
def download_poetry_dataset():
    dataset_path = "shakespeare_sonnets.txt"
    url = "https://www.gutenberg.org/cache/epub/100/pg100.txt"

    if not os.path.exists(dataset_path):
        try:
            response = requests.get(url, stream=True)
            with open(dataset_path, 'wb') as f:
                f.write(response.content)
            logger.info("Dataset downloaded")

            # Remove metadata
            with open(dataset_path, 'r', encoding='utf-8') as f:
                lines = f.readlines()
            start_idx = next(i for i, line in enumerate(lines) if "SONNETS" in line.upper()) + 1

            with open(dataset_path, "w", encoding="utf-8") as f:
                f.writelines(lines[start_idx:])
            logger.info("Poetry dataset saved as shakespeare_sonnets.txt")
        except Exception as e:
            logger.error(f"Failed to download or process poetry dataset: {e}")
            raise
    else:
        logger.info("Poetry dataset already exists.")
    return dataset_path

*Preprocess*

In [None]:
def preprocess_poetry(dataset_path, max_quatrain_length=40, max_words=5000):
    logger.info("Preprocessing poetry data...")
    try:
        with open(dataset_path, 'r', encoding='utf-8') as f:
            text = f.read().lower()
        # Remove noises
        text = re.sub(r'[^\w\s\b]', '', text)
        text = re.sub(r'\n+', '\n', text)

        # Split to lines
        lines = text.split('\n')
        lines = [line.strip() for line in lines if line.strip() and not line.isspace()]

        # Convert to Quatrains
        # This code takes the lines of the poem from the lines list,
        # converts each 4 lines into a string (quatrain),
        # and adds it to the quatrains list if the number of words does not exceed
        # the allowed limit.
        # This is used to prepare the data for training the GRU model,
        # since the model is going to be trained on quatrains.
        quatrains = []                                              # empty list to store quatrains
        for i in range(0, len(lines) - 3, 4):                       # creates lines in form of quatrains (4 lines by 4 lines)
            if i + 3 < len(lines):                                  # checks if there are 4 lines available
                quatrain = ' '.join(lines[i: i + 4])
                if len(quatrain.split()) <= max_quatrain_length:
                    quatrains.append(quatrain)

        # Tokenization
        tokenizer = Tokenizer(num_words = max_words, oov_token='<OOV>')
        tokenizer.fit_on_texts(quatrains)
        sequences = tokenizer.texts_to_sequences(quatrains)
        padded_sequences = pad_sequences(sequences, maxlen=max_quatrain_length, padding='post')

        # Spliting to X, t
        X = padded_sequences[:, :-1]
        y = padded_sequences[:, 1:]

        logger.info(f"Preprocessed {len(quatrains)} quatrains. X shape: {X.shape}, y shape: {y.shape}")
        return X, y, tokenizer, max_quatrain_length - 1

    except Exception as e:
        logger.error(f"Error in preprocessing poetry data: {e}")
        raise

*Build GRU*

In [None]:
def build_gru_model(vocab_size, sequence_length):
    logger.info("Building GRU Model...")
    try:
        model = Sequential([
            Embedding(vocab_size, 128, input_length=sequence_length),
            GRU(256, return_sequences=True, dropout=0.2),
            GRU(128, return_sequences=True, dropout=0.2),
            TimeDistributed(Dense(vocab_size, activation='softmax'))
        ])

        model.compile(loss='sparse_categorical_crossentropy',
                      optimizer='adam',
                      metrics=['accuracy'])
        logger.info("GRU model built successfully.")
        return model
    except Exception as e:
        logger.error(f"Error building model: {str(e)}")
        raise

*Generate Poem*

In [None]:
def generate_poetry(model, tokenizer, sequence_length, max_length=40, temperature=1.0):
    logger.info("Generating poetry...")
    try:
        start_sequence = np.zeros((1, sequence_length))
        seed_text = "shall i compare thee to a summer's day"
        for i, word in enumerate(seed_text.lower().split()):
            token = tokenizer.word_index.get(word, 1)  # 1 for OOV
            if i < sequence_length:
                start_sequence[0, i] = token

        generated = seed_text.lower().split()
        for _ in range(max_length - len(seed_text.split())):
            pred = model.predict(start_sequence, verbose=0)
            pred = np.log(pred + 1e-10) / temperature  # temperature
            next_word_idx = np.argmax(pred[:, -1, :])
            next_word = tokenizer.index_word.get(next_word_idx, '<OOV>')
            if next_word == '<OOV>' or not next_word:
                break
            generated.append(next_word)
            start_sequence = np.roll(start_sequence, -1)
            start_sequence[0, -1] = next_word_idx

        # convert to quatrains
        poetry = ' '.join(generated)
        quatrain_lines = [poetry[i:i + int(len(poetry.split())/4)].strip() for i in range(0, len(poetry.split()), int(len(poetry.split())/4))]
        while len(quatrain_lines) < 4:
            quatrain_lines.append("")
        return '\n'.join(quatrain_lines[:4])
    except Exception as e:
        logger.error(f"Error generating poetry: {e}")
        raise

# Run Functions and Train Model

In [None]:
logger.info("Starting poetry generation project...")

# Download dataset
dataset_path = download_poetry_dataset()

# Preprocessing
X, y, tokenizer, sequence_length = preprocess_poetry(dataset_path)

# Build and train the model
vocab_size = len(tokenizer.word_index) + 1
model = build_gru_model(vocab_size, sequence_length)
model.summary()

logger.info("Training GRU Model...")
y = np.expand_dims(y, -1)
early_stopping = EarlyStopping(monitor='val_loss',
                               patience=5,
                               restore_best_weights=True)
model.fit(X, y, epochs=10,
          batch_size=32,
          validation_split=0.2,
          callbacks=[early_stopping])
logger.info("Training completed.")

# Generate poet
poetry = generate_poetry(model, tokenizer, sequence_length, temperature=0.7)
logger.info("Generated poetry:\n" + poetry)

# Save the model
model.save("quatrains_poetry_gru_model.h5")
logger.info("Model saved as poetry_gru_model.h5")

Epoch 1/100
[1m927/927[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2821s[0m 3s/step - accuracy: 0.4067 - loss: 4.9293 - val_accuracy: 0.4229 - val_loss: 4.0504
Epoch 2/100
[1m296/927[0m [32m━━━━━━[0m[37m━━━━━━━━━━━━━━[0m [1m28:43[0m 3s/step - accuracy: 0.4328 - loss: 3.8353