<a href="https://colab.research.google.com/github/mohammadreza-mohammadi94/Deep-Learning-Projects/blob/main/Text-Generation-Edgar-Allan-Poems/text_generation_edgar_allan_poems.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import tensorflow as tf
import numpy as np
import os
import re

# Download Dataset

In [None]:
path = tf.keras.utils.get_file(
    "allan.tx",
    origin="https://www.gutenberg.org/cache/epub/10031/pg10031.txt"
)

# Load all text
text = open(path, "rb").read().decode(encoding="utf-8")

Downloading data from https://www.gutenberg.org/cache/epub/10031/pg10031.txt
[1m408498/408498[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [None]:
# Check few lines of text corpus
print(text[2000:2500])

n
many additional pieces and extra stanzas, nowhere else published or
included in Poe's works. Such verses have been gathered from printed or
manuscript sources during a research extending over many years.

In addition to the new poetical matter included in this volume,
attention should, also, be solicited on behalf of the notes, which will
be found to contain much matter, interesting both from biographical and
bibliographical points of view.

JOHN H. INGRAM.




CONTENTS.


ME


# Preprocessing

In [None]:
def clean_text(raw_text):
    """
    Cleans the raw text data by removing project-specific headers/footers,
    non-essential characters, and excessive whitespace.

    Args:
        raw_text (str): The raw text loaded from the source.

    Returns:
        str: The cleaned text.
    """
    start_marker = "EDGAR ALLAN POE"
    end_marker = "End of the Project Gutenberg"

    # Find the main content between start and end markers
    start_idx = raw_text.find(start_marker)
    end_idx = raw_text.find(end_marker)

    if start_idx != -1 and end_idx != -1:
        text = raw_text[start_idx:end_idx]
    else:
        text = raw_text

    lines = text.split('\n')
    clean_lines = []
    for line in lines:
        # Remove lines containing project Gutenberg related words
        if not any(word in line.upper() for word in ["GUTENBERG", "LICENSE", "FOUNDATION", "TRADEMARK"]):
            clean_lines.append(line)

    text = '\n'.join(clean_lines)

    text = text.replace('\r', '') # Remove carriage return characters
    text = re.sub(r'\[.*?\]', '', text) # Remove text within square brackets (e.g., [Illustration])
    text = re.sub(r'[0-9]+\.[A-Z]\.[0-9]+', '', text) # Remove license numbers like 1.D.6

    # Limit characters to alphanumeric and essential punctuation
    # This helps in creating a smaller and focused vocabulary
    allowed_chars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!,.?;: \n'"
    text = ''.join([c for c in text if c in allowed_chars])

    # Remove excessive blank lines
    text = re.sub(r'\n\s*\n', '\n\n', text) # Replace multiple blank lines with at most two

    return text

def create_vocab(text):
    """
    Creates a vocabulary from the cleaned text, character-to-index and index-to-character mappings,
    and converts the text into a numerical representation.

    Args:
        text (str): The cleaned text.

    Returns:
        tuple: A tuple containing:
            - vocab (list): Sorted list of unique characters.
            - char2idx (dict): Mapping from character to integer index.
            - idx2char (dict): Mapping from integer index to character.
            - text_as_int (np.array): Numerical representation of the text.
    """
    vocab = sorted(set(text))
    char2idx = {char: idx for idx, char in enumerate(vocab)}
    idx2char = {idx: char for idx, char in enumerate(vocab)}
    text_as_int = np.array([char2idx[c] for c in text])
    return vocab, char2idx, idx2char, text_as_int

def prepare_dataset(text, batch_size, seq_length=100):
    """
    Prepares the text data into a tf.data.Dataset for training a sequence model.
    It creates sequences of characters and corresponding target sequences.

    Args:
        text (np.array): Numerical representation of the text.
        batch_size (int): Number of sequences per batch.
        seq_length (int, optional): Length of each input sequence. Defaults to 100.

    Returns:
        tf.data.Dataset: A batched and shuffled dataset of input-target sequence pairs.
    """
    char_dataset = tf.data.Dataset.from_tensor_slices(text)
    sequences = char_dataset.batch(seq_length + 1, drop_remainder=True)

    def split_input_target(sequence):
        # For each sequence, input is all characters except the last,
        # and target is all characters except the first.
        return sequence[:-1], sequence[1:]

    dataset = sequences.map(split_input_target)
    dataset = dataset.shuffle(10_000).batch(batch_size, drop_remainder=True)
    return dataset

def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    """
    Builds a character-level LSTM model using the Keras Sequential API.

    Args:
        vocab_size (int): The size of the vocabulary (number of unique characters).
        embedding_dim (int): The dimension of the character embedding layer.
        rnn_units (int): The number of units in the LSTM layers.
        batch_size (int): The batch size for training/inference.

    Returns:
        tf.keras.Model: The compiled Keras model.
    """
    model = tf.keras.Sequential([
        tf.keras.Input(batch_shape=(batch_size, None)),
        tf.keras.layers.Embedding(vocab_size, embedding_dim),

        tf.keras.layers.LSTM(rnn_units, return_sequences=True, stateful=True),
        tf.keras.layers.Dropout(0.2),

        tf.keras.layers.LSTM(rnn_units, return_sequences=True, stateful=True),
        tf.keras.layers.Dropout(0.2),

        tf.keras.layers.LSTM(rnn_units, return_sequences=True, stateful=True),

        tf.keras.layers.Dense(vocab_size)
    ])
    return model

def inference(weights, vocab_size, embedding_dim, rnn_units, batch_size=1):
    """
    Loads a pre-trained model for inference. It rebuilds the model with a batch size of 1
    and loads the specified weights.

    Args:
        weights (str): Path to the model weights file.
        vocab_size (int): The size of the vocabulary.
        embedding_dim (int): The dimension of the embedding layer.
        rnn_units (int): The number of units in the LSTM layers.
        batch_size (int, optional): Batch size for inference. Defaults to 1.

    Returns:
        tf.keras.Model: The loaded model configured for inference.
    """
    model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)
    model.load_weights(weights)
    model.build(tf.TensorShape([1, None]))
    return model

def generate(model, start_string, len, temp=0.7):
    """
    Generates text using the trained character-level LSTM model.

    Args:
        model (tf.keras.Model): The trained Keras model.
        start_string (str): The initial string to start text generation from.
        len (int): The number of characters to generate.
        temp (float, optional): Controls the randomness of prediction. Higher values
                                 result in more unpredictable text. Defaults to 0.7.

    Returns:
        str: The generated text, prefixed with the start_string.
    """
    num_generation = len

    # Convert start string to numerical representation
    input_eval = [char2idx[s] for s in start_string]
    input_eval = tf.expand_dims(input_eval, 0) # Add batch dimension (1, seq_length)

    text_generated = []

    # Reset the LSTM states before each generation to ensure independence
    for layer in model.layers:
        if hasattr(layer, "reset_states"):
            layer.reset_states()

    # Generate characters one by one
    for i in range(num_generation):
        predictions = model(input_eval)
        # Remove the batch dimension (seq_length, vocab_size)
        predictions = tf.squeeze(predictions, 0)

        # Apply Temperature to modify the probability distribution for creativity
        predictions = predictions / temp

        # Sample a character from the predicted probability distribution
        predicted_id = tf.random.categorical(
            predictions, num_samples=1
        )[-1, 0].numpy()

        # Use the predicted character as the next input to the model
        input_eval = tf.expand_dims([predicted_id], 0)
        text_generated.append(idx2char[predicted_id])
    return (start_string + "".join(text_generated))

In [None]:
# The raw text data is cleaned to remove irrelevant sections (like Project Gutenberg headers/footers),
# special characters, and excessive whitespace, making it suitable for training.
text = clean_text(text)

# A vocabulary is created from the cleaned text. This involves:
# - Extracting all unique characters to form the vocabulary.
# - Creating mappings from characters to integer indices (char2idx) and vice-versa (idx2char).
# - Converting the entire text into a numerical representation (text_as_int) using these mappings.
vocab, char2idx, idx2char, text_as_int = create_vocab(text)

# The numerical text is transformed into a TensorFlow `tf.data.Dataset`.
# This dataset will provide sequences of characters as input and the next character in the sequence as the target,
# batched and shuffled for efficient model training.
dataset = prepare_dataset(text_as_int, 64)

# Parameters for the LSTM model are defined, and the model architecture is built using these parameters.
# - `vocab_size`: Total number of unique characters in the text, defining the output dimension of the last layer.
# - `embedding_dim`: Dimension for the character embedding layer, converting character indices into dense vectors.
# - `rnn_units`: Number of units in each LSTM layer, influencing the model's capacity to learn sequences.
vocab_size = len(vocab)
embedding_dim = 256
rnn_units = 1024
model = build_model(
    vocab_size=vocab_size,
    embedding_dim=embedding_dim,
    rnn_units=rnn_units,
    batch_size=64
)

# Display a summary of the model's architecture, including layer types, output shapes, and parameter counts.
model.summary()

# The model is compiled with a loss function and an optimizer, preparing it for the training process.
# - `loss`: SparseCategoricalCrossentropy is used as it's suitable for integer-encoded targets and multi-class classification (predicting the next character).
# - `optimizer`: Adam optimizer is chosen for its efficiency and good performance in various deep learning tasks, with an initial learning rate of 0.001.
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
model.compile(optimizer=optimizer, loss=loss)

# Callbacks are configured to enhance the training process by saving model checkpoints, stopping early if performance plateaus,
# and dynamically adjusting the learning rate.
checkpoint_path = "training_checkpoints/ckpt_{epoch}.weights.h5"
checkpoint_dir = os.path.dirname(checkpoint_path)

# 1. ModelCheckpoint:
# Saves the model's weights during training. It monitors the 'loss' and saves only the best performing weights.
# This ensures that if training improves, the best model state is preserved.
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_path,
    save_weights_only=True,
    monitor='loss', # Monitor the training loss (since no validation data is used)
    save_best_only=True # Save weights only when the monitored quantity improves
)

# 2. EarlyStopping:
# Stops training if the model's performance (monitored via 'loss') does not improve for a specified number of epochs (patience).
# The best weights from the epoch with the lowest loss will be restored.
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='loss',
    patience=5, # Stop if loss doesn't improve for 5 consecutive epochs
    restore_best_weights=True # Restore model weights from the epoch with the best value of the monitored quantity
)

# 3. ReduceLROnPlateau:
# Reduces the learning rate when the model's performance has stopped improving.
# This helps the model to converge more precisely once it's close to an optimal solution.
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(
    monitor='loss',
    factor=0.2, # Reduce learning rate by a factor of 0.2 (divide by 5)
    patience=3, # Wait for 3 epochs of no improvement before reducing LR
    min_lr=0.00001 # Set a lower bound for the learning rate
)

# Combine all defined callbacks into a list to be passed to the `fit` method.
callbacks_list = [checkpoint_callback, early_stopping, reduce_lr]

# The model is trained using the prepared dataset for a specified number of epochs, utilizing the defined callbacks.
model.fit(dataset, epochs=50, callbacks=callbacks_list)

# After training is complete, the final model weights are saved to a file.
# This allows for later inference without needing to retrain the model.
model.save_weights("edgar_allan.weights.h5")

Epoch 1/50
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 290ms/step - loss: 3.4055 - learning_rate: 0.0010
Epoch 2/50
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 297ms/step - loss: 2.8157 - learning_rate: 0.0010
Epoch 3/50
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 309ms/step - loss: 2.2737 - learning_rate: 0.0010
Epoch 4/50
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 324ms/step - loss: 2.0465 - learning_rate: 0.0010
Epoch 5/50
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 325ms/step - loss: 1.8853 - learning_rate: 0.0010
Epoch 6/50
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 314ms/step - loss: 1.7473 - learning_rate: 0.0010
Epoch 7/50
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 314ms/step - loss: 1.6425 - learning_rate: 0.0010
Epoch 8/50
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 316ms/step - loss: 1.5603 - learning_rate:

In [None]:
# Define the path to the saved model weights file.
weights = "/content/edgar_allan.weights.h5"

# Load the pre-trained model for inference.
# The 'inference' function rebuilds the model with a batch size of 1 for single-prediction tasks,
# and then loads the weights from the specified path.
# It uses the previously defined vocabulary size, embedding dimension, and RNN units to reconstruct the model architecture.
model_inference = inference(weights, vocab_size=vocab_size, embedding_dim=embedding_dim,
                            rnn_units=rnn_units, batch_size=1)


In [None]:
generate(model_inference, start_string="Alice: ", len=800)

"Alice: where he expired on the th of\nOctober, , in the fortyfirst year of his age.\n\nEdgar Poe was editor of the 'Broadway Journal' for May .\n\n.   THE SLEEPER ERIST OF ANTHUN I\nLCEIET DAR of The Pan, if you pale, est in the 'Se upon this new course of life with his\nusual enthusiasm, and for a time to have borne the rigid violet beauty\n              The unembodied essence, and no more\n  Thy memory no more! Accursed ground\n    Henceforward I hold thy flowerenamelled shadows so her anything it where the pieces referred to hinf\n                 Fly that it must:\n  Heaven shall chances apelyand the lightning did not flashand\nthe cloud of the river, and the corrosive hours, comalted in my very hair!\n\n.\n\nTO HELEN.\n\n  Helen, thy beauty is to me\n    Like those Nicean barks of yore,\n  That gentle ways, and "