<!-- #Recurrent Neural Network (RNN) for Text Generation
# Description:
An RNN model, specifically using Long Short-Term Memory (LSTM) layers, that generates text based on input sequences. This model is useful for applications such as writing assistance, creative content generation, and automated storytelling.

# Dataset:
For this example, we'll use a dataset containing the works of William Shakespeare. The dataset can be downloaded from various online sources or obtained from public datasets repositories like Kaggle.

# Architecture:
# Embedding Layer: Converts input text into dense vectors of fixed size.
# LSTM Layers: Processes the sequences of embeddings and captures long-range dependencies.
# Dense Layers: Maps the LSTM outputs to the vocabulary size for generating predictions.
# Tools Used:
# TensorFlow
# Keras
# PyTorch (alternative) -->

# Recurrent Neural Network (RNN) for Text Generation

## Description
An RNN model, specifically using Long Short-Term Memory (LSTM) layers, that generates text based on input sequences. This model is useful for applications such as writing assistance, creative content generation, and automated storytelling.

## Dataset
For this example, we'll use a dataset containing the works of William Shakespeare. The dataset can be downloaded from various online sources or obtained from public datasets repositories like Kaggle.

## Tools Used
- TensorFlow
- Keras
- PyTorch (alternative)

## Architecture
1. **Embedding Layer**: Converts input text into dense vectors of fixed size.
2. **LSTM Layers**: Processes the sequences of embeddings and captures long-range dependencies.
3. **Dense Layers**: Maps the LSTM outputs to the vocabulary size for generating predictions.

## Implementation




# 1. Import Libraries

In [2]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# For reproducibility
np.random.seed(42)
tf.random.set_seed(42)

# 2. Load and Prepare the Dataset

In [6]:
# Load the dataset
path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')

# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
total_words = len(tokenizer.word_index) + 1

# Create sequences of words
input_sequences = []
for line in text.split('\n'):
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

# Pad sequences and create predictors and label
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
predictors, label = input_sequences[:,:-1],input_sequences[:,-1]

# Convert labels to categorical
label = tf.keras.utils.to_categorical(label, num_classes=total_words)


Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt
[1m1115394/1115394[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1us/step


# 3. Build the Model

In [9]:
model = Sequential()
model.add(Embedding(total_words, 100))
model.add(LSTM(150, return_sequences=True))
model.add(LSTM(100))
model.add(Dense(total_words, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()


# 4. Train the Model

In [10]:
history = model.fit(predictors, label, epochs=20, verbose=1)


Epoch 1/20
[1m5354/5354[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m140s[0m 26ms/step - accuracy: 0.0343 - loss: 7.0411
Epoch 2/20
[1m5354/5354[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m133s[0m 25ms/step - accuracy: 0.0678 - loss: 6.2374
Epoch 3/20
[1m5354/5354[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m147s[0m 27ms/step - accuracy: 0.0862 - loss: 5.8901
Epoch 4/20
[1m5354/5354[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m106s[0m 20ms/step - accuracy: 0.0946 - loss: 5.6057
Epoch 5/20
[1m5354/5354[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m152s[0m 28ms/step - accuracy: 0.1009 - loss: 5.3768
Epoch 6/20
[1m5354/5354[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m105s[0m 20ms/step - accuracy: 0.1058 - loss: 5.1665
Epoch 7/20
[1m5354/5354[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m194s[0m 36ms/step - accuracy: 0.1138 - loss: 4.9770
Epoch 8/20
[1m5354/5354[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m200s[0m 37ms/step - accuracy: 0.1216 - loss: 4.8264


# 5. Generate Text

In [11]:
def generate_text(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = np.argmax(model.predict(token_list), axis=-1)
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " " + output_word
    return seed_text

# Generate new text
seed_text = "Shall I compare thee to a summer's day"
next_words = 20
print(generate_text(seed_text, next_words, model, max_sequence_len))


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 279ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1

In [12]:
from tensorflow.keras.layers import Bidirectional, Dropout
from tensorflow.keras.optimizers import Adam

model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_sequence_len-1))
model.add(Bidirectional(LSTM(150, return_sequences=True)))
model.add(Dropout(0.2))
model.add(Bidirectional(LSTM(100)))
model.add(Dropout(0.2))
model.add(Dense(total_words, activation='softmax'))

optimizer = Adam(learning_rate=0.001)
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

model.summary()


In [13]:
history = model.fit(predictors, label, epochs=50, batch_size=128, validation_split=0.2, verbose=1)


Epoch 1/50
[1m1071/1071[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m92s[0m 82ms/step - accuracy: 0.0334 - loss: 7.1425 - val_accuracy: 0.0486 - val_loss: 6.7022
Epoch 2/50
[1m 302/1071[0m [32m━━━━━[0m[37m━━━━━━━━━━━━━━━[0m [1m59s[0m 78ms/step - accuracy: 0.0555 - loss: 6.4415

KeyboardInterrupt: 

In [19]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
import pickle
import os

# For reproducibility
np.random.seed(42)
tf.random.set_seed(42)


In [20]:
# Load the dataset
path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')

# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
total_words = len(tokenizer.word_index) + 1

# Save the tokenizer for future use
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Create sequences of words
input_sequences = []
for line in text.split('\n'):
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

# Pad sequences and create predictors and label
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
predictors, label = input_sequences[:,:-1],input_sequences[:,-1]

# Convert labels to categorical
label = tf.keras.utils.to_categorical(label, num_classes=total_words)


In [21]:
# Load GloVe embeddings
def load_glove_embeddings(file_path, embedding_dim=100):
    embeddings_index = {}
    with open(file_path, encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

# Create embedding matrix
def create_embedding_matrix(word_index, embeddings_index, embedding_dim=100):
    embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    return embedding_matrix

# Update the path to your GloVe file
glove_file_path = 'glove.6B.100d.txt'
embeddings_index = load_glove_embeddings(glove_file_path)
embedding_matrix = create_embedding_matrix(tokenizer.word_index, embeddings_index)

# Verify the shape of the embedding matrix
print("Embedding matrix shape:", embedding_matrix.shape)


Embedding matrix shape: (12633, 100)


In [22]:
# Define the model
model = Sequential()
model.add(Embedding(input_dim=total_words, 
                    output_dim=100, 
                    input_length=max_sequence_len-1, 
                    weights=[embedding_matrix], 
                    trainable=True))  # Set trainable=True if you want to further train the embeddings
model.add(Bidirectional(LSTM(150, return_sequences=True)))
model.add(Dropout(0.2))
model.add(Bidirectional(LSTM(100)))
model.add(Dropout(0.2))
model.add(Dense(total_words, activation='softmax'))

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Model summary
model.summary()


In [23]:
history = model.fit(predictors, label, epochs=20, batch_size=128, validation_split=0.2, verbose=1)

Epoch 1/20
[1m1071/1071[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m90s[0m 78ms/step - accuracy: 0.0349 - loss: 7.1152 - val_accuracy: 0.0564 - val_loss: 6.6423
Epoch 2/20
[1m1071/1071[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m98s[0m 92ms/step - accuracy: 0.0663 - loss: 6.3340 - val_accuracy: 0.0780 - val_loss: 6.5108
Epoch 3/20
[1m1071/1071[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m113s[0m 105ms/step - accuracy: 0.0872 - loss: 6.0489 - val_accuracy: 0.0813 - val_loss: 6.5010
Epoch 4/20
[1m1071/1071[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m86s[0m 81ms/step - accuracy: 0.0948 - loss: 5.8460 - val_accuracy: 0.0855 - val_loss: 6.5202
Epoch 5/20
[1m1071/1071[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m106s[0m 99ms/step - accuracy: 0.1008 - loss: 5.6673 - val_accuracy: 0.0878 - val_loss: 6.5718
Epoch 6/20
[1m1071/1071[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m86s[0m 80ms/step - accuracy: 0.1069 - loss: 5.5132 - val_accuracy: 0.0875 - val_loss: 6.6556
E

In [25]:
def generate_text(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = np.argmax(model.predict(token_list), axis=-1)
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " " + output_word
    return seed_text

# Generate new text
seed_text = "Shall I compare thee to a summer's day"
next_words = 50
print(generate_text(seed_text, next_words, model, max_sequence_len))


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20m