<a href="https://colab.research.google.com/github/nexion11/Text-Generator-Shakespeare/blob/main/Poetic_power.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
import numpy as np
import random

In [3]:

# Load and preprocess the text data
filepath = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')
text = open(filepath, 'rb').read().decode(encoding='utf-8').lower()
text = text[100000:800000]
text = text.replace('\n', ' ').replace('\r', '').replace('\t', ' ')
text = ''.join(c for c in text if c.isprintable())

# Initialize and fit tokenizer
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts([text])
sequences = tokenizer.texts_to_sequences([text])[0]

SEQ_LENGTH = 40
STEP_SIZE = 3

input_sequences = []
next_characters = []

for i in range(0, len(sequences) - SEQ_LENGTH, STEP_SIZE):
    input_sequences.append(sequences[i: i + SEQ_LENGTH])
    next_characters.append(sequences[i + SEQ_LENGTH])

input_sequences = np.array(input_sequences)
next_characters = np.array(next_characters)

print("Number of input sequences:", len(input_sequences))
print("Number of next characters:", len(next_characters))

# One-hot encode the next characters
vocab_size = len(tokenizer.word_index) + 1
y = to_categorical(next_characters, num_classes=vocab_size)

print("Shape of input_sequences:", input_sequences.shape)
print("Shape of y:", y.shape)

Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt
Number of input sequences: 233320
Number of next characters: 233320
Shape of input_sequences: (233320, 40)
Shape of y: (233320, 39)


In [4]:
# Build the model
embedding_dim = 512
rnn_units = 512  # Increased number of units in LSTM layers
dropout_rate = 0.3  # Adjusted dropout rate

model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=SEQ_LENGTH),
    LSTM(rnn_units, return_sequences=True),
    Dropout(dropout_rate),
    LSTM(rnn_units, return_sequences=True),
    Dropout(dropout_rate),
    LSTM(rnn_units),
    Dense(vocab_size, activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy')
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 40, 512)           19968     
                                                                 
 lstm (LSTM)                 (None, 40, 512)           2099200   
                                                                 
 dropout (Dropout)           (None, 40, 512)           0         
                                                                 
 lstm_1 (LSTM)               (None, 40, 512)           2099200   
                                                                 
 dropout_1 (Dropout)         (None, 40, 512)           0         
                                                                 
 lstm_2 (LSTM)               (None, 512)               2099200   
                                                                 
 dense (Dense)               (None, 39)                2

In [5]:
epochs = 20  # Increased number of epochs
batch_size = 64  # Adjust batch size

model.fit(input_sequences, y, epochs=epochs, batch_size=batch_size)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x7d1bb3ac31f0>

In [None]:
# Text generation functions
def sample(preds, temperature=0.2):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds + 1e-8) / temperature  # Add small value to avoid log(0)
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

def generate_text(model, tokenizer, seed_text, num_generate=100, temperature=0.2):
    input_eval = tokenizer.texts_to_sequences([seed_text])
    input_eval = pad_sequences(input_eval, maxlen=SEQ_LENGTH, padding='pre')

    generated_text = seed_text

    for _ in range(num_generate):
        predictions = model.predict(input_eval, verbose=0)
        predictions = predictions[0]

        next_index = sample(predictions, temperature)
        next_char = tokenizer.index_word[next_index]

        generated_text += next_char

        input_eval = tokenizer.texts_to_sequences([generated_text[-SEQ_LENGTH:]])
        input_eval = pad_sequences(input_eval, maxlen=SEQ_LENGTH, padding='pre')

    return generated_text

# Example usage
seed_text = "shall i compare thee to a summer's day"
generated_text = generate_text(model, tokenizer, seed_text, num_generate=500, temperature=0.2)
print(generated_text)

In [8]:
model.save('text_generation_model.h5')


  saving_api.save_model(
