In [1]:

# 1. Imports

import numpy as np
import re
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Input


In [4]:
# 2. Load & preprocess dataset

# Download the dataset if it doesn't exist
path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')

with open(path_to_file, "r", encoding="utf-8") as f:
    text = f.read().lower()

# remove punctuation
text = re.sub(r"[^a-z\s]", "", text)

words = text.split()

Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt
[1m1115394/1115394[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [5]:

# 3. Tokenization

vocab = sorted(set(words))
word_to_idx = {w: i for i, w in enumerate(vocab)}
idx_to_word = {i: w for w, i in word_to_idx.items()}
vocab_size = len(vocab)

encoded = [word_to_idx[word] for word in words]


In [6]:

# 4. Create input-output sequences

seq_length = 10
X, y = [], []

for i in range(len(encoded) - seq_length):
    X.append(encoded[i:i + seq_length])
    y.append(encoded[i + seq_length])

X = np.array(X)
y = np.array(y)


In [7]:



X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [8]:



model = Sequential([
    Input(shape=(seq_length,)),
    Embedding(input_dim=vocab_size, output_dim=128),
    LSTM(256),
    Dense(vocab_size, activation="softmax")
])

model.compile(
    optimizer="adam",
    loss="sparse_categorical_crossentropy"
)

model.summary()


In [9]:

# 7. Train model

early_stop = EarlyStopping(monitor="val_loss", patience=3, restore_best_weights=True)

model.fit(
    X_train,
    y_train,
    validation_data=(X_val, y_val),
    epochs=20,
    batch_size=64,
    callbacks=[early_stop]
)


Epoch 1/20
[1m2533/2533[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m417s[0m 164ms/step - loss: 7.0064 - val_loss: 6.3992
Epoch 2/20
[1m2533/2533[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m422s[0m 166ms/step - loss: 6.1372 - val_loss: 6.2087
Epoch 3/20
[1m2533/2533[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m428s[0m 169ms/step - loss: 5.7040 - val_loss: 6.1891
Epoch 4/20
[1m2533/2533[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m425s[0m 168ms/step - loss: 5.2958 - val_loss: 6.2485
Epoch 5/20
[1m2533/2533[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m423s[0m 160ms/step - loss: 4.8522 - val_loss: 6.3872
Epoch 6/20
[1m2533/2533[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m409s[0m 162ms/step - loss: 4.4080 - val_loss: 6.5443


<keras.src.callbacks.history.History at 0x7b62739c9e80>

In [10]:

# 8. Text generation function

def generate_text(seed_text, next_words=30):
    for _ in range(next_words):
        tokenized = [word_to_idx.get(w, 0) for w in seed_text.split()]
        tokenized = tokenized[-seq_length:]

        if len(tokenized) < seq_length:
            tokenized = [0] * (seq_length - len(tokenized)) + tokenized

        prediction = model.predict(np.array([tokenized]), verbose=0)
        next_word = idx_to_word[np.argmax(prediction)]
        seed_text += " " + next_word

    return seed_text


In [11]:

# 9. Generate sample text

seed = "charles dickens"
print(generate_text(seed, next_words=40))


charles dickens man and a man and a man to be a man to be a man to be a man to be a man to be a man to be a man to be a man to be a man to


In [12]:

# 10. Save model

model.save("lstm_gen.keras")
