In [1]:
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer

In [2]:
# Step 1: Merge and Preprocess the Text Files
folder_path = '/kaggle/input/poetry'
output_file = '/kaggle/working/merged_poetry.txt'

In [3]:
# Merge all .txt files into one file
with open(output_file, 'w', encoding='utf-8') as outfile:
    for filename in os.listdir(folder_path):
        if filename.endswith('.txt'):
            with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as infile:
                for line in infile:
                    outfile.write(line.strip() + r'\n')
            outfile.write(r'\n')

In [4]:
# Load merged file and clean data
with open(output_file, 'r', encoding='utf-8') as file:
    data = file.read()
clean_data = data.replace("\\n", "\n").strip()
corpus = clean_data.split("\n")

del data  # Free memory

In [5]:
# Step 2: Tokenize the Corpus
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1

In [6]:
# Convert text into sequences
sequences = []
for line in corpus:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_seq = token_list[:i + 1]
        sequences.append(n_gram_seq)

del corpus  # Free memory

In [7]:
# Step 3: Save Sequences in Batches
sequences_path = '/kaggle/working/sequences.npy'
xs_batches_path = '/kaggle/working/xs_batches'
labels_batches_path = '/kaggle/working/labels_batches'
os.makedirs(xs_batches_path, exist_ok=True)
os.makedirs(labels_batches_path, exist_ok=True)

In [8]:
# Save sequences to disk and clear memory
np.save(sequences_path, np.array(sequences, dtype=object))
del sequences  # Free memory

In [9]:
# Process and save sequences in batches
sequence_data = np.load(sequences_path, allow_pickle=True)
batch_size = 128
num_batches = len(sequence_data) // batch_size + (len(sequence_data) % batch_size > 0)

In [10]:
for i in range(num_batches):
    batch = sequence_data[i * batch_size:(i + 1) * batch_size]
    xs_batch = np.array([seq[:-1] for seq in batch], dtype=object)  # Input sequences
    labels_batch = np.array([seq[-1] for seq in batch], dtype=np.int32)  # Last word as the label

    np.save(f"{xs_batches_path}/xs_batch_{i}.npy", xs_batch)
    np.save(f"{labels_batches_path}/labels_batch_{i}.npy", labels_batch)

del sequence_data 

In [11]:
# Step 4: Define Data Generator and Dataset
def data_generator(xs_path, labels_path):
    batch_files = sorted(os.listdir(xs_path))
    for i in range(len(batch_files)):
        xs_batch = np.load(f"{xs_path}/xs_batch_{i}.npy", allow_pickle=True)
        labels_batch = np.load(f"{labels_path}/labels_batch_{i}.npy")
        yield tf.keras.preprocessing.sequence.pad_sequences(xs_batch, maxlen=None), labels_batch


In [12]:
def create_tf_dataset(xs_path, labels_path):
    dataset = tf.data.Dataset.from_generator(
        lambda: data_generator(xs_path, labels_path),
        output_signature=(
            tf.TensorSpec(shape=(None, None), dtype=tf.int32),
            tf.TensorSpec(shape=(None,), dtype=tf.int32),
        )
    )
    return dataset.prefetch(tf.data.AUTOTUNE)

In [13]:
train_dataset = create_tf_dataset(xs_batches_path, labels_batches_path)

In [14]:
# Step 5: Define and Train the Model
embedding_dim = 100
lstm_units = 128

In [15]:
model = Sequential([
    Embedding(input_dim=total_words, output_dim=embedding_dim),
    Bidirectional(LSTM(lstm_units)),
    Dropout(0.2),
    Dense(total_words, activation='softmax')
])

In [16]:
model.compile(
    optimizer='adam',
    loss=tf.keras.losses.SparseCategoricalCrossentropy(),
    metrics=['accuracy']
)

In [17]:
epochs = 100
history = model.fit(train_dataset, epochs=epochs, verbose=1)

Epoch 1/100
[1m9213/9213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m190s[0m 20ms/step - accuracy: 0.0710 - loss: 6.4428
Epoch 2/100
[1m   7/9213[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m3:02[0m 20ms/step - accuracy: 0.0762 - loss: 6.1767

  self.gen.throw(typ, value, traceback)


[1m9213/9213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m185s[0m 20ms/step - accuracy: 0.1259 - loss: 5.5439
Epoch 3/100
[1m9213/9213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m185s[0m 20ms/step - accuracy: 0.1475 - loss: 5.1884
Epoch 4/100
[1m9213/9213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m185s[0m 20ms/step - accuracy: 0.1654 - loss: 4.9131
Epoch 5/100
[1m9213/9213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m185s[0m 20ms/step - accuracy: 0.1827 - loss: 4.7069
Epoch 6/100
[1m9213/9213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m185s[0m 20ms/step - accuracy: 0.1986 - loss: 4.5476
Epoch 7/100
[1m9213/9213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m185s[0m 20ms/step - accuracy: 0.2134 - loss: 4.4064
Epoch 8/100
[1m9213/9213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m185s[0m 20ms/step - accuracy: 0.2256 - loss: 4.3148
Epoch 9/100
[1m9213/9213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m186s[0m 20ms/step - accuracy: 0.2346 - loss: 4.2532
Epoc

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[1m9213/9213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m184s[0m 20ms/step - accuracy: 0.2935 - loss: 3.9333
Epoch 57/100
[1m1161/9213[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m2:38[0m 20ms/step - accuracy: 0.3222 - loss: 3.6174

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[1m9213/9213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m184s[0m 20ms/step - accuracy: 0.2995 - loss: 3.8683
Epoch 59/100
[1m3270/9213[0m [32m━━━━━━━[0m[37m━━━━━━━━━━━━━[0m [1m1:58[0m 20ms/step - accuracy: 0.3240 - loss: 3.6509

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[1m9213/9213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m185s[0m 20ms/step - accuracy: 0.3199 - loss: 3.6386
Epoch 61/100
[1m8828/9213[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m7s[0m 20ms/step - accuracy: 0.3250 - loss: 3.6042

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[1m9213/9213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m188s[0m 20ms/step - accuracy: 0.3245 - loss: 3.6046
Epoch 63/100
[1m9213/9213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m192s[0m 21ms/step - accuracy: 0.3276 - loss: 3.5744
Epoch 64/100
[1m8829/9213[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m7s[0m 20ms/step - accuracy: 0.3212 - loss: 3.6343

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[1m9213/9213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m184s[0m 20ms/step - accuracy: 0.3203 - loss: 3.6288
Epoch 66/100
[1m9213/9213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m184s[0m 20ms/step - accuracy: 0.3197 - loss: 3.6207
Epoch 67/100
[1m9213/9213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m184s[0m 20ms/step - accuracy: 0.3139 - loss: 3.6847
Epoch 68/100
[1m9213/9213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m184s[0m 20ms/step - accuracy: 0.3165 - loss: 3.6475
Epoch 69/100
[1m9213/9213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m184s[0m 20ms/step - accuracy: 0.3162 - loss: 3.6871
Epoch 70/100
[1m9213/9213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m185s[0m 20ms/step - accuracy: 0.3119 - loss: 3.7017
Epoch 71/100
[1m9213/9213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m185s[0m 20ms/step - accuracy: 0.3177 - loss: 3.6465
Epoch 72/100
[1m9213/9213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m185s[0m 20ms/step - accuracy: 0.3214 - loss: 3.61

In [18]:
model.save('/kaggle/working/word_pred.h5')