In [4]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding
from keras.utils import to_categorical
import numpy as np
import nltk
nltk.download('punkt')

# Upload the dataset
with open('/content/pg5200.txt', 'r') as f:
    corpus = f.read()

# Tokenize the data into sequences of words
tokens = nltk.word_tokenize(corpus)
seq_length = 3
sequences = []
for i in range(seq_length, len(tokens)):
    seq = tokens[i-seq_length:i]
    sequences.append(seq)

# Create the input and output data
X = []
y = []
for seq in sequences:
    X.append(seq[:-1])
    y.append(seq[-1])

# Create the vocabulary
word_to_index = {}
index_to_word = {}
for i, word in enumerate(set(tokens)):
    word_to_index[word] = i
    index_to_word[i] = word
vocab_size = len(word_to_index)

# Convert the input and output data to numerical values
X_num = np.zeros((len(X), seq_length-1))
y_num = np.zeros((len(y), 1))
for i, (seq, label) in enumerate(zip(X, y)):
    X_num[i, :] = [word_to_index[word] for word in seq]
    y_num[i, :] = word_to_index[label]

# Embed the input data
embedding_size = 100
model = Sequential()
model.add(Embedding(vocab_size, embedding_size, input_length=seq_length-1))
model.add(LSTM(100))
model.add(Dense(vocab_size, activation='softmax'))

# Convert the output to one-hot encoding
y_one_hot = to_categorical(y_num, num_classes=vocab_size)

# Split the data into training and validation sets
train_size = int(0.8 * len(X_num))
X_train, X_val = X_num[:train_size], X_num[train_size:]
y_train, y_val = y_one_hot[:train_size], y_one_hot[train_size:]

# Train the LSTM model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=30, batch_size=64, verbose=2, validation_data=(X_val, y_val))

# Evaluate the LSTM model
loss, accuracy = model.evaluate(X_val, y_val, verbose=0)
print('Accuracy: %f' % (accuracy*10))

# Use the LSTM model to generate text
seed_text = "As nobody could"
for i in range(10):
    # Create the input sequence
    token_list = nltk.word_tokenize(seed_text)
    token_list = token_list[-seq_length+1:]
    input_seq = np.array([word_to_index[word] for word in token_list])[np.newaxis, :]

    # Predict the next word
    predicted_probs = model.predict(input_seq)[0]


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Epoch 1/30
356/356 - 11s - loss: 6.4953 - accuracy: 0.0498 - val_loss: 7.0908 - val_accuracy: 0.0460 - 11s/epoch - 32ms/step
Epoch 2/30
356/356 - 8s - loss: 5.8463 - accuracy: 0.0556 - val_loss: 7.2369 - val_accuracy: 0.0629 - 8s/epoch - 23ms/step
Epoch 3/30
356/356 - 8s - loss: 5.6950 - accuracy: 0.0702 - val_loss: 7.3534 - val_accuracy: 0.0680 - 8s/epoch - 22ms/step
Epoch 4/30
356/356 - 8s - loss: 5.4961 - accuracy: 0.0865 - val_loss: 7.3753 - val_accuracy: 0.0759 - 8s/epoch - 22ms/step
Epoch 5/30
356/356 - 8s - loss: 5.2380 - accuracy: 0.1163 - val_loss: 7.3500 - val_accuracy: 0.0738 - 8s/epoch - 23ms/step
Epoch 6/30
356/356 - 7s - loss: 4.9823 - accuracy: 0.1347 - val_loss: 7.4133 - val_accuracy: 0.0852 - 7s/epoch - 20ms/step
Epoch 7/30
356/356 - 8s - loss: 4.7823 - accuracy: 0.1518 - val_loss: 7.5136 - val_accuracy: 0.0861 - 8s/epoch - 23ms/step
Epoch 8/30
356/356 - 8s - loss: 4.6163 - accuracy: 0.1640 - val_loss: 7.6310 - val_accuracy: 0.0889 - 8s/epoch - 22ms/step
Epoch 9/30
356