In [45]:
import random
import pickle

import numpy as np
import pandas as pd
from nltk.tokenize import RegexpTokenizer

import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import LSTM, Dense, Activation, Input
from tensorflow.keras.optimizers import RMSprop

In [14]:
from pathlib import Path
path = Path('../datasets/LibriSpeech/cleaned/')
files = list(path.glob('*.txt'))
text = []
for file in files:
    with open(file, '+r') as f:
        text.extend(f.readlines())
text = [line.rstrip() for line in text]
text = " ".join(text).lower()

In [30]:

tokenizer = RegexpTokenizer(r"\w+")
tokens = tokenizer.tokenize(text)
unique_tokens = np.unique(tokens)
unique_token_index = {token: idx for idx, token in enumerate(unique_tokens)}

In [34]:
n_words = 10
input_words = []
next_words = []

for i in range(len(tokens) - n_words):
    input_words.append(tokens[i: i + n_words])
    next_words.append(tokens[i + n_words])


In [36]:
x = np.zeros((len(input_words), n_words, len(unique_tokens)), dtype=bool)
y = np.zeros((len(next_words), len(unique_tokens)), dtype=bool)

In [43]:
for i, words in enumerate(input_words):
    for j, word in enumerate(words):
        x[i, j, unique_token_index[word]] = 1
    y[i, unique_token_index[next_words[i]]] = 1

In [46]:
model = Sequential()
model.add(Input((n_words, len(unique_tokens))))
model.add(LSTM(256, return_sequences=True))
model.add(LSTM(256, return_sequences=True))
model.add(LSTM(256))
model.add(Dense(len(unique_tokens)))
model.add(Activation("softmax"))

In [48]:
model.compile(loss="categorical_crossentropy", optimizer=RMSprop(learning_rate=0.01), metrics=["accuracy"])
model.fit(x, y, batch_size=128, epochs=10, shuffle=True)

Epoch 1/10
[1m428/428[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 88ms/step - accuracy: 0.0576 - loss: 7.1148
Epoch 2/10
[1m428/428[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 89ms/step - accuracy: 0.0800 - loss: 6.6429
Epoch 3/10
[1m428/428[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 88ms/step - accuracy: 0.0962 - loss: 6.3717
Epoch 4/10
[1m428/428[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 88ms/step - accuracy: 0.1069 - loss: 6.1496
Epoch 5/10
[1m428/428[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 88ms/step - accuracy: 0.1228 - loss: 5.9774
Epoch 6/10
[1m428/428[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 88ms/step - accuracy: 0.1382 - loss: 5.7346
Epoch 7/10
[1m428/428[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 91ms/step - accuracy: 0.1539 - loss: 5.5050
Epoch 8/10
[1m428/428[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 90ms/step - accuracy: 0.1753 - loss: 5.2576
Epoch 9/10
[1m428/428[

<keras.src.callbacks.history.History at 0x30a723d30>

In [50]:
model.save("next_word.keras")

In [54]:
def predict_n_words(input_text, n_best):
    input_text = input_text.lower()
    X = np.zeros((1, n_words, len(unique_tokens)))
    for i, word in enumerate(tokenizer.tokenize(input_text)):
        X[0, i, unique_token_index[word]] = 1
    predictions = model.predict(X)[0]
    best = np.argpartition(predictions, -n_best)[-n_best:]
    return [unique_tokens[idx] for idx in best]

In [59]:
predict_n_words("I was at the store", 5)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step


['a', 'you', 'my', 'what', 'i']