In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

# Reading corpus the text file
with open("simple_dataset.txt", 'r', encoding='utf-8') as myfile:
    mytext = myfile.read()

In [2]:
mytext

'The sun rises in the east every morning.\nShe enjoys painting beautiful landscapes on weekends.\nThey are planning to visit the national park this summer.\nJohn reads a new book every week.\nThe cat sleeps peacefully on the sofa.\nChildren love playing in the park after school.\nHe drinks coffee before starting his work.\nWe walk together in the evening for fresh air.\nThe teacher explains difficult topics with examples.\nMy brother is learning to play the guitar slowly.'

In [4]:
mytokenizer = Tokenizer()
mytokenizer.fit_on_texts([mytext])
total_words = len(mytokenizer.word_index) + 1

In [5]:
mytokenizer.word_index

{'the': 1,
 'in': 2,
 'every': 3,
 'on': 4,
 'to': 5,
 'park': 6,
 'sun': 7,
 'rises': 8,
 'east': 9,
 'morning': 10,
 'she': 11,
 'enjoys': 12,
 'painting': 13,
 'beautiful': 14,
 'landscapes': 15,
 'weekends': 16,
 'they': 17,
 'are': 18,
 'planning': 19,
 'visit': 20,
 'national': 21,
 'this': 22,
 'summer': 23,
 'john': 24,
 'reads': 25,
 'a': 26,
 'new': 27,
 'book': 28,
 'week': 29,
 'cat': 30,
 'sleeps': 31,
 'peacefully': 32,
 'sofa': 33,
 'children': 34,
 'love': 35,
 'playing': 36,
 'after': 37,
 'school': 38,
 'he': 39,
 'drinks': 40,
 'coffee': 41,
 'before': 42,
 'starting': 43,
 'his': 44,
 'work': 45,
 'we': 46,
 'walk': 47,
 'together': 48,
 'evening': 49,
 'for': 50,
 'fresh': 51,
 'air': 52,
 'teacher': 53,
 'explains': 54,
 'difficult': 55,
 'topics': 56,
 'with': 57,
 'examples': 58,
 'my': 59,
 'brother': 60,
 'is': 61,
 'learning': 62,
 'play': 63,
 'guitar': 64,
 'slowly': 65}

In [8]:
my_input_sequences = []
for line in mytext.split('\n'):
    #print(line)
    token_list = mytokenizer.texts_to_sequences([line])[0]
    #print(token_list)
    for i in range(1, len(token_list)):
        my_n_gram_sequence = token_list[:i+1]
        #print(my_n_gram_sequence)
        my_input_sequences.append(my_n_gram_sequence)
        #print(input_sequences)

In [9]:
max_sequence_len = max([len(seq) for seq in my_input_sequences])
input_sequences = np.array(pad_sequences(my_input_sequences, maxlen=max_sequence_len, padding='pre'))

In [10]:
input_sequences[1]

array([0, 0, 0, 0, 0, 0, 0, 1, 7, 8], dtype=int32)

In [11]:
X = input_sequences[:, :-1]
y = input_sequences[:, -1]

In [12]:
X[1]

array([0, 0, 0, 0, 0, 0, 0, 1, 7], dtype=int32)

In [14]:
y

array([ 7,  8,  2,  1,  9,  3, 10, 12, 13, 14, 15,  4, 16, 18, 19,  5, 20,
        1, 21,  6, 22, 23, 25, 26, 27, 28,  3, 29, 30, 31, 32,  4,  1, 33,
       35, 36,  2,  1,  6, 37, 38, 40, 41, 42, 43, 44, 45, 47, 48,  2,  1,
       49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 60, 61, 62,  5, 63,  1, 64,
       65], dtype=int32)

In [15]:
y = np.array(tf.keras.utils.to_categorical(y, num_classes=total_words))

In [16]:
y[1]

array([0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [18]:
model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_sequence_len-1))
model.add(LSTM(150))
model.add(Dense(total_words, activation='softmax'))
print(model.summary())

None


In [19]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X, y, epochs=100, verbose=1)

Epoch 1/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 25ms/step - accuracy: 0.0759 - loss: 4.1896
Epoch 2/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - accuracy: 0.1389 - loss: 4.1759 
Epoch 3/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step - accuracy: 0.1277 - loss: 4.1618
Epoch 4/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - accuracy: 0.1132 - loss: 4.1449
Epoch 5/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step - accuracy: 0.0825 - loss: 4.1240
Epoch 6/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - accuracy: 0.0786 - loss: 4.0866
Epoch 7/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step - accuracy: 0.0747 - loss: 4.0312
Epoch 8/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step - accuracy: 0.0943 - loss: 3.9547
Epoch 9/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[

<keras.src.callbacks.history.History at 0x7bb909e068d0>

In [23]:
print(model.summary())

None


In [22]:
input_text = "evening for"
predict_next_words= 6

for _ in range(predict_next_words):
    token_list = mytokenizer.texts_to_sequences([input_text])[0]
    print(token_list)
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    predicted = np.argmax(model.predict(token_list), axis=-1)
    output_word = ""
    for word, index in mytokenizer.word_index.items():
        if index == predicted:
            output_word = word
            break
    input_text += " " + output_word

print(input_text)

[49, 50]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[49, 50, 48]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[49, 50, 48, 2]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step
[49, 50, 48, 2, 1]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
[49, 50, 48, 2, 1, 49]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[49, 50, 48, 2, 1, 49, 50]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
evening for together in the evening for fresh
