In [1]:
import tensorflow as tf 
from tensorflow.keras.models import Sequential 
from tensorflow.keras.layers import Embedding, LSTM, Dense 
from tensorflow.keras.preprocessing.text import Tokenizer 
from tensorflow.keras.preprocessing.sequence import pad_sequences 
import numpy as np 
import regex as re 
import os


In [2]:
def file_to_sentence_list(file_path): 
	with open(file_path, 'r', encoding="UTF-8") as file: 
		text = file.read() 

	# Splitting the text into sentences using 
	# delimiters like '.', '?', and '!' 
	sentences = [sentence.strip() for sentence in re.split( 
		r'(?<=[.!?])\s+', text) if sentence.strip()] 

	return sentences 

file_path = 'combined.txt'
text_data = file_to_sentence_list(file_path) 

# Tokenize the text data 
tokenizer = Tokenizer() 
tokenizer.fit_on_texts(text_data) 
total_words = len(tokenizer.word_index) + 1

# Create input sequences 
input_sequences = [] 
for line in text_data: 
	token_list = tokenizer.texts_to_sequences([line])[0] 
	for i in range(1, len(token_list)): 
		n_gram_sequence = token_list[:i+1] 
		input_sequences.append(n_gram_sequence) 

# Pad sequences and split into predictors and label 
max_sequence_len = max([len(seq) for seq in input_sequences]) 
input_sequences = np.array(pad_sequences( 
	input_sequences, maxlen=max_sequence_len, padding='pre')) 
X, y = input_sequences[:, :-1], input_sequences[:, -1] 

print(total_words,y)
# Convert target data to one-hot encoding 
y = tf.keras.utils.to_categorical(y, num_classes=total_words) 


12565 [   7   32  810 ...   16 1771  271]


In [3]:
# Define the model 
model = Sequential() 
print(total_words)
model.add(Embedding(total_words, 10))
model.add(LSTM(128)) 
model.add(Dense(total_words, activation='softmax')) 
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) 
checkpoint_path = "training_1/cp.keras"
checkpoint_dir = os.path.dirname(checkpoint_path)
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,verbose=1)


12565


In [4]:
# Train the model 
model.fit(X, y, epochs=20, verbose=1,callbacks=[cp_callback]) 

Epoch 1/20
[1m4988/4988[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 151ms/step - accuracy: 0.0534 - loss: 7.3212
Epoch 1: saving model to training_1/cp.keras
[1m4988/4988[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m756s[0m 151ms/step - accuracy: 0.0534 - loss: 7.3212
Epoch 2/20
[1m4988/4988[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 154ms/step - accuracy: 0.0890 - loss: 6.5100
Epoch 2: saving model to training_1/cp.keras
[1m4988/4988[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m768s[0m 154ms/step - accuracy: 0.0890 - loss: 6.5100
Epoch 3/20
[1m4988/4988[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 155ms/step - accuracy: 0.1113 - loss: 6.0795
Epoch 3: saving model to training_1/cp.keras
[1m4988/4988[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m774s[0m 155ms/step - accuracy: 0.1113 - loss: 6.0795
Epoch 4/20
[1m4988/4988[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 155ms/step - accuracy: 0.1284 - loss: 5.7312
Epoch 4: saving model to

In [None]:
# Generate next word predictions 
seed_text = "I am writing this email "
next_words = 5

for _ in range(next_words): 
	token_list = tokenizer.texts_to_sequences([seed_text])[0] 
	token_list = pad_sequences( 
		[token_list], maxlen=max_sequence_len-1, padding='pre') 
	predicted_probs = model.predict(token_list) 
	predicted_word = tokenizer.index_word[np.argmax(predicted_probs)] 
	seed_text += " " + predicted_word 

print("Next predicted words:", seed_text) 


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 78ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
Next predicted words: Pizza is a  thriving sector of a concession


In [None]:
model.save('saved_model/my_model')

In [None]:
model.save('saved_model/my_model')