In [1]:
import pickle
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Embedding, LSTM, Dense
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import BatchNormalization, Dropout
from tensorflow.keras import regularizers
from keras.regularizers import l2
# Read the text file
with open('en_US.blogs.txt', 'r', encoding='utf-8') as file:
    text = file.read()

In [2]:
text = text[:2500000]
len(text)

2500000

In [3]:
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts([text])
total_words = len(tokenizer.word_index) + 1
with open('tokenizer.pkl', 'wb') as handle:
    pickle.dump(tokenizer, handle)
total_words

34369

In [10]:
with open('tokenizer.pkl', 'rb') as handle:
    tokenizer = pickle.load(handle)
total_words = len(tokenizer.word_index) + 1
total_words

34369

In [11]:
input_sequences = []
for line in text.split('\n'):
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

In [6]:
with open('max_sequence_len.pkl', 'rb') as f:
    max_sequence_len = pickle.load(f)
max_sequence_len

681

In [13]:
max_sequence_len = max([len(seq) for seq in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
max_sequence_len

681

In [14]:
# with open('max_sequence_len.pkl', 'wb') as handle:
#     pickle.dump(max_sequence_len, handle)
# max_sequence_len

681

In [9]:
X = input_sequences[:, :-1]
y = input_sequences[:, -1]
X.shape

(17897, 680)

In [10]:
y = np.array(tf.keras.utils.to_categorical(y, num_classes=total_words))
y.shape

(17897, 30393)

In [11]:
print(len(X))
print(len(y))

17897
17897


In [12]:
#model = load_model("baseModel.h5")
model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_sequence_len-1))
model.add(LSTM(50, kernel_regularizer=regularizers.l2(0.0001)))
model.add(Dropout(0.2))
model.add(BatchNormalization())
model.add(Dense(total_words, activation='softmax'))
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 680, 100)          3039300   
                                                                 
 lstm (LSTM)                 (None, 150)               150600    
                                                                 
 dropout (Dropout)           (None, 150)               0         
                                                                 
 batch_normalization (BatchN  (None, 150)              600       
 ormalization)                                                   
                                                                 
 dense (Dense)               (None, 30393)             4589343   
                                                                 
Total params: 7,779,843
Trainable params: 7,779,543
Non-trainable params: 300
____________________________________________

In [13]:
X_train, X_temp, Y_train, Y_temp = train_test_split(X,y, test_size=0.2, random_state=42)
X_val, X_test, Y_val, Y_test = train_test_split(X_temp,Y_temp, test_size=0.5, random_state=42)

In [14]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train, Y_train, epochs=30, verbose=1, validation_data=(X_val, Y_val))

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x20596d7e7f0>

In [17]:
seed_text = "This ensures that the model works with the"
next_words = 3

for _ in range(next_words):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    predicted = np.argmax(model.predict(token_list), axis=-1)
    output_word = ""
    for word, index in tokenizer.word_index.items():
        if index == predicted:
            output_word = word
            break
    seed_text += " " + output_word

print(seed_text)

This ensures that the model works with the chicken and fillets


In [19]:
model.save("baseModel.h5")