In [3]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import regularizers
import tensorflow.keras.utils as ku 
import numpy as np
import re
import tensorflow as tf

In [4]:
tokenizer = Tokenizer()
data = open('NLP task.txt').read()
corpus = data.lower().split(".")
for i in range(0,len(corpus)):
  s = re.sub(' +',' ',(re.sub(r'[^\w]', ' ', corpus[i])))
  corpus[i] = s
print(len(corpus))
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1

6417


In [5]:
input_sequences = []
for j in corpus:
 token_list = tokenizer.texts_to_sequences([j])[0]
 for i in range(1, len(token_list)):
  n_gram_sequence = token_list[:i+1]
  input_sequences.append(n_gram_sequence)

In [6]:
# pad sequences 
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

In [7]:
# create predictors and label
predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
label = ku.to_categorical(label, num_classes=total_words)

In [8]:
model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_sequence_len-1))
model.add(Bidirectional(LSTM(150, return_sequences = True)))
model.add(Dropout(0.2))
model.add(LSTM(100))
model.add(Dense(total_words/2, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
model.add(Dense(total_words, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 104, 100)          816300    
_________________________________________________________________
bidirectional (Bidirectional (None, 104, 300)          301200    
_________________________________________________________________
dropout (Dropout)            (None, 104, 300)          0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               160400    
_________________________________________________________________
dense (Dense)                (None, 4081)              412181    
_________________________________________________________________
dense_1 (Dense)              (None, 8163)              33321366  
Total params: 35,011,447
Trainable params: 35,011,447
Non-trainable params: 0
____________________________________________

In [9]:
from keras.callbacks import ModelCheckpoint

filepath = "model_training.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss',
                             verbose=1, save_best_only=True,
                             mode='min')
callbacks = [checkpoint]

In [None]:
history = model.fit(predictors, label, epochs=500, verbose=1,callbacks=callbacks)

Epoch 1/500

Epoch 00001: loss improved from inf to 6.43854, saving model to model_training.hdf5
Epoch 2/500

Epoch 00002: loss improved from 6.43854 to 5.95787, saving model to model_training.hdf5
Epoch 3/500

Epoch 00003: loss improved from 5.95787 to 5.71359, saving model to model_training.hdf5
Epoch 4/500

Epoch 00004: loss improved from 5.71359 to 5.51309, saving model to model_training.hdf5
Epoch 5/500

Epoch 00005: loss improved from 5.51309 to 5.35200, saving model to model_training.hdf5
Epoch 6/500

Epoch 00006: loss improved from 5.35200 to 5.22507, saving model to model_training.hdf5
Epoch 7/500

Epoch 00007: loss improved from 5.22507 to 5.12018, saving model to model_training.hdf5
Epoch 8/500

Epoch 00008: loss improved from 5.12018 to 5.02884, saving model to model_training.hdf5
Epoch 9/500

Epoch 00009: loss improved from 5.02884 to 4.94923, saving model to model_training.hdf5
Epoch 10/500

Epoch 00010: loss improved from 4.94923 to 4.87624, saving model to model_trainin

In [None]:
model.save("model1.h5")

In [None]:
seed_text = "sherlock was always"
next_words = 100
  
for _ in range(next_words):
 token_list = tokenizer.texts_to_sequences([seed_text])[0]
 token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
 predicted = model.predict_classes(token_list, verbose=0)
 output_word = ""
 for word, index in tokenizer.word_index.items():
  if index == predicted:
   output_word = word
   break
 seed_text += " " + output_word
print(seed_text)