<a href="https://colab.research.google.com/github/nikewinchester/Text-Generator-LSTM/blob/main/word-level-text-generator-bilstm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Importing required packages

In [16]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import regularizers
import tensorflow.keras.utils as ku 
import numpy as np
import re
import tensorflow as tf

## Reading dataset (given text file) from github

In [17]:
import requests
from bs4 import BeautifulSoup
gitfile = requests.get('https://raw.githubusercontent.com/nikewinchester/Text-Generator-LSTM/main/NLP task.txt')
data = str(BeautifulSoup(gitfile.content, "html.parser"))

## Tokenizing the words by making a dictionary (using fit_on_texts) where each unique word is associated with a number

In [18]:
tokenizer = Tokenizer()
corpus = data.lower().split(".")
for i in range(0,len(corpus)):
  s = re.sub(' +',' ',(re.sub(r'[^\w]', ' ', corpus[i])))
  corpus[i] = s
print(len(corpus))
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1

6417


## Converting sentences to respected vector list (using the numbers got from previous step). Creating n-gram sequences as inputs for all input sentences.

In [19]:
input_sequences = []
for j in corpus:
 token_list = tokenizer.texts_to_sequences([j])[0]
 for i in range(1, len(token_list)):
  n_gram_sequence = token_list[:i+1]
  input_sequences.append(n_gram_sequence)

## Padding the input sequences to make it all of the same length. This will help the model learn better.

In [20]:
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

## Creating input data and output data from given sequences. Further converts output labels into a standardized binary matrix using keras.utils.to_categorical

In [21]:
predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
label = ku.to_categorical(label, num_classes=total_words)

## Building model using Bidirectional LSTM. Added dropout to prevent overfitting.

In [22]:
model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_sequence_len-1))
model.add(Bidirectional(LSTM(64, return_sequences = True)))
model.add(Dropout(0.2))
model.add(LSTM(64))
model.add(Dense(total_words/2, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
model.add(Dense(total_words, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 104, 100)          816400    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 104, 128)          84480     
_________________________________________________________________
dropout_1 (Dropout)          (None, 104, 128)          0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 64)                49408     
_________________________________________________________________
dense_2 (Dense)              (None, 4082)              265330    
_________________________________________________________________
dense_3 (Dense)              (None, 8164)              33333612  
Total params: 34,549,230
Trainable params: 34,549,230
Non-trainable params: 0
__________________________________________

## Checkpoints to save model every epoch the model does better.

In [23]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [24]:
from keras.callbacks import ModelCheckpoint

filepath = "/content/drive/MyDrive/Text-Generator/model_training.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss',
                             verbose=1, save_best_only=True,
                             mode='min')
callbacks = [checkpoint]

## Training the model

In [25]:
history = model.fit(predictors, label, epochs=125, verbose=1,callbacks=callbacks)

Epoch 1/125

Epoch 00001: loss improved from inf to 6.42600, saving model to /content/drive/MyDrive/Text-Generator/model_training.hdf5
Epoch 2/125

Epoch 00002: loss improved from 6.42600 to 5.83888, saving model to /content/drive/MyDrive/Text-Generator/model_training.hdf5
Epoch 3/125

Epoch 00003: loss improved from 5.83888 to 5.54502, saving model to /content/drive/MyDrive/Text-Generator/model_training.hdf5
Epoch 4/125

Epoch 00004: loss improved from 5.54502 to 5.35898, saving model to /content/drive/MyDrive/Text-Generator/model_training.hdf5
Epoch 5/125

Epoch 00005: loss improved from 5.35898 to 5.21864, saving model to /content/drive/MyDrive/Text-Generator/model_training.hdf5
Epoch 6/125

Epoch 00006: loss improved from 5.21864 to 5.09847, saving model to /content/drive/MyDrive/Text-Generator/model_training.hdf5
Epoch 7/125

Epoch 00007: loss improved from 5.09847 to 4.99352, saving model to /content/drive/MyDrive/Text-Generator/model_training.hdf5
Epoch 8/125

Epoch 00008: loss 

## Saving final model

In [26]:
model.save("/content/drive/MyDrive/Text-Generator/model1.hdf5")

##Loading Pre-trained model

In [27]:
from tensorflow import keras
model = keras.models.load_model('/content/drive/MyDrive/Text-Generator/model1.hdf5')

## Example prediction of trained model.

In [28]:
seed_text = "sherlock was indeed"
next_words = 10
  
for _ in range(next_words):
 token_list = tokenizer.texts_to_sequences([seed_text])[0]
 token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
 predicted = np.argmax(model.predict(token_list), axis=-1)
 output_word = ""
 for word, index in tokenizer.word_index.items():
  if index == predicted:
   output_word = word
   break
 seed_text += " " + output_word
print(seed_text)

sherlock was indeed the facts have been in the same source that you
