In [None]:
from tensorflow.keras.callbacks import LambdaCallback
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM, Embedding, GRU
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.utils import get_file, to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import matplotlib.pyplot as plt
import numpy as np
import random
import sys
import io
import requests
import re
import string

In [None]:
path = '/content/drive/MyDrive/language_model/internet_archive_scifi_v3.txt'

In [None]:
file = open(path, 'r').read()

In [None]:
len(file)

149326361

In [None]:
print(file[:300])

MARCH # All Stories New and Complete Publisher Editor IF is published bi-monthly by Quinn Publishing Company, Inc., Kingston, New York. Volume #, No. #. Copyright # by Quinn Publishing Company, Inc. Application for Entry' as Second Class matter at Post Office, Buffalo, New York, pending. Subscriptio


In [None]:
def cleaned_text(text):
  text = text.split(" ")
  text = re.sub(r'[^\x00-\x7f]', r'', str(text)) # removing special chars...
  text = text.translate(str.maketrans('', '', string.punctuation)) # removing special chars...
  text = re.sub('\s+', ' ', str(text)) # removing extra spaces...
  return text

In [None]:
text = cleaned_text(file)

In [None]:
len(text)

142442755

In [None]:
text_corpus = text.split(" ")
text_corpus = [x for x in text_corpus if x != ""]
unique_vocab = list(set(text_corpus))

In [None]:
len(text_corpus)

26308635

In [None]:
len(unique_vocab)

330125

In [None]:
seq_length =  30
step_size = 1 
all_sentences = []
for i in range(seq_length, len(text_corpus)):
  sentence = text_corpus[i - seq_length: i] # sliding window, dividing the whole text into multiple strings, each of length 31...
  sentence = ' '.join(sentence)
  all_sentences.append(sentence)

In [None]:
all_sentences[:10]

['MARCH All Stories New and Complete Publisher Editor IF is published bimonthly by Quinn Publishing Company Inc Kingston New York Volume No Copyright by Quinn Publishing Company Inc Application for',
 'All Stories New and Complete Publisher Editor IF is published bimonthly by Quinn Publishing Company Inc Kingston New York Volume No Copyright by Quinn Publishing Company Inc Application for Entry',
 'Stories New and Complete Publisher Editor IF is published bimonthly by Quinn Publishing Company Inc Kingston New York Volume No Copyright by Quinn Publishing Company Inc Application for Entry as',
 'New and Complete Publisher Editor IF is published bimonthly by Quinn Publishing Company Inc Kingston New York Volume No Copyright by Quinn Publishing Company Inc Application for Entry as Second',
 'and Complete Publisher Editor IF is published bimonthly by Quinn Publishing Company Inc Kingston New York Volume No Copyright by Quinn Publishing Company Inc Application for Entry as Second Class',
 'C

In [None]:
len(all_sentences)

26308605

In [None]:
model_sent  = all_sentences[:500000]

In [None]:
len(model_sent)

In [None]:
model_sent[:10]

['MARCH All Stories New and Complete Publisher Editor IF is published bimonthly by Quinn Publishing Company Inc Kingston New York Volume No Copyright by Quinn Publishing Company Inc Application for',
 'All Stories New and Complete Publisher Editor IF is published bimonthly by Quinn Publishing Company Inc Kingston New York Volume No Copyright by Quinn Publishing Company Inc Application for Entry',
 'Stories New and Complete Publisher Editor IF is published bimonthly by Quinn Publishing Company Inc Kingston New York Volume No Copyright by Quinn Publishing Company Inc Application for Entry as',
 'New and Complete Publisher Editor IF is published bimonthly by Quinn Publishing Company Inc Kingston New York Volume No Copyright by Quinn Publishing Company Inc Application for Entry as Second',
 'and Complete Publisher Editor IF is published bimonthly by Quinn Publishing Company Inc Kingston New York Volume No Copyright by Quinn Publishing Company Inc Application for Entry as Second Class',
 'C

In [None]:
# tokenizing the words; converting words to numerical values...
tokenizer = Tokenizer(num_words= 10000)
tokenizer.fit_on_texts(model_sent)
seq = tokenizer.texts_to_sequences(model_sent)

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Pad sequences to a fixed length of 30
max_length = 30
padded_sequences = pad_sequences(seq, maxlen=max_length, padding='pre', truncating='pre')



In [None]:
data = np.vstack(padded_sequences)

In [None]:
# # Converting one dimentional list to numpy ndarray...
# data = np.array(seq)

In [None]:
# Using first 30 columns of each rows as features and 31st as target variable...
X = data[:, :-1]
y = data[:, -1]

In [None]:
len(tokenizer.word_index)

26014

In [None]:
X.shape[1]

29

In [None]:
from tensorflow.keras.layers import Dropout

In [None]:
# Sequential LSTM model to predict next word...
model = Sequential()

# input_dim is the length of the vocab/dictionary that we created earlier, output_dim is 50, and input length is 31...
model.add(Embedding(len(tokenizer.word_index) + 1, 50, input_length = X.shape[1])) 

# 64 LSTM units and return_sequences = True to pass it on to next LSTM layer...
model.add(LSTM(64, return_sequences=True))


model.add(LSTM(64))

model.add(Dense(128, activation='relu'))
model.add(Dense(len(tokenizer.word_index) + 1, activation='softmax'))

In [None]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 29, 50)            1300750   
                                                                 
 lstm (LSTM)                 (None, 29, 64)            29440     
                                                                 
 lstm_1 (LSTM)               (None, 64)                33024     
                                                                 
 dense (Dense)               (None, 128)               8320      
                                                                 
 dense_1 (Dense)             (None, 26015)             3355935   
                                                                 
Total params: 4,727,469
Trainable params: 4,727,469
Non-trainable params: 0
_________________________________________________________________


In [None]:
# # define the checkpoint
# filepath="/content/drive/MyDrive/textgen/weights.hdf5"
# checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
# callbacks_list = [checkpoint]

In [None]:
# Compiling the model with adam optimizer and training it for 200 epochs...

from tensorflow.keras.callbacks import ModelCheckpoint

# Define a filepath for the saved weights
filepath = '/content/drive/MyDrive/language_model/model_weights.{epoch:03d}'

# Define a callback to save the weights after each epoch
checkpoint = ModelCheckpoint(filepath, monitor='loss', save_best_only=True, mode='min', verbose=1)

model.compile(loss='sparse_categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
lstm_history = model.fit(X, y, batch_size = 32, epochs=100 , callbacks=[checkpoint])

In [None]:
# when input text and number of words to be generated are given... this function will return text...
def text_generator(model, tokenizer, seq_len, feature_text, num_words):
  text = []
  for i in range(num_words):
    token = tokenizer.texts_to_sequences([feature_text])[0]
    token = pad_sequences([token], maxlen = seq_len, truncating='pre')
    # y_pred = model.predict_classes(token)
    y_pred = model.predict(token) 
    y_pred = np.argmax(y_pred, axis=1)

    pred_word = ''
    for word, idx in tokenizer.word_index.items():
      if idx == y_pred:
        pred_word = word
        break
    feature_text += " "+ pred_word
    text.append(pred_word)

  return " ".join(text)

###**MODEL EVALUATION**

In [None]:
model.save('/content/drive/MyDrive/language_model/lstm.h5') # saving model...

In [None]:
lstm_model = load_model('/content/drive/MyDrive/language_model/lstm.h5') # loading model...

In [None]:
num_of_words = 50 # number of words to be generated...

# input text...
text = """He moved slowly and with a kind of painful dignity, as a man moves on his way to the firing squad. A rumpled shock of black hair pointed up the extreme pallor of a gaunt face"""

text_generator(lstm_model, tokenizer, X.shape[1], text, num_of_words)



'days on that instantly on the really was item the couldnt or who youll the else you out did he you warily of singhalut the true huge almost left shrugged answer walked the knocked chairman to which against to with had possibility one miles after then from laugh and thousand'