In [1]:
import nltk
nltk.download("gutenberg")

[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


True

### DATA COLLECTION

In [2]:
from nltk.corpus import gutenberg
import pandas as pd

data = gutenberg.raw("shakespeare-hamlet.txt")
with open("hamlet.txt","w") as file:
    file.write(data)

### DATA PREPROCESSING

In [5]:
import numpy as np

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

## Loading the dataset

with open("hamlet.txt","r") as file:
    text = file.read().lower()

## Tokenizing
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
total_words = len(tokenizer.word_index)+1

In [6]:
total_words

4818

In [9]:
tokenizer.word_index

{'the': 1,
 'and': 2,
 'to': 3,
 'of': 4,
 'i': 5,
 'you': 6,
 'a': 7,
 'my': 8,
 'it': 9,
 'in': 10,
 'that': 11,
 'ham': 12,
 'is': 13,
 'not': 14,
 'his': 15,
 'this': 16,
 'with': 17,
 'your': 18,
 'but': 19,
 'for': 20,
 'me': 21,
 'lord': 22,
 'as': 23,
 'what': 24,
 'he': 25,
 'be': 26,
 'so': 27,
 'him': 28,
 'haue': 29,
 'king': 30,
 'will': 31,
 'no': 32,
 'our': 33,
 'we': 34,
 'on': 35,
 'are': 36,
 'if': 37,
 'all': 38,
 'then': 39,
 'shall': 40,
 'by': 41,
 'thou': 42,
 'come': 43,
 'or': 44,
 'hamlet': 45,
 'good': 46,
 'do': 47,
 'hor': 48,
 'her': 49,
 'let': 50,
 'now': 51,
 'thy': 52,
 'how': 53,
 'more': 54,
 'they': 55,
 'from': 56,
 'enter': 57,
 'at': 58,
 'was': 59,
 'oh': 60,
 'like': 61,
 'most': 62,
 'there': 63,
 'well': 64,
 'know': 65,
 'selfe': 66,
 'would': 67,
 'them': 68,
 'loue': 69,
 'may': 70,
 "'tis": 71,
 'vs': 72,
 'sir': 73,
 'qu': 74,
 'which': 75,
 'did': 76,
 'why': 77,
 'laer': 78,
 'giue': 79,
 'thee': 80,
 'ile': 81,
 'must': 82,
 'hath': 

In [11]:
## Creating inpput sequence 
# i.e for one word what would the next word predicted

input_sequence = []
for line in text.split("\n"):
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1,len(token_list)):
        n_gram_sequence=token_list[:i+1]
        input_sequence.append(n_gram_sequence)


In [17]:
max_sequence_length = max([len(x) for x in input_sequence])
max_sequence_length

14

In [18]:
input_sequence = np.array(pad_sequences(input_sequence,maxlen=max_sequence_length,padding="pre"))
input_sequence

array([[   0,    0,    0, ...,    0,    1,  687],
       [   0,    0,    0, ...,    1,  687,    4],
       [   0,    0,    0, ...,  687,    4,   45],
       ...,
       [   0,    0,    0, ...,    4,   45, 1047],
       [   0,    0,    0, ...,   45, 1047,    4],
       [   0,    0,    0, ..., 1047,    4,  193]], dtype=int32)

In [19]:
## Creating Predictors and label
import tensorflow as tf
X,y = input_sequence[:,:-1],input_sequence[:,-1]

In [28]:
y = tf.keras.utils.to_categorical(y,num_classes=total_words)

In [29]:
y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [30]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)


In [56]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,Dropout,Dense,LSTM
from tensorflow.keras.callbacks import EarlyStopping
early_stopping = EarlyStopping(monitor="val_loss",patience=3,restore_best_weights=True)
# Defining the model
model = Sequential()
model.add(Embedding(total_words,100,input_length=max_sequence_length))
model.add(LSTM(150,return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(100))
model.add(Dense(total_words,activation="softmax"))

# Compiling the Model
model.compile(loss="categorical_crossentropy",optimizer="adam",metrics=["accuracy"])
model.build(input_shape=(None, max_sequence_length))
model.summary()



In [58]:
#Training the Model
history = model.fit(X_train,y_train,epochs=120,validation_data=(X_test,y_test),verbose=1)

Epoch 1/120
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 24ms/step - accuracy: 0.0364 - loss: 6.4656 - val_accuracy: 0.0422 - val_loss: 6.8575
Epoch 2/120
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 22ms/step - accuracy: 0.0484 - loss: 6.2792 - val_accuracy: 0.0486 - val_loss: 6.8956
Epoch 3/120
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 22ms/step - accuracy: 0.0522 - loss: 6.1139 - val_accuracy: 0.0517 - val_loss: 6.9203
Epoch 4/120
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 22ms/step - accuracy: 0.0547 - loss: 5.9977 - val_accuracy: 0.0544 - val_loss: 6.9822
Epoch 5/120
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 22ms/step - accuracy: 0.0567 - loss: 5.8730 - val_accuracy: 0.0538 - val_loss: 7.0322
Epoch 6/120
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 22ms/step - accuracy: 0.0624 - loss: 5.7660 - val_accuracy: 0.0608 - val_loss: 7.0767
Epoch 7/12

In [42]:
## Function to predict the next word
def predict_next_word(model,tokenizer,text,max_sequence_length):
    token_list = tokenizer.texts_to_sequences([text])[0]
    if len(token_list) >= max_sequence_length:
        #Ensuring the sequence length matches the max_sequbece
        token_list = token_list[-(max_sequence_length-1):]
    token_list = pad_sequences([token_list],maxlen=max_sequence_length-1,padding="pre")
    predicted = model.predict(token_list,verbose=0)
    predict_word_index = np.argmax(predicted,axis=1)
    for word, index in tokenizer.word_index.items():
        if index==predict_word_index:
            return word
    return None


In [50]:
input_text = "government is "
print(f"Input: {input_text}")
max_sequence_len = model.input_shape[1]+1
next_word = predict_next_word(model,tokenizer,input_text,max_sequence_len)
print(f"Output: {input_text+" "+next_word}")

Input: government is 
Output: government is  a


In [46]:
model.save("model.h5")



In [60]:
import pickle
with open("new_tokenizer.pkl","wb") as file:
    pickle.dump(tokenizer,file,protocol=pickle.HIGHEST_PROTOCOL)

In [59]:
model.save("new_model.keras")