In [1]:
#import needed libraries
import tensorflow as tf
import string
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [2]:
# open the shakespear text file and join all lines using join function 
# removing the special characters using rstrip function

with open('t8.shakespeare.txt') as f:
    data = " ".join([l.rstrip() for l in f]) 
    #data=f.read()

In [3]:
len(data)

5437069

In [4]:
#define the preprocessing function for data cleaning and tokenizzing
def preprocess_text(text):
    tokens = text.split()
    table = str.maketrans('','',string.punctuation)
    tokens = [w.translate(table) for w in tokens]
    tokens = [word for word in tokens if word.isalpha()]
    tokens = [word.lower() for word in tokens]
    return tokens

In [5]:
tokens = preprocess_text(data)
print(tokens[:50])


['from', 'fairest', 'creatures', 'we', 'desire', 'increase', 'that', 'thereby', 'beautys', 'rose', 'might', 'never', 'die', 'but', 'as', 'the', 'riper', 'should', 'by', 'time', 'decease', 'his', 'tender', 'heir', 'might', 'bear', 'his', 'memory', 'but', 'thou', 'contracted', 'to', 'thine', 'own', 'bright', 'eyes', 'feedst', 'thy', 'lights', 'flame', 'with', 'selfsubstantial', 'fuel', 'making', 'a', 'famine', 'where', 'abundance', 'lies', 'thy']


In [6]:
len(tokens)                    # count of tokens

898199

In [7]:
len(set(tokens))                #count of uniqe words

27956

In [8]:
line_size = 5      
lines = []            # lines combination

# this loop generate all posibble lines combinations

for i in range(line_size, len(tokens)):
  sequence = tokens[i-line_size:i]
  line = ' '.join(sequence)
  lines.append(line)

# not taking all the data as it is huge

  if i > 50000:
                          
    break

    
print(len(lines))           #print the number of lines generated

49997


In [9]:
#generate the word index and the sequence of words represeneted in integers

In [10]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
sequences = tokenizer.texts_to_sequences(lines) 

In [11]:
sequences = np.array(sequences)
# convert the sequence to numpy array

In [12]:
sequences.shape

(49997, 5)

In [13]:
sequences[0:5]

array([[  45, 1157, 1408,   52,  385],
       [1157, 1408,   52,  385,  783],
       [1408,   52,  385,  783,    7],
       [  52,  385,  783,    7, 1867],
       [ 385,  783,    7, 1867,  343]])

In [14]:
X, y = sequences[:, :-1], sequences[:,-1]
X[0],y[0]                                                   #define the training data x and the output y

(array([  45, 1157, 1408,   52]), 385)

In [15]:
y.shape   

(49997,)

In [16]:
#vocab size or unique words = number of word in the word to index list

vocab_size = len(tokenizer.word_index) + 1
vocab_size

6029

In [17]:
y = to_categorical(y, num_classes=vocab_size)           


In [18]:
y.shape

(49997, 6029)

In [19]:
seq_length = X.shape[1]
seq_length                            #context lenght

4

In [20]:
#build the lstm model
model = Sequential()
model.add(Embedding(vocab_size, 50, input_length=seq_length))
model.add(LSTM(100, return_sequences=True))
model.add(LSTM(100))
model.add(Dense(100, activation='relu'))
model.add(Dense(vocab_size, activation='softmax'))
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 4, 50)             301450    
                                                                 
 lstm (LSTM)                 (None, 4, 100)            60400     
                                                                 
 lstm_1 (LSTM)               (None, 100)               80400     
                                                                 
 dense (Dense)               (None, 100)               10100     
                                                                 
 dense_1 (Dense)             (None, 6029)              608929    
                                                                 
Total params: 1061279 (4.05 MB)
Trainable params: 1061279 (4.05 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [21]:
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])      #compile the model

In [22]:
model.fit(X, y, batch_size = 256, epochs = 170)     #trian the model 

Epoch 1/170
Epoch 2/170
Epoch 3/170
Epoch 4/170
Epoch 5/170
Epoch 6/170
Epoch 7/170
Epoch 8/170
Epoch 9/170
Epoch 10/170
Epoch 11/170
Epoch 12/170
Epoch 13/170
Epoch 14/170
Epoch 15/170
Epoch 16/170
Epoch 17/170
Epoch 18/170
Epoch 19/170
Epoch 20/170
Epoch 21/170
Epoch 22/170
Epoch 23/170
Epoch 24/170
Epoch 25/170
Epoch 26/170
Epoch 27/170
Epoch 28/170
Epoch 29/170
Epoch 30/170
Epoch 31/170
Epoch 32/170
Epoch 33/170
Epoch 34/170
Epoch 35/170
Epoch 36/170
Epoch 37/170
Epoch 38/170
Epoch 39/170
Epoch 40/170
Epoch 41/170
Epoch 42/170
Epoch 43/170
Epoch 44/170
Epoch 45/170
Epoch 46/170
Epoch 47/170
Epoch 48/170
Epoch 49/170
Epoch 50/170
Epoch 51/170
Epoch 52/170
Epoch 53/170
Epoch 54/170
Epoch 55/170
Epoch 56/170
Epoch 57/170
Epoch 58/170
Epoch 59/170
Epoch 60/170
Epoch 61/170
Epoch 62/170
Epoch 63/170
Epoch 64/170
Epoch 65/170
Epoch 66/170
Epoch 67/170
Epoch 68/170
Epoch 69/170
Epoch 70/170
Epoch 71/170
Epoch 72/170
Epoch 73/170
Epoch 74/170
Epoch 75/170
Epoch 76/170
Epoch 77/170
Epoch 78

<keras.src.callbacks.History at 0x1464a25af90>

In [34]:
test=lines[2]
test

'creatures we desire increase that'

In [35]:
# evaluation function that predict the next word

def generate_text_seq(model, tokenizer, text_seq_length, test, n_words):
  text = []

  for _ in range(n_words):
    encoded = tokenizer.texts_to_sequences([test])[0]
    encoded = pad_sequences([encoded], maxlen = text_seq_length, truncating='pre')

    y_predict = model.predict(encoded)

    predicted_index = np.argmax(y_predict)  # Get the index of the word with the highest probability
    predicted_word = tokenizer.index_word[predicted_index]  # Get the corresponding word

    test = test + ' ' + predicted_word
    text.append(predicted_word)

    return ' '.join(text)

In [38]:
generate_text_seq(model, tokenizer, seq_length, test, 1)          # predict the next word for the test line 



'thereby'