In [23]:
%tensorflow_version 2.x
import tensorflow as tf#to build the model
import string#to get set of punctuations
import requests#to get the data file in the notebook

In [24]:
response = requests.get('https://ocw.mit.edu/ans7870/6/6.006/s08/lecturenotes/files/t8.shakespeare.txt')
#The get() method sends a GET request to the specified url. Here we are sending a request to get the text document of the data.

In [25]:
print("tensorflow version{}".format(tf.__version__))

tensorflow version2.3.0


In [26]:
response.text[:1500]#display some part of the text returned by requests.get().

'This is the 100th Etext file presented by Project Gutenberg, and\nis presented in cooperation with World Library, Inc., from their\nLibrary of the Future and Shakespeare CDROMS.  Project Gutenberg\noften releases Etexts that are NOT placed in the Public Domain!!\n\nShakespeare\n\n*This Etext has certain copyright implications you should read!*\n\n<<THIS ELECTRONIC VERSION OF THE COMPLETE WORKS OF WILLIAM\nSHAKESPEARE IS COPYRIGHT 1990-1993 BY WORLD LIBRARY, INC., AND IS\nPROVIDED BY PROJECT GUTENBERG ETEXT OF ILLINOIS BENEDICTINE COLLEGE\nWITH PERMISSION.  ELECTRONIC AND MACHINE READABLE COPIES MAY BE\nDISTRIBUTED SO LONG AS SUCH COPIES (1) ARE FOR YOUR OR OTHERS\nPERSONAL USE ONLY, AND (2) ARE NOT DISTRIBUTED OR USED\nCOMMERCIALLY.  PROHIBITED COMMERCIAL DISTRIBUTION INCLUDES BY ANY\nSERVICE THAT CHARGES FOR DOWNLOAD TIME OR FOR MEMBERSHIP.>>\n\n*Project Gutenberg is proud to cooperate with The World Library*\nin the presentation of The Complete Works of William Shakespeare\nfor your

In [27]:
data = response.text.split('\n')
data[0]
#You can see the character \n in the text. \n means “newline”. Now we are going to split the text with respect to \n.

'This is the 100th Etext file presented by Project Gutenberg, and'

In [28]:
data = data[253:]
data[0]
#The text file contains a header file before the actual data begins. The actual work of william shakesphere  begins from line 253. So we are going to slice the data and retain everything from line 253 onwards.

'  From fairest creatures we desire increase,'

In [29]:
len(data)
#The total number of lines in our data is 124204

124204

In [30]:
data = " ".join(data)
data[:1000]
#join all the lines and create a long string consisting of the data in continuous format

"  From fairest creatures we desire increase,   That thereby beauty's rose might never die,   But as the riper should by time decease,   His tender heir might bear his memory:   But thou contracted to thine own bright eyes,   Feed'st thy light's flame with self-substantial fuel,   Making a famine where abundance lies,   Thy self thy foe, to thy sweet self too cruel:   Thou that art now the world's fresh ornament,   And only herald to the gaudy spring,   Within thine own bud buriest thy content,   And tender churl mak'st waste in niggarding:     Pity the world, or else this glutton be,     To eat the world's due, by the grave and thee.                        2   When forty winters shall besiege thy brow,   And dig deep trenches in thy beauty's field,   Thy youth's proud livery so gazed on now,   Will be a tattered weed of small worth held:     Then being asked, where all thy beauty lies,   Where all the treasure of thy lusty days;   To say within thine own deep sunken eyes,   Were an al

In [31]:
def clean_text(doc):#to remove all the punctuation marks and special characters
  tokens = doc.split()#split the data according to space character 
  table = str.maketrans('', '', string.punctuation)# characters that need to be deleted from the string
  tokens = [w.translate(table) for w in tokens]#translate the characters in the string 
  tokens = [word for word in tokens if word.isalpha()]#isalpha() method returns True if all the characters are alphabet letters 
  tokens = [word.lower() for word in tokens]#lower() methods returns the lowercased string
  return tokens

tokens = clean_text(data)
print(tokens[:50])

['from', 'fairest', 'creatures', 'we', 'desire', 'increase', 'that', 'thereby', 'beautys', 'rose', 'might', 'never', 'die', 'but', 'as', 'the', 'riper', 'should', 'by', 'time', 'decease', 'his', 'tender', 'heir', 'might', 'bear', 'his', 'memory', 'but', 'thou', 'contracted', 'to', 'thine', 'own', 'bright', 'eyes', 'feedst', 'thy', 'lights', 'flame', 'with', 'selfsubstantial', 'fuel', 'making', 'a', 'famine', 'where', 'abundance', 'lies', 'thy']


In [32]:
len(tokens)

898199

In [33]:
len(set(tokens))# total number of unique words are 27956

27956

In [34]:
#I set of 50 words to predict the 51st word
#divide data in chunks of 51 words and at the last we will separate the last word from every line
length = 50 + 1#50 previous as input and next 1 for output
lines = []

for i in range(length, len(tokens)):
  seq = tokens[i-length:i]#0 to 50 
  line = ' '.join(seq)
  lines.append(line)
  if i > 200000:#Since its a huge dataset RAM will overflow so considering only in limited
    break

print(len(lines))

199951


In [35]:
lines[0]#first line consisting of 51 words

'from fairest creatures we desire increase that thereby beautys rose might never die but as the riper should by time decease his tender heir might bear his memory but thou contracted to thine own bright eyes feedst thy lights flame with selfsubstantial fuel making a famine where abundance lies thy self'

In [36]:
tokens[50]#51st word in this line is 'self'

'self'

In [37]:
lines[1]#51st word in this line is 'thy' which will the output word used for prediction

'fairest creatures we desire increase that thereby beautys rose might never die but as the riper should by time decease his tender heir might bear his memory but thou contracted to thine own bright eyes feedst thy lights flame with selfsubstantial fuel making a famine where abundance lies thy self thy'

In [38]:
import numpy as np#arrays
from tensorflow.keras.preprocessing.text import Tokenizer#generating dictionary of word encoding and creating vectors out of sentence
from tensorflow.keras.utils import to_categorical# to_categorical(), a numpy array (or) a vector which has integers that represent different categories, can be converted into a numpy array (or) a matrix which has binary values and has columns equal to the number of categories in the data
from tensorflow.keras.models import Sequential# plain stack of layers where each layer has exactly one input tensor and one output tensor. 
from tensorflow.keras.layers import Dense, LSTM, Embedding#dense for hidden layers,lstm,embedding:Turns positive integers (indexes) into dense vectors of fixed size and used only in first layer
from tensorflow.keras.preprocessing.sequence import pad_sequences#ensure that all sequences in a list have the same length

In [40]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)#fit_on_texts() updates internal vocabulary based on a list of texts
sequences = tokenizer.texts_to_sequences(lines)#transforms each text in texts to a sequence of integers

sequences containes a list of integer values created by tokenizer. Each line in sequences has 51 words.split each line such that the first 50 words are in X and the last word is in y

In [41]:
sequences = np.array(sequences)
X, y = sequences[:, :-1], sequences[:,-1]
X[0]

array([   47,  1408,  1264,    37,   451,  1406,     9,  2766,  1158,
        1213,   171,   132,   269,    20,    24,     1,  4782,    87,
          30,    98,  4781,    18,   715,  1263,   171,   211,    18,
         829,    20,    27,  3807,     4,   214,   121,  1212,   153,
       13004,    31,  2765,  1847,    16, 13003, 13002,   754,     7,
        3806,    99,  2430,   466,    31])

vocab_size contains all the uniques words in the dataset

tokenizer.word_index gives the mapping of each unique word to its numerical equivalent

In [42]:
vocab_size = len(tokenizer.word_index) + 1

to_categorical() converts a class vector (integers) to binary class matrix. num_classes is the total number of classes which is vocab_size

In [43]:
y = to_categorical(y, num_classes=vocab_size)

In [44]:
seq_length = X.shape[1]
seq_length

50

Embedding layer:
The Embedding layer is initialized with random weights and will learn an embedding for all of the words in the training dataset. It requires 3 arguments:

input_dim: This is the size of the vocabulary in the text data which is vocab_size in this case.

output_dim: This is the size of the vector space in which words will be embedded. It defines the size of the output vectors from this layer for each word.

input_length: Length of input sequences which is seq_length.

In [45]:
model = Sequential()
model.add(Embedding(vocab_size, 50, input_length=seq_length))
model.add(LSTM(100, return_sequences=True))#return_sequence when set to True returns the full sequence as the output
model.add(LSTM(100))
model.add(Dense(100, activation='relu'))
model.add(Dense(vocab_size, activation='softmax'))

In [46]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 50, 50)            650450    
_________________________________________________________________
lstm (LSTM)                  (None, 50, 100)           60400     
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense (Dense)                (None, 100)               10100     
_________________________________________________________________
dense_1 (Dense)              (None, 13009)             1313909   
Total params: 2,115,259
Trainable params: 2,115,259
Non-trainable params: 0
_________________________________________________________________


In [50]:
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
#use categorical_crossentropy when there are multiple outputs and if its binary output then use binary_crossentroy

In [52]:
model.fit(X, y, batch_size = 256, epochs = 5)#set it to 100 epoch for good accuracy
#training the model,batch_size is 256 so the weights will be updates after 256 training examples
#epoch is the number of passes of entire training dataset the model has completed. Its like more you do revision better you perform in exam. so more epoch means more accuracy

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f910119afd0>

In [53]:
#now generate the word
#taking a random line
new_text=lines[12343]
new_text

'home of love if i have ranged like him that travels i return again just to the time not with the time exchanged so that my self bring water for my stain never believe though in my nature reigned all frailties that besiege all kinds of blood that it could so'

In [54]:
#generate_text_seq() generates n_words number of words after the given new_text
def generate_text_seq(model, tokenizer, text_seq_length, new_text, n_words):
  text = []

  for _ in range(n_words):
    encoded = tokenizer.texts_to_sequences([new_text])[0]# pre-process the new_text before predicting
    encoded = pad_sequences([encoded], maxlen = text_seq_length, truncating='pre')#to create uniformity in text,convert the new_text to 50 words by using pad_sequences()
  #encode the new_text using the same encoding used for encoding the training data
    y_predict = model.predict_classes(encoded)

    predicted_word = ''
    for word, index in tokenizer.word_index.items():
      if index == y_predict:#search the word in tokenizer using the index in y_predict
        predicted_word = word
        break
    new_text = new_text + ' ' + predicted_word
    text.append(predicted_word)#append the predicted word to new_text and text and repeat the process
  return ' '.join(text)

In [55]:
generate_text_seq(model, tokenizer, seq_length, new_text, 100)# next 100 words are predicted by the model for the new_text

Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).


'i have not so a man and not been a man and not been a man and i have not so a man and not been a man and not been a man and not been a man and i have not so a man and not been a man and not been a man and not been a man and i have not so a man and not been a man and not been a man and not been a man and i have not so a man and not been a man and not been a man and'