# Shakespeare next word prediction
- #### An RNN is used to predict the next word based on the previous word(s).
- #### Parts from a Shakespeare poem (Shakespeare’s first sonnet) are used to train the network
### **Network Architecture**
1. The input layer is an embedding layer.
2. The first hidden layer is an LSTM layer.
3. The second hidden layer is a dropout layer.
4. The output layer is a dense layer.

In [1]:
import numpy as np

from keras.preprocessing.text import text_to_word_sequence, Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dropout, Dense




In [2]:
with open('Shakespeare_Sonnet_1.txt') as file: 
    text = file.read()

In [3]:
#Read the text file as separate lines of text
with open('Shakespeare_Sonnet_1.txt') as file: 
    text = file.read()
    lines = text.lower().split('\n')

#print(lines)   

# By default, the text_to_word_sequence function:
- Removes all punctuation including tabs and newlines.
- Converts all words to lower case.
- Splits the input string into words using space as the separator.

In [4]:
#Define words, vocabulary size and sequences of words as lines
words = text_to_word_sequence(text)
#print(words)

# Use a Tokenizer from the keras.preprocessing.text library to convert the input text into indexed tokens.

In [5]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(words)
tokens = tokenizer.word_index
#print(tokens)

In [6]:
#tokenizer.word_index
print(len(np.unique(words)))
print(len(tokenizer.word_index))

580
580


# Get the Vocabulary Size

In [7]:
vocabulary_size = len(tokenizer.word_index) + 1
print(vocabulary_size)

581


# Build Sequences
Use the texts_to_sequences() method to convert lines into sequences of tokens.

In [8]:
# build subsequences of different sizes, starting from 1 to the size of the sequence. 
sequences = tokenizer.texts_to_sequences(lines)
subsequences = []
for sequence in sequences:
    for i in range(1, len(sequence)):
       subsequence = sequence[:i+1]
       subsequences.append(subsequence)

### Use the pad_sequences from the keras.preprocessing.sequences library to pad the subsequences with zeros so that all sequences are made the same size.
### The padding='pre' parameter tells the function to add padding at the beginning of each sequence.

In [9]:
subsequences = []
for sequence in sequences:
    for i in range(1, len(sequence)):
        subsequence = sequence[:i+1]
        subsequences.append(subsequence)
        
sequence_length = max([len(sequence) for sequence in sequences])
sequences = pad_sequences(subsequences, maxlen=sequence_length, padding='pre')

# Build Input and Output
#### The input to the network is all the words of the sequence except the last one, and the output is the last word of the sequence.
#### Then, convert output into categorical data using the to_categorical() method from the keras.utils module.

In [10]:
x, y = sequences[:,:-1],sequences[:,-1]
y = to_categorical(y, num_classes=vocabulary_size)

In [11]:
print(x.shape)
print(y.shape)

(1177, 9)
(1177, 581)


# Train an LSTM to perform NLP

In [12]:
model = Sequential()
model.add(Embedding(vocabulary_size, 100, input_length=sequence_length - 1))
model.add(LSTM(100))
model.add(Dropout(0.1))
model.add(Dense(units=vocabulary_size, activation='softmax'))




In [13]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])




In [None]:
model.fit(x, y, epochs=100, verbose=0)

In [43]:
# Evaluate model

In [44]:
model.evaluate(x, y)



[0.2621830105781555, 0.9303313493728638]

# Show 10 single predictions using the different sequences

In [46]:
for idx in range(len(sequences)):
    if idx > 9:
        break
    
    input_sequence = sequences[idx,:]

    #input_words_tokens = np.insert(input_sequence[:-1],0,0)
    input_words_tokens = input_sequence[:-1].tolist()

    y_pred = model.predict([input_words_tokens], verbose=0)
    predicted_token = np.argmax(y_pred, axis=1)

    print(f"{idx}.Input Sequence-tokens: ", input_sequence)
    print(f"\tInput Word-token: ", input_words_tokens)
    print(f"\t\tPredicted Word-token: ", predicted_token)

    print(f"{idx}.Input Sequence-text: ", tokenizer.sequences_to_texts([input_sequence]))
    print(f"\tInput Word-text: ", tokenizer.sequences_to_texts([input_words_tokens]))
    print(f"\t\tPredicted Word-text: ", tokenizer.sequences_to_texts([predicted_token]))

    print("\n")

0.Input Sequence-tokens:  [  0   0   0   0   0   0   0   0  26 189]
	Input Word-token:  [0, 0, 0, 0, 0, 0, 0, 0, 26]
		Predicted Word-token:  [189]
0.Input Sequence-text:  ['from fairest']
	Input Word-text:  ['from']
		Predicted Word-text:  ['fairest']


1.Input Sequence-tokens:  [  0   0   0   0   0   0   0  26 189 190]
	Input Word-token:  [0, 0, 0, 0, 0, 0, 0, 26, 189]
		Predicted Word-token:  [190]
1.Input Sequence-text:  ['from fairest creatures']
	Input Word-text:  ['from fairest']
		Predicted Word-text:  ['creatures']


2.Input Sequence-tokens:  [  0   0   0   0   0   0  26 189 190 191]
	Input Word-token:  [0, 0, 0, 0, 0, 0, 26, 189, 190]
		Predicted Word-token:  [191]
2.Input Sequence-text:  ['from fairest creatures we']
	Input Word-text:  ['from fairest creatures']
		Predicted Word-text:  ['we']


3.Input Sequence-tokens:  [  0   0   0   0   0  26 189 190 191  97]
	Input Word-token:  [0, 0, 0, 0, 0, 26, 189, 190, 191]
		Predicted Word-token:  [97]
3.Input Sequence-text:  ['from