<a href="https://colab.research.google.com/github/mmm84766/Predict-next-possible-word/blob/master/Predict_next_possible_word.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Import Library
import numpy as np
import pandas as pd
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import LSTM, Dense, GRU, Embedding
from keras.callbacks import EarlyStopping, ModelCheckpoint

Using TensorFlow backend.


In [0]:
# Load DataSet
data_text = """Just A Rather Very Intelligent System a.k.a JARVIS is created by Tony Stark natural-language and a sophisticated artificial intelligence user interface computer system, named after Edwin Jarvis, the butler
who worked for Howard Stark. Though its primary duty is to automate Stark’s Malibu estate, the lifelike program fulfills many other needs for Stark, like being an information source for him, a diagnostic tool, a
consultant and a voice of reason in Stark’s life. It was also responsible to provide security for Tony Stark's Mansion and Stark Tower. After creating the Mark II armor, Stark uploaded JARVIS into all of
the Iron Man Armors, as well as allowing him to interact with the other Avengers, giving them valuable information during combat. JARVIS may be the one intellect Stark feels most comfortable opening up to.
JARVIS can object to Stark’s commands if necessary. JARVIS speaks with a refined British accent, and is capable of back talk, sarcasm and condescension. During the Ultron Offensive, JARVIS was destroyed
by Ultron, although his remaining programming codes unknowingly continued to thwart Ultron's plans of gaining access to nuclear missiles. His remains were found by Stark, who uploaded them into a synthetic
body made of vibranium and, in conjunction with Ultron's personality and an Infinity Stone. JARVIS' duties were then taken over by FRIDAY."""

In [0]:
import re

def text_cleaner(text):
    # lower case text
    newString = text.lower()
    newString = re.sub(r"'s\b","",newString)
    # remove punctuations
    newString = re.sub("[^a-zA-Z]", " ", newString) 
    long_words=[]
    # remove short word
    for i in newString.split():
        if len(i)>=3:                  
            long_words.append(i)
    return (" ".join(long_words)).strip()

# preprocess the text
data_new = text_cleaner(data_text)

In [4]:
def create_seq(text):
    length = 30
    sequences = list()
    for i in range(length, len(text)):
        # select sequence of tokens
        seq = text[i-length:i+1]
        # store
        sequences.append(seq)
    print('Total Sequences: %d' % len(sequences))
    return sequences

# create sequences   
sequences = create_seq(data_new)

Total Sequences: 1197


In [0]:
# create a character mapping index
chars = sorted(list(set(data_new)))
mapping = dict((c, i) for i, c in enumerate(chars))

def encode_seq(seq):
    sequences = list()
    for line in seq:
        # integer encode line
        encoded_seq = [mapping[char] for char in line]
        # store
        sequences.append(encoded_seq)
    return sequences

# encode the sequences
sequences = encode_seq(sequences)

In [6]:
from sklearn.model_selection import train_test_split

# vocabulary size
vocab = len(mapping)
sequences = np.array(sequences)
# create X and y
X, y = sequences[:,:-1], sequences[:,-1]
# one hot encode y
y = to_categorical(y, num_classes=vocab)
# create train and validation sets
X_tr, X_val, y_tr, y_val = train_test_split(X, y, test_size=0.1, random_state=42)

print('Train shape:', X_tr.shape, 'Val shape:', X_val.shape)

Train shape: (1077, 30) Val shape: (120, 30)


In [7]:
# define model
model = Sequential()
model.add(Embedding(vocab, 50, input_length=30, trainable=True))
model.add(GRU(150))
model.add(Dense(vocab, activation='softmax'))
print(model.summary())




Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 30, 50)            1200      
_________________________________________________________________
gru_1 (GRU)                  (None, 150)               90450     
_________________________________________________________________
dense_1 (Dense)              (None, 24)                3624      
Total params: 95,274
Trainable params: 95,274
Non-trainable params: 0
_________________________________________________________________
None


In [8]:

# compile the model
model.compile(loss='categorical_crossentropy', metrics=['acc'], optimizer='adam')





In [9]:
# fit the model
model.fit(X_tr, y_tr, epochs=100, verbose=2, validation_data=(X_val, y_val))

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where

Train on 1077 samples, validate on 120 samples
Epoch 1/100
 - 2s - loss: 3.0378 - acc: 0.1309 - val_loss: 3.0893 - val_acc: 0.1167
Epoch 2/100
 - 1s - loss: 2.8809 - acc: 0.1495 - val_loss: 3.0586 - val_acc: 0.1167
Epoch 3/100
 - 1s - loss: 2.8296 - acc: 0.1606 - val_loss: 2.9912 - val_acc: 0.1500
Epoch 4/100
 - 1s - loss: 2.7130 - acc: 0.2386 - val_loss: 2.8353 - val_acc: 0.1917
Epoch 5/100
 - 1s - loss: 2.5350 - acc: 0.2721 - val_loss: 2.7461 - val_acc: 0.2250
Epoch 6/100
 - 1s - loss: 2.3900 - acc: 0.2953 - val_loss: 2.5330 - val_acc: 0.2333
Epoch 7/100
 - 1s - loss: 2.2810 - acc: 0.3110 - val_loss: 2.4728 - val_acc: 0.2667
Epoch 8/100
 - 1s - loss: 2.1854 - acc: 0.3408 - val_loss: 2.4464 - val_acc: 0.3000
Epoch 9/100
 - 1s - loss: 2.1065 - acc: 0.3621 - val_loss: 2.3769 - val_acc: 0.2833
Epoch 10/100
 - 1s - loss: 2.0359 - acc: 0.3928 - val_loss: 2.3434 - val_acc: 0.2917
Epoch 11/100
 - 1

<keras.callbacks.History at 0x7fe973328c18>

In [0]:
# generate a sequence of characters with a language model
def generate_seq(model, mapping, seq_length, seed_text, n_chars):
	in_text = seed_text
	# generate a fixed number of characters
	for _ in range(n_chars):
		# encode the characters as integers
		encoded = [mapping[char] for char in in_text]
		# truncate sequences to a fixed length
		encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
		# predict character
		yhat = model.predict_classes(encoded, verbose=0)
		# reverse map integer to character
		out_char = ''
		for char, index in mapping.items():
			if index == yhat:
				out_char = char
				break
		# append to input
		in_text += char
	return in_text

In [11]:
# Test of Our_Model
inp = 'JARVIS was destroyed'
print(len(inp))
print(generate_seq(model, mapping, 30, inp.lower(), 8))

20
jarvis was destroyed ultron 


In [14]:
inp = "JARVIS is created by"
print(len(inp))
print(generate_seq(model, mapping, 30, inp.lower(), 8))

20
jarvis is created byd stark 
