In [26]:
import nltk
import keras
import string
import pandas as pd
import numpy as np
from keras import backend as K
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
import gensim.models.keyedvectors as word2vec1
from scipy import spatial
from gensim.models import FastText

In [None]:
path = "\data\Metamorphosis_clean.txt"
text = open(path,encoding='utf-8').read().lower()
text = text.replace('\r', '').replace('\ufeff', '')
translator = str.maketrans(string.punctuation, ' '*len(string.punctuation))
text = text.translate(translator)
corpus = text.split('\n')

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
word_index = tokenizer.word_index
total_unique_words = len(tokenizer.word_index) + 1

In [None]:
input_sequences = []
for line in corpus:
  token_list = tokenizer.texts_to_sequences([line])[0]
  for i in range(1, len(token_list)): 
    n_gram_seqs = token_list[:i+1]
    input_sequences.append(n_gram_seqs)

In [None]:
max_seq_length = max([len(x) for x in input_sequences])
input_seqs = np.array(pad_sequences(input_sequences, maxlen=max_seq_length, padding='pre'))

In [31]:
x_values, labels = input_seqs[:, :-1], input_seqs[:, -1]
y_values = tf.keras.utils.to_categorical(labels, num_classes=total_unique_words)

In [9]:
pathToBinVectors = 'cc.en.300.vec'
embed_map = word2vec1.KeyedVectors.load_word2vec_format(pathToBinVectors)

In [None]:
embed_map.get_vector('hello')

In [11]:
embeddings_matrix = np.zeros((total_unique_words, 300))
wordss = []
for word, i in word_index.items():
  embedding_vector = embed_map.get_vector(word)
  wordss.append(word)
  if embedding_vector is not None:
    embeddings_matrix[i] = embedding_vector

In [14]:
model = tf.keras.Sequential([
tf.keras.layers.Embedding(input_dim = total_unique_words, output_dim=300, weights=[embeddings_matrix], input_length=max_seq_length-1, trainable=False),
tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(256, return_sequences=True)),
tf.keras.layers.Dropout(0.2), 
tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(256)),
tf.keras.layers.Dropout(0.2),
tf.keras.layers.Dense(128, activation='relu'),
tf.keras.layers.Dense(total_unique_words, activation='softmax')])
model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
history = model.fit(x_values, y_values, epochs=200, validation_split=0.2, verbose=1, batch_size=20)

In [None]:
def prediction(seed_text, next_words): 
  for _ in range(next_words):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_seq_length-1, padding='pre')
    predicted = np.argmax(model.predict(token_list, verbose=1), axis=-1)
    ouput_word = ""

    for word, index in tokenizer.word_index.items():
      if index == predicted:  
        output_word = word
        break
      
    seed_text += ' '+output_word
    print(seed_text)

seed_phrase = "I am"
next_words = len("one two  three".split())
prediction(seed_phrase, next_words)