# Declaration

In [1]:
import keras
import re
import nltk
import numpy as np
import tensorflow as tf
from pickle import dump
from pickle import load
from keras import preprocessing
from keras import optimizers
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.models import Model
from keras.models import load_model
from keras.layers import SeparableConv1D, MaxPooling1D
from keras.layers import Input
from keras.layers import LSTM
from keras.layers import GRU
from keras.layers import Bidirectional
from keras.layers import Embedding
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import Activation
from keras.layers import Flatten
from keras.utils import to_categorical
from keras.utils.vis_utils import plot_model
from keras.callbacks import ModelCheckpoint



keras.__version__

Using TensorFlow backend.


'2.3.1'

# Data Loading

In [2]:
trainFromTextFile = "train.FROM"
trainToTextFile   = "train.TO"
trainFromText     = open(trainFromTextFile, 'r', encoding='utf-8').read().lower()
trainToText       = open(trainToTextFile, 'r', encoding='utf-8').read().lower()
trainFromSentence = re.split('\n', trainFromText)
trainToSentence   = re.split('\n', trainToText)
trainFromWords = re.split(' |\n', trainFromText)
trainToWords   = re.split(' |\n', trainToText)

print('Found %s sentences from TrainFrom Text' %len(trainFromSentence))
print('Found %s sentences from TrainTo Text' %len(trainToSentence))
print('Found %s words from TrainFrom Text' %len(trainFromWords))
print('Found %s words from TrainTo Text' %len(trainToWords))

Found 29620 sentences from TrainFrom Text
Found 29620 sentences from TrainTo Text
Found 521666 words from TrainFrom Text
Found 479824 words from TrainTo Text


In [3]:
trainInput = trainFromSentence[2000:3000]
trainTarget = trainToSentence[2000:3000]

In [4]:
max_len = 50    # Cut comments after 100 words
max_words = 10000  # Consider the top 10,000 words in the dataset

tokenizerInput = Tokenizer()
tokenizerInput.fit_on_texts(trainInput)

sequencesInputEncode = tokenizerInput.texts_to_sequences(trainInput)
sequencesInputEncode = pad_sequences(sequencesInputEncode, maxlen=max_len)  #Pad so all the arrays are the same size

Inputindex = tokenizerInput.word_index
Inputcount = tokenizerInput.word_counts
nEncoderToken = len(tokenizerInput.word_index)+1

trainInputEncoded = to_categorical([sequencesInputEncode])
trainInputEncoded = trainInputEncoded.reshape(len(trainInput), max_len, nEncoderToken)

print("Train From File:\n")
print('Found %s sentences.' %len(trainInput))
print('Found %s sequences.' %len(sequencesInputEncode))
print('Found %s unique tokens.' % len(Inputindex))
print('Found %s unique words.' % len(Inputcount))

Train From File:

Found 1000 sentences.
Found 1000 sequences.
Found 4161 unique tokens.
Found 4161 unique words.


In [5]:
tokenizerTarget = Tokenizer()
tokenizerTarget.fit_on_texts(trainTarget)

sequencesInputDecode = tokenizerTarget.texts_to_sequences(trainTarget)
sequencesTargetDecode = tokenizerTarget.texts_to_sequences(trainTarget)

#Pops the First Element in the Sequence (To prepare for Decoder Target)
for seq in sequencesTargetDecode:
    if seq:
        _ = seq.pop(0)
        
sequencesInputDecode = pad_sequences(sequencesInputDecode, maxlen=max_len)  #Pad so all the arrays are the same size
sequencesTargetDecode = pad_sequences(sequencesTargetDecode, maxlen=max_len)  #Pad so all the arrays are the same size

Targetindex = tokenizerTarget.word_index
Targetcount = tokenizerTarget.word_counts
nDecoderToken = len(tokenizerTarget.word_index)+1

trainInputDecoded = to_categorical([sequencesInputDecode], num_classes=nDecoderToken)
trainTargetDecoded = to_categorical([sequencesTargetDecode], num_classes=nDecoderToken)

trainInputDecoded = trainInputDecoded.reshape(len(trainTarget), max_len, nDecoderToken)
trainTargetDecoded = trainTargetDecoded.reshape(len(trainTarget), max_len, nDecoderToken)

print("Train From File:\n")
print('Found %s sentences.' %len(trainTarget))
print('Found %s sequences.' %len(sequencesInputDecode))
print('Found %s unique tokens.' % len(Targetindex))
print('Found %s unique words.' % len(Targetcount))

Train From File:

Found 1000 sentences.
Found 1000 sequences.
Found 4057 unique tokens.
Found 4057 unique words.


In [6]:
# Reverse-lookup token index to decode sequences back to something readable.
reverse_input_char_index = dict((i, char) for char, i in Inputindex.items())
reverse_target_char_index = dict((i, char) for char, i in Targetindex.items())

In [7]:
trainInputEncoded = trainInputEncoded.mean(axis=2, keepdims=False)
trainInputDecoded = trainInputDecoded.mean(axis=2, keepdims=False)
trainTargetDecoded = trainTargetDecoded.mean(axis=2, keepdims=False)

In [8]:
print(trainInputEncoded.shape)
print(trainInputDecoded.shape)
print(trainTargetDecoded.shape)

(1000, 50)
(1000, 50)
(1000, 50)


In [9]:
Inputindex['to']

4

In [10]:
sequencesTargetDecode[2,48]

4

In [11]:
trainTargetDecoded

array([[0.00024643, 0.00024643, 0.00024643, ..., 0.00024643, 0.00024643,
        0.00024643],
       [0.00024643, 0.00024643, 0.00024643, ..., 0.00024643, 0.00024643,
        0.00024643],
       [0.00024643, 0.00024643, 0.00024643, ..., 0.00024643, 0.00024643,
        0.00024643],
       ...,
       [0.00024643, 0.00024643, 0.00024643, ..., 0.00024643, 0.00024643,
        0.00024643],
       [0.00024643, 0.00024643, 0.00024643, ..., 0.00024643, 0.00024643,
        0.00024643],
       [0.00024643, 0.00024643, 0.00024643, ..., 0.00024643, 0.00024643,
        0.00024643]], dtype=float32)

# Embedding

In [12]:
glove_dir = r'C:\Users\mosta\Desktop\Deep Learning Projects\Projects\Lab Exercises\Machine Learning Projects\glove.6B\glove.6B.300d.txt'

embeddings_index = {}
f = open(glove_dir, encoding="utf8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [13]:
# The Embedding layer takes at least two arguments:
# the number of possible tokens, the maximum word index,
# and the dimensionality of the embeddings, here 300.
embedding_dim = 300

def CreateEmbeddingMatrix(nWords, is_Encoder: bool):
    embedding_matrix = np.zeros((nWords, embedding_dim))
    if is_Encoder:
        for word, i in Inputindex.items():
            embedding_vector = embeddings_index.get(word)
            if i < nWords:
                if embedding_vector is not None:
                    # Words not found in embedding index will be all-zeros.
                    embedding_matrix[i] = embedding_vector
        return embedding_matrix
    else:
        for word, i in Targetindex.items():
            embedding_vector = embeddings_index.get(word)
            if i < nWords:
                if embedding_vector is not None:
                    # Words not found in embedding index will be all-zeros.
                    embedding_matrix[i] = embedding_vector
        return embedding_matrix
        

In [14]:
encoder_embedding_matrix = CreateEmbeddingMatrix(nEncoderToken, True)
decoder_embedding_matrix = CreateEmbeddingMatrix(nDecoderToken, False)

In [15]:
print(encoder_embedding_matrix.shape)
print(decoder_embedding_matrix.shape)

(4162, 300)
(4058, 300)


# Encoder-Decoder

In [16]:
# LSTM takes in only 3 Dimensions.  Embedding adds 1 Dimension, therefore, data has to be 2 Dimensions

In [17]:
# Define the Encoder
encoder_inputs = Input(shape=(None,))
encoder_embed = Embedding(nEncoderToken, embedding_dim, weights=[encoder_embedding_matrix])
encoder_lstm = LSTM(embedding_dim, return_state=True)
encoder_inference_inputs = encoder_embed(encoder_inputs)
encoder = encoder_embed(encoder_inputs)
encoder, state_h, state_c = encoder_lstm(encoder)
encoder_states = [state_h, state_c]

# Define the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None,))
decoder_embed = Embedding(nDecoderToken, embedding_dim, weights=[decoder_embedding_matrix])
decoder_lstm = LSTM(embedding_dim, return_state=True)
decoder_dense = Dense(max_len, activation='softmax')

decoder = decoder_embed(decoder_inputs)
decoder, _, _ = decoder_lstm(decoder, initial_state=encoder_states)
decoder_outputs = decoder_dense(decoder)

# Define the model that will turn `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.summary()

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 300)    1248600     input_1[0][0]                    
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, None, 300)    1217400     input_2[0][0]                    
___________

In [33]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
path = r'C:\Users\mosta\Desktop\Deep Learning Projects\Projects\Lib\word_pred_Model4.h5'
checkpoint = ModelCheckpoint(path, monitor='loss', verbose=1, save_best_only=True, mode='min')
model.fit([sequencesInputEncode, sequencesInputDecode], sequencesTargetDecode, epochs=100,verbose=1,callbacks=[checkpoint])
model.save('s9s.h5')
#dump(tokenizer,open('tokenizer_Model9','wb')) 

Epoch 1/100

Epoch 00001: loss improved from inf to 601289.17300, saving model to C:\Users\mosta\Desktop\Deep Learning Projects\Projects\Lib\word_pred_Model4.h5
Epoch 2/100

Epoch 00002: loss did not improve from 601289.17300
Epoch 3/100

Epoch 00003: loss did not improve from 601289.17300
Epoch 4/100

Epoch 00004: loss did not improve from 601289.17300
Epoch 5/100

Epoch 00005: loss did not improve from 601289.17300
Epoch 6/100

Epoch 00006: loss did not improve from 601289.17300
Epoch 7/100

Epoch 00007: loss did not improve from 601289.17300
Epoch 8/100

Epoch 00008: loss did not improve from 601289.17300
Epoch 9/100

Epoch 00009: loss did not improve from 601289.17300
Epoch 10/100

Epoch 00010: loss did not improve from 601289.17300
Epoch 11/100

Epoch 00011: loss did not improve from 601289.17300
Epoch 12/100

Epoch 00012: loss did not improve from 601289.17300
Epoch 13/100

Epoch 00013: loss did not improve from 601289.17300
Epoch 14/100

Epoch 00014: loss did not improve from 60


Epoch 00051: loss did not improve from 601289.17300
Epoch 52/100

Epoch 00052: loss did not improve from 601289.17300
Epoch 53/100

Epoch 00053: loss did not improve from 601289.17300
Epoch 54/100

Epoch 00054: loss did not improve from 601289.17300
Epoch 55/100

Epoch 00055: loss did not improve from 601289.17300
Epoch 56/100

Epoch 00056: loss did not improve from 601289.17300
Epoch 57/100

Epoch 00057: loss did not improve from 601289.17300
Epoch 58/100

Epoch 00058: loss did not improve from 601289.17300
Epoch 59/100

Epoch 00059: loss did not improve from 601289.17300
Epoch 60/100

Epoch 00060: loss did not improve from 601289.17300
Epoch 61/100

Epoch 00061: loss did not improve from 601289.17300
Epoch 62/100

Epoch 00062: loss did not improve from 601289.17300
Epoch 63/100

Epoch 00063: loss did not improve from 601289.17300
Epoch 64/100

Epoch 00064: loss did not improve from 601289.17300
Epoch 65/100

Epoch 00065: loss did not improve from 601289.17300
Epoch 66/100

Epoch 000

In [None]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit([sequencesInputEncode, sequencesInputDecode], sequencesTargetDecode, epochs=25)
model.save('s2s.h5')

In [27]:
# Define Sampling Models

In [34]:
# Define Encoder Inference Model
encoder_model = Model(encoder_inputs, encoder_states)
encoder_model.summary()

Model: "model_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, None)              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, None, 300)         1248600   
_________________________________________________________________
lstm_1 (LSTM)                [(None, 300), (None, 300) 721200    
Total params: 1,969,800
Trainable params: 1,969,800
Non-trainable params: 0
_________________________________________________________________


In [35]:
# Define Decoder Inference Model
decoder_state_input_h = Input(shape=(None,))
decoder_state_input_c = Input(shape=(None,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_inference = decoder_embed(decoder_inputs)
decoder_inference_outputs, state_h, state_c = decoder_lstm(decoder_inference, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
#decoder_inference_outputs = decoder_dense(decoder_inference_outputs)
decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_inference_outputs] + decoder_states)
decoder_model.summary()

Model: "model_7"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, None, 300)    1217400     input_2[0][0]                    
__________________________________________________________________________________________________
input_7 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
input_8 (InputLayer)            (None, None)         0                                            
____________________________________________________________________________________________

In [36]:
# Sampling Function

In [37]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)
    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1,1))
    # Populate the first character of target sequence with the start character.
    #target_seq[0, 0] = target_token_index['START_']
    # Sampling loop for a batch of sequences (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
        
        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += ' '+sampled_char
        
        # Exit condition: either hit max length or find stop character.
        #if (sampled_char == '_END' or len(decoded_sentence) > max_len):
            #stop_condition = True
        
        if (sampled_char == '\n' or len(decoded_sentence) > max_len):
            stop_condition = True
            
        # Update the target sequence (of length 1).
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index
        # Update states
        states_value = [h, c]
    return decoded_sentence

In [None]:
def tokenBacktoWords(sequencesInput):
    decoded_sentence = ''
    for i in sequencesInput:
        number = sequencesInput[:,i]
        char = reverse_input_char_index[number]
        decoded_sentence += ' '+char
    return decoded_sentence

In [38]:
for seq_index in range(10):
    input_seq = sequencesInputEncode[seq_index: seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
    print('-')
    print('Input Sentence: ', trainInput[seq_index])
    print('Decoded Sentence: ', decoded_sentence)

-
Input Sentence:  does he say anywhere on the book that emacs is better than vi?
Decoded Sentence:   1 1 how newlinechar newlinechar newlinechar newlinechar
-
Input Sentence:  i'm just going by what you wrote here, amigo. there is only one rami james, graphic designer, tel aviv resident, who goes by the nickname of lonesmurf. which is it, swiss, israeli, or american? newlinechar  newlinechar http://www.pokermoons.com/
Decoded Sentence:   1 how how newlinechar newlinechar newlinechar newlinechar
-
Input Sentence:  1. vibrations are fun. newlinechar 2. sometimes they want to do it themselves.
Decoded Sentence:   1 how make newlinechar newlinechar newlinechar newlinechar
-
Input Sentence:  obviously. newlinechar  newlinechar chandler's demise is only evidence that the chandler team had no coherent vision or design.
Decoded Sentence:   1 how how newlinechar so newlinechar newlinechar newlinechar
-
Input Sentence:  what about newlinechar  newlinechar *the people should ensure that votes ar

In [24]:
trainInput[1]

"i'm just going by what you wrote here, amigo. there is only one rami james, graphic designer, tel aviv resident, who goes by the nickname of lonesmurf. which is it, swiss, israeli, or american? newlinechar  newlinechar http://www.pokermoons.com/"

In [None]:
# Define an input sequence and process it.
encoder_inputs = Input(shape=(None,))
encoder = Embedding(nEncoderToken, embedding_dim, weights=[encoder_embedding_matrix], input_length=max_len)(encoder_inputs)
x = encoder
x, state_h, state_c = LSTM(embedding_dim, return_state=True)(x)
encoder_states = [state_h, state_c]

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None,))
decoder = Embedding(nDecoderToken, embedding_dim, weights=[decoder_embedding_matrix], input_length=max_len)(decoder_inputs)
y = decoder
y = LSTM(embedding_dim, return_sequences=True)(y, initial_state=encoder_states)
decoder_outputs = Dense(nDecoderToken, activation='softmax')(y)

# Define the model that will turn `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [None]:
model.summary()

In [None]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit([trainInputEncoded, trainInputDecoded], trainTargetDecoded, epochs=25)
model.save('s2s.h5')

In [None]:
encoder_inputs = Input(shape=(None,))
encoder_embed = Embedding(nEncoderToken, embedding_dim, weights=[encoder_embedding_matrix])(encoder_inputs)

# Encoder lstm
encoder = LSTM(embedding_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_embed)
encoder_states = [state_h, state_c]

In [None]:
# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None,))
decoder_embed = Embedding(nDecoderToken, embedding_dim, weights=[decoder_embedding_matrix])(decoder_inputs)

# decoder lstm
decoder = LSTM(embedding_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder(decoder_embed, initial_state=encoder_states)
decoder_dense = Dense(max_len, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

In [None]:
# Define the model that will turn `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [None]:
# Define Encoder Inference Model
encoder_model = Model(encoder_inputs, encoder_states)

In [None]:
decoder_state_input_h = Input(shape=(None,))
decoder_state_input_c = Input(shape=(None,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = LSTM(embedding_dim, return_sequences=True)(decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = Dense(max_len, activation='softmax')(decoder_outputs)
decoder_model = Model([decoder_inputs] + decoder_states_inputs,[decoder_outputs] + decoder_states)

In [None]:
encoder

# Model

In [None]:
model.summary()

In [None]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit([sequencesInputEncode, sequencesInputDecode], sequencesTargetDecode, epochs=15)
model.save('s2s.h5')

In [None]:
reverse_input_char_index

In [None]:
# Define Inference Encoder
encoder_inference_inputs = Input(shape=(None, nEncoderToken))
encoder_model = Model(encoder_inference_inputs, encoder_states)

In [None]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1))
    # Populate the first character of target sequence with the start character.
    #target_seq[0, 0, Targetindex['\t']] = 1.

    # Sampling loop for a batch of sequences (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1])
        print(output_tokens[0, -1, :])
        print(sampled_token_index)
        # Index start from 1
        if(sampled_token_index == 0):
            sampled_token_index = 1
        
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += (' ' + sampled_char)

        # Exit condition: either hit max length or find stop character.
        if (sampled_char == '\n' or len(decoded_sentence) > max_len):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1, nDecoderToken))
        target_seq[0, 0, sampled_token_index] = 1.

        # Update states
        states_value = [h, c]

    return decoded_sentence


for seq_index in range(50):
    # Take one sequence (part of the training set)
    # for trying out decoding.
    input_seq = sequencesInputEncode[seq_index: seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
    print('-')
    print('Input sentence:', trainFromSentence[seq_index])
    print('Decoded sentence:', decoded_sentence)

In [None]:
# Define an Encoder
encoder_inputs = Input(shape=(None, nEncoderToken))
encoder = LSTM(embedding_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
encoder_states = [state_h, state_c]

# Define a Decoder
decoder_inputs = Input(shape=(None, nDecoderToken))
# We set up our decoder to return full output sequences, and to return internal states as well. 
# We don't use the return states in the training model, but we will use them in inference.
decoder_lstm = LSTM(embedding_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
decoder_dense = Dense(nDecoderToken, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [None]:
# Define sampling models
encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(embedding_dim,))
decoder_state_input_c = Input(shape=(embedding_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model([decoder_inputs] + decoder_states_inputs,[decoder_outputs] + decoder_states)