# Declaration

In [6]:
import keras
import re
import nltk
import numpy as np
from keras import preprocessing
from keras import optimizers
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.models import Model
from keras.layers import SeparableConv1D, MaxPooling1D
from keras.layers import Input
from keras.layers import LSTM
from keras.layers import GRU
from keras.layers import Bidirectional
from keras.layers import Embedding
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import Activation
from keras.layers import Flatten
from keras.utils import to_categorical
from keras.utils.vis_utils import plot_model
from keras.callbacks import ModelCheckpoint



keras.__version__

'2.2.4'

# Data Loading

In [7]:
trainFromTextFile = "train.FROM"
trainToTextFile   = "train.TO"
trainFromText     = open(trainFromTextFile, 'r', encoding='utf-8').read().lower()
trainToText       = open(trainToTextFile, 'r', encoding='utf-8').read().lower()
trainFromSentence = re.split('\n', trainFromText)
trainToSentence   = re.split('\n', trainToText)
trainFromWords = re.split(' |\n', trainFromText)
trainToWords   = re.split(' |\n', trainToText)

print('Found %s sentences from TrainFrom Text' %len(trainFromSentence))
print('Found %s sentences from TrainTo Text' %len(trainToSentence))
print('Found %s words from TrainFrom Text' %len(trainFromWords))
print('Found %s words from TrainTo Text' %len(trainToWords))

Found 29620 sentences from TrainFrom Text
Found 29620 sentences from TrainTo Text
Found 521666 words from TrainFrom Text
Found 479824 words from TrainTo Text


In [8]:
print(trainFromWords[6])
print(trainToWords[0])
print(trainFromSentence[0])
print(trainToSentence[0])

salad?
jesus,
does this also apply to the salad?
jesus, i hope not.


In [9]:
trainInput = trainFromSentence[0:1000]
trainTarget = trainToSentence[0:1000]

len(trainTarget)

1000

# Tokenization

In [10]:
max_len = 100    # We will cut comments after 100 words
#max_words = 10000  # We will only consider the top 10,000 words in the dataset

tokenizerInput = Tokenizer()
tokenizerInput.fit_on_texts(trainInput)

sequencesInputEncode = tokenizerInput.texts_to_sequences(trainInput)
sequencesInputEncode = pad_sequences(sequencesInputEncode, maxlen=max_len)  #Pad so all the arrays are the same size

Inputindex = tokenizerInput.word_index
Inputcount = tokenizerInput.word_counts
nEncoderToken = len(tokenizerInput.word_index)+1

trainInputEncoded = to_categorical([sequencesInputEncode])

trainInputEncoded = trainInputEncoded.reshape(len(trainInput), max_len, nEncoderToken)

print("Train From File:\n")
print('Found %s sentences.' %len(trainInput))
print('Found %s sequences.' %len(sequencesInputEncode))
print('Found %s unique tokens.' % len(Inputindex))
print('Found %s unique words.' % len(Inputcount))

Train From File:

Found 1000 sentences.
Found 1000 sequences.
Found 4304 unique tokens.
Found 4304 unique words.


In [11]:
trainInputEncoded.shape

(1000, 100, 4305)

In [13]:
tokenizerTarget = Tokenizer()
tokenizerTarget.fit_on_texts(trainTarget)

sequencesInputDecode = tokenizerTarget.texts_to_sequences(trainTarget)
sequencesTargetDecode = tokenizerTarget.texts_to_sequences(trainTarget)

#Pops the First Element in the Sequence (To prepare for Decoder Target)
for seq in sequencesTargetDecode:
    if seq:
        _ = seq.pop(0)

In [15]:
sequencesInputDecode[0]

[576, 5, 272, 15]

In [16]:
sequencesTargetDecode[0]

[5, 272, 15]

In [17]:
sequencesInputDecode = pad_sequences(sequencesInputDecode, maxlen=max_len)  #Pad so all the arrays are the same size
sequencesTargetDecode = pad_sequences(sequencesTargetDecode, maxlen=max_len)  #Pad so all the arrays are the same size
Targetindex = tokenizerTarget.word_index
Targetcount = tokenizerTarget.word_counts
nDecoderToken = len(tokenizerTarget.word_index)+1

In [27]:
Targetindex

{'the': 1,
 'newlinechar': 2,
 'to': 3,
 'a': 4,
 'i': 5,
 'you': 6,
 'and': 7,
 'of': 8,
 'is': 9,
 'that': 10,
 'it': 11,
 'in': 12,
 'for': 13,
 'on': 14,
 'not': 15,
 'they': 16,
 'be': 17,
 'have': 18,
 'was': 19,
 'are': 20,
 'but': 21,
 'if': 22,
 'with': 23,
 "it's": 24,
 'just': 25,
 'my': 26,
 'your': 27,
 'or': 28,
 'like': 29,
 'as': 30,
 'would': 31,
 'all': 32,
 'what': 33,
 'this': 34,
 "don't": 35,
 'so': 36,
 'he': 37,
 'http': 38,
 'think': 39,
 'people': 40,
 'do': 41,
 'me': 42,
 'no': 43,
 'at': 44,
 "'": 45,
 'from': 46,
 'one': 47,
 'can': 48,
 'gt': 49,
 'how': 50,
 'there': 51,
 'about': 52,
 'more': 53,
 'we': 54,
 'get': 55,
 'know': 56,
 'com': 57,
 'some': 58,
 'them': 59,
 'then': 60,
 'an': 61,
 "that's": 62,
 'will': 63,
 'up': 64,
 "i'm": 65,
 'good': 66,
 'well': 67,
 'by': 68,
 'who': 69,
 'because': 70,
 'make': 71,
 'than': 72,
 'their': 73,
 'his': 74,
 'too': 75,
 'when': 76,
 'www': 77,
 'out': 78,
 'why': 79,
 'actually': 80,
 'which': 81,
 'red

In [18]:
trainInputDecoded = to_categorical([sequencesInputDecode], num_classes=nDecoderToken)
trainTargetDecoded = to_categorical([sequencesTargetDecode], num_classes=nDecoderToken)

trainInputDecoded = trainInputDecoded.reshape(len(trainTarget), max_len, nDecoderToken)
trainTargetDecoded = trainTargetDecoded.reshape(len(trainTarget), max_len, nDecoderToken)

print("Train From File:\n")
print('Found %s sentences.' %len(trainTarget))
print('Found %s sequences.' %len(sequencesInputDecode))
print('Found %s unique tokens.' % len(Targetindex))
print('Found %s unique words.' % len(Targetcount))

Train From File:

Found 1000 sentences.
Found 1000 sequences.
Found 4129 unique tokens.
Found 4129 unique words.


In [19]:
trainInputDecoded.shape

(1000, 100, 4130)

In [45]:
nUnits = 300
# Define training encoder
encoder_inputs = Input(shape=(1000, nEncoderToken))
encoder = LSTM(nUnits, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
encoder_states = [state_h, state_c]

# Define training decoder
decoder_inputs = Input(shape=(None, nDecoderToken))
decoder_lstm = LSTM(nUnits, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
decoder_dense = Dense(nDecoderToken, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [39]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit([trainInputEncoded, trainInputDecoded], trainTargetDecoded, epochs=2)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x1cfce99b308>

In [40]:
# Reverse-lookup token index to decode sequences back to something readable.
reverse_input_char_index = dict((i, char) for char, i in Inputindex.items())
reverse_target_char_index = dict((i, char) for char, i in Targetindex.items())

In [41]:
# Define inference encoder
encoder_model = Model(encoder_inputs, encoder_states)

# Define inference decoder
decoder_state_input_h = Input(shape=(nUnits,))
decoder_state_input_c = Input(shape=(nUnits,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states)

In [36]:
for seq_index in range(100):
    # Take one sequence (part of the training set)
    # for trying out decoding.
    input_seq = encoder_input_data[seq_index: seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)

NameError: name 'encoder_input_data' is not defined

In [33]:
# Encode the input as state vectors.
states_value = encoder_model.predict(input_seq)
# Generate empty target sequence of length 1.
target_seq = np.zeros((1, 1, nDecoderToken))

decoded_sentence = ''
output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

# Sample a token
sampled_token_index = np.argmax(output_tokens[0, -1, :])
sampled_char = reverse_target_char_index[sampled_token_index]
decoded_sentence += sampled_char



NameError: name 'states_value' is not defined

In [46]:
encoder_inputs

<tf.Tensor 'input_10:0' shape=(?, 1000, 4305) dtype=float32>

In [43]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1, nDecoderToken))
    
    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_char == '\n' or
           len(decoded_sentence) > max_len):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1, nDecoderToken))
        target_seq[0, 0, sampled_token_index] = 1.

        # Update states
        states_value = [h, c]

    return decoded_sentence


for seq_index in range(100):
    # Take one sequence (part of the training set) for trying out decoding.
    input_seq = encoder_inputs[seq_index: seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
    print('-')
    print('Input sentence:', input_texts[seq_index])
    print('Decoded sentence:', decoded_sentence)

ValueError: When feeding symbolic tensors to a model, we expect thetensors to have a static batch size. Got tensor with shape: (None, None, 4305)

In [45]:
# Generate empty target sequence of length 1.
target_seq = np.zeros((1, 1, nDecoderToken))

In [62]:
target_seq.shape

(1, 1, 4130)

In [64]:
source = np.zeros((1, 100, nEncoderToken))

In [102]:
source = trainInputEncoded[20:30, :, :]

In [103]:
source.shape

(10, 100, 4305)

In [104]:
source[0]

array([[1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [105]:
outputSource = np.zeros((5, 100, nEncoderToken))

In [106]:
outputSource.shape

(5, 100, 4305)

In [107]:
state = encoder_model.predict(source)
yhat, h, c = decoder_model.predict([target_seq] + state)

sampled_token_index = np.argmax(yhat)

print(yhat)
print(sampled_token_index)

[[[8.8839579e-01 9.5118629e-03 6.7120218e-03 ... 3.5038986e-06
   3.5079911e-06 3.5739542e-06]]]
0


In [88]:
nSteps = 5
state = encoder_model.predict(source)
# Collect predictions
output = list()
for t in range(nSteps):
    # Predict next word
    yhat, h, c = decoder_model.predict([target_seq] + state)
    # Store prediction
    outputSource[t, :, :] = yhat[0,0,:]
    output.append(yhat[0,0,:])
    # Update state
    state = [h, c]
    # Update target sequence
    target_seq = yhat

ValueError: could not broadcast input array from shape (4130) into shape (100,4305)

In [79]:
target_seq.shape

(1, 1, 4130)

In [85]:
output.shape

AttributeError: 'list' object has no attribute 'shape'

In [83]:
# Sample a token
sampled_token_index = np.argmax(target_seq[0, -1, :])

In [84]:
sampled_token_index

0

In [None]:
sampled_char = reverse_target_char_index[sampled_token_index]
decoded_sentence += sampled_char

In [82]:
reverse_target_char_index

{1: 'the',
 2: 'newlinechar',
 3: 'to',
 4: 'a',
 5: 'i',
 6: 'you',
 7: 'and',
 8: 'of',
 9: 'is',
 10: 'that',
 11: 'it',
 12: 'in',
 13: 'for',
 14: 'on',
 15: 'not',
 16: 'they',
 17: 'be',
 18: 'have',
 19: 'was',
 20: 'are',
 21: 'but',
 22: 'if',
 23: 'with',
 24: "it's",
 25: 'just',
 26: 'my',
 27: 'your',
 28: 'or',
 29: 'like',
 30: 'as',
 31: 'would',
 32: 'all',
 33: 'what',
 34: 'this',
 35: "don't",
 36: 'so',
 37: 'he',
 38: 'http',
 39: 'think',
 40: 'people',
 41: 'do',
 42: 'me',
 43: 'no',
 44: 'at',
 45: "'",
 46: 'from',
 47: 'one',
 48: 'can',
 49: 'gt',
 50: 'how',
 51: 'there',
 52: 'about',
 53: 'more',
 54: 'we',
 55: 'get',
 56: 'know',
 57: 'com',
 58: 'some',
 59: 'them',
 60: 'then',
 61: 'an',
 62: "that's",
 63: 'will',
 64: 'up',
 65: "i'm",
 66: 'good',
 67: 'well',
 68: 'by',
 69: 'who',
 70: 'because',
 71: 'make',
 72: 'than',
 73: 'their',
 74: 'his',
 75: 'too',
 76: 'when',
 77: 'www',
 78: 'out',
 79: 'why',
 80: 'actually',
 81: 'which',
 82: 

In [58]:
# generate target given source sequence
def predict_sequence(infenc, infdec, source, n_steps, cardinality):
	# encode
	state = infenc.predict(source)
	# start of sequence input
	target_seq = array([0.0 for _ in range(cardinality)]).reshape(1, 1, cardinality)
	# collect predictions
	output = list()
	for t in range(n_steps):
		# predict next char
		yhat, h, c = infdec.predict([target_seq] + state)
		# store prediction
		output.append(yhat[0,0,:])
		# update state
		state = [h, c]
		# update target sequence
		target_seq = yhat
	return array(output)

In [38]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1, nDecoderToken))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0, Targetindex['\t']] = 1.

    # Sampling loop for a batch of sequences (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += sampled_char

        # Exit condition: either hit max length or find stop character.
        if (sampled_char == '\n' or len(decoded_sentence) > max_len):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1, nDecoderToken))
        target_seq[0, 0, sampled_token_index] = 1.

        # Update states
        states_value = [h, c]

    return decoded_sentence


for seq_index in range(100):
    # Take one sequence (part of the training set)
    # for trying out decoding.
    input_seq = trainInputEncoded[seq_index: seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
    print('-')
    print('Input sentence:', input_texts[seq_index])
    print('Decoded sentence:', decoded_sentence)

KeyError: '\t'