In [33]:
import os, sys

from keras.models import Model
from keras.layers import Input, LSTM, GRU, Dense, Embedding
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import numpy as np
import matplotlib.pyplot as plt

In [34]:
BATCH_SIZE = 64
EPOCHS = 20
LSTM_NODES =256
NUM_SENTENCES = 20000
MAX_SENTENCE_LENGTH = 50
MAX_NUM_WORDS = 20000
EMBEDDING_SIZE = 100

In [35]:
input_sentences = []
output_sentences = []
output_sentences_inputs = []

count = 0
for line in open('data/fra_data.txt', encoding="utf-8"):
    count += 1

    if count > NUM_SENTENCES:
        break

    if '\t' not in line:
        continue

    input_sentence, output = line.rstrip().split('\t')

    output_sentence = output + ' <eos>'
    output_sentence_input = '<sos> ' + output

    input_sentences.append(input_sentence)
    output_sentences.append(output_sentence)
    output_sentences_inputs.append(output_sentence_input)

print("num samples input:", len(input_sentences))
print("num samples output:", len(output_sentences))
print("num samples output input:", len(output_sentences_inputs))

num samples input: 20000
num samples output: 20000
num samples output input: 20000


In [36]:
print(input_sentences[173])
print(output_sentences[173])
print(output_sentences_inputs[173])

I'm ill.
Je suis malade. <eos>
<sos> Je suis malade.


In [37]:
input_tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
input_tokenizer.fit_on_texts(input_sentences)
input_integer_seq = input_tokenizer.texts_to_sequences(input_sentences)

word2idx_inputs = input_tokenizer.word_index
print('Total unique words in the input: %s' % len(word2idx_inputs))

max_input_len = max(len(sen) for sen in input_integer_seq)
print("Length of longest sentence in input: %g" % max_input_len)

Total unique words in the input: 3514
Length of longest sentence in input: 6


In [38]:
output_tokenizer = Tokenizer(num_words=MAX_NUM_WORDS, filters='')
output_tokenizer.fit_on_texts(output_sentences + output_sentences_inputs)
output_integer_seq = output_tokenizer.texts_to_sequences(output_sentences)
output_input_integer_seq = output_tokenizer.texts_to_sequences(output_sentences_inputs)

word2idx_outputs = output_tokenizer.word_index
print('Total unique words in the output: %s' % len(word2idx_outputs))

num_words_output = len(word2idx_outputs) + 1
max_out_len = max(len(sen) for sen in output_integer_seq)
print("Length of longest sentence in the output: %g" % max_out_len)

Total unique words in the output: 9532
Length of longest sentence in the output: 13


In [43]:
encoder_input_sequences = pad_sequences(input_integer_seq, maxlen=max_input_len)
print("encoder_input_sequences.shape:", encoder_input_sequences.shape)
print("encoder_input_sequences[173]:", encoder_input_sequences[173])

encoder_input_sequences.shape: (20000, 6)
encoder_input_sequences[173]: [  0   0   0   0   6 540]


In [64]:
encoder_output_sequences = pad_sequences(output_integer_seq, maxlen=max_out_len)
print("encoder_output_sequences.shape:", encoder_output_sequences.shape)
print("encoder_output_sequences[173]:", encoder_output_sequences[173])

encoder_output_sequences.shape: (20000, 13)
encoder_output_sequences[173]: [  0   0   0   0   0   0   0   0   0   3   6 188   1]


In [44]:
print(word2idx_inputs["i'm"])
print(word2idx_inputs["ill"])

6
540


In [46]:
decoder_input_sequences = pad_sequences(output_input_integer_seq, maxlen=max_out_len, padding='post')
print("decoder_input_sequences.shape:", decoder_input_sequences.shape)
print("decoder_input_sequences[173]:", decoder_input_sequences[173])

decoder_input_sequences.shape: (20000, 13)
decoder_input_sequences[173]: [  2   3   6 188   0   0   0   0   0   0   0   0   0]


In [76]:
decoder_output_sequences = pad_sequences(output_input_integer_seq, maxlen=max_input_len, padding='post')
print("decoder_output_sequences.shape:", decoder_output_sequences.shape)
print("decoder_output_sequences[173]:", decoder_output_sequences[173])

decoder_output_sequences.shape: (20000, 6)
decoder_output_sequences[173]: [  2   3   6 188   0   0]


In [77]:
print(word2idx_outputs["<sos>"])
print(word2idx_outputs["je"])
print(word2idx_outputs["suis"])
print(word2idx_outputs["malade."])

2
3
6
188


In [49]:
from numpy import array
from numpy import asarray
from numpy import zeros

embeddings_dictionary = dict()

glove_file = open('glove/glove.6B.100d.txt', encoding="utf8")

for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = asarray(records[1:], dtype='float32')
    embeddings_dictionary[word] = vector_dimensions
glove_file.close()

In [50]:
num_words = min(MAX_NUM_WORDS, len(word2idx_inputs) + 1)
embedding_matrix = zeros((num_words, EMBEDDING_SIZE))
for word, index in word2idx_inputs.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

In [51]:
print(embeddings_dictionary["ill"])

[ 0.12648    0.1366     0.22192   -0.025204  -0.7197     0.66147
  0.48509    0.057223   0.13829   -0.26375   -0.23647    0.74349
  0.46737   -0.462      0.20031   -0.26302    0.093948  -0.61756
 -0.28213    0.1353     0.28213    0.21813    0.16418    0.22547
 -0.98945    0.29624   -0.62476   -0.29535    0.21534    0.92274
  0.38388    0.55744   -0.14628   -0.15674   -0.51941    0.25629
 -0.0079678  0.12998   -0.029192   0.20868   -0.55127    0.075353
  0.44746   -0.71046    0.75562    0.010378   0.095229   0.16673
  0.22073   -0.46562   -0.10199   -0.80386    0.45162    0.45183
  0.19869   -1.6571     0.7584    -0.40298    0.82426   -0.386
  0.0039546  0.61318    0.02701   -0.3308    -0.095652  -0.082164
  0.7858     0.13394   -0.32715   -0.31371   -0.20247   -0.73001
 -0.49343    0.56445    0.61038    0.36777   -0.070182   0.44859
 -0.61774   -0.18849    0.65592    0.44797   -0.10469    0.62512
 -1.9474    -0.60622    0.073874   0.50013   -1.1278    -0.42066
 -0.37322   -0.50538    0

In [52]:
print(embedding_matrix[540])

[ 0.12648     0.1366      0.22192    -0.025204   -0.71969998  0.66147
  0.48508999  0.057223    0.13829    -0.26374999 -0.23647     0.74348998
  0.46737    -0.46200001  0.20031001 -0.26302001  0.093948   -0.61756003
 -0.28213     0.1353      0.28213     0.21813001  0.16418     0.22547001
 -0.98944998  0.29624    -0.62475997 -0.29534999  0.21534     0.92273998
  0.38387999  0.55743998 -0.14628001 -0.15673999 -0.51941001  0.25628999
 -0.0079678   0.12998    -0.029192    0.20868    -0.55127001  0.075353
  0.44746    -0.71046001  0.75562     0.010378    0.095229    0.16673
  0.22073001 -0.46562001 -0.10199    -0.80386001  0.45162001  0.45183
  0.19869    -1.65709996  0.75840002 -0.40298     0.82426    -0.38600001
  0.0039546   0.61317998  0.02701    -0.3308     -0.095652   -0.082164
  0.78579998  0.13394    -0.32714999 -0.31371    -0.20247    -0.73000997
 -0.49342999  0.56445003  0.61037999  0.36776999 -0.070182    0.44859001
 -0.61773998 -0.18849     0.65592003  0.44797    -0.10469     0.

In [53]:
embedding_layer = Embedding(num_words, EMBEDDING_SIZE, weights=[embedding_matrix], input_length=max_input_len)

In [54]:
decoder_targets_one_hot = np.zeros((
        len(input_sentences),
        max_out_len,
        num_words_output
    ),
    dtype='float32'
)

In [55]:
decoder_targets_one_hot.shape

(20000, 13, 9533)

In [78]:
for i, d in enumerate(decoder_output_sequences):
    for t, word in enumerate(d):
        decoder_targets_one_hot[i, t, word] = 1

In [57]:
encoder_inputs_placeholder = Input(shape=(max_input_len,))
x = embedding_layer(encoder_inputs_placeholder)
encoder = LSTM(LSTM_NODES, return_state=True)

encoder_outputs, h, c = encoder(x)
encoder_states = [h, c]

In [58]:
decoder_inputs_placeholder = Input(shape=(max_out_len,))

decoder_embedding = Embedding(num_words_output, LSTM_NODES)
decoder_inputs_x = decoder_embedding(decoder_inputs_placeholder)

decoder_lstm = LSTM(LSTM_NODES, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs_x, initial_state=encoder_states)

In [59]:
decoder_dense = Dense(num_words_output, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

In [60]:
model = Model([encoder_inputs_placeholder,
  decoder_inputs_placeholder], decoder_outputs)
model.compile(
    optimizer='rmsprop',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

In [61]:
from keras.utils import plot_model
plot_model(model, to_file='model_plot4a.png', show_shapes=True, show_layer_names=True)

In [62]:
r = model.fit(
    [encoder_input_sequences, decoder_input_sequences],
    decoder_targets_one_hot,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_split=0.1,
)

Train on 18000 samples, validate on 2000 samples
Epoch 1/20
 4096/18000 [=====>........................] - ETA: 4:45 - loss: 0.0000e+00 - acc: 0.0000e+00

KeyboardInterrupt: 