In [1]:
import keras
import numpy as np
import pandas as pd

import string

Using TensorFlow backend.


### Data Preprocessing

In [2]:
text_data = pd.read_table('../data/fra.txt', names=['en', 'fr', 'description'])
text_data = text_data.drop('description', axis=1)
text_data = text_data.iloc[:10]
text_data

Unnamed: 0,en,fr
0,Go.,Va !
1,Hi.,Salut !
2,Hi.,Salut.
3,Run!,Cours !
4,Run!,Courez !
5,Who?,Qui ?
6,Wow!,Ça alors !
7,Fire!,Au feu !
8,Help!,À l'aide !
9,Jump.,Saute.


In [3]:
def normalize_strings(s):
    return ''.join([c for c in s if c not in string.punctuation]).lower().strip()

text_data['en'] = text_data['en'].apply(normalize_strings)
text_data['fr'] = text_data['fr'].apply(normalize_strings)

text_data.head()

Unnamed: 0,en,fr
0,go,va
1,hi,salut
2,hi,salut
3,run,cours
4,run,courez


In [4]:
def get_vocab(data):
    vocab = set()
    for sentence in data:
        for char in sentence:
            vocab.add(char)
    return vocab

total_pairs = len(text_data)

# Adding start and ending sequence to target
text_data['fr'] = '\t' + text_data['fr'] + '\n'

en_vocab = get_vocab(text_data['en'])
fr_vocab = get_vocab(text_data['fr'])

en_vocab.add(' ')
fr_vocab.add(' ')

max_en_sentence = max(text_data['en'].apply(len)) + 1
max_fr_sentence = max(text_data['fr'].apply(len)) + 1

en_vocab_size = len(en_vocab)
fr_vocab_size = len(fr_vocab)

text_data.head()

Unnamed: 0,en,fr
0,go,\tva\n
1,hi,\tsalut\n
2,hi,\tsalut\n
3,run,\tcours\n
4,run,\tcourez\n


In [14]:
en_token_index = dict([(char, i) for i, char in enumerate(en_vocab)])
fr_token_index = dict([(char, i) for i, char in enumerate(fr_vocab)])

reverse_en_token_index = dict((i, char) for char, i in en_token_index.items())
reverse_fr_token_index = dict((i, char) for char, i in fr_token_index.items())

encoder_input_data = np.zeros((total_pairs, max_en_sentence, en_vocab_size), dtype='float32')
decoder_input_data = np.zeros((total_pairs, max_fr_sentence, fr_vocab_size), dtype='float32')
decoder_target_data = decoder_input_data.copy()

# Generating one-hot representation

for i, (en_text, fr_text) in enumerate(zip(text_data['en'], text_data['fr'])):
    for t, char in enumerate(en_text):
        encoder_input_data[i, t, en_token_index[char]] = 1
    encoder_input_data[i, t + 1, en_token_index[' ']] = 1
    
    for t, char in enumerate(fr_text):
        decoder_input_data[i, t, fr_token_index[char]] = 1
        if t > 0:
            decoder_target_data[i, t - 1, fr_token_index[char]] = 1
    decoder_input_data[i, t + 1, fr_token_index[' ']] = 1
    decoder_target_data[i, t:, fr_token_index[' ']] = 1

### Encoder

In [6]:
latent_dim = 10

encoder_inputs = keras.layers.Input(shape=(None, en_vocab_size))
encoder = keras.layers.LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
encoder_states = [state_h, state_c]

### Decoder

In [7]:
decoder_inputs = keras.layers.Input(shape=(None, fr_vocab_size))
decoder_lstm = keras.layers.LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
decoder_dense = keras.layers.Dense(fr_vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

### Model for Training

In [15]:
model = keras.models.Model([encoder_inputs, decoder_inputs], decoder_outputs)

model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
          batch_size=10, 
          epochs=100,
          verbose=0)

<keras.callbacks.callbacks.History at 0x28c1332edc8>

### Model for Inference

In [17]:
encoder_model = keras.models.Model(encoder_inputs, encoder_states)

decoder_state_input_h = keras.layers.Input(shape=(latent_dim,))
decoder_state_input_c = keras.layers.Input(shape=(latent_dim,))

decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = keras.models.Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states)

In [26]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1, fr_vocab_size))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0, fr_token_index['\t']] = 1.

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_fr_token_index[sampled_token_index]
        decoded_sentence += sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_char == '\n' or
           len(decoded_sentence) > max_fr_sentence):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1, fr_vocab_size))
        target_seq[0, 0, sampled_token_index] = 1.

        # Update states
        states_value = [h, c]

    return decoded_sentence

In [42]:
for seq_index in range(10):
    # Take one sequence (part of the training set)
    # for trying out decoding.
    input_seq = encoder_input_data[seq_index: seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
    print('-')
    print('Input sentence:', text_data['en'].iloc[seq_index])
    print('Decoded sentence:', decoded_sentence)

-
Input sentence: go
Decoded sentence: va

-
Input sentence: hi
Decoded sentence: salut

-
Input sentence: hi
Decoded sentence: salut

-
Input sentence: run
Decoded sentence: cours

-
Input sentence: run
Decoded sentence: cours

-
Input sentence: who
Decoded sentence: qui

-
Input sentence: wow
Decoded sentence: ça alors

-
Input sentence: fire
Decoded sentence: au feu

-
Input sentence: help
Decoded sentence: à laide

-
Input sentence: jump
Decoded sentence: saute

