This homework is part of the Deep Learning class

In this notebook, I train an RNN model (LSTM and bi-LSTM models) to translate from French and Italian to English using letter by letter technique.

The dataset is downloaded from: "http://www.manythings.org/anki/" 

1. Data Preparation

1.1. Load and clean text

In [1]:
import re
import string
from unicodedata import normalize
import numpy

# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, mode='rt', encoding='utf-8')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

# split a loaded document into sentences
def to_pairs(doc):
    lines = doc.strip().split('\n')
    pairs = [line.split('\t') for line in  lines]
    return pairs

def clean_data(lines):
    cleaned = list()
    # prepare regex for char filtering
    re_print = re.compile('[^%s]' % re.escape(string.printable))
    # prepare translation table for removing punctuation
    table = str.maketrans('', '', string.punctuation)
    for pair in lines:
        clean_pair = list()
        for line in pair:
            # normalize unicode characters
            line = normalize('NFD', line).encode('ascii', 'ignore')
            line = line.decode('UTF-8')
            # tokenize on white space
            line = line.split()
            # convert to lowercase
            line = [word.lower() for word in line]
            # remove punctuation from each token
            line = [word.translate(table) for word in line]
            # remove non-printable chars form each token
            line = [re_print.sub('', w) for w in line]
            # remove tokens with numbers in them
            line = [word for word in line if word.isalpha()]
            # store as string
            clean_pair.append(' '.join(line))
        cleaned.append(clean_pair)
    return numpy.array(cleaned)


In [2]:
# e.g., filename = 'Data/deu.txt'
filename_fra = 'Data/fra.txt'
filename_ita = 'Data/ita.txt'

# e.g., n_train = 20000
n_train = 160003


In [3]:
# load dataset
doc_fra = load_doc(filename_fra)
doc_ita = load_doc(filename_ita)

# split into Language1-Language2 pairs
pairs_fra = to_pairs(doc_fra)
pairs_ita = to_pairs(doc_ita)

# clean sentences
clean_pairs_fra = clean_data(pairs_fra)[:n_train, :]
clean_pairs_ita = clean_data(pairs_ita)[:n_train, :]


In [4]:
for i in range(3000, 3010):
    print('[' + clean_pairs_fra[i, 0] + '] => [' + clean_pairs_fra[i, 1] + ']')
    
for i in range(3000, 3010):
    print('[' + clean_pairs_ita[i, 0] + '] => [' + clean_pairs_ita[i, 1] + ']')

[whose is it] => [a qui estce]
[whose is it] => [cest a qui]
[will it fit] => [cela iratil]
[will it fit] => [cela passeratil]
[will it fit] => [cela sadapteratil]
[will you go] => [irezvous]
[will you go] => [irastu]
[will you go] => [ty rendrastu]
[will you go] => [vous y rendrezvous]
[work slowly] => [travaille lentement]
[im flabby] => [io sono fiacco]
[im flabby] => [sono fiacca]
[im flabby] => [io sono fiacca]
[im for it] => [sono a favore]
[im for it] => [io sono a favore]
[im frugal] => [sono parsimonioso]
[im greedy] => [sono avido]
[im greedy] => [io sono avido]
[im greedy] => [sono avida]
[im greedy] => [io sono avida]


In [5]:
input_texts_fra  = clean_pairs_fra[:, 0]
target_texts_fra = ['\t' + text + '\n' for text in clean_pairs_fra[:, 1]]

print('Length of input_texts french :  ' + str(input_texts_fra.shape))
print('Length of target_texts french: ' + str(input_texts_fra.shape))


input_texts_ita  = clean_pairs_ita[:, 0]
target_texts_ita = ['\t' + text + '\n' for text in clean_pairs_ita[:, 1]]

print('Length of input_texts italian :  ' + str(input_texts_ita.shape))
print('Length of target_texts italian: ' + str(input_texts_ita.shape))

Length of input_texts french :  (160003,)
Length of target_texts french: (160003,)
Length of input_texts italian :  (160003,)
Length of target_texts italian: (160003,)


In [6]:
max_encoder_seq_length_fra = max(len(line) for line in input_texts_fra)
max_decoder_seq_length_fra = max(len(line) for line in target_texts_fra)

print('max length of input  sentences french: %d' % (max_encoder_seq_length_fra))
print('max length of target sentences french: %d' % (max_decoder_seq_length_fra))

max_encoder_seq_length_ita = max(len(line) for line in input_texts_ita)
max_decoder_seq_length_ita = max(len(line) for line in target_texts_ita)

print('max length of input  sentences italian: %d' % (max_encoder_seq_length_ita))
print('max length of target sentences italian: %d' % (max_decoder_seq_length_ita))

max length of input  sentences french: 54
max length of target sentences french: 119
max length of input  sentences italian: 25
max length of target sentences italian: 102


2. Text processing

2.1. Convert texts to sequences

In [7]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# encode and pad sequences
def text2sequences(max_len, lines):
    tokenizer = Tokenizer(char_level=True, filters='')
    tokenizer.fit_on_texts(lines)
    seqs = tokenizer.texts_to_sequences(lines)
    seqs_pad = pad_sequences(seqs, maxlen=max_len, padding='post')
    return seqs_pad, tokenizer.word_index

encoder_input_seq_fra, input_token_index_fra  = text2sequences(max_encoder_seq_length_fra, input_texts_fra)
decoder_input_seq_fra, target_token_index_fra = text2sequences(max_decoder_seq_length_fra, target_texts_fra)

print('shape of encoder_input_seq French : ' + str(encoder_input_seq_fra.shape))
print('shape of input_token_index French : ' + str(len(input_token_index_fra)))
print('shape of decoder_input_seq French : ' + str(decoder_input_seq_fra.shape))
print('shape of target_token_index French: ' + str(len(target_token_index_fra)))

encoder_input_seq_ita, input_token_index_ita  = text2sequences(max_encoder_seq_length_ita, input_texts_ita)
decoder_input_seq_ita, target_token_index_ita = text2sequences(max_decoder_seq_length_ita, target_texts_ita)

print('shape of encoder_input_seq Italian : ' + str(encoder_input_seq_ita.shape))
print('shape of input_token_index Italian : ' + str(len(input_token_index_ita)))
print('shape of decoder_input_seq Italian : ' + str(decoder_input_seq_ita.shape))
print('shape of target_token_index Italian: ' + str(len(target_token_index_ita)))

Using TensorFlow backend.


shape of encoder_input_seq French : (160003, 54)
shape of input_token_index French : 27
shape of decoder_input_seq French : (160003, 119)
shape of target_token_index French: 29
shape of encoder_input_seq Italian : (160003, 25)
shape of input_token_index Italian : 27
shape of decoder_input_seq Italian : (160003, 102)
shape of target_token_index Italian: 29


In [8]:
# the 1 is added to add a class that there are no more letters
num_encoder_tokens_fra = len(input_token_index_fra) + 1
num_decoder_tokens_fra = len(target_token_index_fra) + 1

print('num_encoder_tokens French: ' + str(num_encoder_tokens_fra))
print('num_decoder_tokens French: ' + str(num_decoder_tokens_fra))

# the 1 is added to add a class that there are no more letters
num_encoder_tokens_ita = len(input_token_index_ita) + 1
num_decoder_tokens_ita = len(target_token_index_ita) + 1

print('num_encoder_tokens Italian: ' + str(num_encoder_tokens_ita))
print('num_decoder_tokens Italian: ' + str(num_decoder_tokens_ita))

num_encoder_tokens French: 28
num_decoder_tokens French: 30
num_encoder_tokens Italian: 28
num_decoder_tokens Italian: 30


In [9]:
target_texts_fra[100]

'\tentrez\n'

In [10]:
target_texts_ita[100]

'\tlavoro ai ferri\n'

In [11]:
print(decoder_input_seq_fra[100, :])
print(decoder_input_seq_ita[100, :])

[12  2  8  7  9  2 25 13  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
[10 12  3 18  2  8  2  1  3  5  1 22  4  8  8  5 11  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0]


2.2. One-hot encode

In [12]:
from keras.utils import to_categorical

# one hot encode target sequence
def onehot_encode(sequences, max_len, vocab_size):
    n = len(sequences) # number of sentences
    data = numpy.zeros((n, max_len, vocab_size))
    for i in range(n):
        data[i, :, :] = to_categorical(sequences[i], num_classes=vocab_size)
    return data

encoder_input_data_fra = onehot_encode(encoder_input_seq_fra, max_encoder_seq_length_fra, num_encoder_tokens_fra)
decoder_input_data_fra = onehot_encode(decoder_input_seq_fra, max_decoder_seq_length_fra, num_decoder_tokens_fra)

decoder_target_seq_fra = numpy.zeros(decoder_input_seq_fra.shape)
decoder_target_seq_fra[:, 0:-1] = decoder_input_seq_fra[:, 1:]
decoder_target_data_fra = onehot_encode(decoder_target_seq_fra, 
                                    max_decoder_seq_length_fra, 
                                    num_decoder_tokens_fra)

print('encoder_input_data French: ', encoder_input_data_fra.shape)
print('decoder_input_data French: ', decoder_input_data_fra.shape)


encoder_input_data_ita = onehot_encode(encoder_input_seq_ita, max_encoder_seq_length_ita, num_encoder_tokens_ita)
decoder_input_data_ita = onehot_encode(decoder_input_seq_ita, max_decoder_seq_length_ita, num_decoder_tokens_ita)

decoder_target_seq_ita = numpy.zeros(decoder_input_seq_ita.shape)
decoder_target_seq_ita[:, 0:-1] = decoder_input_seq_ita[:, 1:]
decoder_target_data_ita = onehot_encode(decoder_target_seq_ita, 
                                    max_decoder_seq_length_ita, 
                                    num_decoder_tokens_ita)

print('encoder_input_data Italian: ', encoder_input_data_ita.shape)
print('decoder_input_data Italian: ', decoder_input_data_ita.shape)

encoder_input_data French:  (160003, 54, 28)
decoder_input_data French:  (160003, 119, 30)
encoder_input_data Italian:  (160003, 25, 28)
decoder_input_data Italian:  (160003, 102, 30)


2.3 Divide Data to train and test

In [13]:
train_encoder_input_data_fra  = encoder_input_data_fra[:159000]
train_decoder_input_data_fra  = decoder_input_data_fra[:159000]
train_decoder_target_data_fra = decoder_target_data_fra[:159000]

test_encoder_input_data_fra  = encoder_input_data_fra[159000:]
test_decoder_input_data_fra  = decoder_input_data_fra[159000:]
test_decoder_target_data_fra = decoder_target_data_fra[159000:]

train_encoder_input_data_ita  = encoder_input_data_ita[:159000]
train_decoder_input_data_ita  = decoder_input_data_ita[:159000]
train_decoder_target_data_ita = decoder_target_data_ita[:159000]

test_encoder_input_data_ita  = encoder_input_data_ita[159000:]
test_decoder_input_data_ita  = decoder_input_data_ita[159000:]
test_decoder_target_data_ita = decoder_target_data_ita[159000:]

3. Build the networks (for training)

3.1. Encoder network

In [14]:
from keras.layers import Input, LSTM, Bidirectional, Concatenate
from keras.models import Model

latent_dim = 256

# inputs of the encoder network
encoder_inputs_fra = Input(shape=(None, num_encoder_tokens_fra), name='encoder_inputs_fra')
encoder_inputs_ita = Input(shape=(None, num_encoder_tokens_ita), name='encoder_inputs_ita')

# set the BiLSTM layer
encoder_bilstm = Bidirectional(LSTM(latent_dim, return_state=True, return_sequences=True,
                                  dropout=0.5, name='encoder_bilstm'))
_, forward_h_fra, forward_c_fra, backward_h_fra, backward_c_fra = encoder_bilstm(encoder_inputs_fra)
_, forward_h_ita, forward_c_ita, backward_h_ita, backward_c_ita = encoder_bilstm(encoder_inputs_ita)

state_h_fra = Concatenate()([forward_h_fra, backward_h_fra])
state_c_fra = Concatenate()([forward_c_fra, backward_c_fra])

state_h_ita = Concatenate()([forward_h_ita, backward_h_ita])
state_c_ita = Concatenate()([forward_c_ita, backward_c_ita])

# build the encoder network model
encoder_model = Model(inputs=[encoder_inputs_fra, encoder_inputs_ita], 
                      outputs=[state_h_fra, state_c_fra, state_h_ita, state_c_ita],
                      name='encoder')

In [15]:
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot, plot_model

encoder_model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
encoder_inputs_fra (InputLayer) (None, None, 28)     0                                            
__________________________________________________________________________________________________
encoder_inputs_ita (InputLayer) (None, None, 28)     0                                            
__________________________________________________________________________________________________
bidirectional_1 (Bidirectional) [(None, None, 512),  583680      encoder_inputs_fra[0][0]         
                                                                 encoder_inputs_ita[0][0]         
__________________________________________________________________________________________________
concatenate_1 (Concatenate)     (None, 512)          0           bidirectional_1[0][1]            
          

3.2. Decoder network

In [16]:
from keras.layers import Input, LSTM, Dense
from keras.models import Model

# inputs of the decoder network
decoder_input_h_fra = Input(shape=(2*latent_dim,), name='decoder_input_h_fra')
decoder_input_c_fra = Input(shape=(2*latent_dim,), name='decoder_input_c_fra')
decoder_input_x_fra = Input(shape=(None, num_decoder_tokens_fra), name='decoder_input_x_fra')

decoder_input_h_ita = Input(shape=(2*latent_dim,), name='decoder_input_h_ita')
decoder_input_c_ita = Input(shape=(2*latent_dim,), name='decoder_input_c_ita')
decoder_input_x_ita = Input(shape=(None, num_decoder_tokens_fra), name='decoder_input_x_ita')


# set the LSTM layer
decoder_lstm_fra = LSTM(latent_dim*2, return_sequences=True, 
                    return_state=True, dropout=0.25, name='decoder_lstm_fra')
decoder_lstm_outputs_fra, state_h_fra, state_c_fra = decoder_lstm_fra(decoder_input_x_fra, 
                                                      initial_state=[decoder_input_h_fra, decoder_input_c_fra])

decoder_lstm_ita = LSTM(latent_dim*2, return_sequences=True, 
                    return_state=True, dropout=0.25, name='decoder_lstm_ita')
decoder_lstm_outputs_ita, state_h_ita, state_c_ita = decoder_lstm_ita(decoder_input_x_ita, 
                                                      initial_state=[decoder_input_h_ita, decoder_input_c_ita])

# set the dense layer
decoder_dense_fra   = Dense(num_decoder_tokens_fra, activation='softmax', name='decoder_dense_fra')
decoder_outputs_fra = decoder_dense_fra(decoder_lstm_outputs_fra)

decoder_dense_ita   = Dense(num_decoder_tokens_ita, activation='softmax', name='decoder_dense_ita')
decoder_outputs_ita = decoder_dense_ita(decoder_lstm_outputs_ita)

# build the decoder network model
decoder_model = Model(inputs=[decoder_input_x_fra, decoder_input_h_fra, decoder_input_c_fra, decoder_input_x_ita, decoder_input_h_ita, decoder_input_c_ita],
                      outputs=[decoder_outputs_fra, state_h_fra, state_c_fra, decoder_outputs_ita, state_h_ita, state_c_ita],
                      name='decoder')

In [17]:
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot, plot_model

decoder_model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
decoder_input_x_fra (InputLayer (None, None, 30)     0                                            
__________________________________________________________________________________________________
decoder_input_h_fra (InputLayer (None, 512)          0                                            
__________________________________________________________________________________________________
decoder_input_c_fra (InputLayer (None, 512)          0                                            
__________________________________________________________________________________________________
decoder_input_x_ita (InputLayer (None, None, 30)     0                                            
__________________________________________________________________________________________________
decoder_in

3.3. Connect the encoder and decoder

In [18]:
# input layers
encoder_input_x_fra = Input(shape=(None, num_encoder_tokens_fra), name='encoder_input_x_fra')
decoder_input_x_fra = Input(shape=(None, num_decoder_tokens_fra), name='decoder_input_x_fra')

encoder_input_x_ita = Input(shape=(None, num_encoder_tokens_ita), name='encoder_input_x_ita')
decoder_input_x_ita = Input(shape=(None, num_decoder_tokens_ita), name='decoder_input_x_ita')

# connect encoder to decoder
encoder_final_states = encoder_model([encoder_input_x_fra, encoder_input_x_fra])

In [19]:
# input layers
encoder_input_x_fra = Input(shape=(None, num_encoder_tokens_fra), name='encoder_input_x_fra')
decoder_input_x_fra = Input(shape=(None, num_decoder_tokens_fra), name='decoder_input_x_fra')

encoder_input_x_ita = Input(shape=(None, num_encoder_tokens_ita), name='encoder_input_x_ita')
decoder_input_x_ita = Input(shape=(None, num_decoder_tokens_ita), name='decoder_input_x_ita')

# connect encoder to decoder
encoder_final_states = encoder_model([encoder_input_x_fra, encoder_input_x_ita])

decoder_lstm_output_fra, _, _ = decoder_lstm_fra(decoder_input_x_fra, initial_state=encoder_final_states[:2])
decoder_pred_fra = decoder_dense_fra(decoder_lstm_output_fra)

decoder_lstm_output_ita, _, _ = decoder_lstm_ita(decoder_input_x_ita, initial_state=encoder_final_states[2:])
decoder_pred_ita = decoder_dense_ita(decoder_lstm_output_ita)

model = Model(inputs=[encoder_input_x_fra, encoder_input_x_ita, decoder_input_x_fra, decoder_input_x_ita], 
              outputs=[decoder_pred_fra, decoder_pred_ita], 
              name='model_training')

In [20]:
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot, plot_model

model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
encoder_input_x_fra (InputLayer (None, None, 28)     0                                            
__________________________________________________________________________________________________
encoder_input_x_ita (InputLayer (None, None, 28)     0                                            
__________________________________________________________________________________________________
decoder_input_x_fra (InputLayer (None, None, 30)     0                                            
__________________________________________________________________________________________________
encoder (Model)                 [(None, 512), (None, 583680      encoder_input_x_fra[0][0]        
                                                                 encoder_input_x_ita[0][0]        
__________

3.5. Fit the model on the bilingual dataset

In [21]:
print('shape of encoder_input_data' + str(encoder_input_data_fra.shape))
print('shape of decoder_input_data' + str(decoder_input_data_fra.shape))
print('shape of decoder_target_data' + str(decoder_target_data_fra.shape))

print('shape of encoder_input_data' + str(encoder_input_data_ita.shape))
print('shape of decoder_input_data' + str(decoder_input_data_ita.shape))
print('shape of decoder_target_data' + str(decoder_target_data_ita.shape))

shape of encoder_input_data(160003, 54, 28)
shape of decoder_input_data(160003, 119, 30)
shape of decoder_target_data(160003, 119, 30)
shape of encoder_input_data(160003, 25, 28)
shape of decoder_input_data(160003, 102, 30)
shape of decoder_target_data(160003, 102, 30)


In [22]:
from keras import optimizers

model.compile(optimizer=optimizers.RMSprop(lr=1E-4),
              loss=['categorical_crossentropy', 'categorical_crossentropy'])

model.fit([train_encoder_input_data_fra, train_encoder_input_data_ita, train_decoder_input_data_fra, train_decoder_input_data_ita],  # training data
          [train_decoder_target_data_fra, train_decoder_target_data_ita],                       # labels (left shift of the target sequences)
          batch_size=32, epochs = 20, validation_split=0.1)

model.save('seq2seq.h5')

Train on 143100 samples, validate on 15900 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


  '. They will not be included '
  '. They will not be included '


4. Make predictions

4.1. Translate English to French

In [23]:
# Reverse-lookup token index to decode sequences back to something readable.
reverse_input_char_index_fra  = dict((i, char) for char, i in input_token_index_fra.items())
reverse_target_char_index_fra = dict((i, char) for char, i in target_token_index_fra.items())

reverse_input_char_index_ita  = dict((i, char) for char, i in input_token_index_ita.items())
reverse_target_char_index_ita = dict((i, char) for char, i in target_token_index_ita.items())

In [24]:
def decode_sequence(input_seq_fra, input_seq_ita):
    states_value = encoder_model.predict([input_seq_fra, input_seq_ita])
        
    target_seq_fra = numpy.zeros((1, 1, num_decoder_tokens_fra))
    target_seq_fra[0, 0, target_token_index_fra['\t']] = 1.

    target_seq_ita = numpy.zeros((1, 1, num_decoder_tokens_ita))
    target_seq_ita[0, 0, target_token_index_ita['\t']] = 1.
        
    stop_condition_fra = False
    decoded_sentence_fra = ''
    
    stop_condition_ita = False
    decoded_sentence_ita = ''
    
    while not stop_condition_fra:

        output_tokens_fra, h_fra, c_fra, output_tokens_ita, h_ita, c_ita = decoder_model.predict([target_seq_fra, states_value[0], states_value[1], target_seq_ita, states_value[2], states_value[3]])
        
        # this line of code is greedy selection
        # try to use multinomial sampling instead (with temperature)
        sampled_token_index = numpy.argmax(output_tokens_fra[0, -1, :])
        
        sampled_char = reverse_target_char_index_fra[sampled_token_index]
        decoded_sentence_fra += sampled_char
        
        if (sampled_char == '\n' or
           len(decoded_sentence_fra) > max_decoder_seq_length_fra):
            stop_condition_fra = True

        target_seq_fra = numpy.zeros((1, 1, num_decoder_tokens_fra))
        target_seq_fra[0, 0, sampled_token_index] = 1.

        states_value = [h_fra, c_fra, h_ita, c_ita]
    
    states_value = encoder_model.predict([input_seq_fra, input_seq_ita])
    
    while not stop_condition_ita:
        output_tokens_fra, h_fra, c_fra, output_tokens_ita, h_ita, c_ita = decoder_model.predict([target_seq_fra, states_value[0], states_value[1], target_seq_ita, states_value[2], states_value[3]])
        
        # this line of code is greedy selection
        # try to use multinomial sampling instead (with temperature)
        sampled_token_index = numpy.argmax(output_tokens_ita[0, -1, :])
        
        sampled_char = reverse_target_char_index_ita[sampled_token_index]
        decoded_sentence_ita += sampled_char

        if (sampled_char == '\n' or
           len(decoded_sentence_ita) > max_decoder_seq_length_ita):
            stop_condition_ita = True

        target_seq_ita = numpy.zeros((1, 1, num_decoder_tokens_ita))
        target_seq_ita[0, 0, sampled_token_index] = 1.

        states_value = [h_fra, c_fra, h_ita, c_ita]

    return decoded_sentence_fra, decoded_sentence_ita

4.2 Translate an English sentence to the target language

In [25]:
def token_data_fra(sentence):
    cc = clean_data(sentence)
    input_sequence = numpy.zeros((1, max_encoder_seq_length_fra))
    for n in range(len(cc)):
        if cc[n,0] == '':
            cc[n,0] = ' '
        input_sequence[0,n] = input_token_index_fra[cc[n,0]]
    return input_sequence

def token_data_ita(sentence):
    cc = clean_data(sentence)
    input_sequence = numpy.zeros((1, max_encoder_seq_length_ita))
    for n in range(len(cc)):
        if cc[n,0] == '':
            cc[n,0] = ' '
        input_sequence[0,n] = input_token_index_ita[cc[n,0]]
    return input_sequence

input_sentence = 'why is that'

input_sequence_fra = token_data_fra(input_sentence)
input_sequence_ita = token_data_ita(input_sentence)

input_x_fra = onehot_encode(input_sequence_fra, max_encoder_seq_length_fra, num_encoder_tokens_fra)
input_x_ita = onehot_encode(input_sequence_ita, max_encoder_seq_length_ita, num_encoder_tokens_ita)

translated_sentence_fra, translated_sentence_ita = decode_sequence(input_x_fra, input_x_ita)
# translated_sentence = decode_sequence(input_x_ita)

print('source sentence is            : ' + input_sentence)
print('translated sentence French is : ' + translated_sentence_fra)
print('translated sentence Italian is: ' + translated_sentence_ita)

source sentence is            : why is that
translated sentence French is : pourquoi le change

translated sentence Italian is: perche e la stanza



4.3 Calculate BLUE score

In [26]:
input_textt_fra  = ["" for x in range(len(test_encoder_input_data_fra))]
target_textt_fra = ["" for x in range(len(test_encoder_input_data_fra))]
decode_textt_fra = ["" for x in range(len(test_encoder_input_data_fra))]

input_textt_ita  = ["" for x in range(len(test_encoder_input_data_ita))]
target_textt_ita = ["" for x in range(len(test_encoder_input_data_ita))]
decode_textt_ita = ["" for x in range(len(test_encoder_input_data_ita))]

for seq_index in range(len(test_encoder_input_data_fra)):
    # Take one sequence (part of the training set)
    # for trying out decoding.
    input_seq_fra = test_encoder_input_data_fra[seq_index: seq_index + 1]
    input_seq_ita = test_encoder_input_data_ita[seq_index: seq_index + 1]
    
    decoded_sentence_fra, decoded_sentence_ita = decode_sequence(input_seq_fra, input_seq_ita)
    
    input_textt_fra[seq_index]  = input_texts_fra[seq_index+158999]
    target_textt_fra[seq_index] = target_texts_fra[seq_index+158999][1:-1]
    decode_textt_fra[seq_index] = decoded_sentence_fra[0:-1]
    
    input_textt_ita[seq_index]  = input_texts_ita[seq_index+158999]
    target_textt_ita[seq_index] = target_texts_ita[seq_index+158999][1:-1]
    decode_textt_ita[seq_index] = decoded_sentence_ita[0:-1]

In [27]:
for seq_index in range(20):
    print('-')
    print('English:       ', input_textt_fra[seq_index])
    print('French (true): ', target_textt_fra[seq_index])
    print('French (pred): ', decode_textt_fra[seq_index])

-
English:        i can see youre busy so ill get right to the point
French (true):  je peux voir que vous etes occupe donc je vais aller droit au but
French (pred):  je ne peux pas vous rendre a la maison au sujet de la plage
-
English:        i can tell you things you wont hear from anyone else
French (true):  je peux te raconter des choses que tu nentendras de personne dautre
French (pred):  je ne peux pas me preter un peu de sentiment a la maison aujourdhui
-
English:        i cant afford to buy a used car much less a new one
French (true):  je ne peux pas me permettre dacheter une voiture doccasion et encore moins une nouvelle
French (pred):  je narrive pas a croire que je suis un peu de sens de la plage
-
English:        i cant believe we havent run into each other before
French (true):  je narrive pas a croire que nos chemins ne se soient pas croises auparavant
French (pred):  je narrive pas a croire que vous ne soyez pas de conseils de tout ce que tu as dit
-
English:        i 

In [28]:
for seq_index in range(20):
    print('-')
    print('English:       ', input_textt_ita[seq_index])
    print('Italian (true): ', target_textt_ita[seq_index])
    print('Italian (pred): ', decode_textt_ita[seq_index])

-
English:        on sunday i go to church
Italian (true):  io vado in chiesa la domenica
Italian (pred):  si sono perso il tuo numero
-
English:        on your mark get set go
Italian (true):  pronti partenza via
Italian (pred):  il suo distorso e profondo
-
English:        one cup of coffee please
Italian (true):  una tazza di caffe per favore
Italian (pred):  il suo distorso e profondo
-
English:        one cup of coffee please
Italian (true):  una tazza di caffe per piacere
Italian (pred):  il suo distorso e profondo
-
English:        one cup of coffee please
Italian (true):  una tazzina di caffe per favore
Italian (pred):  il suo distorso e profondo
-
English:        one cup of coffee please
Italian (true):  una tazzina di caffe per piacere
Italian (pred):  oggi con mangiate state aspettane
-
English:        one day youll understand
Italian (true):  un giorno capirete
Italian (pred):  oggi con mangiate state aspettane
-
English:        one day youll understand
Italian (true):  un 

In [29]:
def duplicates(lst, item):
    return [i for i, x in enumerate(lst) if x == item]

length_fra = numpy.zeros([780,1])
dupi_fra   = numpy.zeros([780,14], dtype=numpy.int32)

j = 0; k = 0;
for i in range(780):
    dup_fra = duplicates(input_textt_fra, input_textt_fra[k])
    dupi_fra[i,:len(dup_fra)] = dup_fra
    length_fra[i] = len(dup_fra)
    k+=len(dup_fra)


In [31]:
from nltk.translate.bleu_score import sentence_bleu

score_fra = numpy.zeros([780,1])

for i in range(780):
    vv = numpy.nonzero(dupi_fra[i])
    texttt = ["" for x in range(len(vv))]
    for j in range(len(vv)):
        texttt[j] = target_textt_fra[dupi_fra[i, j]]
    score_fra[i] = sentence_bleu(texttt, decode_textt_fra[dupi_fra[i,0]])

numpy.average(score_fra)


The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


0.21310638162912765

In [32]:
def duplicates(lst, item):
    return [i for i, x in enumerate(lst) if x == item]

length_ita = numpy.zeros([457,1])
dupi_ita   = numpy.zeros([457,12], dtype=numpy.int32)

j = 0; k = 0;
for i in range(457):
    dup_ita = duplicates(input_textt_ita, input_textt_ita[k])
    dupi_ita[i,:len(dup_ita)] = dup_ita
    length_ita[i] = len(dup_ita)
    k+=len(dup_ita)


In [34]:
from nltk.translate.bleu_score import sentence_bleu

score_ita = numpy.zeros([457,1])

for i in range(457):
    vv = numpy.nonzero(dupi_ita[i])
    texttt = ["" for x in range(len(vv))]
    for j in range(len(vv)):
        texttt[j] = target_textt_ita[dupi_ita[i, j]]
    score_ita[i] = sentence_bleu(texttt, decode_textt_ita[dupi_ita[i,0]])

numpy.average(score_ita)


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


0.17261413213035998