In [None]:
from __future__ import division

import os
import re
import math
import time
import nltk
import string
import random
import codecs
import numpy as np
import itertools as it
import tensorflow as tf
import cPickle as pickle
import scipy.sparse as ss
import matplotlib.pyplot as plt
import tensorflow.contrib.seq2seq as seq2seq
import nltk.translate.bleu_score as bleu

from tqdm import tqdm
from collections import Counter
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from tensorflow.python.layers import core as layers_core
from tensorflow.contrib.layers import safe_embedding_lookup_sparse
from tensorflow.contrib.rnn import LSTMCell, LSTMStateTuple, GRUCell

from src.helpers import *

In [None]:
CORPUS_A_PATH, CORPUS_B_PATH = './corpora/europarl.de-en.en', './corpora/europarl.de-en.de'
# CORPUS_A_PATH, CORPUS_B_PATH = './corpora/europarl.pl-en.pl', './corpora/europarl.pl-en.en'

# LOAD CORPUS
corpus_a, vocab_cnt_a = load_corpus(CORPUS_A_PATH)
corpus_b, vocab_cnt_b = load_corpus(CORPUS_B_PATH)

raw_corpus_a_size = sum(vocab_cnt_a.itervalues())
raw_vocab_a_size = len(vocab_cnt_a)
raw_corpus_b_size = sum(vocab_cnt_b.itervalues())
raw_vocab_b_size = len(vocab_cnt_b)

print 'Corpus A size (total tokens):', raw_corpus_a_size
print 'Corpus A vocabulary size (distinct tokens):', raw_vocab_a_size
print 'Most popular words (corpus A):', vocab_cnt_a.most_common(5)
print
print 'Corpus B size (total tokens):', raw_corpus_b_size
print 'Corpus B vocabulary size (distinct tokens):', raw_vocab_b_size
print 'Most popular words (corpus B):', vocab_cnt_b.most_common(5)

# visualize distribution
counts_a = sorted(vocab_cnt_a.itervalues(), reverse=True)
counts_b = sorted(vocab_cnt_b.itervalues(), reverse=True)
plt.figure(figsize=(12, 4))
plt.subplot(121)
plt.semilogy(range(len(counts_a)), counts_a)
plt.title('Distribution of token occurences (Corpus SRC)')
plt.xlabel('Token')
plt.ylabel('Occurences')
plt.grid()
plt.subplot(122)
plt.semilogy(range(len(counts_b)), counts_b)
plt.title('Distribution of token occurences (Corpus TRG)')
plt.xlabel('Token')
plt.ylabel('Occurences')
plt.tight_layout()
plt.grid()
plt.show()

In [None]:
# LIMIT VOCABS
LANG_A_TOKEN_LIMIT = 25000
vocab_a, vocab_enc_a, vocab_dec_a = code_tokens(vocab_cnt_a, LANG_A_TOKEN_LIMIT)
# corpus_a_enc = [[vocab_enc_a[word] for word in sentence if word in vocab_enc_a] for sentence in corpus_a]

LANG_B_TOKEN_LIMIT = 35000
vocab_b, vocab_enc_b, vocab_dec_b = code_tokens(vocab_cnt_b, LANG_B_TOKEN_LIMIT)
# corpus_b_enc = [[vocab_enc_b[word] for word in sentence if word in vocab_enc_b] for sentence in corpus_b]
del corpus_a
del corpus_b

# print 'Clean corpus A size (total sentences):', len(corpus_a_enc)
# print 'Clean corpus A vocabulary size (distinct tokens):', len(vocab_a)
# print
# print 'Clean corpus B size (total sentences):', len(corpus_b_enc)
# print 'Clean corpus B vocabulary size (distinct tokens):', len(vocab_b)

In [None]:
corpus_par = zip(corpus_a_enc, corpus_b_enc)
length_diff = Counter([abs(len(a) - len(b)) for a, b in corpus_par])

print 'Max length diff:', max(length_diff.keys())
print 'Avg length diff:', sum(length_diff.keys()) / len(corpus_par)

keys, values = zip(*sorted(length_diff.iteritems(), key=lambda x: x[0]))
plt.figure(figsize=(6, 4))
plt.semilogy(keys, values)
plt.title('Distribution of length differences (Corpus Parallel)')
plt.xlabel('Length Difference')
plt.ylabel('Occurences')
plt.grid()
plt.show()

In [None]:
class AttnSeqToSeq(object):
    def __init__(self, src_vocab_size, trg_vocab_size, emb_size=100, enc_units=100, dec_units=100, 
                 num_layers=1, attn_span=25, bi_dir=False, learning_rate=1e-3, pad_token=0, eos_token=2):
        self.src_vocab_size = src_vocab_size
        self.trg_vocab_size = trg_vocab_size
        self.emb_size = emb_size
        self.enc_units = enc_units
        self.dec_units = dec_units
        self.num_layers = num_layers
        self.attn_span = attn_span
        self.bi_dir = bi_dir
        self.learning_rate = learning_rate
        self.pad_token = pad_token
        self.eos_token = eos_token
        
        self._build_model()
        
    def _init_placeholders(self):
        with tf.variable_scope('placeholders') as scope:
            self.enc_inputs = tf.placeholder(shape=(None, None), dtype=tf.int32, name='encoder_inputs')
            self.dec_inputs = tf.placeholder(shape=(None, None), dtype=tf.int32, name='decoder_inputs')
            self.dec_targets = tf.placeholder(shape=(None, None), dtype=tf.int32, name='decoder_targets')

            self.enc_inputs_len = tf.placeholder(shape=(None,), dtype=tf.int32, name='encoder_inputs_len')
            self.dec_inputs_len = tf.placeholder(shape=(None,), dtype=tf.int32, name='decoder_inputs_len')
            self.max_dec_inputs_len = tf.reduce_max(self.dec_inputs_len, name='max_decoder_inputs_len')
            
            self.batch_size = tf.shape(self.enc_inputs)[0]
            self.avg_eval_loss = tf.placeholder_with_default(0.0, shape=None, name='avg_eval_loss')
            
    def _init_variables(self):
        # define global variables
        self.global_step = tf.Variable(
            initial_value=0, 
            trainable=False, 
            name='global_step')
        
        # define embeddings and lookup
        with tf.variable_scope('embeddings') as scope:
            self.embeddings_src = tf.Variable(
                tf.random_uniform([self.src_vocab_size, self.emb_size], -0.15, 0.15), 
                dtype=tf.float32,
                name='embeddings_src')
            self.embeddings_trg = tf.Variable(
                tf.random_uniform([self.trg_vocab_size, self.emb_size], -0.15, 0.15), 
                dtype=tf.float32,
                name='embeddings_trg')
    
    def _init_encoder(self):
        with tf.variable_scope('encoder') as scope:
            enc_inputs_emb = tf.nn.embedding_lookup(self.embeddings_src, self.enc_inputs)
            enc_cell = tf.contrib.rnn.GRUCell(self.enc_units)
            self.enc_outputs, self.enc_final_state = tf.nn.dynamic_rnn(
                cell=enc_cell, 
                inputs=enc_inputs_emb, 
                sequence_length=self.enc_inputs_len, 
                time_major=False, 
                dtype=tf.float32, 
                scope='encoder_cell')
            
    def _init_decoder(self):
        # training decoder
        with tf.variable_scope('decoder') as scope:
            dec_inputs_emb = tf.nn.embedding_lookup(self.embeddings_trg, self.dec_inputs)
            
            attention_mechanism = tf.contrib.seq2seq.LuongAttention(
                num_units=self.dec_units,
                memory=self.enc_outputs,
                memory_sequence_length=self.enc_inputs_len)
            
            dec_cell = tf.contrib.seq2seq.AttentionWrapper(
                cell=tf.contrib.rnn.GRUCell(self.dec_units),
                attention_mechanism=attention_mechanism,
                attention_layer_size=self.dec_units,
                output_attention=True,
                alignment_history=True,
                name='attention_wrapper'
                )
            
            dec_cell = tf.contrib.rnn.OutputProjectionWrapper(
                dec_cell,
                self.trg_vocab_size
            )

            dec_train_helper = tf.contrib.seq2seq.TrainingHelper(
                inputs=dec_inputs_emb,
                sequence_length=self.dec_inputs_len,
                time_major=False,
                name='training_helper')

            dec_initial_state = dec_cell.zero_state(batch_size=self.batch_size, dtype=tf.float32).clone(cell_state=self.enc_final_state) 
            dec_train_decoder = seq2seq.BasicDecoder(
                cell=dec_cell,
                helper=dec_train_helper,
                initial_state=dec_initial_state)

            self.dec_train_outputs, dec_train_final_state, _ = seq2seq.dynamic_decode(
                decoder=dec_train_decoder,
                output_time_major=False,
                impute_finished=True,
                maximum_iterations=self.max_dec_inputs_len)
        
            self.train_attn_history = self._prepare_attention_images(
                dec_train_final_state.alignment_history)
            
            # inference decoder
            batch_size = tf.shape(self.enc_inputs)[0]
            eos_slice = tf.fill([batch_size], self.eos_token, name='EOS')

            dec_infer_helper = seq2seq.GreedyEmbeddingHelper(
                embedding=self.embeddings_trg,
                start_tokens=eos_slice,
                end_token=self.eos_token)

            dec_infer_decoder = seq2seq.BasicDecoder(
                cell=dec_cell,
                helper=dec_infer_helper,
                initial_state=dec_initial_state)

            self.dec_infer_outputs, dec_infer_final_state, _ = seq2seq.dynamic_decode(
                decoder=dec_infer_decoder,
                output_time_major=False,
                impute_finished=True)
    
            self.infer_attn_history = self._prepare_attention_images(
                dec_infer_final_state.alignment_history)
    
    def _init_optimizer(self):
        with tf.variable_scope('optimization') as scope:
            dec_train_logits = tf.identity(self.dec_train_outputs.rnn_output)
            dec_infer_logits = tf.identity(self.dec_infer_outputs.rnn_output)
            self.dec_train_preds = tf.identity(self.dec_train_outputs.sample_id) 
            self.dec_infer_preds = tf.identity(self.dec_infer_outputs.sample_id)

            masks = tf.sequence_mask(
                lengths=self.dec_inputs_len, 
                maxlen=self.max_dec_inputs_len,
                dtype=tf.float32, 
                name='masks')

            self.loss = seq2seq.sequence_loss(
                logits=dec_train_logits,
                targets=self.dec_targets,
                weights=masks,
                average_across_timesteps=True,
                average_across_batch=True)

            # setup optimizer and training step
            self.opt = tf.train.AdamOptimizer(self.learning_rate)
            
            trainable_params = tf.trainable_variables()
            gradients = tf.gradients(self.loss, trainable_params)
            # add gradient clipping
            clip_gradients = gradients
            self.updates = self.opt.apply_gradients(
                zip(clip_gradients, trainable_params), 
                global_step=self.global_step)
    
            # summaries
            self.train_summary = tf.summary.scalar('train_loss', self.loss)
            self.valid_summary = tf.summary.scalar('valid_loss', self.loss)
            self.train_attention_summary = tf.summary.image('train_attention_history', self.train_attn_history)
            self.infer_attention_summary = tf.summary.image('infer_attention_history', self.infer_attn_history)
            self.avg_valid_summary = tf.summary.scalar('avg_valid_loss', self.avg_eval_loss)
            
    def _build_model(self):
        self._init_placeholders()
        self._init_variables()
        self._init_encoder()
        self._init_decoder()
        self._init_optimizer()
        
        self.summary_op = tf.summary.merge_all()
    
    def _get_feed_dict(self, enc_in, enc_in_len, dec_in=None, dec_in_len=None, dec_out=None):
            feed_dict = { self.enc_inputs: enc_in, self.enc_inputs_len: enc_in_len}
            
            if dec_in is not None:
                feed_dict[self.dec_inputs] = dec_in
            if dec_in_len is not None:
                feed_dict[self.dec_inputs_len] = dec_in_len
            if dec_out is not None:
                feed_dict[self.dec_targets] = dec_out
                
            return feed_dict
        
    def _prepare_attention_images(self, alignment_history):
        attention_images = alignment_history.stack()
        attention_images = tf.expand_dims(
            tf.transpose(attention_images, [1, 2, 0]), -1)
        attention_images *= 255
        return attention_images
    
    def train(self, sess, enc_inputs, enc_inputs_len, dec_inputs, dec_inputs_len, dec_targets):
        fd = self._get_feed_dict(enc_inputs, enc_inputs_len, 
                                 dec_inputs, dec_inputs_len, 
                                 dec_targets)
        
        operations = [self.updates, self.loss, self.train_summary, self.enc_final_state]
        _, l, sl, e = sess.run(operations, fd)
        return l, sl, e
    
    def evaluate(self, sess, enc_inputs, enc_inputs_len, dec_inputs, dec_inputs_len, dec_targets):
        fd = self._get_feed_dict(enc_inputs, enc_inputs_len, 
                                 dec_inputs, dec_inputs_len, 
                                 dec_targets)
        
        operations = [self.loss, self.train_summary, self.train_attn_history, 
                      self.dec_train_preds, self.enc_final_state]
        l, sl, ah, p, e = sess.run(operations, fd)
        return l, sl, ah, p, e
    
    def infer(self, sess, enc_inputs, enc_inputs_len):
        fd = self._get_feed_dict(enc_inputs, enc_inputs_len)
        return sess.run([self.dec_infer_preds, self.infer_attn_history], fd)
    
    def encode_seq(self, sess, enc_inputs, enc_inputs_len):
        fd = self._get_feed_dict(enc_inputs, enc_inputs_len)
        return sess.run([self.enc_final_state], fd)
    
    def save_model(self, sess, path):
        saver = tf.train.Saver()
        saver.save(sess, save_path=path, global_step=self.global_step)
    
    def restore_model(self, sess, path):
        saver = tf.train.Saver()
        saver.restore(sess, path)

### Toy Example - Reconstructing Sequence

In [None]:
# EXPERIMENT PARAMETERS
VOCAB_SIZE_SRC = VOCAB_SIZE_TRG = 10
BATCH_SIZE = 64
TRAIN_DATA_SIZE = 500000
VAL_DATA_SIZE = 1000

NUM_LAYERS = 1
EMB_SIZE = 15
ENC_UNITS = DEC_UNITS = 50
LEARNING_RATE = 0.001
HARD_MAX_LEN = 128

SUMM_INTERVAL = 100
EVAL_INTERVAL = 25
CKPT_INTERVAL = 1000

CKPT_PATH = './models/'
LOG_PATH = './logs/'
EXPERIMENT_NAME = 'toy-example'

# EXPERIMENT DATA
SEQ_MAX_LEN = 30
train_data = toy_data_generator(VOCAB_SIZE_SRC, TRAIN_DATA_SIZE, SEQ_MAX_LEN, 3)
eval_data = list(toy_data_generator(VOCAB_SIZE_SRC, VAL_DATA_SIZE, SEQ_MAX_LEN, 3))
vocab_dec_a = {ix: str(ix) for ix in xrange(11)}
vocab_dec_b = vocab_dec_a

### Europarl DE-EN Translation

In [None]:
# EXPERIMENT PARAMETERS
VOCAB_SIZE_SRC = len(vocab_a)
VOCAB_SIZE_TRG = len(vocab_b)
BATCH_SIZE = 64

NUM_LAYERS = 1
EMB_SIZE = 200
ENC_UNITS = 500
DEC_UNITS = ENC_UNITS
LEARNING_RATE = 0.001
HARD_MAX_LEN = 128

SUMM_INTERVAL = 100
EVAL_INTERVAL = 250
CKPT_INTERVAL = 1000

CKPT_PATH = './models/'
LOG_PATH = './logs/'
EXPERIMENT_NAME = 'ep-de'

# EXPERIMENT DATA
# random.seed(1)
# random.shuffle(corpus_par)
# corpus_len = len(corpus_par)
# train_split, eval_split, test_split = int(0.8*corpus_len), int(0.1*corpus_len), int(0.1*corpus_len)
# train_data = corpus_par[:train_split]
# eval_data = corpus_par[train_split:train_split+eval_split]
# test_data = corpus_par[-test_split:]

#### Run Experiments

In [None]:
tf.reset_default_graph()
sess = tf.InteractiveSession()
model = AttnSeqToSeq(VOCAB_SIZE_SRC, VOCAB_SIZE_TRG, EMB_SIZE, ENC_UNITS, DEC_UNITS, 
                 learning_rate=LEARNING_RATE, attn_span=SEQ_MAX_LEN)

try:
    sess.run(tf.global_variables_initializer())
    summary_writer = tf.summary.FileWriter(
        os.path.join(LOG_PATH, EXPERIMENT_NAME + time.strftime("%Y-%m-%d-%H-%M-%S")),
        graph=sess.graph)
    
    while True:
        batches_gen = batchify_data(train_data, BATCH_SIZE)
        train_losses = []
        for data_batch in batches_gen: 
            batch_src, batch_trg = zip(*data_batch)
#             batch_trg = [row[::-1] for row in batch_trg]
            
            # prepare batch
            enc_inp, enc_lengths = pad_data(batch_src, append_suf=[2])
            dec_inp, _ = pad_data(batch_trg, append_pre=[2])
            dec_trg, dec_lengths = pad_data(batch_trg, append_suf=[2])
            
            # training step
            if enc_inp.shape[1] > HARD_MAX_LEN: continue
            if enc_inp.shape[0] != BATCH_SIZE: continue
            l, sl, _ = model.train(sess, enc_inp, enc_lengths, dec_inp, dec_lengths, dec_trg)
            train_losses.append(l)
            
            # summarize, eval, etc.
            global_step = model.global_step.eval()
            if global_step % CKPT_INTERVAL == 0:
                ckpt_file = os.path.join(CKPT_PATH, EXPERIMENT_NAME + time.strftime("%Y-%m-%d-%H-%M-%S"))
#                 model.save_model(sess, ckpt_file)
                print 'Saved model...'
                
            if global_step == 1 or global_step % SUMM_INTERVAL == 0:
                summary_writer.add_summary(sl, global_step)

            if global_step == 1 or global_step % EVAL_INTERVAL == 0:
                eval_losses = []
                example_input, example_pred, example_attn = None, None, None
                for batch_data in batchify_data(eval_data, BATCH_SIZE): 
                    batch_src, batch_trg = zip(*batch_data)
#                     batch_trg = [row[::-1] for row in batch_trg]
                    
                    enc_inp, enc_lengths = pad_data(batch_src, append_suf=[2])
                    dec_inp, _ = pad_data(batch_trg, append_pre=[2])
                    dec_trg, dec_lengths = pad_data(batch_trg, append_suf=[2])
                    
                    if enc_inp.shape[1] > HARD_MAX_LEN: continue
                    if enc_inp.shape[0] != BATCH_SIZE: continue
                    l, _a, ah, p, _b = model.evaluate(sess, enc_inp, enc_lengths, dec_inp, dec_lengths, dec_trg)
                    example_input, example_pred, example_attn = enc_inp, p, ah
                    eval_losses.append(l)
                    
#                 plt.imshow(example_attn[0][:,:,0])
#                 plt.savefig('example-attn-step-{}.png'.format(global_step))
                print('batch {}'.format(global_step))
                print('train losses: {} / eval losses: {}'.format(np.mean(train_losses), np.mean(eval_losses)))
                for i, (inp, pred) in enumerate(zip(example_input, example_pred)[:3]):
                    print('sample {}:'.format(i + 1))
                    print('input     >> {}'.format(' '.join([vocab_dec_a[word].encode('ascii', errors='replace') for word in inp])))
                    print('predicted >> {}'.format(' '.join([vocab_dec_b[word].encode('ascii', errors='replace') for word in pred])))
                    
                eval_s = sess.run(model.avg_valid_summary, {model.avg_eval_loss: np.mean(eval_losses)})
                summary_writer.add_summary(eval_s, global_step)
                    
                # clear train losses
                train_losses = []
        summary_writer.flush()
                    
except KeyboardInterrupt:
    ckpt_file = os.path.join(CKPT_PATH, EXPERIMENT_NAME + time.strftime("%Y-%m-%d-%H-%M-%S"))
#     model.save_model(sess, ckpt_file)
    summary_writer.close()
    print 'Training Interrupted'

In [None]:
source_seq = [[3,4,5,6,7,8,9,2]]
source_len = [9]
target_seq, attn_img = model.infer(sess, source_seq, source_len)
plot_alignment(attn_img[0][:,:,0], map(str, source_seq[0])[:source_len[0]], map(str, target_seq[0]))

## Analyze Results

In [None]:
tf.reset_default_graph()
sess = tf.InteractiveSession()
model = SeqToSeq(VOCAB_SIZE_SRC, VOCAB_SIZE_TRG, EMB_SIZE, ENC_UNITS, DEC_UNITS, learning_rate=LEARNING_RATE)
model.restore_model(sess, './models/basic/ep-de2017-10-09-17-21-06-11000')
# model.restore_model(sess, './models/reversed/ep-de2017-10-10-00-01-03-11000')

#### Run inference on test set

In [None]:
test_losses, translations = [], []
test_data = corpus_par_small
data_batches = list(batchify_data(test_data, BATCH_SIZE))
for data_batch in tqdm(data_batches): 
    batch_src, batch_trg = zip(*data_batch)
    enc_inp = [line[::-1] for line in enc_inp]
    enc_inp, enc_lengths = pad_data(batch_src, append_suf=[2])
    dec_inp, _ = pad_data(batch_trg, append_pre=[2])
    dec_trg, dec_lengths = pad_data(batch_trg, append_suf=[2])
    if enc_inp.shape[1] > HARD_MAX_LEN: continue
    l, _, p, _ = model.evaluate(sess, enc_inp, enc_lengths, dec_inp, dec_lengths, dec_trg)
    translations.append(model.infer(sess, enc_inp, enc_lengths))
    test_losses.append(l)
    
# flatten and cleanup network output
trans = [trans.tolist() for wrap_batch in translations for batch in wrap_batch for trans in batch]
trans_end_ix = [line.index(2) for line in trans]
trans_pred = [line[:end_ix] for line, end_ix in zip(trans, trans_end_ix)]

#### Compute BLEU score and print results

In [None]:
pred_words = [[vocab_dec_b[token] for token in line] for line in trans_pred]
targ_words = [[vocab_dec_b[token] for token in line[1]] for line in corpus_par_small]
bleu.corpus_bleu([[sentence] for sentence in targ_words], pred_words, weights=(0.4,0.3,0.3))

In [None]:
for ix in xrange(len(test_data)):
    print 'SRC: ', ' '.join([vocab_dec_a[token] for token in test_data[ix][0]])
    print 'TARGET: ', ' '.join([vocab_dec_b[token] for token in test_data[ix][1]])
    print 'PEDICT: ', ' '.join([vocab_dec_b[token] for token in trans_pred[ix]])

#### Save translations to file

In [None]:
with codecs.open('test-source.en', 'w', encoding='utf-8') as test_src:
    with codecs.open('test-target.de', 'w', encoding='utf-8') as test_trg:
        with codecs.open('test-pred.de', 'w', encoding='utf-8') as test_prd:
            for ix in xrange(len(test_data)):
                test_src.write(' '.join([vocab_dec_a[token] for token in test_data[ix][0]]) + '\n')
                test_trg.write(' '.join([vocab_dec_b[token] for token in test_data[ix][1]]) + '\n')
                test_prd.write(' '.join([vocab_dec_b[token] for token in trans_pred[ix]]) + '\n')

#### Toy experiments

In [None]:
src_sentences = [
    'the president met with other leaders',
    'italian prime minister berlusconi',
    'merkel spoke to the president',
    'foreign policy was discussed in the meeting',
    'summit touching on international affairs',
    'debate about the foreign affairs',
    'support for developing countries',
    'financial aid for the poorest',
    'social benefits for the needy',
    'handling the immigration crisis',
    'refugees seeking asylum'
                ]

src_sentences = [
    'angela merkel', 'barack obama', 'vladimir putin',
    'poland', 'hungary', 'slovakia', 'czech',
    'norway', 'sweden', 'finland', 'denmark',
    'foreign affairs', 'foreign policy', 'international affairs'
    
]

src_split = [sentence.split(' ') for sentence in src_sentences]
src_enc = [[vocab_enc_a[word] for word in sent] for sent in src_split]
enc_inp, enc_len = pad_data(src_enc, append_suf=[2])

encoded = model.encode_seq(sess, enc_inp, enc_len)[0]
pca = get_pca_embeddings(encoded)

plt.figure(figsize=(8,8))
for point, label in zip(pca, src_sentences):
    x, y = point[0], point[1]
    plt.scatter(x, y)
    plt.annotate(label, xy = (x, y), ha='center',  va='bottom', xytext=(3,2), textcoords='offset points', fontsize=15)
plt.grid()
plt.savefig('mt-basic-sent-emb.png')
plt.show()
# for ix in xrange(len(src_sentences)):
#     print 'INPUT: ', ' '.join([vocab_dec_a[token] for token in src_enc[ix]])
#     print 'OUTPUT:', ' '.join([vocab_dec_b[token].encode('utf8') for token in model.infer(sess, enc_inp, enc_len)[0][ix].tolist()])

### Visualize results

In [None]:
def load_and_split(file_path):
    with open(file_path, 'r') as fd:
        return [line.split(' ') for line in fd]

# load data and store it in tuple (source, target, pred_basic, pred_reverse, pred_gnmt)
data_src = load_and_split('other/translations/test-source.en.txt')
data_trg = [[line] for line in load_and_split('other/translations/test-target.de.txt')]
data_pred = load_and_split('other/translations/test-pred.de.txt')
data_pred_r = load_and_split('other/translations/test-pred-r.de.txt')
data_gnmt = load_and_split('other/translations/test-gnmt.de.txt')

data = list(zip(data_src, data_trg, data_pred, data_pred_r, data_gnmt))

In [None]:
limits = [5, 10, 15, 25, 50, 75, 100]
bleus_pred = []
bleus_gnmt = []

for limit in limits:
    trg, prd, gnmt = zip(*[triple for triple in zip(data_trg, data_pred, data_gnmt) if len(triple[0][0]) <= limit])
    bleu_pred = 100*bleu.corpus_bleu(trg, prd, weights=(0.33,0.33,0.33))
    bleu_gnmt = 100*bleu.corpus_bleu(trg, gnmt, weights=(0.33,0.33,0.33))
    
    bleus_pred.append(bleu_pred)
    bleus_gnmt.append(bleu_gnmt)

    print 'limit {} - bleu pred {} - bleu gnmt {}'.format(limit, bleu_pred, bleu_gnmt)

In [None]:
plt.plot(limits, bleus_pred, label='IC')
plt.plot(limits, bleus_gnmt, label='GT', alpha=0.8, linestyle='--')
plt.title('BLEU score vs. length of translated text')
plt.legend(loc='lower left')
plt.grid()
plt.savefig('mt-bleu-len.png')
plt.show()

In [None]:
filtered = [triple for triple in zip(data_src, data_trg, data_pred, data_gnmt) if len(triple[1][0]) <= 75]
random.shuffle(filtered)
src, trg, prd, gnmt = zip(*filtered)

for ix in xrange(15):
    print 'Src:', ' '.join(src[ix]).strip('\n')
    print 'Trg:', ' '.join(trg[ix][0]).strip('\n')
    print 'Prd:', ' '.join(prd[ix]).strip('\n')
    print 'Ggl:', ' '.join(gnmt[ix])