In [1]:
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import time
import collections

np.set_printoptions(precision=4, linewidth=200)

%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
from utils.reader import europarl_raw_data

In [26]:
from utils.nmt_graph import NMTModel

In [3]:
def show_dict_contents(d):
    for k, v in d.items():
        for k2, v2 in v.items():
            print('{0: <30}: type={1: <30}{2}{3}'.format(
                '{0}.{1}'.format(k, k2),
                str(type(v2)),
                ' shape={0}'.format(v2.shape) if isinstance(v2, np.ndarray) else '',
                ' len={0}, contents type={1}'.format(
                    len(v2),
                    type(v2[0])
                ) if isinstance(v2, list) else '',
            ))

In [4]:
def unvectorize_sentence(sentence, idx2word):
    return ' '.join([idx2word[i] for i in sentence])

In [5]:
def sample_group(data, group_num, num_samples=10):
    samples = np.random.choice(len(data['train']['X'][group_num]), size=num_samples)
    for sample in samples:
        print(unvectorize_sentence(data['train']['X'][group_num][sample], data['vocab']['lang1_idx2word']))
        print(unvectorize_sentence(data['train']['y'][group_num][sample], data['vocab']['lang2_idx2word']))
        print()

In [6]:
data = europarl_raw_data()

In [7]:
show_dict_contents(data)

vocab.lang1_idx2word          : type=<class 'list'>                 len=44247, contents type=<class 'str'>
vocab.lang1_word2idx          : type=<class 'dict'>                
vocab.lang2_idx2word          : type=<class 'list'>                 len=93799, contents type=<class 'str'>
vocab.lang2_word2idx          : type=<class 'dict'>                
train.X                       : type=<class 'list'>                 len=31, contents type=<class 'numpy.ndarray'>
train.y                       : type=<class 'list'>                 len=31, contents type=<class 'numpy.ndarray'>
val.X                         : type=<class 'numpy.ndarray'>        shape=(160000, 882)
val.y                         : type=<class 'list'>                 len=160000, contents type=<class 'str'>
test.X                        : type=<class 'numpy.ndarray'>        shape=(160209, 684)
test.y                        : type=<class 'list'>                 len=160209, contents type=<class 'str'>


In [24]:
[(x.shape, y.shape) for x, y in sorted(zip(data['train']['X'], data['train']['y']), key=lambda t: t[0].shape[1]) if x.shape[1] > 2]

[((5889, 3), (5889, 30)),
 ((1852, 4), (1852, 25)),
 ((5880, 5), (5880, 26)),
 ((6132, 6), (6132, 32)),
 ((15746, 7), (15746, 32)),
 ((18777, 8), (18777, 30)),
 ((22049, 9), (22049, 31)),
 ((27057, 10), (27057, 32)),
 ((30250, 11), (30250, 32)),
 ((33015, 12), (33015, 31)),
 ((35229, 13), (35229, 32)),
 ((38235, 14), (38235, 32)),
 ((40093, 15), (40093, 32)),
 ((42081, 16), (42081, 32)),
 ((43547, 17), (43547, 32)),
 ((44804, 18), (44804, 32)),
 ((46057, 19), (46057, 32)),
 ((46366, 20), (46366, 32)),
 ((46773, 21), (46773, 32)),
 ((46944, 22), (46944, 32)),
 ((46618, 23), (46618, 32)),
 ((45600, 24), (45600, 32)),
 ((44575, 25), (44575, 32)),
 ((43294, 26), (43294, 32)),
 ((41318, 27), (41318, 32)),
 ((38765, 28), (38765, 32)),
 ((35988, 29), (35988, 32)),
 ((32868, 30), (32868, 32)),
 ((28754, 31), (28754, 32)),
 ((24893, 32), (24893, 32))]

In [18]:
sample_group(data, 0, num_samples=3)

<bos> <unk> who ? &apos; <eos>
<bos> &quot; Wer &apos; ich &apos; ? &quot; <eos> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk>

<bos> ( Standing ovation ) <eos>
<bos> ( Die Mitglieder des Parlaments erheben sich und spenden Beifall . ) <eos> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk>

<bos> - Before the vote <eos>
<bos> - Vor der Abstimmung <eos> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk>



In [None]:
tf.reset_default_graph()
model = NMTModel(data['vocab']['lang1_idx2word'], data['vocab']['lang2_idx2word'], 64, 256)
all_graphs = model.make_all_graphs(32, data['train']['X'], data['train']['y'])
writer = tf.summary.FileWriter(logdir='logs', graph=tf.get_default_graph())
writer.flush()

In [None]:
with tf.Session() as sess:
    sess.