In [1]:
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import time
import collections

np.set_printoptions(precision=4, linewidth=200)

%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
from utils.reader import europarl_raw_data

In [3]:
from utils.nmt_graph import NMTModel

In [4]:
def show_dict_contents(d):
    for k, v in d.items():
        for k2, v2 in v.items():
            print('{0: <30}: type={1: <30}{2}{3}'.format(
                '{0}.{1}'.format(k, k2),
                str(type(v2)),
                ' shape={0}'.format(v2.shape) if isinstance(v2, np.ndarray) else '',
                ' len={0}, contents type={1}'.format(
                    len(v2),
                    type(v2[0])
                ) if isinstance(v2, list) else '',
            ))

In [5]:
def unvectorize_sentence(sentence, idx2word):
    return ' '.join([idx2word[i] 
                     for i in sentence
                     if idx2word[i] != '<bos>' and idx2word[i] != '<eos>' and idx2word[i] != '<pad>'
                    ])

In [6]:
def sample_group(data, group_num, num_samples=10):
    samples = np.random.choice(len(data['train']['X'][group_num]), size=num_samples)
    for sample in samples:
        print(unvectorize_sentence(data['train']['X'][group_num][sample], data['vocab']['lang1_idx2word']))
        print(unvectorize_sentence(data['train']['y'][group_num][sample], data['vocab']['lang2_idx2word']))
        print()

In [14]:
data = europarl_raw_data()
show_dict_contents(data)
print(
    [(x.shape, y.shape) for x, y in sorted(zip(data['train']['X'], data['train']['y']), key=lambda t: t[0].shape[1]) if x.shape[1] > 2]
)

vocab.lang1_idx2word          : type=<class 'list'>                 len=93800, contents type=<class 'str'>
vocab.lang1_word2idx          : type=<class 'dict'>                
vocab.lang2_idx2word          : type=<class 'list'>                 len=44248, contents type=<class 'str'>
vocab.lang2_word2idx          : type=<class 'dict'>                
train.X                       : type=<class 'list'>                 len=4, contents type=<class 'numpy.ndarray'>
train.y                       : type=<class 'list'>                 len=4, contents type=<class 'numpy.ndarray'>
val.X                         : type=<class 'numpy.ndarray'>        shape=(160000, 604)
val.y                         : type=<class 'list'>                 len=160000, contents type=<class 'str'>
test.X                        : type=<class 'numpy.ndarray'>        shape=(160209, 640)
test.y                        : type=<class 'list'>                 len=160209, contents type=<class 'str'>
[((63624, 8), (63624, 32)), ((29

In [8]:
sample_group(data, 0, num_samples=5)

Ich habe kein anderes Land .
I do not have another country .

Gibt es Einwände ?
Are there any comments ?

Es geht noch weiter .
There is more .

Es wird uns gelingen .
We will succeed .

Aber das wissen wir .
But we know that .



In [23]:
data['train']['X'][1].shape

(377096, 24)

In [15]:
tf.reset_default_graph()
model = NMTModel(data['vocab']['lang1_idx2word'], data['vocab']['lang2_idx2word'], 128, 512)
all_graphs = model.make_all_graphs(16, data['train']['X'], data['train']['y'])
eval_graph = model.make_eval_graph(16, 30, 32, data['vocab']['lang2_word2idx']['<bos>'])
writer = tf.summary.FileWriter(logdir='logs', graph=tf.get_default_graph())
writer.flush()

In [11]:
show_dict_contents(all_graphs[0]['inputs_and_outputs'])

placeholders.inputs           : type=<class 'tensorflow.python.framework.ops.Tensor'>
placeholders.targets          : type=<class 'tensorflow.python.framework.ops.Tensor'>
placeholders.learning_rate    : type=<class 'tensorflow.python.framework.ops.Tensor'>
placeholders.max_norm         : type=<class 'tensorflow.python.framework.ops.Tensor'>
outputs.loss                  : type=<class 'tensorflow.python.framework.ops.Tensor'>
outputs.num_correct_predictions: type=<class 'tensorflow.python.framework.ops.Tensor'>
train_ops.train_op            : type=<class 'tensorflow.python.framework.ops.Operation'>
train_ops.gradient_global_norm: type=<class 'tensorflow.python.framework.ops.Tensor'>


In [24]:
INITIAL_LR=5e0
MAX_NORM=1
with tf.Session() as sess:
    run_id = time.time()
    writer = tf.summary.FileWriter('logs/{0}'.format(run_id), sess.graph)
    coord = tf.train.Coordinator()
    sess.run(tf.global_variables_initializer())
    threads = tf.train.start_queue_runners(sess=sess, coord=coord)

    learning_rate = INITIAL_LR
    max_norm = MAX_NORM
    training_outputs = {
        **all_graphs[2]['inputs_and_outputs']['outputs'],
        **all_graphs[2]['inputs_and_outputs']['train_ops'],
    }
    for i in range(235 * 5):
        for j in range(100):
            start_idx = ((i % 235) * 100 + j) * 16
            end_idx = ((i % 235) * 100 + j+1) * 16
            inputs = data['train']['X'][1][start_idx:end_idx, 1:-1]
            targets = data['train']['y'][1][start_idx:end_idx]
            outputs = sess.run(
                training_outputs,
                feed_dict={
                    all_graphs[2]['inputs_and_outputs']['placeholders']['inputs']: inputs,
                    all_graphs[2]['inputs_and_outputs']['placeholders']['targets']: targets,
                    all_graphs[2]['inputs_and_outputs']['placeholders']['learning_rate']: learning_rate, # * (10.0 / (10.0 + np.sqrt(i))),
                    all_graphs[2]['inputs_and_outputs']['placeholders']['max_norm']: max_norm,
                }
            )
        samples = np.random.choice(len(data['val']['X']), size=16)
        eval_inputs = data['val']['X'][samples][:,1:31]
        correct_outputs = [data['val']['y'][sample] for sample in samples]
        eval_outputs = sess.run(
            eval_graph['outputs']['outputs'],
            feed_dict={
                eval_graph['placeholders']['inputs']: eval_inputs,
            }
        )
        print('-' * 40)
        print('i = {0}'.format(i))
        print(outputs)
        for sample_idx, sample in enumerate(samples):
            print(unvectorize_sentence(eval_inputs[sample_idx], data['vocab']['lang1_idx2word']))
            print(correct_outputs[sample_idx])
            print(unvectorize_sentence(eval_outputs[sample_idx], data['vocab']['lang2_idx2word']))
            print()
        print('-' * 40)
            

    # Bookkeeping        
    writer.close()
    coord.request_stop()
    coord.join(threads)
    
print(outputs)

----------------------------------------
i = 0
{'loss': 5.7078514, 'num_correct_predictions': 204, 'train_op': None, 'gradient_global_norm': 0.86170268}
Wenn es der Kommission ernst mit ihren Maßnahmen gegen das Rauchen ist , dann könnte sie sich und der europäischen Öffentlichkeit einen besseren Dienst erweisen , wenn sie sich auf
<bos> If the Commission is serious about its anti-smoking policy then it would serve itself and the European public better by concentrating on education programmes rather than seeking publicity by ill-conceived and badly executed means . <eos>
This a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a

Daher enthält die Richtlinie eine Reihe von Bestimmungen zur Selbstregulierung , worüber ich sehr zufrieden bin .
<bos> A number of provisions have also been included in this regard , which I am very pleased about . <eos>
This a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a

Darüber besteht bislang noch keine Einigkeit , es finden viele D

KeyboardInterrupt: 