In [1]:
import numpy as np 
import pandas as pd 
from data_processing import generate_vocab, process_data, create_dataloaders 
from model import get_pretrained_emb, EncoderDecoder, EncoderRNN, DecoderRNN, EncoderDecoderAttn, DecoderAttnRNN
from train_eval import count_parameters, summarize_results, plot_single_learning_curve, load_experiment_log
from train_eval import train_and_eval
import importlib
import pickle as pkl 
import torch

In [2]:
# model identification
MODEL_NAME = 'zh-seq2seq-rnn-attn'
SRC_LANG = 'zh'
TARG_LANG = 'en'

# data processing params  
SRC_MAX_SENTENCE_LEN = 20
TARG_MAX_SENTENCE_LEN = 20
SRC_VOCAB_SIZE = 30000 #30000
TARG_VOCAB_SIZE = 30000 #30000

# model architecture params 
RNN_CELL_TYPE = 'gru'
NUM_LAYERS = 2 #2 
ENC_HIDDEN_DIM = 256 #512
DEC_HIDDEN_DIM = 2 * ENC_HIDDEN_DIM #2 * ENC_HIDDEN_DIM 
TEACHER_FORCING_RATIO = 1
CLIP_GRAD_MAX_NORM = 1
ENC_DROPOUT = 0.2 # to actually implement
DEC_DROPOUT = 0.2 # to actually implement
USE_ATTN = False

# training params  
BATCH_SIZE = 64 #32
NUM_EPOCHS = 20
LR = 0.0003 # 0.0005
OPTIMIZER = 'Adam'
LAZY_TRAIN = False

In [3]:
# store as dict to save to results later 
params = {'model_name': MODEL_NAME, 'src_lang': SRC_LANG, 'targ_lang': TARG_LANG, 'rnn_cell_type': RNN_CELL_TYPE, 
          'src_max_sentence_len': SRC_MAX_SENTENCE_LEN, 'targ_max_sentence_len': TARG_MAX_SENTENCE_LEN, 
          'src_vocab_size': SRC_VOCAB_SIZE, 'targ_vocab_size': TARG_VOCAB_SIZE, 
          'num_layers': NUM_LAYERS, 'enc_hidden_dim': ENC_HIDDEN_DIM, 'dec_hidden_dim': DEC_HIDDEN_DIM,
          'teacher_forcing_ratio': TEACHER_FORCING_RATIO, 'clip_grad_max_norm': CLIP_GRAD_MAX_NORM,
          'enc_dropout': ENC_DROPOUT, 'dec_dropout': DEC_DROPOUT, 'use_attn': USE_ATTN, 
          'batch_size': BATCH_SIZE, 'num_epochs': NUM_EPOCHS, 'learning_rate': LR, 'optimizer': OPTIMIZER, 
          'lazy_train': LAZY_TRAIN} 

In [4]:
# # takes a long time to process, save to pickle for reimport in future 
# vocab = generate_vocab(SRC_LANG, TARG_LANG, SRC_VOCAB_SIZE, TARG_VOCAB_SIZE)
# vocab_filename = "{}-{}-vocab.p".format(SRC_LANG, TARG_LANG)
# pkl.dump(vocab, open(vocab_filename, "wb"))

In [5]:
# reload from pickle 
vocab_filename = "{}-{}-vocab.p".format(SRC_LANG, TARG_LANG)
vocab = pkl.load(open(vocab_filename, "rb"))
data = process_data(SRC_LANG, TARG_LANG, SRC_MAX_SENTENCE_LEN, TARG_MAX_SENTENCE_LEN, vocab)
data_minibatch = process_data(SRC_LANG, TARG_LANG, SRC_MAX_SENTENCE_LEN, TARG_MAX_SENTENCE_LEN, vocab, sample_limit=BATCH_SIZE) 
data_minitrain = process_data(SRC_LANG, TARG_LANG, SRC_MAX_SENTENCE_LEN, TARG_MAX_SENTENCE_LEN, vocab, sample_limit=1000)

In [6]:
# create dataloaders 
loaders_full = create_dataloaders(data, SRC_MAX_SENTENCE_LEN, TARG_MAX_SENTENCE_LEN, BATCH_SIZE)
loaders_minibatch = create_dataloaders(data_minibatch, SRC_MAX_SENTENCE_LEN, TARG_MAX_SENTENCE_LEN, BATCH_SIZE)
loaders_minitrain = create_dataloaders(data_minitrain, SRC_MAX_SENTENCE_LEN, TARG_MAX_SENTENCE_LEN, BATCH_SIZE)

In [7]:
# define model 

encoder = EncoderRNN(rnn_cell_type=RNN_CELL_TYPE, enc_hidden_dim=ENC_HIDDEN_DIM, num_layers=NUM_LAYERS, 
                     src_max_sentence_len=SRC_MAX_SENTENCE_LEN, enc_dropout=ENC_DROPOUT, 
                     pretrained_word2vec=get_pretrained_emb(vocab[SRC_LANG]['word2vec'], vocab[SRC_LANG]['token2id']))

# without attention 
# decoder = DecoderRNN(dec_hidden_dim=DEC_HIDDEN_DIM, enc_hidden_dim=ENC_HIDDEN_DIM, num_layers=NUM_LAYERS,
#                      targ_vocab_size=TARG_VOCAB_SIZE, targ_max_sentence_len=TARG_MAX_SENTENCE_LEN, 
#                      pretrained_word2vec=get_pretrained_emb(vocab[TARG_LANG]['word2vec'], vocab[TARG_LANG]['token2id']))
# model = EncoderDecoder(encoder, decoder, vocab[TARG_LANG]['token2id'])

# with attention 
decoder = DecoderAttnRNN(rnn_cell_type=RNN_CELL_TYPE, dec_hidden_dim=DEC_HIDDEN_DIM, enc_hidden_dim=ENC_HIDDEN_DIM, 
                         num_layers=NUM_LAYERS, targ_vocab_size=TARG_VOCAB_SIZE, src_max_sentence_len=SRC_MAX_SENTENCE_LEN, 
                         targ_max_sentence_len=TARG_MAX_SENTENCE_LEN, dec_dropout=DEC_DROPOUT, 
                         pretrained_word2vec=get_pretrained_emb(vocab[TARG_LANG]['word2vec'], vocab[TARG_LANG]['token2id']))
model = EncoderDecoderAttn(encoder, decoder, vocab[TARG_LANG]['token2id']) 

In [None]:
model, results = train_and_eval(
    model=model, loaders_full=loaders_full, loaders_minibatch=loaders_minibatch, loaders_minitrain=loaders_minitrain, 
    params=params, vocab=vocab, print_intermediate=100, save_checkpoint=True, save_to_log=True, 
    lazy_eval=True, print_attn=True, inspect_samples=1)

Epoch: 0.00, Train Loss: 0.00, Val Loss: 10.23, Train BLEU: 0.00, Val BLEU: 0.07, Minutes Elapsed: 0.17
Sampling from val predictions...
Source: 超过 70 的 家庭 家庭暴力 暴力 谋杀 发生 生在 受害 受害者 结束 这段 关系 后 在 她 离开 之后 因为
Reference: over 70 percent of domestic violence murders happen after the victim has ended the relationship , after she &apos;s
Model: <SOS> soaring soaring soaring the the sleeper <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS>
Attention Weights: tensor([[0.0409, 0.0424, 0.0447, 0.0474, 0.0460, 0.0486, 0.0511, 0.0518, 0.0488,
         0.0495, 0.0511, 0.0521, 0.0505, 0.0494, 0.0515, 0.0537, 0.0556, 0.0557,
         0.0550, 0.0544],
        [0.0409, 0.0424, 0.0447, 0.0474, 0.0459, 0.0485, 0.0511, 0.0518, 0.0488,
         0.0495, 0.0511, 0.0521, 0.0505, 0.0494, 0.0515, 0.0537, 0.0556, 0.0557,
         0.0550, 0.0544],
        [0.0409, 0.0424, 0.0447, 0.0474, 0.0459, 0.0485, 0.0511, 0.0518, 0.0488,
         0.0495, 0.0510, 0.0521, 0.0505, 0.0494, 0.0515, 0.053

Epoch: 0.16, Train Loss: 0.00, Val Loss: 6.20, Train BLEU: 0.00, Val BLEU: 2.34, Minutes Elapsed: 15.32
Sampling from val predictions...
Source: 我们 是 一个 自由 的 组织 由来 来自 不同 同行 <UNK> 行业 不同 <UNK> 城市 的 <UNK> 组成 大家 全都
Reference: what we do , we &apos;re a <UNK> kind of group , where it &apos;s composed of gardeners from
Model: <SOS> and &apos;s , , , , , , , the the , . , . . . <EOS> .
Attention Weights: tensor([[0.0273, 0.0744, 0.0804, 0.0872, 0.0982, 0.0917, 0.0893, 0.0834, 0.0736,
         0.0551, 0.0095, 0.0618, 0.0473, 0.0074, 0.0478, 0.0354, 0.0038, 0.0190,
         0.0069, 0.0005],
        [0.0373, 0.0652, 0.0718, 0.0765, 0.0811, 0.0799, 0.0792, 0.0766, 0.0715,
         0.0611, 0.0226, 0.0649, 0.0549, 0.0188, 0.0526, 0.0415, 0.0108, 0.0227,
         0.0099, 0.0014],
        [0.0402, 0.0608, 0.0665, 0.0702, 0.0732, 0.0731, 0.0729, 0.0715, 0.0683,
         0.0614, 0.0303, 0.0640, 0.0570, 0.0264, 0.0547, 0.0458, 0.0172, 0.0284,
         0.0150, 0.0031],
        [0.0413, 0.0599, 0.0652, 0.

Epoch: 0.33, Train Loss: 0.00, Val Loss: 6.09, Train BLEU: 0.00, Val BLEU: 1.80, Minutes Elapsed: 30.47
Sampling from val predictions...
Source: 我们 用 的 就是 清洁 <UNK> 用 的 钢丝 <UNK> 钢丝 <UNK> 可以 <UNK> 很小 小块 我们 把 这些 些小
Reference: so what we can do is use steel wool just to clean <UNK> , and the steel wool we
Model: <SOS> and , , a a , , , , , the the . <EOS> <EOS> . . . <EOS>
Attention Weights: tensor([[0.3186, 0.1996, 0.0811, 0.0643, 0.0076, 0.0012, 0.0447, 0.0184, 0.0035,
         0.0005, 0.0025, 0.0005, 0.0083, 0.0006, 0.0138, 0.0017, 0.1391, 0.0703,
         0.0215, 0.0022],
        [0.1303, 0.1314, 0.0921, 0.0866, 0.0243, 0.0071, 0.0808, 0.0480, 0.0162,
         0.0040, 0.0129, 0.0041, 0.0280, 0.0047, 0.0374, 0.0087, 0.1388, 0.0940,
         0.0429, 0.0078],
        [0.1058, 0.1088, 0.0859, 0.0829, 0.0335, 0.0134, 0.0798, 0.0548, 0.0248,
         0.0086, 0.0209, 0.0088, 0.0372, 0.0096, 0.0456, 0.0153, 0.1157, 0.0866,
         0.0485, 0.0136],
        [0.0817, 0.0847, 0.0748, 0.0739, 0.04

Epoch: 0.49, Train Loss: 0.00, Val Loss: 5.89, Train BLEU: 0.00, Val BLEU: 4.00, Minutes Elapsed: 45.58
Sampling from val predictions...
Source: 你 可以 以是 是非 非凡 的 <EOS> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
Reference: you can be extraordinary . <EOS> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
Model: <SOS> i i &apos;t a . . <EOS> . <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS>
Attention Weights: tensor([[0.9983, 0.0016, 0.0001, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000],
        [0.9873, 0.0112, 0.0007, 0.0005, 0.0001, 0.0001, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000],
        [0.8697, 0.0973, 0.0163, 0.0093, 0.0032, 0.0028, 0.0013, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0

Epoch: 0.66, Train Loss: 0.00, Val Loss: 5.80, Train BLEU: 0.00, Val BLEU: 3.34, Minutes Elapsed: 60.73
Sampling from val predictions...
Source: 事实 实是 康 纳 在 一个 周五 的 晚上 回到 家 告诉 我 他 辞掉 了 他 的 工作 他
Reference: instead , <UNK> came home one friday evening and he told me that he had quit his job that
Model: <SOS> and , the , i i i , i i , to . . . . . <EOS> .
Attention Weights: tensor([[7.3082e-01, 9.8949e-03, 1.8067e-02, 1.6319e-02, 9.7914e-02, 4.6987e-02,
         3.4414e-03, 1.8736e-02, 1.1331e-03, 1.5048e-03, 5.5812e-04, 3.8547e-03,
         5.0632e-02, 8.3380e-05, 1.2204e-05, 1.9021e-05, 1.5853e-05, 2.0359e-06,
         8.4031e-07, 2.3728e-06],
        [3.6100e-01, 2.2830e-02, 6.7364e-02, 5.7900e-02, 2.1157e-01, 9.9073e-02,
         1.5521e-02, 4.7642e-02, 5.7081e-03, 6.1564e-03, 2.1261e-03, 6.2002e-03,
         9.5707e-02, 6.6784e-04, 1.2931e-04, 1.6424e-04, 1.6261e-04, 3.2094e-05,
         1.3005e-05, 3.2296e-05],
        [6.0216e-02, 3.5948e-02, 9.0614e-02, 8.9853e-02, 1.5982e-01, 1.2

Epoch: 0.77, Train Loss: 0.00, Val Loss: 5.87, Train BLEU: 0.00, Val BLEU: 3.74, Minutes Elapsed: 70.88
Sampling from val predictions...
Source: 我 能够 结束 自己 疯狂 的 爱 的 故事 靠 的 是 打破 沉默 <EOS> <PAD> <PAD> <PAD> <PAD> <PAD>
Reference: i was able to end my own crazy love story by breaking the silence . <EOS> <PAD> <PAD> <PAD>
Model: <SOS> i &apos;m to to to to the , , the the <EOS> . <EOS> . <EOS> <EOS> . <EOS>
Attention Weights: tensor([[0.9941, 0.0058, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000],
        [0.9393, 0.0595, 0.0009, 0.0002, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000],
        [0.2786, 0.5369, 0.0942, 0.0605, 0.0094, 0.0046, 0.0035, 0.0024, 0.0010,
         0.0021, 0.0019, 0.0020, 0.0008, 0.0012, 0.0011, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000],
        [0.

Epoch: 0.93, Train Loss: 0.00, Val Loss: 5.75, Train BLEU: 0.00, Val BLEU: 3.65, Minutes Elapsed: 86.12
Sampling from val predictions...
Source: 27 岁 时 我 做出 一个 决定 只 基于 人们 的 需求 提供 提供援助 援助 我 创立 了 一套 工作
Reference: i decided when i was 27 years old to only respond to people , and i invented a system
Model: <SOS> i i to to the to , and , , , . . . <EOS> . . . <EOS>
Attention Weights: tensor([[5.3005e-04, 7.5433e-03, 1.8679e-01, 8.0422e-01, 5.1440e-04, 1.8013e-04,
         1.0196e-04, 5.6642e-05, 8.9066e-06, 4.0099e-05, 1.8845e-06, 9.5221e-07,
         4.8574e-07, 7.0377e-07, 1.6002e-06, 1.0179e-05, 9.0752e-08, 1.6808e-07,
         1.3340e-07, 1.8132e-07],
        [1.6853e-03, 2.0949e-02, 2.1279e-01, 7.6274e-01, 1.0953e-03, 3.8615e-04,
         1.7814e-04, 1.0795e-04, 1.2660e-05, 4.1695e-05, 3.2854e-06, 1.0662e-06,
         5.9550e-07, 7.4261e-07, 1.9921e-06, 7.6480e-06, 1.4341e-07, 2.5923e-07,
         1.3801e-07, 1.4697e-07],
        [4.5900e-03, 2.0091e-02, 7.2747e-02, 7.0222e-01, 7.6994e

Epoch: 1.00, Train Loss: 0.00, Val Loss: 5.76, Train BLEU: 0.00, Val BLEU: 4.17, Minutes Elapsed: 92.34
Sampling from val predictions...
Source: 在 西藏 <UNK> <UNK> 文化 中 <UNK> 尤其 显得 重要 <UNK> 在 那些 类似 西藏 的 地方 既 没有 条件
Reference: in tibetan culture , they are performing very important sky burials . in places like tibet , there are
Model: <SOS> the the of , the the &apos;t , , , . . <EOS> <EOS> . <EOS> . <EOS> <EOS>
Attention Weights: tensor([[0.4050, 0.0118, 0.0006, 0.0017, 0.1365, 0.2564, 0.0089, 0.1258, 0.0204,
         0.0119, 0.0008, 0.0072, 0.0117, 0.0003, 0.0001, 0.0004, 0.0001, 0.0003,
         0.0001, 0.0000],
        [0.3590, 0.0299, 0.0026, 0.0050, 0.1582, 0.2812, 0.0139, 0.0826, 0.0231,
         0.0190, 0.0021, 0.0086, 0.0116, 0.0010, 0.0004, 0.0009, 0.0003, 0.0004,
         0.0001, 0.0000],
        [0.0246, 0.0226, 0.0122, 0.0224, 0.1302, 0.2213, 0.0495, 0.1429, 0.0890,
         0.0891, 0.0197, 0.0434, 0.1059, 0.0128, 0.0037, 0.0050, 0.0025, 0.0021,
         0.0008, 0.0003],
     

Epoch: 1.11, Train Loss: 0.00, Val Loss: 5.64, Train BLEU: 0.00, Val BLEU: 3.59, Minutes Elapsed: 102.48
Sampling from val predictions...
Source: 而 如今 我 已经 是 一名 自豪 的 <UNK> 学院 的 毕业 毕业生 业生 <EOS> <PAD> <PAD> <PAD> <PAD> <PAD>
Reference: instead , i stand here a proud graduate of <UNK> college . <EOS> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
Model: <SOS> and , i i a a a of a a . . <EOS> <EOS> . <EOS> <EOS> <EOS> <EOS>
Attention Weights: tensor([[0.1747, 0.1829, 0.6396, 0.0024, 0.0002, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000],
        [0.0662, 0.1060, 0.8254, 0.0022, 0.0002, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000],
        [0.0219, 0.1080, 0.7765, 0.0857, 0.0059, 0.0005, 0.0007, 0.0002, 0.0000,
         0.0001, 0.0001, 0.0000, 0.0000, 0.0001, 0.0002, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000],
    

Epoch: 1.27, Train Loss: 0.00, Val Loss: 5.61, Train BLEU: 0.00, Val BLEU: 4.22, Minutes Elapsed: 117.80
Sampling from val predictions...
Source: 还 不算 毁坏 得 太严 严重 但是 被 <UNK> <UNK> 的 地方 导致 照片 片中 中小 小女 小女孩 女孩 的
Reference: not terribly damaged , but where the water had caused that <UNK> on the girl &apos;s face had to
Model: <SOS> it &apos;s , , , , , , , , , , <UNK> . . . <EOS> . <EOS>
Attention Weights: tensor([[9.4284e-01, 5.5886e-02, 7.4071e-04, 3.1587e-04, 1.8037e-05, 1.1719e-04,
         7.4198e-05, 9.6756e-06, 2.8357e-07, 2.3391e-07, 8.7274e-07, 6.1466e-07,
         2.5165e-07, 6.1795e-08, 4.1738e-08, 4.7329e-08, 9.1714e-09, 7.6305e-09,
         4.9439e-08, 6.6408e-08],
        [2.3586e-01, 3.7974e-01, 2.2887e-01, 8.9015e-02, 1.0557e-02, 2.5798e-02,
         1.9728e-02, 8.0094e-03, 3.5064e-04, 2.7392e-04, 7.2933e-04, 5.6149e-04,
         2.3814e-04, 7.0466e-05, 4.9845e-05, 5.3822e-05, 1.0173e-05, 8.4716e-06,
         3.7362e-05, 3.8998e-05],
        [2.7285e-02, 1.2134e-01, 2.2525e-

Epoch: 1.38, Train Loss: 0.00, Val Loss: 5.61, Train BLEU: 0.00, Val BLEU: 3.38, Minutes Elapsed: 127.98
Sampling from val predictions...
Source: 当 他 长大 了 他 变得 更加 独特 这些 特别 别的 地方 更加 明显 <EOS> <PAD> <PAD> <PAD> <PAD> <PAD>
Reference: and as he grew older , he grew more different , and the differences became more obvious . <EOS>
Model: <SOS> and he he , , , , , , to , . the &apos;s . . . . <EOS>
Attention Weights: tensor([[0.0284, 0.9607, 0.0032, 0.0010, 0.0066, 0.0001, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000],
        [0.0030, 0.9505, 0.0251, 0.0023, 0.0185, 0.0004, 0.0001, 0.0000, 0.0001,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000],
        [0.0038, 0.4543, 0.1896, 0.0415, 0.2765, 0.0253, 0.0030, 0.0010, 0.0018,
         0.0007, 0.0008, 0.0003, 0.0003, 0.0003, 0.0007, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000],
        [0.0036, 0.232

Epoch: 1.55, Train Loss: 0.00, Val Loss: 5.50, Train BLEU: 0.00, Val BLEU: 3.94, Minutes Elapsed: 143.29
Sampling from val predictions...
Source: 我 原本 以为 世界 上 只有 我 一个 个人 会 继续 留在 一个 一个打 我 的 男人 人身 身边 但是
Reference: i would have told you myself that i was the last person on earth who would stay with a
Model: <SOS> i i a my my my my my my my my , , my , <EOS> . . <EOS>
Attention Weights: tensor([[9.2633e-01, 8.4187e-03, 3.1097e-02, 2.6569e-02, 4.4943e-03, 1.0225e-03,
         1.8527e-03, 5.9032e-05, 1.2667e-04, 1.5741e-05, 1.9457e-06, 2.1723e-06,
         1.7177e-06, 5.3046e-07, 6.4603e-06, 2.5326e-07, 4.4308e-07, 2.3778e-07,
         1.7695e-07, 1.1698e-07],
        [1.0911e-01, 9.9887e-02, 4.9030e-01, 2.3672e-01, 2.9422e-02, 1.0896e-02,
         1.4888e-02, 2.5293e-03, 4.2254e-03, 7.4227e-04, 2.3138e-04, 2.9212e-04,
         1.4735e-04, 7.4046e-05, 2.4914e-04, 7.2767e-05, 1.1744e-04, 5.8948e-05,
         3.0135e-05, 1.2053e-05],
        [4.9277e-02, 6.6026e-02, 4.5769e-01, 2.4680e-01, 5.3

Epoch: 1.66, Train Loss: 0.00, Val Loss: 5.49, Train BLEU: 0.00, Val BLEU: 4.19, Minutes Elapsed: 153.50
Sampling from val predictions...
Source: 食物 是 问题 食物 也 是 解决 解决方案 方案 <EOS> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
Reference: food is the problem and food is the solution . <EOS> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
Model: <SOS> the is is a of the is <EOS> . . <EOS> <EOS> . <EOS> <EOS> <EOS> <EOS> <EOS> <EOS>
Attention Weights: tensor([[0.9762, 0.0225, 0.0013, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000],
        [0.8917, 0.0854, 0.0218, 0.0008, 0.0002, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000],
        [0.1872, 0.2940, 0.4490, 0.0410, 0.0138, 0.0046, 0.0026, 0.0017, 0.0031,
         0.0029, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000

Epoch: 1.82, Train Loss: 0.00, Val Loss: 5.46, Train BLEU: 0.00, Val BLEU: 4.12, Minutes Elapsed: 168.83
Sampling from val predictions...
Source: 这个 个人 还 得 去 <UNK> 条约 和 接见 <UNK> <UNK> <EOS> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
Reference: this guy has to go and sign treaties and meet foreign <UNK> . <EOS> <PAD> <PAD> <PAD> <PAD> <PAD>
Model: <SOS> and &apos;s of they to to the and the <EOS> . <EOS> <EOS> <EOS> <EOS> . <EOS> <EOS> <EOS>
Attention Weights: tensor([[0.9836, 0.0163, 0.0001, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000],
        [0.1393, 0.8301, 0.0274, 0.0018, 0.0007, 0.0001, 0.0003, 0.0001, 0.0000,
         0.0000, 0.0001, 0.0001, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000],
        [0.0822, 0.7056, 0.1707, 0.0226, 0.0100, 0.0010, 0.0028, 0.0007, 0.0007,
         0.0005, 0.0009, 0.0022, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0

Epoch: 1.99, Train Loss: 0.00, Val Loss: 5.47, Train BLEU: 0.00, Val BLEU: 4.15, Minutes Elapsed: 184.20
Sampling from val predictions...
Source: 我 不能 披露 太多 我 离开 朝鲜 时 的 细节 但是 我 只能 说 那 是 在 饥荒 中 最
Reference: i can &apos;t reveal many details &#91; about &#93; how i left north korea , but i only can
Model: <SOS> i i &apos;t my my i , i i , i i a the the . . . .
Attention Weights: tensor([[6.3450e-01, 2.3022e-01, 2.1254e-02, 1.0652e-01, 3.0188e-03, 1.7539e-04,
         1.1694e-03, 4.8315e-04, 2.0362e-04, 6.2013e-04, 1.4570e-03, 3.3760e-04,
         2.7218e-05, 6.4545e-06, 7.3466e-07, 1.7616e-07, 2.2553e-08, 1.3311e-07,
         1.4360e-07, 1.1162e-07],
        [4.7713e-02, 7.6442e-01, 1.1394e-01, 6.0669e-02, 7.4317e-03, 2.1035e-03,
         1.6637e-03, 4.7165e-04, 2.0173e-04, 3.1039e-04, 5.4506e-04, 3.1343e-04,
         1.3900e-04, 3.4434e-05, 1.4322e-05, 7.0077e-06, 2.3254e-06, 8.8150e-06,
         5.0153e-06, 2.9720e-06],
        [2.9640e-02, 5.5310e-01, 1.0471e-01, 1.7024e-01, 8.1801e-0

Epoch: 2.05, Train Loss: 0.00, Val Loss: 5.41, Train BLEU: 0.00, Val BLEU: 3.64, Minutes Elapsed: 190.47
Sampling from val predictions...
Source: 你们 应该 看看 这些 垃圾 掌声 你们 应该 看看 我们 塞 了 多少 垃圾 给 这些 满怀 信任 的 非洲
Reference: you should see the rubbish — -- you should see the rubbish that we have <UNK> on unsuspecting african
Model: <SOS> you you , , , of we we &apos;re the the the of the in in . <EOS> .
Attention Weights: tensor([[8.0456e-01, 1.8576e-01, 7.2864e-03, 6.8758e-04, 3.6355e-04, 9.9578e-05,
         1.0277e-03, 1.7211e-04, 2.9879e-05, 4.2794e-06, 1.9653e-06, 1.1723e-06,
         4.1025e-06, 2.5386e-07, 7.3778e-08, 6.2882e-08, 2.0923e-08, 1.4084e-08,
         4.4098e-09, 2.5148e-08],
        [2.1034e-01, 3.2271e-01, 1.8543e-01, 1.9679e-01, 4.5871e-02, 6.8194e-03,
         1.8496e-02, 6.6353e-03, 2.0660e-03, 2.1093e-03, 8.0778e-04, 4.8336e-04,
         1.2062e-03, 1.1971e-04, 2.9935e-05, 4.4424e-05, 1.7393e-05, 1.1751e-05,
         5.6496e-06, 1.5567e-05],
        [8.6693e-02, 1.4276e-01,

Epoch: 2.16, Train Loss: 0.00, Val Loss: 5.47, Train BLEU: 0.00, Val BLEU: 4.30, Minutes Elapsed: 200.74
Sampling from val predictions...
Source: 我 确实 实有 太多 从没 机会 穿 的 8 英寸 高跟鞋 但是 免费 的 东西 却是 我 在 现实 现实生活
Reference: i do have too many <UNK> heels which i never get to wear , except for earlier , but
Model: <SOS> i i &apos;t have to to , , i my a my . . . . <EOS> . <EOS>
Attention Weights: tensor([[8.2044e-01, 1.4672e-01, 1.3374e-02, 1.8543e-02, 5.7780e-04, 2.5195e-04,
         3.2771e-05, 8.5966e-06, 2.7030e-06, 1.9053e-05, 9.6405e-06, 7.1923e-06,
         2.7602e-06, 9.8856e-07, 3.3320e-06, 1.7936e-06, 5.6119e-07, 3.1129e-08,
         5.7075e-08, 3.5415e-08],
        [1.3380e-01, 4.6649e-01, 1.3086e-01, 2.2230e-01, 2.6961e-02, 1.6171e-02,
         1.5599e-03, 4.8559e-04, 1.3567e-04, 3.7973e-04, 2.3904e-04, 2.4245e-04,
         1.3006e-04, 5.3420e-05, 8.2865e-05, 4.1455e-05, 4.0792e-05, 6.7421e-06,
         9.0517e-06, 5.1429e-06],
        [6.7233e-02, 2.1612e-01, 1.1741e-01, 3.8631e-01, 9

Epoch: 2.27, Train Loss: 0.00, Val Loss: 5.38, Train BLEU: 0.00, Val BLEU: 4.28, Minutes Elapsed: 211.00
Sampling from val predictions...
Source: 我 发现 美国 国人 能 看出 我们 变化 中 的 脆弱 脆弱性 <EOS> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
Reference: i find that americans see the fragility in changes . <EOS> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
Model: <SOS> i i the the , the the of the , we &apos;re have <EOS> <EOS> <EOS> <EOS> <EOS> <EOS>
Attention Weights: tensor([[0.9981, 0.0012, 0.0006, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000],
        [0.4118, 0.4435, 0.1381, 0.0055, 0.0009, 0.0001, 0.0001, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000],
        [0.1429, 0.1712, 0.5516, 0.1137, 0.0132, 0.0018, 0.0020, 0.0009, 0.0006,
         0.0004, 0.0004, 0.0003, 0.0009, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
 

Epoch: 2.44, Train Loss: 0.00, Val Loss: 5.33, Train BLEU: 0.00, Val BLEU: 4.28, Minutes Elapsed: 226.37
Sampling from val predictions...
Source: 这些 井 有 差不多 不多 有 90 多米 深 他们 <UNK> <UNK> 出来 装满 <UNK> 的 袋子 这些 石头 被
Reference: the shafts are up to 300 feet deep , and they carry out heavy bags of stone that later
Model: <SOS> these are of they <UNK> they their their their they they they them them them . <EOS> . <EOS>
Attention Weights: tensor([[7.2936e-01, 1.7139e-02, 8.2352e-02, 1.0701e-01, 5.7900e-02, 4.1197e-03,
         9.3258e-04, 4.2123e-04, 1.6961e-04, 5.7366e-04, 1.1135e-06, 1.1350e-06,
         2.4205e-06, 9.2868e-07, 3.8596e-07, 2.4242e-06, 1.3419e-06, 3.7807e-06,
         3.9726e-07, 8.9233e-07],
        [1.0111e-01, 4.2588e-02, 1.3966e-01, 3.8347e-01, 2.8029e-01, 3.3560e-02,
         9.1147e-03, 2.9899e-03, 1.5913e-03, 5.0837e-03, 7.3389e-05, 8.7018e-05,
         1.0766e-04, 4.7483e-05, 3.0008e-05, 5.6430e-05, 3.7861e-05, 7.7384e-05,
         1.4485e-05, 1.7078e-05],
        [2.49

Epoch: 2.55, Train Loss: 0.00, Val Loss: 5.35, Train BLEU: 0.00, Val BLEU: 4.39, Minutes Elapsed: 236.65
Sampling from val predictions...
Source: 我 的 行为 很 有 可能 会 让 他们 的 境遇 比 现在 更糟 <EOS> <PAD> <PAD> <PAD> <PAD> <PAD>
Reference: i could get them in a worse situation than they were already in . <EOS> <PAD> <PAD> <PAD> <PAD>
Model: <SOS> my mean , be , , , , , they , . . . . <EOS> . <EOS> <EOS>
Attention Weights: tensor([[0.8693, 0.0149, 0.0864, 0.0214, 0.0031, 0.0048, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000],
        [0.0330, 0.0204, 0.8012, 0.1032, 0.0222, 0.0194, 0.0005, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000],
        [0.0310, 0.0084, 0.1955, 0.3389, 0.1307, 0.2679, 0.0234, 0.0013, 0.0016,
         0.0001, 0.0004, 0.0001, 0.0001, 0.0002, 0.0002, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000],
        [0.0196, 0.0036

Epoch: 2.71, Train Loss: 0.00, Val Loss: 5.32, Train BLEU: 0.00, Val BLEU: 4.54, Minutes Elapsed: 252.09
Sampling from val predictions...
Source: 我 11 岁 那年 记得 得有 一天 早晨 醒来 听见 家里 有 愉悦 的 声音 <EOS> <PAD> <PAD> <PAD> <PAD>
Reference: when i was 11 , i remember waking up one morning to the sound of joy in my house
Model: <SOS> i i was in , , i a a a a a <EOS> . . <EOS> . <EOS> .
Attention Weights: tensor([[0.9430, 0.0273, 0.0252, 0.0013, 0.0030, 0.0001, 0.0001, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000],
        [0.0888, 0.2131, 0.6465, 0.0297, 0.0170, 0.0012, 0.0021, 0.0007, 0.0003,
         0.0001, 0.0001, 0.0001, 0.0001, 0.0000, 0.0001, 0.0001, 0.0000, 0.0000,
         0.0000, 0.0000],
        [0.0753, 0.1122, 0.6764, 0.0544, 0.0511, 0.0052, 0.0123, 0.0043, 0.0022,
         0.0014, 0.0009, 0.0008, 0.0007, 0.0004, 0.0008, 0.0017, 0.0000, 0.0000,
         0.0000, 0.0000],
        [0.0543, 0.0586, 0.6305, 0.1031, 0

Epoch: 2.88, Train Loss: 0.00, Val Loss: 5.34, Train BLEU: 0.00, Val BLEU: 4.27, Minutes Elapsed: 267.54
Sampling from val predictions...
Source: 没有 有人 会 拒绝 幸运 的 是 也 没有 有人 拿 着 我们 的 相机 <UNK> <EOS> <PAD> <PAD> <PAD>
Reference: no one &apos;s ever refused , and luckily no one &apos;s ever run off with our camera . <EOS>
Model: <SOS> no you , , to to , the &apos;s they the the . . <EOS> <EOS> . . <EOS>
Attention Weights: tensor([[0.8476, 0.1496, 0.0027, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000],
        [0.0878, 0.7132, 0.1493, 0.0202, 0.0254, 0.0005, 0.0006, 0.0005, 0.0009,
         0.0008, 0.0001, 0.0001, 0.0003, 0.0000, 0.0001, 0.0001, 0.0002, 0.0000,
         0.0000, 0.0000],
        [0.0699, 0.3023, 0.1754, 0.1595, 0.1443, 0.0104, 0.0140, 0.0161, 0.0367,
         0.0281, 0.0031, 0.0024, 0.0085, 0.0013, 0.0024, 0.0061, 0.0195, 0.0000,
         0.0000, 0.0000],
        [0.0326

Epoch: 2.99, Train Loss: 0.00, Val Loss: 5.31, Train BLEU: 0.00, Val BLEU: 4.49, Minutes Elapsed: 277.80
Sampling from val predictions...
Source: 但是 言语 语词 词汇 对于 政治 中心 之 作用 是非 非常 重要 的 并且 所有 的 政客 都 明白 他们
Reference: but it &apos;s very important that words are at the center of politics , and all politicians know they
Model: <SOS> but the , &apos;t a , , do the , , of , , they they are them <EOS>
Attention Weights: tensor([[0.0654, 0.0282, 0.0016, 0.0446, 0.6808, 0.0158, 0.0752, 0.0246, 0.0424,
         0.0061, 0.0018, 0.0043, 0.0002, 0.0004, 0.0017, 0.0000, 0.0033, 0.0001,
         0.0035, 0.0000],
        [0.0224, 0.1594, 0.0105, 0.1341, 0.3217, 0.1137, 0.1698, 0.0187, 0.0273,
         0.0034, 0.0020, 0.0061, 0.0002, 0.0009, 0.0030, 0.0001, 0.0059, 0.0001,
         0.0006, 0.0001],
        [0.0067, 0.0150, 0.0057, 0.0579, 0.2705, 0.1665, 0.2183, 0.0488, 0.0647,
         0.0263, 0.0298, 0.0419, 0.0025, 0.0094, 0.0160, 0.0013, 0.0165, 0.0004,
         0.0016, 0.0003],
        [0.0027, 0.00

Epoch: 3.05, Train Loss: 0.00, Val Loss: 5.29, Train BLEU: 0.00, Val BLEU: 3.79, Minutes Elapsed: 284.09
Sampling from val predictions...
Source: 但 它们 会 损坏 你 的 挡风 挡风玻璃 玻璃 你 肯定 不 高兴 <EOS> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
Reference: but they can damage your windshield , so you &apos;re not happy with that . <EOS> <PAD> <PAD> <PAD>
Model: <SOS> but they &apos;re to your your , you you don &apos;t . . <EOS> . <EOS> <EOS> <EOS> <EOS>
Attention Weights: tensor([[0.0136, 0.9672, 0.0184, 0.0002, 0.0001, 0.0000, 0.0000, 0.0001, 0.0002,
         0.0001, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000],
        [0.0026, 0.9027, 0.0891, 0.0026, 0.0018, 0.0000, 0.0001, 0.0004, 0.0006,
         0.0001, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000],
        [0.0061, 0.1743, 0.5455, 0.1291, 0.1008, 0.0066, 0.0064, 0.0095, 0.0092,
         0.0040, 0.0025, 0.0021, 0.0008, 0.0032, 0.0000, 0.0000, 0.0000, 0.0000,
       

Epoch: 3.22, Train Loss: 0.00, Val Loss: 5.24, Train BLEU: 0.00, Val BLEU: 5.14, Minutes Elapsed: 299.50
Sampling from val predictions...
Source: 谢谢 <EOS> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
Reference: thank you . <EOS> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
Model: <SOS> thank you . <EOS> you <EOS> <EOS> <EOS> <EOS> you you you you you you you you you you
Attention Weights: tensor([[0.9956, 0.0044, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000],
        [0.9921, 0.0079, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000],
        [0.9574, 0.0426, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.

Epoch: 3.38, Train Loss: 0.00, Val Loss: 5.31, Train BLEU: 0.00, Val BLEU: 4.37, Minutes Elapsed: 314.90
Sampling from val predictions...
Source: 他 无所 无所谓 所谓 宗教 宗教信仰 信仰 的 差异 还有 他 从来 从来没 说 过 谎 <EOS> <PAD> <PAD> <PAD>
Reference: he doesn &apos;t care about religious differences , and get this : he has never told a lie .
Model: <SOS> he didn &apos;t know any the , and but he didn &apos;t . &apos;t &apos;t . <EOS> <EOS> .
Attention Weights: tensor([[0.9842, 0.0011, 0.0125, 0.0001, 0.0000, 0.0000, 0.0000, 0.0002, 0.0015,
         0.0004, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000],
        [0.3546, 0.1514, 0.4484, 0.0090, 0.0035, 0.0010, 0.0051, 0.0040, 0.0168,
         0.0033, 0.0020, 0.0005, 0.0002, 0.0001, 0.0000, 0.0000, 0.0001, 0.0000,
         0.0000, 0.0000],
        [0.1514, 0.0886, 0.5483, 0.0718, 0.0222, 0.0058, 0.0257, 0.0164, 0.0393,
         0.0095, 0.0073, 0.0029, 0.0024, 0.0024, 0.0011, 0.0010, 0.0040, 0.0000,
         0.0000, 0.00

Epoch: 3.55, Train Loss: 0.00, Val Loss: 5.25, Train BLEU: 0.00, Val BLEU: 4.80, Minutes Elapsed: 330.32
Sampling from val predictions...
Source: 但是 言语 语词 词汇 对于 政治 中心 之 作用 是非 非常 重要 的 并且 所有 的 政客 都 明白 他们
Reference: but it &apos;s very important that words are at the center of politics , and all politicians know they
Model: <SOS> but the &apos;s important important , the &apos;s the , , , , , , they are them <EOS>
Attention Weights: tensor([[0.2486, 0.0689, 0.0012, 0.0752, 0.4425, 0.0282, 0.0659, 0.0135, 0.0258,
         0.0072, 0.0065, 0.0054, 0.0002, 0.0004, 0.0012, 0.0001, 0.0077, 0.0001,
         0.0015, 0.0000],
        [0.0692, 0.2762, 0.0115, 0.2610, 0.1427, 0.0884, 0.1092, 0.0091, 0.0135,
         0.0025, 0.0040, 0.0038, 0.0002, 0.0006, 0.0014, 0.0001, 0.0061, 0.0001,
         0.0005, 0.0000],
        [0.0078, 0.0088, 0.0039, 0.0726, 0.1710, 0.2650, 0.3024, 0.0478, 0.0409,
         0.0139, 0.0242, 0.0206, 0.0014, 0.0030, 0.0050, 0.0005, 0.0104, 0.0002,
         0.0006, 0.0001],
  

Epoch: 3.71, Train Loss: 0.00, Val Loss: 5.20, Train BLEU: 0.00, Val BLEU: 5.08, Minutes Elapsed: 345.76
Sampling from val predictions...
Source: 我们 看 这个 样品 现在 还有 有点 烫 <EOS> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
Reference: so we still have the specimen here . it &apos;s quite warm . <EOS> <PAD> <PAD> <PAD> <PAD> <PAD>
Model: <SOS> we we we this this , , this this &apos;s a a . <EOS> <EOS> <EOS> <EOS> <EOS> <EOS>
Attention Weights: tensor([[0.8711, 0.1264, 0.0023, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000],
        [0.0882, 0.8478, 0.0596, 0.0022, 0.0006, 0.0009, 0.0003, 0.0000, 0.0002,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000],
        [0.1507, 0.4926, 0.2912, 0.0243, 0.0108, 0.0131, 0.0097, 0.0008, 0.0068,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,

Epoch: 3.82, Train Loss: 0.00, Val Loss: 5.23, Train BLEU: 0.00, Val BLEU: 4.84, Minutes Elapsed: 356.06
Sampling from val predictions...
Source: 然而 与 创业 创业者 业者 打交道 交道 有 个 秘诀 <EOS> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
Reference: however , there is a secret to work with entrepreneurs . <EOS> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
Model: <SOS> but , a , a of of of a the . <EOS> <EOS> . <EOS> <EOS> <EOS> <EOS> <EOS>
Attention Weights: tensor([[0.6436, 0.1390, 0.1464, 0.0128, 0.0205, 0.0074, 0.0015, 0.0061, 0.0037,
         0.0057, 0.0133, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000],
        [0.1458, 0.1639, 0.4889, 0.0273, 0.0538, 0.0335, 0.0090, 0.0118, 0.0108,
         0.0264, 0.0289, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000],
        [0.0693, 0.0495, 0.2140, 0.0390, 0.0716, 0.0840, 0.0513, 0.0899, 0.0967,
         0.1151, 0.1197, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
      

Epoch: 3.93, Train Loss: 0.00, Val Loss: 5.34, Train BLEU: 0.00, Val BLEU: 4.65, Minutes Elapsed: 366.35
Sampling from val predictions...
Source: 康 纳 第一 第一次 一次 打 我 是 在 我们 婚礼 的 五天 前 <EOS> <PAD> <PAD> <PAD> <PAD> <PAD>
Reference: <UNK> first physically attacked me five days before our wedding . <EOS> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
Model: <SOS> the : : , i i the ago <EOS> . . <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS>
Attention Weights: tensor([[0.0074, 0.0377, 0.6804, 0.1295, 0.1122, 0.0098, 0.0191, 0.0006, 0.0002,
         0.0006, 0.0000, 0.0001, 0.0012, 0.0006, 0.0006, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000],
        [0.0088, 0.0513, 0.8472, 0.0734, 0.0156, 0.0011, 0.0022, 0.0001, 0.0000,
         0.0001, 0.0000, 0.0000, 0.0001, 0.0000, 0.0001, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000],
        [0.0068, 0.0428, 0.5855, 0.1789, 0.1301, 0.0354, 0.0120, 0.0014, 0.0003,
         0.0005, 0.0001, 0.0003, 0.0016, 0.0006, 0.0037, 0.0000, 0.0000, 0.0000,
        

Epoch: 4.00, Train Loss: 0.00, Val Loss: 5.28, Train BLEU: 0.00, Val BLEU: 4.91, Minutes Elapsed: 372.66
Sampling from val predictions...
Source: 这个 个人 还 得 去 <UNK> 条约 和 接见 <UNK> <UNK> <EOS> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
Reference: this guy has to go and sign treaties and meet foreign <UNK> . <EOS> <PAD> <PAD> <PAD> <PAD> <PAD>
Model: <SOS> this &apos;s is a <UNK> with and and the <EOS> . <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS>
Attention Weights: tensor([[0.5153, 0.4825, 0.0020, 0.0002, 0.0001, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000],
        [0.0323, 0.9382, 0.0148, 0.0088, 0.0041, 0.0005, 0.0006, 0.0001, 0.0002,
         0.0001, 0.0001, 0.0002, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000],
        [0.0972, 0.5074, 0.1599, 0.0929, 0.0537, 0.0117, 0.0150, 0.0059, 0.0108,
         0.0094, 0.0130, 0.0230, 0.0000, 0.0000, 0.0000, 0.0000, 0.

Epoch: 4.11, Train Loss: 0.00, Val Loss: 5.19, Train BLEU: 0.00, Val BLEU: 4.51, Minutes Elapsed: 382.98
Sampling from val predictions...
Source: 照片 片中 中美 美丽 的 和服 必须 通过 过手 手绘 处理 或者 一点 一点一点 一点 拼接 而 成 <UNK> 配上
Reference: the <UNK> in this shot pretty much had to be <UNK> , or <UNK> together , picking out the
Model: <SOS> the , of the , , , , , be and and <EOS> . . <EOS> <EOS> . <EOS>
Attention Weights: tensor([[2.8535e-01, 3.5222e-01, 1.7358e-01, 1.5906e-01, 5.2992e-03, 1.0520e-02,
         8.2944e-03, 1.1040e-04, 3.7901e-05, 1.0536e-04, 5.5387e-04, 4.4237e-03,
         2.7083e-04, 1.2035e-05, 1.0287e-04, 1.4491e-05, 8.1513e-06, 9.1695e-06,
         3.6205e-06, 2.3322e-05],
        [7.5046e-02, 2.8411e-01, 2.4544e-01, 3.5021e-01, 1.6677e-02, 1.8973e-02,
         8.0205e-03, 3.7115e-04, 3.9781e-05, 7.6286e-05, 1.0786e-04, 7.6161e-04,
         8.7394e-05, 1.3492e-05, 3.7174e-05, 1.3080e-05, 8.4862e-06, 2.3563e-06,
         1.0198e-06, 1.8721e-06],
        [4.5666e-02, 1.7331e-01, 2.2004e-0

Epoch: 4.22, Train Loss: 0.00, Val Loss: 5.11, Train BLEU: 0.00, Val BLEU: 5.47, Minutes Elapsed: 393.32
Sampling from val predictions...
Source: 我们 在 巴士 上 的 旅途 有 一周 之久 好几 几次 都 差点 被 抓住 <EOS> <PAD> <PAD> <PAD> <PAD>
Reference: the journey by bus took one week , and we were almost caught several times . <EOS> <PAD> <PAD>
Model: <SOS> we spent of we we we the , the , was been to the . . <EOS> <EOS> .
Attention Weights: tensor([[0.7352, 0.0079, 0.0003, 0.0076, 0.0177, 0.1327, 0.0188, 0.0282, 0.0159,
         0.0246, 0.0055, 0.0044, 0.0006, 0.0001, 0.0000, 0.0007, 0.0000, 0.0000,
         0.0000, 0.0000],
        [0.2202, 0.0707, 0.0071, 0.0220, 0.0234, 0.4913, 0.0286, 0.0600, 0.0221,
         0.0144, 0.0237, 0.0047, 0.0098, 0.0004, 0.0003, 0.0014, 0.0000, 0.0000,
         0.0000, 0.0000],
        [0.2769, 0.0405, 0.0178, 0.0458, 0.0465, 0.3135, 0.0470, 0.0832, 0.0324,
         0.0330, 0.0268, 0.0091, 0.0123, 0.0019, 0.0014, 0.0120, 0.0000, 0.0000,
         0.0000, 0.0000],
        [0.6584,

Epoch: 4.33, Train Loss: 0.00, Val Loss: 5.39, Train BLEU: 0.00, Val BLEU: 4.87, Minutes Elapsed: 403.61
Sampling from val predictions...
Source: 我 把 花园 和 土地 视为 画布 而 植物 和 树木 就是 我 在 画布 上 的 装饰 <EOS> <PAD>
Reference: i use the garden , the soil , like it &apos;s a piece of cloth , and the plants
Model: <SOS> i i a in and and the , , , , the i . i . <EOS> i .
Attention Weights: tensor([[0.9466, 0.0444, 0.0043, 0.0037, 0.0010, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000],
        [0.1077, 0.7792, 0.0915, 0.0134, 0.0062, 0.0010, 0.0001, 0.0001, 0.0002,
         0.0001, 0.0003, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000],
        [0.0589, 0.2354, 0.3960, 0.0766, 0.1447, 0.0279, 0.0099, 0.0042, 0.0096,
         0.0029, 0.0120, 0.0041, 0.0021, 0.0010, 0.0005, 0.0020, 0.0036, 0.0040,
         0.0047, 0.0000],
        [0.0045, 0.0079, 0.2850, 0.0768, 0.2120, 0.0504, 0.

Epoch: 4.49, Train Loss: 0.00, Val Loss: 5.18, Train BLEU: 0.00, Val BLEU: 5.62, Minutes Elapsed: 419.07
Sampling from val predictions...
Source: 这 很 怪 <EOS> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
Reference: that was awkward . <EOS> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
Model: <SOS> it &apos;s a . <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS>
Attention Weights: tensor([[0.8713, 0.1174, 0.0059, 0.0054, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000],
        [0.2408, 0.4360, 0.2972, 0.0261, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000],
        [0.1232, 0.5159, 0.3294, 0.0315, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000,

Epoch: 4.60, Train Loss: 0.00, Val Loss: 5.16, Train BLEU: 0.00, Val BLEU: 5.24, Minutes Elapsed: 429.36
Sampling from val predictions...
Source: 这些 光会 反射 射进 进入 房间 一些 会 返回 回到 门上 然后 后进 进入 照相 照相机 相机 这样 我们 就
Reference: it &apos;s going to bounce , go inside the room , some of that is going to reflect back
Model: <SOS> these , to to be back and , and , , and we we , . . . .
Attention Weights: tensor([[4.6027e-01, 3.8181e-02, 7.5439e-02, 5.0506e-03, 9.5843e-03, 5.2556e-02,
         2.7625e-01, 7.8316e-02, 2.5167e-03, 1.1873e-03, 5.8926e-05, 2.2530e-05,
         1.4503e-05, 9.4098e-06, 5.2670e-06, 1.3241e-06, 2.0082e-06, 5.2104e-04,
         1.1150e-05, 7.6358e-08],
        [7.5332e-02, 4.7504e-02, 2.7120e-01, 7.2605e-02, 6.9432e-02, 1.6438e-01,
         5.7650e-02, 1.2797e-01, 6.6265e-02, 4.1663e-02, 3.2173e-03, 6.7726e-04,
         5.6558e-04, 5.2290e-04, 1.9458e-04, 7.5504e-05, 1.1976e-04, 4.0424e-04,
         2.1333e-04, 1.0713e-05],
        [9.7575e-03, 9.2908e-03, 6.4512e-02, 4.9320e-0

Epoch: 4.71, Train Loss: 0.00, Val Loss: 5.17, Train BLEU: 0.00, Val BLEU: 5.29, Minutes Elapsed: 439.69
Sampling from val predictions...
Source: 看看 以 一位 位非 非洲 女性 的 眼光 我们 所 带来 的 损害 有 多 大 <EOS> <PAD> <PAD> <PAD>
Reference: read it from an african woman , the damage that we have done . <EOS> <PAD> <PAD> <PAD> <PAD>
Model: <SOS> look , the a , of , in , of our are to . . <EOS> <EOS> <EOS> <EOS>
Attention Weights: tensor([[0.9968, 0.0016, 0.0009, 0.0001, 0.0003, 0.0001, 0.0000, 0.0001, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000],
        [0.6979, 0.0393, 0.1318, 0.0095, 0.0331, 0.0333, 0.0023, 0.0314, 0.0110,
         0.0014, 0.0018, 0.0005, 0.0038, 0.0004, 0.0006, 0.0009, 0.0010, 0.0000,
         0.0000, 0.0000],
        [0.1535, 0.0682, 0.1372, 0.0373, 0.1395, 0.0920, 0.0141, 0.0749, 0.0585,
         0.0231, 0.0393, 0.0094, 0.0500, 0.0087, 0.0187, 0.0324, 0.0431, 0.0000,
         0.0000, 0.0000],
        [0.0188, 0.029

Epoch: 4.93, Train Loss: 0.00, Val Loss: 5.28, Train BLEU: 0.00, Val BLEU: 5.03, Minutes Elapsed: 460.37
Sampling from val predictions...
Source: 尽管 她们 被 抓住 了 但是 <UNK> 巨大 的 国际 <UNK> 舆论 <UNK> 压力 她们 最终 被 释放 了 <EOS>
Reference: even though they were caught , they were eventually released after heavy international pressure . <EOS> <PAD> <PAD> <PAD>
Model: <SOS> and they they they , , , they the , the the . . <EOS> <EOS> <EOS> . <EOS>
Attention Weights: tensor([[1.0899e-01, 8.8196e-01, 8.6218e-03, 2.7010e-04, 1.8200e-05, 2.0886e-05,
         3.5542e-06, 2.0424e-05, 3.4665e-06, 4.5659e-05, 2.8812e-06, 9.4374e-06,
         3.4989e-06, 1.3047e-05, 1.1550e-05, 1.0745e-06, 7.4694e-07, 2.2303e-07,
         3.5908e-07, 2.4323e-06],
        [1.6442e-02, 8.7785e-01, 8.4943e-02, 1.2796e-02, 2.7427e-04, 5.7257e-04,
         1.7563e-04, 1.0617e-03, 1.9223e-04, 4.1019e-03, 2.9134e-04, 5.1144e-04,
         1.7592e-04, 2.9530e-04, 1.4770e-04, 2.7333e-05, 1.3596e-05, 9.2995e-06,
         8.7489e-06, 1.1223e

Epoch: 5.00, Train Loss: 0.00, Val Loss: 5.23, Train BLEU: 0.00, Val BLEU: 5.02, Minutes Elapsed: 466.68
Sampling from val predictions...
Source: 嗯 跳舞 是 人类 众多 的 活动 之一 <EOS> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
Reference: so , dancing is one of the most human of activities . <EOS> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
Model: <SOS> well , is is is a the of of of . . <EOS> <EOS> . <EOS> <EOS> <EOS> <EOS>
Attention Weights: tensor([[0.9998, 0.0002, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000],
        [0.9576, 0.0419, 0.0002, 0.0000, 0.0000, 0.0000, 0.0000, 0.0001, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000],
        [0.2375, 0.7242, 0.0202, 0.0055, 0.0028, 0.0003, 0.0019, 0.0061, 0.0014,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.

Epoch: 5.16, Train Loss: 0.00, Val Loss: 5.22, Train BLEU: 0.00, Val BLEU: 5.01, Minutes Elapsed: 482.17
Sampling from val predictions...
Source: 我 养 了 一只 黑色 <UNK> 拉布 <UNK> 布拉 拉多 猎犬 <UNK> <UNK> 奥 <UNK> 面包 面包车 包车 <EOS> <PAD>
Reference: my dog is a black lab , and i drive a <UNK> odyssey <UNK> . <EOS> <PAD> <PAD> <PAD>
Model: <SOS> i i <UNK> a <UNK> , , , , , a <UNK> , . <EOS> <EOS> <EOS> . <EOS>
Attention Weights: tensor([[0.9657, 0.0114, 0.0033, 0.0184, 0.0008, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0002, 0.0001, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000],
        [0.0619, 0.5555, 0.0652, 0.2927, 0.0156, 0.0022, 0.0013, 0.0010, 0.0005,
         0.0012, 0.0012, 0.0002, 0.0002, 0.0001, 0.0002, 0.0003, 0.0001, 0.0002,
         0.0003, 0.0000],
        [0.0232, 0.0636, 0.1199, 0.6027, 0.0870, 0.0169, 0.0079, 0.0059, 0.0031,
         0.0090, 0.0093, 0.0018, 0.0018, 0.0013, 0.0028, 0.0087, 0.0055, 0.0127,
         0.0165, 0.0000],
        [0.0672, 0.

Epoch: 5.27, Train Loss: 0.00, Val Loss: 5.11, Train BLEU: 0.00, Val BLEU: 5.31, Minutes Elapsed: 492.52
Sampling from val predictions...
Source: 我们 有志 志愿 志愿者 愿者 团队 帮助 创业 辅助 商 为 客户 集结 资源 及 人力 我们 发现 当地 当地人
Reference: we have groups of volunteers supporting the enterprise facilitator to help you to find resources and people and we
Model: <SOS> we we to to to , and and , and we us we . . . <EOS> . <EOS>
Attention Weights: tensor([[9.8937e-01, 1.7815e-03, 4.1156e-03, 1.6221e-04, 2.6662e-04, 3.5262e-03,
         5.0243e-04, 1.4222e-04, 7.6211e-05, 2.8987e-05, 6.0737e-06, 2.6304e-06,
         1.6226e-06, 4.3517e-06, 4.2305e-06, 3.0836e-06, 2.1392e-06, 3.4254e-08,
         1.2740e-08, 3.3305e-09],
        [2.7851e-01, 2.4195e-01, 3.3535e-01, 2.2013e-02, 2.0651e-02, 4.9579e-02,
         4.1706e-02, 7.4502e-03, 1.7363e-03, 5.7289e-04, 1.6415e-04, 7.3170e-05,
         5.6580e-05, 5.7275e-05, 2.5127e-05, 3.6357e-05, 3.6886e-05, 2.4837e-05,
         3.8830e-06, 1.2775e-06],
        [1.0619e-01, 5.

Epoch: 5.38, Train Loss: 0.00, Val Loss: 5.21, Train BLEU: 0.00, Val BLEU: 4.83, Minutes Elapsed: 502.87
Sampling from val predictions...
Source: 我 过去 讨厌 狮子 但是 现在 就是 是因为 因为 我 的 发明 拯救 了 我 爸爸 的 牛 以及 狮子
Reference: i used to hate lions , but now because my invention is saving my father &apos;s cows and the
Model: <SOS> i i a because , because because i my my my is my my my . <EOS> . <EOS>
Attention Weights: tensor([[6.9224e-01, 2.1632e-01, 2.8987e-03, 3.6074e-03, 3.3029e-02, 4.1466e-02,
         1.0061e-02, 9.1840e-05, 2.5080e-04, 2.5800e-05, 3.2877e-06, 1.4915e-06,
         9.2572e-07, 1.2300e-07, 4.6976e-07, 7.7521e-08, 6.9967e-08, 1.6382e-07,
         1.4279e-07, 7.5868e-08],
        [2.5985e-02, 7.0323e-01, 2.4858e-01, 2.0608e-02, 4.9580e-04, 2.9427e-04,
         6.6117e-04, 5.0303e-05, 4.6492e-05, 2.4270e-05, 4.7069e-06, 1.1426e-05,
         5.6976e-06, 1.0296e-06, 1.5715e-06, 2.1457e-06, 6.2677e-07, 6.6095e-07,
         6.5426e-07, 5.3151e-07],
        [7.6811e-02, 3.1334e-01, 2.7694

Epoch: 5.49, Train Loss: 0.00, Val Loss: 5.15, Train BLEU: 0.00, Val BLEU: 5.70, Minutes Elapsed: 513.24
Sampling from val predictions...
Source: 而 可持续性 持续 持续性 的 搞笑 之处 处在 在于 你 必须 维持 它 <EOS> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
Reference: the funny thing about sustainability , you have to sustain it . <EOS> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
Model: <SOS> and the of of that , you can to it it . <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS>
Attention Weights: tensor([[0.0652, 0.2703, 0.4402, 0.1539, 0.0123, 0.0568, 0.0007, 0.0002, 0.0003,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000],
        [0.0150, 0.0807, 0.3899, 0.1548, 0.0297, 0.3022, 0.0189, 0.0043, 0.0036,
         0.0008, 0.0001, 0.0000, 0.0000, 0.0002, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000],
        [0.0407, 0.0503, 0.2921, 0.2823, 0.0843, 0.1574, 0.0494, 0.0187, 0.0108,
         0.0035, 0.0014, 0.0006, 0.0005, 0.0081, 0.0000, 0.0000, 0.0000, 0.0000,
     

Epoch: 5.66, Train Loss: 0.00, Val Loss: 5.13, Train BLEU: 0.00, Val BLEU: 5.45, Minutes Elapsed: 528.81
Sampling from val predictions...
Source: 她们 带 着 我 走下 一段 <UNK> 狭窄 的 楼梯 到 了 一个 肮脏 昏暗 的 地下 地下室 <EOS> <PAD>
Reference: they ushered me down a narrow set of stairs that led to this dirty , dimly fluorescent lit basement
Model: <SOS> they they me a a a of , a , a a a . . . . <EOS> <EOS>
Attention Weights: tensor([[0.9588, 0.0384, 0.0026, 0.0002, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000],
        [0.2591, 0.4584, 0.1129, 0.1571, 0.0097, 0.0010, 0.0002, 0.0003, 0.0002,
         0.0005, 0.0001, 0.0001, 0.0001, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0002, 0.0000],
        [0.0228, 0.0850, 0.1329, 0.6503, 0.0880, 0.0113, 0.0026, 0.0021, 0.0005,
         0.0023, 0.0003, 0.0005, 0.0005, 0.0001, 0.0000, 0.0000, 0.0001, 0.0002,
         0.0005, 0.0000],
        [0.0376, 0.0121, 0.020

Epoch: 5.77, Train Loss: 0.00, Val Loss: 5.29, Train BLEU: 0.00, Val BLEU: 5.54, Minutes Elapsed: 539.20
Sampling from val predictions...
Source: 图象 象是 强大 的 但 同时 又 是 表面 <UNK> 的 <EOS> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
Reference: image is powerful , but also image is superficial . <EOS> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
Model: <SOS> the is the , but it &apos;s the <EOS> . <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS>
Attention Weights: tensor([[0.5010, 0.2078, 0.2707, 0.0037, 0.0114, 0.0050, 0.0001, 0.0000, 0.0000,
         0.0000, 0.0001, 0.0001, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000],
        [0.1179, 0.0639, 0.7049, 0.0138, 0.0728, 0.0231, 0.0005, 0.0002, 0.0000,
         0.0000, 0.0003, 0.0026, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000],
        [0.0443, 0.0544, 0.4674, 0.0339, 0.2880, 0.0832, 0.0110, 0.0032, 0.0003,
         0.0006, 0.0025, 0.0112, 0.0000, 0.0000, 0.0000, 0.0000, 0.00

Epoch: 5.93, Train Loss: 0.00, Val Loss: 5.24, Train BLEU: 0.00, Val BLEU: 5.31, Minutes Elapsed: 554.79
Sampling from val predictions...
Source: 见到 他 是 在 一个 收容 收容所 所里 free the <UNK> 组织 用于 <UNK> 奴役 受害 受害者 的 一个 地方
Reference: i met him at a shelter where free the slaves <UNK> victims of slavery . <EOS> <PAD> <PAD> <PAD>
Model: <SOS> she was a a a a of a a a of a the <EOS> . <EOS> <EOS> <EOS> <EOS>
Attention Weights: tensor([[1.1885e-02, 9.7155e-01, 1.2684e-02, 2.4145e-03, 2.6639e-04, 5.8961e-04,
         4.0956e-04, 2.3395e-05, 1.0759e-04, 1.6818e-05, 4.3100e-06, 1.4082e-05,
         1.0797e-06, 1.8564e-07, 9.0339e-07, 6.6034e-06, 4.2451e-06, 5.9254e-06,
         4.7520e-06, 6.4062e-06],
        [8.4084e-03, 6.5557e-01, 2.1906e-01, 9.8239e-02, 7.5921e-03, 6.1119e-03,
         1.6397e-03, 3.2996e-04, 1.3266e-03, 4.0070e-04, 1.8687e-04, 5.9743e-04,
         8.5648e-05, 2.1979e-05, 4.2621e-05, 6.5949e-05, 5.9891e-05, 3.7297e-05,
         1.2278e-04, 9.5450e-05],
        [1.1580e-02, 1.3650e

Epoch: 6.00, Train Loss: 0.00, Val Loss: 5.19, Train BLEU: 0.00, Val BLEU: 5.12, Minutes Elapsed: 561.14
Sampling from val predictions...
Source: 这个 个人 还 得 去 <UNK> 条约 和 接见 <UNK> <UNK> <EOS> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
Reference: this guy has to go and sign treaties and meet foreign <UNK> . <EOS> <PAD> <PAD> <PAD> <PAD> <PAD>
Model: <SOS> this person is a to with and and the <EOS> . <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS>
Attention Weights: tensor([[0.3968, 0.5792, 0.0225, 0.0012, 0.0002, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000],
        [0.0309, 0.9279, 0.0224, 0.0132, 0.0036, 0.0003, 0.0007, 0.0001, 0.0003,
         0.0001, 0.0001, 0.0002, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000],
        [0.0616, 0.4068, 0.2330, 0.1209, 0.1031, 0.0133, 0.0154, 0.0046, 0.0093,
         0.0080, 0.0113, 0.0128, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000

In [None]:
summarize_results(load_experiment_log())[['dt_created', 'num_epochs', 'learning_rate', 'clip_grad_max_norm', 'val_loss']].head()

In [None]:
plot_single_learning_curve(results)

In [None]:
# Epoch: 199.00, Train Loss: 0.32, Val Loss: 13.19, Train BLEU: 98.94, Val BLEU: 0.27
plot_single_learning_curve(results)

In [None]:
# with attention energies = v_broadcast.bmm(torch.tanh(self.attn(concat)).transpose(1, 2)) # switched order  
# Epoch: 199.00, Train Loss: 0.63, Val Loss: 12.82, Train BLEU: 92.05, Val BLEU: 0.38
plot_single_learning_curve(results)

In [None]:
for i, token in enumerate(vocab[SRC_LANG]['id2token']): 
    if i < 20: 
        print("{}: {}".format(i, token))

In [None]:
for i, token in enumerate(vocab[TARG_LANG]['id2token']): 
    if i < 20: 
        print("{}: {}".format(i, token))

In [None]:
import torch
x = torch.arange(0, 3*5*10).view(3, 5, 10)
print(x)
y = x[1:, :, :]
print(y)
z = y.view(-1, 10)
print(z)

In [None]:
t = torch.arange(0, 2*5).view(5, 2)
print(t)
u = t.contiguous().view(-1)
print(u)
v = t.permute(1, 0)
print(v)
w = v.contiguous().view(-1)
print(w)

In [None]:
a = torch.arange(0, 2*1*300)
print(a)
b = a.view(-1, 1, 300)
print(b.size())

In [None]:
for i, (src_idxs, targ_idxs, src_lens, targ_lens) in enumerate(full_loaders['train']):
#     print(i)
#     print(src_idxs.size())
#     print(src_idxs)
#     print(src_lens)
#     print(targ_idxs.size())
#     print(targ_idxs)
#     print(targ_lens)
    id2token = vocab[SRC_LANG]['id2token']
    test_tensor = src_idxs
    list_of_lists = test_tensor.numpy().astype(int).tolist()
    to_token = lambda l: ' '.join([id2token[idx] for idx in l])
    list_of_lists_tokens = [to_token(l) for l in list_of_lists] 
    break 