In [1]:
import numpy as np 
import pandas as pd 
from data_processing import generate_vocab, process_data, create_dataloaders, text2tokens
from model import get_pretrained_emb, EncoderDecoder, EncoderRNN, DecoderRNN, EncoderDecoderAttn, DecoderAttnRNN, \
    DecoderDotAttnRNN
from train_eval import count_parameters, summarize_results, plot_single_learning_curve, load_experiment_log
from train_eval import train_and_eval
import importlib
import pickle as pkl 
import torch

In [2]:
# model identification
MODEL_NAME = 'zh-seq2seq-rnn-dot-attn'
SRC_LANG = 'zh'
TARG_LANG = 'en'

# data processing params  
SRC_MAX_SENTENCE_LEN = 10
TARG_MAX_SENTENCE_LEN = 10
SRC_VOCAB_SIZE = 30000 #30000
TARG_VOCAB_SIZE = 30000 #30000

# model architecture params 
RNN_CELL_TYPE = 'gru'
NUM_LAYERS = 2 #2 
ENC_HIDDEN_DIM = 256 #512
DEC_HIDDEN_DIM = 2 * ENC_HIDDEN_DIM #2 * ENC_HIDDEN_DIM 
TEACHER_FORCING_RATIO = 1
CLIP_GRAD_MAX_NORM = 1
ENC_DROPOUT = 0.2 # to actually implement
DEC_DROPOUT = 0.2 # to actually implement
USE_ATTN = True

# training params  
BATCH_SIZE = 64 #32
NUM_EPOCHS = 200
LR = 0.0003 # 0.0005
OPTIMIZER = 'Adam'
LAZY_TRAIN = False

In [3]:
# store as dict to save to results later 
params = {'model_name': MODEL_NAME, 'src_lang': SRC_LANG, 'targ_lang': TARG_LANG, 'rnn_cell_type': RNN_CELL_TYPE, 
          'src_max_sentence_len': SRC_MAX_SENTENCE_LEN, 'targ_max_sentence_len': TARG_MAX_SENTENCE_LEN, 
          'src_vocab_size': SRC_VOCAB_SIZE, 'targ_vocab_size': TARG_VOCAB_SIZE, 
          'num_layers': NUM_LAYERS, 'enc_hidden_dim': ENC_HIDDEN_DIM, 'dec_hidden_dim': DEC_HIDDEN_DIM,
          'teacher_forcing_ratio': TEACHER_FORCING_RATIO, 'clip_grad_max_norm': CLIP_GRAD_MAX_NORM,
          'enc_dropout': ENC_DROPOUT, 'dec_dropout': DEC_DROPOUT, 'use_attn': USE_ATTN, 
          'batch_size': BATCH_SIZE, 'num_epochs': NUM_EPOCHS, 'learning_rate': LR, 'optimizer': OPTIMIZER, 
          'lazy_train': LAZY_TRAIN} 

In [4]:
# # takes a long time to process, save to pickle for reimport in future 
# vocab = generate_vocab(SRC_LANG, TARG_LANG, SRC_VOCAB_SIZE, TARG_VOCAB_SIZE)
# vocab_filename = "{}-{}-vocab.p".format(SRC_LANG, TARG_LANG)
# pkl.dump(vocab, open(vocab_filename, "wb"))

In [5]:
# reload from pickle 
vocab_filename = "{}-{}-vocab.p".format(SRC_LANG, TARG_LANG)
vocab = pkl.load(open(vocab_filename, "rb"))
data = process_data(SRC_LANG, TARG_LANG, SRC_MAX_SENTENCE_LEN, TARG_MAX_SENTENCE_LEN, vocab, filter_long=False)
data_minibatch = process_data(SRC_LANG, TARG_LANG, SRC_MAX_SENTENCE_LEN, TARG_MAX_SENTENCE_LEN, vocab, sample_limit=BATCH_SIZE, filter_long=False) 
data_minitrain = process_data(SRC_LANG, TARG_LANG, SRC_MAX_SENTENCE_LEN, TARG_MAX_SENTENCE_LEN, vocab, sample_limit=1000, filter_long=False)

In [6]:
# create dataloaders 
loaders_full = create_dataloaders(data, SRC_MAX_SENTENCE_LEN, TARG_MAX_SENTENCE_LEN, BATCH_SIZE)
loaders_minibatch = create_dataloaders(data_minibatch, SRC_MAX_SENTENCE_LEN, TARG_MAX_SENTENCE_LEN, BATCH_SIZE)
loaders_minitrain = create_dataloaders(data_minitrain, SRC_MAX_SENTENCE_LEN, TARG_MAX_SENTENCE_LEN, BATCH_SIZE)

In [7]:
# define model 

encoder = EncoderRNN(rnn_cell_type=RNN_CELL_TYPE, enc_hidden_dim=ENC_HIDDEN_DIM, num_layers=NUM_LAYERS, 
                     src_max_sentence_len=SRC_MAX_SENTENCE_LEN, enc_dropout=ENC_DROPOUT, 
                     pretrained_word2vec=get_pretrained_emb(vocab[SRC_LANG]['word2vec'], vocab[SRC_LANG]['token2id']))

# without attention 
decoder = DecoderRNN(dec_hidden_dim=DEC_HIDDEN_DIM, enc_hidden_dim=ENC_HIDDEN_DIM, num_layers=NUM_LAYERS,
                     targ_vocab_size=TARG_VOCAB_SIZE, targ_max_sentence_len=TARG_MAX_SENTENCE_LEN, 
                     pretrained_word2vec=get_pretrained_emb(vocab[TARG_LANG]['word2vec'], vocab[TARG_LANG]['token2id']))
model = EncoderDecoder(encoder, decoder, vocab[TARG_LANG]['token2id'])

# with attention 
# decoder = DecoderAttnRNN(rnn_cell_type=RNN_CELL_TYPE, dec_hidden_dim=DEC_HIDDEN_DIM, enc_hidden_dim=ENC_HIDDEN_DIM, 
#                          num_layers=NUM_LAYERS, targ_vocab_size=TARG_VOCAB_SIZE, src_max_sentence_len=SRC_MAX_SENTENCE_LEN, 
#                          targ_max_sentence_len=TARG_MAX_SENTENCE_LEN, dec_dropout=DEC_DROPOUT, 
#                          pretrained_word2vec=get_pretrained_emb(vocab[TARG_LANG]['word2vec'], vocab[TARG_LANG]['token2id']))
# model = EncoderDecoderAttn(encoder, decoder, vocab[TARG_LANG]['token2id']) 

# with dot attention 
decoder = DecoderDotAttnRNN(rnn_cell_type=RNN_CELL_TYPE, dec_hidden_dim=DEC_HIDDEN_DIM, enc_hidden_dim=ENC_HIDDEN_DIM, 
                         num_layers=NUM_LAYERS, targ_vocab_size=TARG_VOCAB_SIZE, src_max_sentence_len=SRC_MAX_SENTENCE_LEN, 
                         targ_max_sentence_len=TARG_MAX_SENTENCE_LEN, dec_dropout=DEC_DROPOUT, 
                         pretrained_word2vec=get_pretrained_emb(vocab[TARG_LANG]['word2vec'], vocab[TARG_LANG]['token2id']))
model = EncoderDecoderAttn(encoder, decoder, vocab[TARG_LANG]['token2id']) 

In [None]:
model, results = train_and_eval(
    model=model, loaders_full=loaders_full, loaders_minibatch=loaders_minibatch, loaders_minitrain=loaders_minitrain, 
    params=params, vocab=vocab, print_intermediate=100, save_checkpoint=True, save_to_log=True, 
    lazy_eval=True, print_attn=True, inspect_samples=1)

Epoch: 0.00, Train Loss: 0.00, Val Loss: 10.22, Train BLEU: 0.00, Val BLEU: 0.21, Minutes Elapsed: 0.08
Sampling from val predictions...
Source: 她 两年 年前 退休 了 结果 却 把 我家 变成
Reference: she retired two years ago , only to turn
Model: <SOS> the the the the the the the the the
Attention Weights: tensor([[0.0000, 0.0001, 0.0017, 0.0191, 0.2104, 0.1899, 0.4758, 0.1011, 0.0019,
         0.0000],
        [0.0015, 0.0039, 0.0185, 0.0621, 0.2173, 0.2102, 0.3286, 0.1397, 0.0169,
         0.0015],
        [0.0094, 0.0199, 0.0567, 0.1006, 0.1973, 0.1928, 0.2352, 0.1383, 0.0400,
         0.0098],
        [0.0248, 0.0425, 0.0912, 0.1195, 0.1787, 0.1703, 0.1828, 0.1198, 0.0500,
         0.0203],
        [0.0391, 0.0593, 0.1085, 0.1249, 0.1636, 0.1541, 0.1543, 0.1088, 0.0566,
         0.0307],
        [0.0516, 0.0710, 0.1146, 0.1248, 0.1560, 0.1464, 0.1409, 0.1005, 0.0579,
         0.0362],
        [0.0595, 0.0740, 0.1114, 0.1225, 0.1506, 0.1446, 0.1348, 0.0982, 0.0618,
         0.0426],
        [0.0670,

Epoch: 0.24, Train Loss: 0.00, Val Loss: 5.83, Train BLEU: 0.00, Val BLEU: 3.11, Minutes Elapsed: 12.51
Sampling from val predictions...
Source: 所以 如果 摄影 摄影师 就 在 这里 灯光 就 在
Reference: so if the photographer is right there and the
Model: <SOS> and we we we , we the the the
Attention Weights: tensor([[0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0009, 0.9196, 0.0001, 0.0794,
         0.0000],
        [0.0000, 1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.9846, 0.0000, 0.0000, 0.0151, 0.0000, 0.0000, 0.0000, 0.0003,
         0.0000],
        [0.0000,

Epoch: 0.42, Train Loss: 0.00, Val Loss: 5.60, Train BLEU: 0.00, Val BLEU: 2.77, Minutes Elapsed: 21.84
Sampling from val predictions...
Source: 在 加德 <UNK> 我 由 一些 从前 是 性 奴
Reference: in kathmandu , i was escorted by women who
Model: <SOS> and , , i i to , , ,
Attention Weights: tensor([[1.2489e-32, 3.7988e-28, 1.3232e-18, 2.2805e-29, 6.5388e-09, 2.1991e-11,
         2.5004e-12, 9.7186e-01, 2.8144e-02, 1.4819e-07],
        [9.8947e-02, 7.5292e-01, 5.5424e-05, 2.4275e-07, 1.4753e-01, 5.4556e-04,
         9.9324e-09, 2.7251e-07, 2.3719e-09, 2.3534e-13],
        [3.6037e-02, 9.6380e-01, 6.2407e-09, 1.6413e-04, 2.3395e-11, 2.5608e-16,
         9.1138e-20, 5.6749e-21, 5.1576e-23, 5.4442e-24],
        [2.8056e-15, 7.6525e-13, 1.1030e-15, 1.0000e+00, 1.0025e-18, 1.2645e-18,
         3.4176e-18, 1.4931e-22, 2.1754e-24, 1.2485e-24],
        [3.6802e-21, 2.1580e-20, 5.4749e-22, 1.0000e+00, 4.2186e-22, 1.1960e-22,
         4.5544e-20, 4.3155e-24, 1.4332e-27, 7.3037e-27],
        [1.5938e-14, 6.901

Epoch: 0.60, Train Loss: 0.00, Val Loss: 5.39, Train BLEU: 0.00, Val BLEU: 3.01, Minutes Elapsed: 31.20
Sampling from val predictions...
Source: 非常 非常感谢 感谢 肯尼 肯尼亚 尼亚 <UNK> <UNK> 在 我
Reference: <UNK> <UNK> . it means in my language ,
Model: <SOS> thank you you you you very . . .
Attention Weights: tensor([[3.9777e-26, 1.3082e-30, 2.6479e-36, 3.7835e-44, 5.8434e-43, 1.3236e-35,
         1.7014e-25, 1.4655e-19, 9.2228e-12, 1.0000e+00],
        [9.2496e-02, 9.0657e-01, 1.6101e-05, 1.0486e-18, 5.4863e-23, 2.9517e-22,
         1.8006e-22, 8.7288e-22, 2.2075e-15, 9.1755e-04],
        [5.9099e-07, 1.0000e+00, 1.1230e-06, 1.1959e-18, 1.9588e-21, 1.0768e-22,
         2.5299e-28, 2.5929e-31, 1.8534e-30, 2.0817e-17],
        [2.8664e-05, 9.9996e-01, 8.5684e-09, 1.8936e-20, 6.9622e-23, 1.0285e-21,
         1.9205e-23, 8.4594e-24, 1.1567e-17, 7.8809e-06],
        [7.8457e-05, 9.7864e-01, 4.8869e-08, 6.7727e-20, 1.1110e-22, 4.2182e-21,
         4.3976e-22, 2.5654e-21, 2.8240e-14, 2.1283e-02],
       

Epoch: 0.78, Train Loss: 0.00, Val Loss: 5.31, Train BLEU: 0.00, Val BLEU: 3.38, Minutes Elapsed: 40.56
Sampling from val predictions...
Source: 谁 还 会 认真 认真对待 真对 对待 他 如果 他
Reference: and who &apos;s going to take him seriously if
Model: <SOS> and &apos;s can &apos;t to to to to to
Attention Weights: tensor([[1.5600e-24, 1.6127e-37, 1.4973e-32, 5.6362e-25, 2.3097e-26, 3.7017e-17,
         1.3979e-12, 9.9999e-01, 1.7095e-08, 1.0064e-05],
        [1.0000e+00, 1.2666e-07, 9.6307e-12, 4.7812e-15, 3.2019e-25, 4.0484e-22,
         1.0508e-25, 1.7482e-14, 4.5859e-25, 1.3770e-16],
        [9.9883e-01, 1.1721e-03, 6.9983e-08, 2.8615e-10, 1.9603e-21, 5.6884e-19,
         2.1363e-28, 6.5282e-23, 3.2300e-31, 6.5071e-27],
        [9.9734e-01, 2.6500e-03, 4.4711e-06, 9.6505e-07, 1.0626e-14, 3.0761e-11,
         8.4428e-20, 1.6694e-14, 3.6160e-21, 1.1277e-19],
        [4.6390e-07, 1.6552e-04, 1.8426e-03, 9.9799e-01, 1.9106e-09, 8.8490e-09,
         2.2130e-15, 3.0598e-09, 5.4921e-13, 1.4139e-12],
    

Epoch: 0.96, Train Loss: 0.00, Val Loss: 5.14, Train BLEU: 0.00, Val BLEU: 4.08, Minutes Elapsed: 49.92
Sampling from val predictions...
Source: 对 是 的 所以 以是 这样 的 在 来电 期间
Reference: yeah . yeah . so what will happen is
Model: <SOS> it &apos;s that &apos;s it is &apos;s the the
Attention Weights: tensor([[3.4658e-15, 2.6777e-17, 8.3892e-19, 2.1691e-21, 5.5301e-15, 1.9912e-03,
         9.9353e-01, 2.1835e-03, 1.2908e-03, 1.0009e-03],
        [9.6046e-01, 3.9525e-02, 1.1068e-05, 9.3768e-17, 4.4915e-18, 1.0669e-12,
         9.6433e-16, 2.4529e-20, 2.6741e-24, 2.4560e-24],
        [1.8615e-03, 9.7739e-01, 2.0751e-02, 3.0168e-11, 1.4603e-14, 3.5047e-11,
         1.1090e-16, 9.2423e-22, 4.4300e-28, 1.5000e-29],
        [1.4773e-09, 1.4152e-03, 7.8086e-01, 2.1683e-01, 2.6315e-06, 8.9002e-04,
         8.4113e-08, 3.0536e-11, 9.1619e-17, 4.7196e-17],
        [8.9766e-11, 9.7555e-06, 9.7718e-01, 6.1990e-04, 5.7019e-06, 2.0841e-02,
         1.3304e-03, 8.5986e-06, 9.7697e-11, 1.8865e-11],
        [

Epoch: 1.12, Train Loss: 0.00, Val Loss: 5.08, Train BLEU: 0.00, Val BLEU: 4.43, Minutes Elapsed: 58.28
Sampling from val predictions...
Source: 笑 我 记得 得有 次 一个 妈妈 和 女儿 在
Reference: i remember this time , there was this mother
Model: <SOS> oh , , &apos;s a a , a a
Attention Weights: tensor([[3.0530e-13, 2.5526e-28, 7.9458e-28, 3.3608e-30, 2.8826e-23, 2.2036e-19,
         2.7554e-07, 1.0564e-07, 1.0000e+00, 9.4330e-10],
        [2.5704e-04, 9.9974e-01, 1.0403e-14, 6.5455e-23, 2.0681e-21, 1.4785e-24,
         2.3627e-19, 2.4318e-23, 7.0173e-21, 2.6207e-27],
        [3.0822e-12, 1.0000e+00, 1.0090e-10, 1.3362e-17, 9.3312e-22, 9.4066e-26,
         3.7715e-25, 2.7337e-33, 5.2957e-33, 3.4847e-38],
        [4.0185e-19, 1.0677e-05, 9.9834e-01, 1.6498e-03, 5.9683e-07, 3.7753e-10,
         2.3144e-16, 1.0931e-23, 3.0374e-27, 2.0627e-25],
        [1.1785e-21, 5.7636e-14, 5.0456e-05, 9.6504e-01, 7.7452e-04, 3.4132e-02,
         3.8003e-06, 1.6255e-13, 3.2823e-14, 2.6460e-18],
        [3.1988e-18, 1

Epoch: 1.30, Train Loss: 0.00, Val Loss: 5.01, Train BLEU: 0.00, Val BLEU: 5.15, Minutes Elapsed: 67.67
Sampling from val predictions...
Source: 所以 到 最后 参议 参议院 议院 胜利 了 众议 众议院
Reference: and so , in the end , the senate
Model: <SOS> so , , , , , , , ,
Attention Weights: tensor([[4.0944e-01, 2.2907e-18, 1.7235e-17, 1.4649e-22, 2.5989e-19, 4.7711e-16,
         5.7312e-09, 5.8889e-01, 1.6736e-03, 1.1950e-06],
        [1.0000e+00, 6.7925e-07, 1.5956e-10, 1.2468e-16, 5.3865e-18, 1.7645e-19,
         3.2118e-20, 2.3508e-14, 8.9689e-18, 9.1738e-21],
        [9.0014e-03, 8.8030e-01, 1.1066e-01, 3.7247e-05, 6.8272e-08, 8.2056e-12,
         2.2833e-15, 4.6564e-14, 3.7010e-17, 2.5927e-18],
        [1.9586e-06, 7.9897e-01, 6.5494e-02, 1.3040e-01, 5.0898e-03, 3.6618e-05,
         7.1267e-09, 2.7541e-11, 4.0746e-14, 6.9478e-13],
        [2.6109e-11, 3.3098e-04, 4.2384e-04, 7.5154e-01, 2.3657e-01, 1.1129e-02,
         1.3588e-06, 6.9949e-11, 3.0114e-13, 4.0848e-12],
        [1.8262e-12, 1.9561e-04, 2.

Epoch: 1.48, Train Loss: 0.00, Val Loss: 4.92, Train BLEU: 0.00, Val BLEU: 4.37, Minutes Elapsed: 77.09
Sampling from val predictions...
Source: 对 我 来说 这 是 人们 问 过 的 最
Reference: to me , this is the saddest and most
Model: <SOS> and i , , is what to that ,
Attention Weights: tensor([[5.6500e-07, 2.4371e-26, 2.4023e-19, 9.3439e-22, 1.3010e-16, 2.0174e-18,
         1.6076e-05, 2.9951e-03, 9.0236e-03, 9.8796e-01],
        [2.9301e-05, 9.9997e-01, 5.2797e-13, 1.1890e-14, 2.5593e-20, 5.9479e-14,
         9.8133e-21, 6.7122e-22, 1.4684e-24, 1.3633e-23],
        [1.3375e-08, 1.0000e+00, 5.5935e-11, 2.4650e-11, 1.2196e-19, 6.3264e-12,
         3.2820e-27, 1.0599e-31, 3.7149e-36, 2.5749e-35],
        [8.6270e-16, 1.0000e+00, 8.8931e-10, 1.2641e-06, 1.8052e-13, 3.6518e-10,
         2.2698e-23, 1.8672e-30, 1.0203e-35, 1.5327e-35],
        [1.1799e-16, 9.8534e-05, 4.1552e-04, 9.8674e-01, 8.8066e-09, 1.2748e-02,
         1.1046e-15, 1.2726e-19, 7.8412e-25, 1.1276e-25],
        [1.6378e-15, 3.9567e-0

Epoch: 1.66, Train Loss: 0.00, Val Loss: 4.87, Train BLEU: 0.00, Val BLEU: 4.22, Minutes Elapsed: 86.45
Sampling from val predictions...
Source: 这 原理 是 我们 可以 把 一些 光 打到 门上
Reference: the idea is that we could shine some light
Model: <SOS> and this is that we can to to ,
Attention Weights: tensor([[9.6550e-03, 5.7323e-22, 3.4829e-13, 4.8127e-22, 1.0147e-20, 3.2492e-14,
         8.8619e-09, 3.6810e-05, 9.8969e-01, 6.2211e-04],
        [1.0000e+00, 6.1689e-19, 6.2331e-20, 1.1050e-24, 7.8688e-36, 2.6903e-36,
         1.2332e-34, 3.1749e-38, 3.2912e-37, 2.5616e-40],
        [1.0000e+00, 3.0486e-07, 1.6412e-09, 5.0438e-17, 6.5935e-29, 5.1652e-35,
         3.3863e-36, 1.2002e-40, 2.0959e-41, 1.4503e-42],
        [9.1848e-08, 3.2995e-07, 1.0000e+00, 8.5981e-07, 8.0946e-19, 5.5100e-27,
         2.7861e-28, 3.4097e-28, 2.3802e-30, 8.0454e-34],
        [5.3882e-06, 2.3293e-10, 6.5715e-01, 3.4284e-01, 3.5789e-11, 1.6715e-16,
         3.9024e-19, 1.8470e-20, 1.4994e-19, 5.3923e-24],
        [5.8958e

Epoch: 1.84, Train Loss: 0.00, Val Loss: 4.81, Train BLEU: 0.00, Val BLEU: 4.59, Minutes Elapsed: 95.87
Sampling from val predictions...
Source: 你 可以 看到 这个 电灯 电灯泡 灯泡 是 <UNK> 的
Reference: as you can see , the bulbs face outside
Model: <SOS> you can can see that the &apos;s of is
Attention Weights: tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.1668, 0.0000, 0.0003, 0.8329, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0000, 1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0000, 0.0084, 0.0006, 0.0166, 0.9744, 0.0001, 0.0000,
         0.0000],
        [

Epoch: 2.00, Train Loss: 0.00, Val Loss: 4.77, Train BLEU: 0.00, Val BLEU: 5.48, Minutes Elapsed: 104.29
Sampling from val predictions...
Source: 我们 还给 她 复原 照片 的 那天 又 恰好 是
Reference: the day i gave her the photos also happened
Model: <SOS> we we we we to to , , and
Attention Weights: tensor([[1.0000e+00, 3.2367e-29, 1.7007e-20, 1.4393e-25, 1.0203e-22, 4.1413e-16,
         1.8462e-09, 5.1254e-12, 4.6865e-09, 1.1015e-06],
        [1.0000e+00, 4.1207e-24, 6.3096e-15, 1.6280e-31, 9.9548e-34, 5.5575e-33,
         2.3950e-28, 9.1064e-35, 4.9071e-35, 1.5535e-33],
        [1.0000e+00, 1.3508e-16, 1.6465e-10, 3.4132e-27, 3.5214e-33, 3.9814e-37,
         1.0509e-33, 1.6788e-42, 2.1832e-42, 2.9147e-43],
        [9.9066e-01, 7.1613e-12, 9.3434e-03, 8.4786e-20, 4.5566e-26, 1.7696e-28,
         1.8563e-25, 9.4183e-36, 1.5004e-36, 5.7994e-37],
        [1.9411e-03, 7.5523e-11, 9.9806e-01, 2.8069e-14, 4.7113e-19, 1.3312e-23,
         1.0472e-20, 1.3360e-30, 1.7663e-30, 3.6162e-31],
        [2.0458e-18,

Epoch: 2.18, Train Loss: 0.00, Val Loss: 4.73, Train BLEU: 0.00, Val BLEU: 4.91, Minutes Elapsed: 113.71
Sampling from val predictions...
Source: 一个 真正 的 学校 <EOS> <PAD> <PAD> <PAD> <PAD> <PAD>
Reference: a real school . <EOS> <PAD> <PAD> <PAD> <PAD>
Model: <SOS> one &apos;s of . <EOS> . <EOS> <EOS> <EOS>
Attention Weights: tensor([[0.9519, 0.0000, 0.0000, 0.0002, 0.0478, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.9985, 0.0015, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0002, 0.9893, 0.0060, 0.0046, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0004, 0.0016, 0.9980, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0171, 0.3825, 0.6002, 0.0003, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0462, 0.9357, 0.0180, 0.0000, 0.0000, 0.0000, 0.0000,


Epoch: 2.36, Train Loss: 0.00, Val Loss: 4.73, Train BLEU: 0.00, Val BLEU: 4.61, Minutes Elapsed: 123.19
Sampling from val predictions...
Source: 为了 改变 世界 我们 迫切 迫切需要 需要 从 我们 的
Reference: we desperately need great communication from our scientists and
Model: <SOS> and to to to change we we world ,
Attention Weights: tensor([[1.0479e-05, 8.9692e-11, 2.5370e-23, 1.3907e-11, 6.2106e-11, 9.2476e-10,
         1.1178e-02, 9.8881e-01, 7.3945e-11, 1.2151e-09],
        [9.8901e-01, 1.0989e-02, 1.2182e-15, 2.3501e-12, 2.8802e-17, 1.0492e-21,
         5.8384e-19, 3.3504e-20, 1.0970e-21, 2.8608e-29],
        [4.6476e-02, 9.5352e-01, 2.9016e-10, 5.0051e-14, 2.1259e-18, 4.5060e-22,
         7.9236e-25, 7.1173e-26, 1.6987e-30, 1.8160e-38],
        [4.8608e-11, 7.4793e-03, 9.9252e-01, 5.1010e-12, 5.8249e-14, 5.3497e-13,
         7.7186e-15, 7.2821e-17, 3.4863e-23, 7.3822e-32],
        [1.0489e-15, 1.2043e-06, 9.9999e-01, 5.6427e-06, 5.8701e-10, 4.2656e-10,
         1.9198e-11, 6.6988e-14, 1.2016e-17, 2

Epoch: 2.54, Train Loss: 0.00, Val Loss: 4.67, Train BLEU: 0.00, Val BLEU: 5.60, Minutes Elapsed: 132.73
Sampling from val predictions...
Source: 我 只能 能指 指望 free the <UNK> 组织 能够 在
Reference: i &apos;d have to rely on free the slaves
Model: <SOS> i i &apos;t to to the to <UNK> ,
Attention Weights: tensor([[1.0000e+00, 1.5766e-28, 3.0597e-32, 1.3260e-26, 9.7537e-36, 2.3010e-34,
         4.0927e-31, 1.7028e-25, 5.8114e-17, 4.3774e-18],
        [1.0000e+00, 1.1247e-23, 3.0380e-26, 1.8053e-22, 1.2459e-35, 4.1702e-37,
         1.5391e-37, 1.0622e-35, 1.9366e-33, 7.7266e-37],
        [1.0000e+00, 2.2512e-10, 1.7780e-12, 2.2145e-10, 4.8410e-29, 3.9591e-35,
         6.3568e-38, 1.0878e-36, 4.4640e-36, 1.5359e-36],
        [7.1708e-20, 3.0080e-09, 4.8072e-08, 1.0000e+00, 1.9071e-16, 9.1198e-23,
         2.6640e-26, 3.3209e-26, 2.1431e-24, 7.0648e-26],
        [1.0604e-27, 3.5294e-18, 9.1267e-13, 1.0000e+00, 3.8914e-16, 5.1950e-22,
         9.9542e-25, 3.1330e-28, 1.0372e-26, 3.0688e-27],
       

Epoch: 2.72, Train Loss: 0.00, Val Loss: 4.62, Train BLEU: 0.00, Val BLEU: 5.43, Minutes Elapsed: 142.34
Sampling from val predictions...
Source: 在 这 之后 不久 当 我 走过 一个 火车 火车站
Reference: soon after , when i was walking past a
Model: <SOS> and after when , i was a to a
Attention Weights: tensor([[9.9976e-01, 3.5138e-12, 3.0878e-06, 1.0860e-06, 3.6104e-07, 3.1052e-18,
         4.1685e-09, 6.3530e-07, 1.3795e-05, 2.1823e-04],
        [2.5558e-04, 9.9696e-01, 2.7865e-03, 1.7256e-08, 9.9256e-14, 8.2484e-21,
         1.4602e-20, 6.2510e-24, 7.8490e-23, 2.9558e-23],
        [5.6791e-10, 1.0000e+00, 5.5101e-07, 2.7528e-09, 1.3583e-09, 4.0850e-14,
         1.0101e-22, 9.7275e-27, 1.3217e-29, 7.5086e-30],
        [1.7908e-14, 7.0257e-06, 6.3214e-06, 7.2811e-01, 2.6905e-01, 2.8219e-03,
         1.2513e-11, 1.7499e-16, 2.9847e-17, 2.1934e-18],
        [2.7453e-16, 7.5363e-10, 5.7162e-08, 9.7501e-02, 8.9510e-01, 7.4019e-03,
         2.9198e-14, 2.3355e-17, 1.4324e-14, 1.0403e-15],
        [4.9411e-17,

Epoch: 2.90, Train Loss: 0.00, Val Loss: 4.59, Train BLEU: 0.00, Val BLEU: 5.54, Minutes Elapsed: 151.79
Sampling from val predictions...
Source: 我们 西方 西方人 方人 是 帝国 帝国主义 <UNK> 国主 主义
Reference: we western people are <UNK> , <UNK> missionaries ,
Model: <SOS> we we the , the <UNK> the , ,
Attention Weights: tensor([[1.0000e+00, 1.8537e-24, 4.6637e-28, 5.8872e-24, 2.3920e-27, 1.5105e-21,
         6.7214e-26, 1.6445e-23, 9.9774e-19, 1.4466e-12],
        [1.0000e+00, 2.9264e-22, 3.5500e-27, 4.3646e-27, 7.7740e-35, 2.7002e-32,
         1.9994e-38, 7.3738e-38, 2.8705e-37, 1.3635e-32],
        [1.0000e+00, 1.0041e-13, 1.8387e-16, 1.8675e-15, 3.3674e-20, 2.4339e-22,
         3.7914e-29, 1.2238e-28, 1.7375e-28, 1.2292e-27],
        [8.1990e-09, 4.4087e-02, 5.2803e-01, 3.2446e-01, 9.8178e-02, 5.2447e-03,
         2.9996e-10, 2.1226e-09, 9.0123e-12, 1.2241e-10],
        [2.5176e-21, 3.3719e-16, 1.7537e-11, 1.5761e-08, 1.0000e+00, 8.0914e-09,
         9.2381e-12, 5.5714e-10, 1.9529e-11, 8.1341e-14],


Epoch: 3.06, Train Loss: 0.00, Val Loss: 4.59, Train BLEU: 0.00, Val BLEU: 5.55, Minutes Elapsed: 160.27
Sampling from val predictions...
Source: 不过 过为 为了 真实 的 表明 词汇 和 政治 怎样
Reference: but to really show you how words and politics
Model: <SOS> but , , , , the , the the
Attention Weights: tensor([[9.2578e-01, 7.7853e-18, 6.6567e-13, 8.2349e-12, 3.9407e-06, 6.8731e-02,
         1.7729e-07, 7.9990e-07, 2.6777e-09, 5.4850e-03],
        [9.6598e-01, 3.3809e-02, 2.0799e-04, 4.0486e-12, 1.8263e-12, 4.1531e-13,
         3.4266e-18, 7.0701e-18, 6.4799e-19, 3.1353e-16],
        [7.8573e-08, 9.9963e-01, 3.7427e-04, 1.2768e-12, 1.6182e-19, 2.6250e-20,
         2.0725e-26, 2.6969e-30, 8.1393e-29, 1.1669e-31],
        [1.4355e-08, 4.8412e-01, 5.1588e-01, 5.4190e-07, 3.9706e-15, 1.5032e-15,
         9.7732e-19, 1.0077e-23, 2.0288e-23, 8.8632e-27],
        [7.8653e-16, 2.3159e-09, 1.4752e-01, 8.5248e-01, 3.1386e-11, 1.3572e-11,
         4.4990e-15, 1.2393e-24, 1.2377e-22, 2.9566e-27],
        [3.5448e

Epoch: 3.24, Train Loss: 0.00, Val Loss: 4.56, Train BLEU: 0.00, Val BLEU: 6.33, Minutes Elapsed: 169.65
Sampling from val predictions...
Source: <UNK> 今年 22 岁 又 高 又 帅 <EOS> <PAD>
Reference: <UNK> is 22 , tall and very handsome .
Model: <SOS> and was was . old . . . .
Attention Weights: tensor([[0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.6314, 0.3686,
         0.0000],
        [0.0000, 0.9935, 0.0065, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.1423, 0.8577, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.1753, 0.8122, 0.0126, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0057, 0.3304, 0.6620, 0.0019, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0005, 0.0002, 0.0143, 0.0113, 0.9737,
         0.0000],
        [0.0000, 0.0007, 0.4052, 0.5941, 0.0001, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.

Epoch: 3.42, Train Loss: 0.00, Val Loss: 4.54, Train BLEU: 0.00, Val BLEU: 5.87, Minutes Elapsed: 179.03
Sampling from val predictions...
Source: 我 想要 知道 并用 我 的 观点 来 阐释 阿富汗
Reference: i want to look at how , in my
Model: <SOS> i i to know what what i i i
Attention Weights: tensor([[9.8725e-01, 9.1003e-15, 7.8355e-18, 8.3345e-14, 1.2748e-02, 5.2708e-28,
         2.3667e-22, 1.4149e-18, 4.6595e-20, 1.2585e-16],
        [1.0000e+00, 6.5460e-17, 2.1511e-24, 6.2836e-31, 4.2685e-21, 7.3848e-43,
         6.1657e-44, 1.3678e-40, 2.3640e-42, 8.0894e-39],
        [1.0000e+00, 3.2796e-07, 4.1793e-13, 1.7508e-21, 5.1610e-16, 6.3102e-38,
         4.7964e-41, 8.3097e-43, 2.2421e-44, 3.0590e-42],
        [1.2318e-15, 3.0221e-03, 9.9698e-01, 6.8002e-10, 2.3414e-13, 6.9327e-21,
         9.9585e-18, 1.7191e-22, 8.8835e-25, 3.0962e-26],
        [2.4699e-15, 2.7979e-06, 9.1422e-01, 6.1100e-02, 2.4681e-02, 9.1268e-16,
         7.2774e-14, 1.3209e-18, 3.8780e-19, 6.0750e-23],
        [1.6995e-17, 4.3764e-12

Epoch: 3.60, Train Loss: 0.00, Val Loss: 4.53, Train BLEU: 0.00, Val BLEU: 5.74, Minutes Elapsed: 188.40
Sampling from val predictions...
Source: 我们 时常 拍照 <EOS> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
Reference: we take photos constantly . <EOS> <PAD> <PAD> <PAD>
Model: <SOS> we we to . <EOS> <EOS> <EOS> <EOS> <EOS>
Attention Weights: tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0129, 0.9871, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.1119, 0.8881, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0088, 0.1841, 0.8071, 0.0001, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.9910, 0.0011, 0.0056, 0.0022, 0.0000, 0.0000, 0.0000, 0.0000, 

Epoch: 3.78, Train Loss: 0.00, Val Loss: 4.51, Train BLEU: 0.00, Val BLEU: 5.57, Minutes Elapsed: 197.82
Sampling from val predictions...
Source: 我 想要 知道 并用 我 的 观点 来 阐释 阿富汗
Reference: i want to look at how , in my
Model: <SOS> i i to understand what my i i my
Attention Weights: tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0001, 0.9999, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0070, 0.9329, 0.0601, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0000, 0.0002, 0.9998, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 1.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0

Epoch: 3.99, Train Loss: 0.00, Val Loss: 4.48, Train BLEU: 0.00, Val BLEU: 6.10, Minutes Elapsed: 208.86
Sampling from val predictions...
Source: 150 年前 一个 <UNK> 的 价格 <UNK> 上 美国 国一
Reference: a hundred and fifty years ago , an agricultural
Model: <SOS> the years of ago half ago the the was
Attention Weights: tensor([[4.8410e-04, 5.1101e-04, 9.0509e-14, 1.2811e-13, 4.9599e-13, 1.2597e-08,
         4.0959e-04, 1.2679e-03, 9.9588e-01, 1.4504e-03],
        [2.3870e-03, 9.9761e-01, 5.0062e-12, 9.5331e-16, 3.6127e-18, 5.5454e-20,
         1.1811e-20, 2.5407e-24, 6.0738e-24, 2.1745e-27],
        [5.9597e-04, 9.9940e-01, 6.5656e-11, 8.9447e-17, 1.4615e-18, 1.5595e-22,
         5.5313e-23, 2.4985e-26, 8.5466e-28, 4.0452e-29],
        [1.8107e-05, 9.9998e-01, 4.9046e-09, 4.4271e-12, 8.7474e-15, 5.5227e-16,
         6.8118e-19, 2.3607e-21, 3.8862e-20, 8.4644e-22],
        [4.1044e-09, 1.0000e+00, 2.3490e-08, 1.6080e-09, 1.8097e-10, 3.1317e-10,
         2.6031e-12, 1.7879e-13, 7.1237e-13, 3.7165e-

Epoch: 4.15, Train Loss: 0.00, Val Loss: 4.50, Train BLEU: 0.00, Val BLEU: 5.96, Minutes Elapsed: 217.27
Sampling from val predictions...
Source: 我 不能 给 他们 钱 我 什么 也 给 不了
Reference: i couldn &apos;t give them money , nothing .
Model: <SOS> i i &apos;t do them them them me i
Attention Weights: tensor([[6.3246e-01, 6.5925e-25, 7.6249e-30, 1.0512e-23, 3.4353e-19, 3.6754e-01,
         6.1479e-14, 8.9322e-11, 8.3267e-10, 7.4941e-09],
        [1.0000e+00, 1.8671e-28, 4.8762e-36, 1.1894e-30, 2.1309e-38, 3.6315e-23,
         1.1538e-38, 1.1308e-38, 1.8640e-40, 7.1901e-41],
        [1.0000e+00, 1.0535e-18, 1.6482e-24, 2.1979e-22, 2.3326e-36, 1.9396e-24,
         1.0312e-38, 8.4793e-42, 9.7951e-43, 3.8030e-41],
        [5.1728e-13, 3.0559e-08, 1.9037e-02, 9.8096e-01, 9.2153e-17, 1.4909e-18,
         1.2314e-27, 6.8349e-27, 2.7442e-26, 2.4481e-22],
        [4.8830e-14, 5.5091e-11, 1.1188e-01, 8.8812e-01, 5.6449e-15, 2.0656e-15,
         5.5952e-26, 7.0658e-28, 2.2302e-26, 5.3389e-25],
        [4.1

Epoch: 4.33, Train Loss: 0.00, Val Loss: 4.46, Train BLEU: 0.00, Val BLEU: 5.85, Minutes Elapsed: 228.11
Sampling from val predictions...
Source: 在 这个 世界 上有 <UNK> <UNK> <UNK> <EOS> <PAD> <PAD>
Reference: so there &apos;s two types of <UNK> in this
Model: <SOS> and this world a world in the . .
Attention Weights: tensor([[0.9900, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0100, 0.0000,
         0.0000],
        [0.1603, 0.0013, 0.8383, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0003, 0.9248, 0.0749, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0000, 1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0000, 1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0000, 1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0973, 0.0021, 0.0362, 0.7706, 0.0868, 0.0071, 0.0000,
         0.

Epoch: 4.51, Train Loss: 0.00, Val Loss: 4.46, Train BLEU: 0.00, Val BLEU: 5.68, Minutes Elapsed: 238.57
Sampling from val predictions...
Source: 没有 能 过人 的 窗户 <EOS> <PAD> <PAD> <PAD> <PAD>
Reference: there were no windows large enough to climb through
Model: <SOS> it &apos;s no beautiful . . . <EOS> .
Attention Weights: tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.8263, 0.1737, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0009, 0.0768, 0.1682, 0.7541, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0000, 0.0004, 0.9996, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0001, 0.0000, 0.0000, 0.0003, 0.9996, 0.0000, 0.0000, 0.0000, 0.0000,
   

Epoch: 4.69, Train Loss: 0.00, Val Loss: 4.44, Train BLEU: 0.00, Val BLEU: 6.40, Minutes Elapsed: 251.97
Sampling from val predictions...
Source: 他们 第一 第一天 一天 来时 看到 了 稻草 <UNK> <UNK>
Reference: they will come the first day and they see
Model: <SOS> they they the the the of , the the
Attention Weights: tensor([[6.7287e-06, 2.8327e-24, 2.7660e-19, 8.6006e-18, 1.0826e-19, 3.7416e-17,
         1.9853e-16, 2.2706e-07, 8.7400e-03, 9.9125e-01],
        [1.0000e+00, 5.7115e-19, 9.0513e-17, 2.9089e-15, 1.3826e-21, 2.3495e-27,
         2.8264e-27, 9.7573e-27, 8.3580e-30, 9.2888e-31],
        [1.0000e+00, 6.8679e-15, 4.6076e-14, 2.5398e-12, 1.7287e-14, 2.3970e-17,
         7.3584e-20, 7.3020e-21, 1.3203e-24, 4.5321e-26],
        [1.4664e-13, 9.6625e-09, 4.0787e-06, 8.6893e-01, 1.0970e-01, 2.0972e-02,
         3.8782e-04, 1.2656e-06, 7.4362e-13, 1.4442e-15],
        [4.8731e-18, 7.7574e-09, 2.8049e-06, 8.2247e-02, 2.7819e-01, 4.8308e-04,
         2.2528e-05, 6.3164e-01, 7.4023e-03, 1.7165e-05],
   

Epoch: 4.87, Train Loss: 0.00, Val Loss: 4.43, Train BLEU: 0.00, Val BLEU: 6.21, Minutes Elapsed: 265.48
Sampling from val predictions...
Source: 可 我们 还是 希望 哪天 可以 逃离 这个 屋子 去
Reference: &quot; we hope still , though , that we
Model: <SOS> and we , to to &quot; we the we
Attention Weights: tensor([[1.0000e+00, 1.7239e-25, 5.9155e-28, 4.0207e-21, 5.3231e-19, 1.5913e-12,
         8.1535e-21, 6.2567e-28, 7.3570e-22, 4.7624e-06],
        [1.7509e-04, 9.9982e-01, 1.9456e-18, 5.3231e-11, 1.3099e-16, 6.7576e-18,
         1.6376e-24, 5.6862e-29, 2.7097e-26, 2.4807e-20],
        [2.1580e-15, 1.0000e+00, 9.3745e-22, 2.2494e-17, 3.5843e-27, 2.6571e-31,
         1.0598e-40, 5.5476e-41, 3.1725e-42, 2.2432e-40],
        [1.9025e-15, 1.0000e+00, 7.2757e-20, 8.6241e-10, 2.7587e-17, 2.4036e-21,
         9.6637e-30, 1.1008e-30, 1.2761e-33, 7.7349e-33],
        [4.4809e-28, 5.0475e-15, 3.4211e-18, 1.0000e+00, 1.9102e-07, 6.9557e-15,
         1.6610e-23, 1.0745e-21, 1.0096e-24, 9.3306e-26],
        [3.5042e

Epoch: 5.03, Train Loss: 0.00, Val Loss: 4.42, Train BLEU: 0.00, Val BLEU: 6.49, Minutes Elapsed: 277.58
Sampling from val predictions...
Source: 到 时人 人们 将 会 被 <UNK> 淹没 <EOS> <PAD>
Reference: they were already drowning in manure . <EOS> <PAD>
Model: <SOS> and people be to . <EOS> . <EOS> <EOS>
Attention Weights: tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.9948, 0.0052, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0026, 0.9382, 0.0592, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0011, 0.3298, 0.6676, 0.0000, 0.0000, 0.0015, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0001, 0.0033, 0.6438, 0.0005, 0.0001, 0.3401, 0.0113, 0.0004, 0.0005,
         0.0000],
        [0.0000, 0.0001, 0.0001, 0.0000, 0.0001, 0.9498, 0.0452, 0.0028, 0.0020,
         0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.9995, 0.0001, 0.0003, 0.0001,
         0.

Epoch: 5.21, Train Loss: 0.00, Val Loss: 4.42, Train BLEU: 0.00, Val BLEU: 6.64, Minutes Elapsed: 290.93
Sampling from val predictions...
Source: 我 到底 是 韩国 国人 还是 朝鲜 <UNK> <EOS> <PAD>
Reference: am i south korean or north korean ? <EOS>
Model: <SOS> i i i the the . . <EOS> <EOS>
Attention Weights: tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0155, 0.0000, 0.0000, 0.9845, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0000, 1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0005, 0.9855, 0.0109, 0.0030, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 1.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [

Epoch: 5.39, Train Loss: 0.00, Val Loss: 4.40, Train BLEU: 0.00, Val BLEU: 6.48, Minutes Elapsed: 304.24
Sampling from val predictions...
Source: 因为 这次 什么 都 没 发生 <EOS> <PAD> <PAD> <PAD>
Reference: because this time it wasn &apos;t coming up .
Model: <SOS> because nothing happened happening happened happening <EOS> <EOS> <EOS>
Attention Weights: tensor([[0.9999, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0001, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.4952, 0.5048, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.5659, 0.4341, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.8188, 0.1809, 0.0003, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0000, 0.8755, 0.0097, 0.1148, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0001, 0.9999, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 1.0000, 0.

Epoch: 5.57, Train Loss: 0.00, Val Loss: 4.40, Train BLEU: 0.00, Val BLEU: 6.75, Minutes Elapsed: 317.67
Sampling from val predictions...
Source: 我们 的 科学 科学家 学家 和 工程 工程师 们 解决
Reference: our scientists and engineers are the ones that are
Model: <SOS> our scientists engineers engineers engineers to to . solve
Attention Weights: tensor([[1.0000e+00, 8.1862e-29, 1.1484e-19, 5.2999e-19, 2.4346e-19, 1.0019e-22,
         1.1879e-18, 9.2204e-17, 1.6515e-20, 6.3550e-15],
        [1.0000e+00, 6.0360e-25, 1.1804e-16, 3.4711e-13, 4.6021e-14, 2.3918e-25,
         5.3125e-25, 4.1785e-24, 3.8525e-28, 8.4808e-29],
        [9.9203e-01, 5.1038e-17, 1.5516e-12, 3.3965e-06, 7.9669e-03, 5.8733e-11,
         3.1543e-16, 6.9203e-16, 7.1656e-21, 8.5956e-25],
        [9.8983e-15, 5.5626e-18, 6.2953e-12, 4.1091e-08, 9.9945e-01, 5.4879e-04,
         1.3012e-10, 2.0286e-07, 3.4621e-10, 2.1934e-18],
        [5.0854e-21, 4.3152e-21, 1.6140e-17, 5.1300e-13, 1.3658e-06, 9.9999e-01,
         1.5235e-07, 2.3609e-07, 1.

Epoch: 5.75, Train Loss: 0.00, Val Loss: 4.40, Train BLEU: 0.00, Val BLEU: 6.49, Minutes Elapsed: 330.88
Sampling from val predictions...
Source: 五年 年前 我 曾 有 过 一个 有如 漫游 爱丽丝
Reference: five years ago , i experienced a bit of
Model: <SOS> i years ago , i was a a <UNK>
Attention Weights: tensor([[1.0000e+00, 2.1435e-07, 3.7475e-29, 6.8099e-29, 1.8752e-34, 4.2413e-31,
         2.2077e-33, 2.9753e-28, 1.3733e-19, 5.5786e-15],
        [9.6136e-01, 3.8644e-02, 1.8503e-15, 2.0451e-19, 3.1288e-28, 2.7437e-28,
         9.7545e-31, 3.0696e-33, 1.2766e-31, 2.4973e-29],
        [5.2360e-04, 9.9948e-01, 1.4515e-12, 1.2169e-20, 4.9817e-29, 2.6361e-29,
         1.7281e-34, 9.2514e-39, 1.1623e-40, 6.0526e-39],
        [1.5666e-14, 1.1350e-02, 9.8865e-01, 1.0170e-14, 8.9306e-25, 1.6074e-25,
         1.4430e-32, 2.7955e-34, 1.2797e-32, 2.2911e-31],
        [3.4605e-24, 6.2882e-13, 1.0000e+00, 1.3798e-18, 3.9567e-29, 2.3832e-28,
         1.5384e-34, 5.3395e-33, 2.9766e-30, 3.5708e-29],
        [2.6089e-20

Epoch: 5.93, Train Loss: 0.00, Val Loss: 4.36, Train BLEU: 0.00, Val BLEU: 6.41, Minutes Elapsed: 344.14
Sampling from val predictions...
Source: <UNK> 期间 我 又 被 打 了 两次 <EOS> <PAD>
Reference: it happened twice more on the <UNK> . <EOS>
Model: <SOS> i was a i i later time . <EOS>
Attention Weights: tensor([[0.8429, 0.0000, 0.0000, 0.0000, 0.0000, 0.1565, 0.0001, 0.0005, 0.0000,
         0.0000],
        [0.6298, 0.3702, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.9981, 0.0019, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0000, 1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0000, 1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0007, 0.0045, 0.0000, 0.0577, 0.9122, 0.0064, 0.0184, 0.0000,
         0.0000],
        [0.0000, 0.8769, 0.0030, 0.0000, 0.0003, 0.1072, 0.0000, 0.0126, 0.0000,
         0.0000],
        [

Epoch: 6.09, Train Loss: 0.00, Val Loss: 4.39, Train BLEU: 0.00, Val BLEU: 6.13, Minutes Elapsed: 356.00
Sampling from val predictions...
Source: 比如 艾哈迈 艾哈迈德 哈迈德 迈德 这 不是 真名 我 也
Reference: like ahmed . that &apos;s not his real name
Model: <SOS> for , , , is not . . .
Attention Weights: tensor([[9.9983e-01, 5.3642e-13, 1.1360e-13, 1.9710e-12, 1.2645e-11, 1.6595e-14,
         1.6162e-04, 1.8946e-10, 1.7551e-13, 5.2715e-06],
        [1.0000e+00, 3.8345e-07, 2.7126e-08, 7.4712e-10, 2.9293e-10, 1.2890e-13,
         1.0430e-18, 4.2832e-29, 2.3555e-29, 4.2496e-27],
        [9.9999e-01, 5.5994e-06, 3.7203e-08, 1.7917e-10, 7.5987e-11, 7.3874e-12,
         1.5106e-17, 5.3550e-31, 2.4148e-29, 4.5986e-31],
        [1.6363e-02, 3.4118e-02, 5.5018e-01, 3.2922e-01, 6.0522e-02, 9.2135e-03,
         3.7719e-04, 9.6132e-12, 6.7930e-10, 3.4810e-15],
        [3.5860e-10, 1.0700e-11, 7.7827e-10, 8.8015e-09, 4.3526e-08, 9.9991e-01,
         9.1900e-05, 2.5536e-14, 3.7482e-08, 1.2104e-13],
        [4.2734e-1

Epoch: 6.27, Train Loss: 0.00, Val Loss: 4.37, Train BLEU: 0.00, Val BLEU: 6.41, Minutes Elapsed: 369.26
Sampling from val predictions...
Source: 我 不知 知道 那 张 是 怎么 怎么回事 回事 回事儿
Reference: that was , i don &apos;t know what happened
Model: <SOS> i don &apos;t i i &apos;t know what the
Attention Weights: tensor([[1.0000e+00, 4.2150e-20, 5.2943e-25, 3.3612e-27, 3.0597e-23, 3.0159e-14,
         1.6162e-07, 8.2403e-10, 3.4228e-11, 5.3981e-10],
        [1.0000e+00, 1.5530e-25, 3.2645e-32, 2.2095e-32, 4.1198e-39, 5.2312e-35,
         1.1058e-30, 1.6614e-31, 3.1386e-32, 4.8196e-32],
        [1.0000e+00, 4.5636e-17, 2.6518e-21, 6.8886e-30, 9.9380e-42, 1.1240e-38,
         9.7411e-38, 4.6503e-36, 1.3776e-36, 1.0683e-36],
        [1.0000e+00, 5.3972e-13, 3.2474e-11, 3.1978e-18, 1.8398e-31, 2.0646e-27,
         1.0266e-28, 9.6637e-29, 1.6539e-30, 9.7227e-30],
        [1.0000e+00, 7.9223e-13, 1.5547e-11, 5.7167e-12, 3.2458e-22, 1.8833e-19,
         1.6654e-20, 1.4029e-21, 1.1363e-22, 8.3353e-25],
   

Epoch: 6.45, Train Loss: 0.00, Val Loss: 4.37, Train BLEU: 0.00, Val BLEU: 6.75, Minutes Elapsed: 382.56
Sampling from val predictions...
Source: 所以 很多 很多年 多年 年前 我 有 个 想法 我们
Reference: so years and years ago , i had this
Model: <SOS> so , years , , , i was a
Attention Weights: tensor([[9.9976e-01, 3.1597e-18, 1.9515e-19, 9.5121e-17, 2.4109e-15, 7.4646e-17,
         6.9733e-20, 2.4326e-24, 4.1846e-15, 2.4213e-04],
        [2.2157e-01, 7.7840e-01, 2.9073e-05, 1.7053e-07, 3.0789e-08, 1.5939e-12,
         2.0197e-19, 1.6544e-21, 5.1897e-21, 1.2846e-16],
        [1.3984e-08, 7.8566e-01, 1.3444e-01, 2.2493e-02, 5.7412e-02, 4.4321e-10,
         1.1179e-18, 3.1752e-22, 2.9788e-29, 1.0491e-26],
        [6.0226e-18, 7.8766e-09, 2.5982e-05, 1.9470e-04, 9.9978e-01, 2.6472e-10,
         7.5052e-19, 6.8806e-23, 2.2530e-30, 1.6373e-27],
        [2.7562e-26, 1.1471e-16, 9.7673e-13, 7.0613e-08, 7.8096e-02, 9.2190e-01,
         2.3462e-13, 4.1947e-14, 8.4109e-17, 1.8686e-16],
        [3.6476e-25, 1.0993

Epoch: 6.66, Train Loss: 0.00, Val Loss: 4.38, Train BLEU: 0.00, Val BLEU: 6.19, Minutes Elapsed: 398.07
Sampling from val predictions...
Source: 嗯 跳舞 是 人类 众多 的 活动 之一 <EOS> <PAD>
Reference: so , dancing is one of the most human
Model: <SOS> well , , , the of the most of
Attention Weights: tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0928, 0.4385, 0.2685, 0.2002, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0001, 0.0135, 0.0203, 0.9661, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0000, 0.9986, 0.0014, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0000, 0.9429, 0.0571, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 

Epoch: 6.84, Train Loss: 0.00, Val Loss: 4.38, Train BLEU: 0.00, Val BLEU: 6.45, Minutes Elapsed: 411.38
Sampling from val predictions...
Source: 见到 他 是 在 一个 收容 收容所 所里 free the
Reference: i met him at a shelter where free the
Model: <SOS> he he he a a a , , was
Attention Weights: tensor([[1.0000e+00, 2.9722e-25, 2.9774e-32, 1.5575e-30, 3.7045e-38, 1.8636e-30,
         2.9678e-28, 2.5976e-26, 1.0124e-27, 9.2025e-23],
        [9.9969e-01, 3.1340e-04, 6.7749e-21, 2.2327e-19, 2.9131e-27, 3.6319e-24,
         3.0333e-24, 3.0014e-27, 1.6246e-30, 2.7822e-29],
        [6.7791e-04, 9.9932e-01, 1.3050e-16, 2.9153e-15, 5.9726e-22, 4.7722e-25,
         1.4793e-25, 2.8761e-32, 1.2735e-36, 2.0636e-37],
        [3.2802e-06, 5.3248e-04, 7.6488e-05, 9.9895e-01, 4.4151e-04, 4.7029e-11,
         7.1834e-15, 1.3652e-19, 1.4118e-23, 2.4718e-26],
        [1.1089e-13, 4.1934e-12, 1.5775e-15, 5.2842e-05, 9.9967e-01, 2.4888e-04,
         2.3521e-05, 6.0559e-09, 6.4742e-11, 2.1744e-14],
        [6.5481e-15, 4.6

Epoch: 7.00, Train Loss: 0.00, Val Loss: 4.38, Train BLEU: 0.00, Val BLEU: 6.70, Minutes Elapsed: 423.25
Sampling from val predictions...
Source: 谢谢 各位 掌声 <EOS> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
Reference: thank you . <EOS> <PAD> <PAD> <PAD> <PAD> <PAD>
Model: <SOS> thank you . <EOS> . <EOS> <EOS> <EOS> <EOS>
Attention Weights: tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.9996, 0.0004, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.3391, 0.6609, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0001, 0.9999, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0027, 0.9973, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0

Epoch: 7.18, Train Loss: 0.00, Val Loss: 4.38, Train BLEU: 0.00, Val BLEU: 5.88, Minutes Elapsed: 436.51
Sampling from val predictions...
Source: 我们 西方 西方人 方人 是 帝国 帝国主义 <UNK> 国主 主义
Reference: we western people are <UNK> , <UNK> missionaries ,
Model: <SOS> we we the the the the the , and
Attention Weights: tensor([[1.0000e+00, 9.7717e-23, 2.2606e-27, 3.5475e-25, 1.3820e-27, 8.3413e-22,
         4.5620e-23, 2.6817e-20, 4.5444e-15, 7.9766e-12],
        [1.0000e+00, 2.6442e-22, 2.3863e-28, 2.8463e-28, 9.3633e-32, 6.9246e-33,
         1.4186e-36, 3.2201e-36, 6.3256e-35, 4.4634e-33],
        [1.0000e+00, 3.1354e-17, 8.6269e-20, 1.2019e-19, 3.2986e-18, 4.8521e-23,
         3.0676e-29, 2.1922e-30, 1.3093e-29, 9.2372e-30],
        [7.7766e-10, 3.8250e-03, 6.8426e-05, 6.3928e-05, 9.9505e-01, 9.9738e-04,
         4.4052e-09, 1.8593e-10, 4.5160e-15, 6.6100e-17],
        [7.0981e-17, 2.0958e-10, 1.3843e-10, 1.7829e-08, 9.9999e-01, 8.6026e-06,
         2.5591e-08, 8.3877e-10, 2.6440e-13, 2.3791e-18]

Epoch: 7.36, Train Loss: 0.00, Val Loss: 4.42, Train BLEU: 0.00, Val BLEU: 6.39, Minutes Elapsed: 450.02
Sampling from val predictions...
Source: 这 是 菜园 在 街上 的 原因 <EOS> <PAD> <PAD>
Reference: this is on the street for a reason .
Model: <SOS> this is the the ground . the . .
Attention Weights: tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0000, 1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.9120, 0.0863, 0.0017, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0075, 0.9920, 0.0005, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0020, 0.9972, 0.0006, 0.0000, 0.0000, 0.0001, 0.0000,
         0.0000],
        [0.02

Epoch: 7.54, Train Loss: 0.00, Val Loss: 4.37, Train BLEU: 0.00, Val BLEU: 6.41, Minutes Elapsed: 463.40
Sampling from val predictions...
Source: 我 想 这 就是 自然 如何 绘制 的 西红柿 一
Reference: i thought , this is how nature paints a
Model: <SOS> i think that this &apos;s the of think to
Attention Weights: tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0000, 1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0000, 1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.3104, 0.0001, 0.6895, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.9998, 0.0002, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0

Epoch: 7.72, Train Loss: 0.00, Val Loss: 4.37, Train BLEU: 0.00, Val BLEU: 6.58, Minutes Elapsed: 476.76
Sampling from val predictions...
Source: <UNK> <UNK> 之下 砸 石场 的 男人 个个 健壮 但
Reference: at first glance , the pounding site seems full
Model: <SOS> and <UNK> , , the &apos;s of of the
Attention Weights: tensor([[1.0005e-08, 5.7376e-12, 4.3690e-16, 1.5827e-08, 3.1544e-07, 3.3009e-09,
         8.1968e-01, 1.5588e-02, 1.6473e-01, 2.9714e-06],
        [2.2343e-02, 9.7752e-01, 4.7727e-07, 1.3639e-04, 5.1338e-08, 1.5296e-11,
         1.5683e-06, 2.3349e-11, 8.4584e-16, 1.0844e-24],
        [2.4832e-04, 9.9751e-01, 1.8604e-03, 3.7952e-04, 9.4383e-08, 2.6346e-11,
         4.5582e-08, 2.6875e-12, 4.5721e-18, 5.0277e-27],
        [5.7726e-08, 1.8404e-03, 6.7657e-01, 3.2053e-01, 9.4554e-04, 5.6875e-05,
         5.8501e-05, 8.7938e-10, 3.2853e-16, 3.7919e-20],
        [4.8857e-13, 1.5375e-09, 8.3666e-02, 8.8569e-01, 5.4619e-04, 1.1295e-04,
         2.9980e-02, 5.3285e-08, 4.1777e-13, 8.8941e-14],


Epoch: 7.90, Train Loss: 0.00, Val Loss: 4.35, Train BLEU: 0.00, Val BLEU: 6.60, Minutes Elapsed: 490.07
Sampling from val predictions...
Source: 你 可以 成为 美国 总统 或是 下一代 一代 互联 互联网
Reference: you could be the president of the united states
Model: <SOS> you can take the to of germany american states
Attention Weights: tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.9993, 0.0007, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0000, 1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0000, 0.9991, 0.0009, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0000, 0.0032, 0.9968, 0.0000, 0.0000, 0.0000, 0.0000,
         0

Epoch: 8.09, Train Loss: 0.00, Val Loss: 4.39, Train BLEU: 0.00, Val BLEU: 5.98, Minutes Elapsed: 504.23
Sampling from val predictions...
Source: 我们 怎么 怎么样 能够 从 邻里 <UNK> <UNK> <UNK> 更多
Reference: how can we lend and borrow more things without
Model: <SOS> how do we start from from ? ? ?
Attention Weights: tensor([[9.9199e-01, 7.2436e-11, 1.8931e-23, 1.9287e-21, 2.2272e-27, 5.9924e-20,
         8.7233e-15, 2.2306e-09, 5.1915e-06, 8.0056e-03],
        [1.0000e+00, 1.3020e-12, 1.7988e-19, 1.4646e-25, 1.9450e-41, 1.9389e-38,
         4.1721e-36, 9.7588e-34, 2.3427e-32, 5.6464e-30],
        [1.0000e+00, 1.8331e-10, 7.5848e-11, 2.9683e-17, 1.2604e-32, 2.2036e-34,
         3.7239e-36, 6.0554e-37, 3.8455e-37, 2.5819e-34],
        [2.2122e-04, 8.8146e-11, 1.7440e-05, 9.9976e-01, 1.1309e-16, 9.3129e-22,
         5.6776e-22, 4.3662e-23, 6.1729e-23, 1.2836e-20],
        [3.9448e-19, 7.5293e-21, 9.2173e-14, 1.4431e-02, 9.8557e-01, 1.9658e-07,
         5.3043e-10, 6.5041e-13, 3.6259e-15, 5.4239e-15]

Epoch: 8.30, Train Loss: 0.00, Val Loss: 4.38, Train BLEU: 0.00, Val BLEU: 6.97, Minutes Elapsed: 519.74
Sampling from val predictions...
Source: 当时 我 最 不想 做 的 就是 离开 纽约 离开
Reference: now , the last thing i wanted to do
Model: <SOS> and one was was thing i want to do
Attention Weights: tensor([[7.6673e-01, 8.1087e-26, 4.8112e-28, 5.6085e-26, 2.4191e-22, 2.6755e-12,
         7.7958e-12, 1.3627e-08, 1.6019e-01, 7.3078e-02],
        [7.5011e-02, 9.2499e-01, 1.9903e-12, 6.6582e-18, 3.5667e-28, 5.1866e-19,
         1.0628e-20, 1.3519e-23, 1.4627e-17, 1.2783e-21],
        [2.4873e-08, 1.0000e+00, 4.8675e-14, 1.3387e-18, 1.2216e-30, 2.7877e-31,
         4.1540e-35, 3.6516e-40, 3.9828e-36, 9.6639e-39],
        [7.6138e-11, 1.0000e+00, 1.4883e-12, 8.8253e-16, 4.4510e-27, 4.2603e-30,
         7.9724e-33, 4.5823e-35, 2.3072e-33, 5.4641e-36],
        [3.0975e-09, 1.0000e+00, 5.1508e-11, 3.8459e-11, 1.1145e-21, 3.3425e-27,
         3.6147e-31, 1.0801e-34, 7.9498e-33, 6.1796e-34],
        [3.4436e-08

Epoch: 8.51, Train Loss: 0.00, Val Loss: 4.38, Train BLEU: 0.00, Val BLEU: 6.04, Minutes Elapsed: 535.28
Sampling from val predictions...
Source: 在 他 叔叔 死 后 <UNK> 要 继续 背负 他
Reference: when his uncle died , <UNK> inherited his uncle
Model: <SOS> after his wife , , , would to to
Attention Weights: tensor([[1.2624e-08, 1.0938e-23, 8.1681e-19, 2.8833e-18, 1.1010e-12, 2.1807e-11,
         9.4593e-09, 9.5071e-03, 8.0302e-01, 1.8747e-01],
        [1.0360e-04, 9.9863e-01, 1.8938e-04, 1.0710e-03, 3.2013e-06, 1.9595e-12,
         4.0834e-15, 2.0251e-12, 3.5177e-16, 1.0892e-14],
        [4.7289e-08, 9.9952e-01, 4.8316e-04, 2.3821e-08, 1.4407e-09, 1.9488e-16,
         2.4942e-28, 8.0091e-28, 3.1519e-31, 2.3130e-29],
        [6.4422e-14, 9.9889e-01, 1.1064e-03, 5.2215e-07, 5.4600e-11, 1.7812e-18,
         1.6527e-28, 3.0879e-29, 5.0039e-32, 3.5346e-30],
        [1.1016e-15, 4.1436e-11, 1.5921e-05, 9.9983e-01, 1.5349e-04, 7.3487e-11,
         7.7674e-18, 9.2316e-21, 6.2122e-23, 3.3602e-21],
        

Epoch: 8.72, Train Loss: 0.00, Val Loss: 4.36, Train BLEU: 0.00, Val BLEU: 6.65, Minutes Elapsed: 550.82
Sampling from val predictions...
Source: 你 可以 又 一次 看到 波纹 在 桌子 上 展开
Reference: you can watch the ripples again washing over the
Model: <SOS> you you see , again of the the the
Attention Weights: tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0000, 0.8701, 0.1299, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0000, 0.9034, 0.0065, 0.0901, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0002, 0.9997, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0001, 0.8864, 0.1133, 0.0002, 0.0000,
         0.0000],
        

Epoch: 8.90, Train Loss: 0.00, Val Loss: 4.36, Train BLEU: 0.00, Val BLEU: 6.91, Minutes Elapsed: 564.21
Sampling from val predictions...
Source: 此时 此时此刻 此刻 你 可能 在 想 哇 这 才
Reference: right now , maybe you &apos;re thinking , &quot;
Model: <SOS> and , , you you &apos;re be to &quot;
Attention Weights: tensor([[9.9878e-01, 4.9196e-07, 1.4788e-21, 5.1286e-36, 4.6573e-23, 1.6358e-19,
         6.8985e-23, 3.2532e-21, 9.3625e-08, 1.2152e-03],
        [1.0166e-06, 1.0000e+00, 1.2234e-09, 5.9369e-17, 2.2646e-20, 1.8775e-24,
         1.8249e-24, 3.0459e-27, 2.5941e-24, 1.9452e-23],
        [2.6030e-11, 9.9953e-01, 4.6515e-04, 1.0793e-11, 1.0462e-19, 1.7343e-24,
         6.6536e-28, 8.6416e-36, 8.8405e-33, 1.6231e-34],
        [3.1587e-17, 6.4659e-05, 8.6936e-02, 9.1300e-01, 3.8276e-19, 1.3160e-25,
         3.3566e-23, 4.4185e-25, 1.3194e-26, 3.1278e-29],
        [2.8537e-23, 3.8897e-13, 4.5319e-08, 1.0000e+00, 1.5370e-24, 4.2190e-33,
         4.7014e-31, 1.9405e-31, 1.4441e-29, 2.2323e-34],
   

Epoch: 9.06, Train Loss: 0.00, Val Loss: 4.41, Train BLEU: 0.00, Val BLEU: 6.43, Minutes Elapsed: 576.07
Sampling from val predictions...
Source: 据 保守 估计 当今 全世界 世界 有 超过 2700 万
Reference: a conservative estimate tells us there are more than
Model: <SOS> and is estimates , the the the the in
Attention Weights: tensor([[9.9980e-01, 5.9080e-13, 1.0123e-10, 5.5576e-12, 3.2426e-13, 1.5439e-07,
         2.5578e-07, 1.9382e-04, 1.1489e-06, 3.3499e-06],
        [9.9995e-01, 4.7183e-05, 2.0390e-06, 2.8803e-12, 1.4577e-11, 1.0708e-09,
         1.6612e-15, 1.9194e-13, 8.1476e-15, 1.5929e-15],
        [4.8879e-01, 5.1120e-01, 9.9361e-06, 3.0071e-08, 2.1926e-11, 5.4796e-12,
         8.7972e-20, 3.5781e-21, 7.9740e-24, 6.3789e-26],
        [1.6899e-09, 4.5631e-05, 1.7908e-02, 3.6595e-03, 9.5292e-01, 2.5463e-02,
         7.4141e-14, 1.9559e-15, 3.7922e-17, 6.9658e-16],
        [9.8790e-11, 2.3784e-08, 4.8866e-03, 3.2992e-01, 6.4490e-01, 2.0295e-02,
         1.5665e-07, 1.3581e-09, 1.3889e-13, 1.0297e-

Epoch: 9.24, Train Loss: 0.00, Val Loss: 4.40, Train BLEU: 0.00, Val BLEU: 6.67, Minutes Elapsed: 589.37
Sampling from val predictions...
Source: 有着 如此 肥沃 的 山谷 当地 当地人 人居 居然 不
Reference: and we were amazed that the local people ,
Model: <SOS> it &apos;s , very to that are of of
Attention Weights: tensor([[5.3312e-01, 2.0724e-09, 2.3723e-09, 8.9244e-13, 2.2423e-11, 1.8885e-10,
         1.5475e-08, 4.7665e-07, 6.1859e-04, 4.6626e-01],
        [4.1870e-01, 5.5161e-01, 2.9686e-02, 2.5278e-11, 1.2434e-14, 1.3088e-16,
         6.6314e-20, 1.2493e-22, 1.8179e-22, 4.1943e-20],
        [4.1747e-03, 9.2946e-01, 6.6367e-02, 2.1512e-09, 1.2564e-11, 6.3595e-15,
         1.6533e-19, 2.1006e-23, 5.6808e-26, 5.4346e-27],
        [7.4644e-07, 6.4976e-01, 3.5010e-01, 1.3201e-04, 2.7618e-06, 1.0436e-08,
         1.7219e-13, 5.8861e-18, 1.5896e-21, 1.1059e-24],
        [7.3646e-10, 9.9983e-01, 1.7194e-04, 5.7814e-07, 5.8085e-11, 1.2550e-14,
         8.7942e-17, 2.9325e-17, 5.9226e-18, 9.7176e-23],
        

Epoch: 9.42, Train Loss: 0.00, Val Loss: 4.40, Train BLEU: 0.00, Val BLEU: 6.80, Minutes Elapsed: 602.64
Sampling from val predictions...
Source: 他 只是 压力 太大 了 婚礼 的 筹备 和 我家
Reference: he had just been really stressed out by the
Model: <SOS> he &apos;s only a , , , and and
Attention Weights: tensor([[9.9999e-01, 9.9241e-29, 2.2617e-32, 7.6040e-28, 2.9841e-23, 8.6225e-19,
         2.1331e-19, 2.0293e-17, 6.6584e-12, 8.3757e-06],
        [1.0000e+00, 1.2874e-21, 3.6730e-26, 6.9874e-29, 2.6479e-33, 1.9563e-36,
         6.7919e-39, 8.7469e-40, 4.0674e-39, 5.7910e-34],
        [1.0000e+00, 1.6421e-14, 3.3279e-23, 6.5154e-27, 2.0492e-35, 8.6180e-43,
         1.6073e-42, 2.8026e-45, 3.4052e-43, 7.8665e-38],
        [2.1126e-13, 5.9374e-10, 9.9218e-01, 7.8169e-03, 1.0213e-14, 2.2635e-20,
         5.9004e-27, 8.4817e-28, 4.5043e-27, 7.4635e-21],
        [1.2383e-29, 9.1885e-19, 9.2219e-05, 9.9991e-01, 1.6107e-09, 1.0774e-11,
         1.5483e-18, 2.5540e-19, 5.4210e-16, 3.6141e-15],
        [2.829

Epoch: 9.60, Train Loss: 0.00, Val Loss: 4.36, Train BLEU: 0.00, Val BLEU: 6.42, Minutes Elapsed: 615.83
Sampling from val predictions...
Source: 我们 正 努力 与 当地 的 居民 们 沟通 <EOS>
Reference: we &apos;re working with local communities . <EOS> <PAD>
Model: <SOS> we &apos;re to to the people . <EOS> <EOS>
Attention Weights: tensor([[1.0000e+00, 6.0065e-30, 1.2959e-27, 4.8831e-33, 1.5166e-26, 8.4432e-22,
         7.7710e-16, 2.7102e-13, 2.0138e-09, 1.2937e-12],
        [1.0000e+00, 1.1801e-28, 4.4181e-25, 5.7538e-32, 1.5256e-33, 2.3595e-36,
         3.3787e-33, 3.5108e-34, 2.0888e-38, 4.2039e-45],
        [1.0000e+00, 2.6990e-18, 1.0897e-16, 9.9494e-18, 7.3239e-25, 1.4183e-29,
         2.3615e-30, 6.4846e-31, 4.3784e-38, 1.4013e-45],
        [6.9596e-19, 3.2003e-12, 8.1446e-02, 9.1855e-01, 8.6212e-06, 2.6711e-17,
         4.1501e-21, 6.8999e-27, 1.8868e-31, 4.0642e-39],
        [3.2905e-19, 1.0860e-15, 5.5364e-08, 9.9972e-01, 2.8124e-04, 1.0730e-17,
         7.0923e-23, 3.5985e-30, 1.0626e-38, 

Epoch: 9.78, Train Loss: 0.00, Val Loss: 4.38, Train BLEU: 0.00, Val BLEU: 6.43, Minutes Elapsed: 629.11
Sampling from val predictions...
Source: 我 想来 分享 几个 有用 的 方法 告诉 你们 如何
Reference: i want to share a few keys on how
Model: <SOS> i i a share you a of to telling
Attention Weights: tensor([[2.2508e-07, 2.2251e-20, 1.8204e-21, 2.7577e-22, 2.3683e-21, 1.6424e-19,
         1.6732e-15, 3.7668e-03, 9.9544e-01, 7.9007e-04],
        [1.0000e+00, 2.5122e-19, 8.9370e-19, 8.9572e-24, 6.7040e-31, 4.5973e-29,
         1.4605e-32, 1.7135e-27, 4.3531e-23, 2.7658e-28],
        [1.0000e+00, 5.6801e-09, 3.9001e-10, 5.2356e-16, 3.3217e-26, 5.7839e-34,
         9.3769e-38, 3.1032e-34, 1.0197e-33, 2.8821e-39],
        [1.9667e-26, 1.0567e-11, 9.9998e-01, 1.5864e-05, 2.3062e-15, 4.4990e-26,
         7.1893e-30, 4.4864e-30, 5.3666e-31, 1.0038e-32],
        [8.3548e-21, 7.1136e-08, 9.7332e-01, 2.6685e-02, 1.1883e-09, 1.7410e-15,
         6.4366e-19, 3.0840e-14, 3.7843e-19, 7.1054e-20],
        [6.1943e-22, 5

Epoch: 9.96, Train Loss: 0.00, Val Loss: 4.36, Train BLEU: 0.00, Val BLEU: 6.85, Minutes Elapsed: 642.37
Sampling from val predictions...
Source: 我 想 介绍 给 你们 认识 我 的 两个 兄弟
Reference: now i &apos;d like to introduce you to my
Model: <SOS> i want want to to introduce you you two
Attention Weights: tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.9565, 0.0425, 0.0010, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.4912, 0.5088, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 1.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.

Epoch: 10.15, Train Loss: 0.00, Val Loss: 4.40, Train BLEU: 0.00, Val BLEU: 6.78, Minutes Elapsed: 656.58
Sampling from val predictions...
Source: 这种 人 不存 存在 <EOS> <PAD> <PAD> <PAD> <PAD> <PAD>
Reference: it doesn &apos;t exist . <EOS> <PAD> <PAD> <PAD>
Model: <SOS> and &apos;s &apos;t exist . <EOS> <EOS> <EOS> <EOS>
Attention Weights: tensor([[0.9860, 0.0000, 0.0000, 0.0000, 0.0140, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0001, 0.0000, 0.0001, 0.9998, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0000, 1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0000, 1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0000, 0.9986, 0.0014, 0.0000, 0.0000, 0.0

Epoch: 10.36, Train Loss: 0.00, Val Loss: 4.41, Train BLEU: 0.00, Val BLEU: 6.56, Minutes Elapsed: 672.16
Sampling from val predictions...
Source: 希望 不 像 中间 那 张 那么 怪 <EOS> <PAD>
Reference: hopefully less awkward than that one in the middle
Model: <SOS> you don don the . . <EOS> <EOS> .
Attention Weights: tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0472, 0.3347, 0.6180, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0001, 0.7568, 0.2431, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.3267, 0.0000, 0.6733, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0847, 0.0000, 0.9153, 0.0000, 0.0000,
         0.0000],
 

In [None]:
summarize_results(load_experiment_log())[['dt_created', 'num_epochs', 'learning_rate', 'clip_grad_max_norm', 'val_loss']].head()

In [None]:
plot_single_learning_curve(results)

In [None]:
# Epoch: 199.00, Train Loss: 0.32, Val Loss: 13.19, Train BLEU: 98.94, Val BLEU: 0.27
plot_single_learning_curve(results)

In [None]:
# with attention energies = v_broadcast.bmm(torch.tanh(self.attn(concat)).transpose(1, 2)) # switched order  
# Epoch: 199.00, Train Loss: 0.63, Val Loss: 12.82, Train BLEU: 92.05, Val BLEU: 0.38
plot_single_learning_curve(results)

In [None]:
for i, token in enumerate(vocab[SRC_LANG]['id2token']): 
    if i < 20: 
        print("{}: {}".format(i, token))

In [None]:
for i, token in enumerate(vocab[TARG_LANG]['id2token']): 
    if i < 20: 
        print("{}: {}".format(i, token))

In [None]:
import torch
x = torch.arange(0, 3*5*10).view(3, 5, 10)
print(x)
y = x[1:, :, :]
print(y)
z = y.view(-1, 10)
print(z)

In [None]:
t = torch.arange(0, 2*5).view(5, 2)
print(t)
u = t.contiguous().view(-1)
print(u)
v = t.permute(1, 0)
print(v)
w = v.contiguous().view(-1)
print(w)

In [None]:
a = torch.arange(0, 2*1*300)
print(a)
b = a.view(-1, 1, 300)
print(b.size())

In [None]:
for i, (src_idxs, targ_idxs, src_lens, targ_lens) in enumerate(loaders_full['train']):
#     print(i)
#     print(src_idxs.size())
#     print(src_idxs)
#     print(src_lens)
#     print(targ_idxs.size())
#     print(targ_idxs)
#     print(targ_lens)
    id2token = vocab[SRC_LANG]['id2token']
    test_tensor = src_idxs
    list_of_lists = test_tensor.numpy().astype(int).tolist()
    to_token = lambda l: ' '.join([id2token[idx] for idx in l])
    list_of_lists_tokens = [to_token(l) for l in list_of_lists] 
    break 

In [None]:
#attn.data.masked_fill_(self.mask, -float('inf'))
test_tensor.

test_tensor.data.masked_fill_(test_tensor == 2, float('inf'))