In [1]:
import numpy as np 
import pandas as pd 
from data_processing import generate_vocab, process_data, create_dataloaders 
from model import get_pretrained_emb, EncoderDecoder, EncoderRNN, DecoderRNN, DecoderSimpleRNN, EncoderSimpleRNN, \
    Attention, DecoderAttnRNN, DecoderRNNV2, EncoderDecoderAttention, EncoderSimpleRNN_Test, DecoderAttnRNN_Test, \
    DecoderRNN_Test
from train_eval import count_parameters, summarize_results, \
    plot_single_learning_curve, load_experiment_log
from train_eval import train_and_eval, train_and_eval_attn 
import importlib
import pickle as pkl 
import torch

In [2]:
# model identification
MODEL_NAME = 'zh-seq2seq-rnn-attention'
SRC_LANG = 'zh'
TARG_LANG = 'en'

# data processing params  
SRC_MAX_SENTENCE_LEN = 10
TARG_MAX_SENTENCE_LEN = 10
SRC_VOCAB_SIZE = 30000 #30000
TARG_VOCAB_SIZE = 30000 #30000

# model architecture params 
RNN_CELL_TYPE = 'gru'
NUM_LAYERS = 2 #2 
ENC_HIDDEN_DIM = 256 #512
DEC_HIDDEN_DIM = 2 * ENC_HIDDEN_DIM #2 * ENC_HIDDEN_DIM 
TEACHER_FORCING_RATIO = 1
CLIP_GRAD_MAX_NORM = 1
ENC_DROPOUT = 0.2 # to actually implement
DEC_DROPOUT = 0.2 # to actually implement

# training params  
BATCH_SIZE = 32 #32
NUM_EPOCHS = 200
LR = 0.0005 # 0.0005
OPTIMIZER = 'Adam'
LAZY_TRAIN = True

In [3]:
# store as dict to save to results later 
params = {'model_name': MODEL_NAME, 'src_lang': SRC_LANG, 'targ_lang': TARG_LANG, 'rnn_cell_type': RNN_CELL_TYPE, 
          'src_max_sentence_len': SRC_MAX_SENTENCE_LEN, 'targ_max_sentence_len': TARG_MAX_SENTENCE_LEN, 
          'src_vocab_size': SRC_VOCAB_SIZE, 'targ_vocab_size': TARG_VOCAB_SIZE, 
          'num_layers': NUM_LAYERS, 'enc_hidden_dim': ENC_HIDDEN_DIM, 'dec_hidden_dim': DEC_HIDDEN_DIM,
          'teacher_forcing_ratio': TEACHER_FORCING_RATIO, 'clip_grad_max_norm': CLIP_GRAD_MAX_NORM,
          'enc_dropout': ENC_DROPOUT, 'dec_dropout': DEC_DROPOUT, 
          'batch_size': BATCH_SIZE, 'num_epochs': NUM_EPOCHS, 'learning_rate': LR, 'optimizer': OPTIMIZER, 
          'lazy_train': LAZY_TRAIN} 

In [4]:
#vocab_test = generate_vocab(SRC_LANG, TARG_LANG, SRC_VOCAB_SIZE, TARG_VOCAB_SIZE)

In [5]:
#vocab['zh']['id2token'][987]

In [6]:
#vocab['zh']['token2id']['森林']

In [7]:
#vocab['en']['token2id']['activity']

In [8]:
#vocab['en']['id2token'][987]

In [9]:
# # takes a long time to process, save to pickle for reimport in future 
# vocab = generate_vocab(SRC_LANG, TARG_LANG, SRC_VOCAB_SIZE, TARG_VOCAB_SIZE)
# vocab_filename = "{}-{}-vocab.p".format(SRC_LANG, TARG_LANG)
# pkl.dump(vocab, open(vocab_filename, "wb"))

In [10]:
# reload from pickle 
vocab_filename = "{}-{}-vocab.p".format(SRC_LANG, TARG_LANG)
vocab = pkl.load(open(vocab_filename, "rb"))
data = process_data(SRC_LANG, TARG_LANG, vocab)
data_minibatch = process_data(SRC_LANG, TARG_LANG, vocab, sample_limit=BATCH_SIZE) 
data_minitrain = process_data(SRC_LANG, TARG_LANG, vocab, sample_limit=1000)

In [11]:
# # takes a long time to process, save to pickle for reimport in future 
# vocab = generate_vocab(SRC_LANG, TARG_LANG, SRC_VOCAB_SIZE, TARG_VOCAB_SIZE)
# vocab_filename = "{}-{}-vocab-fake.p".format(SRC_LANG, TARG_LANG)
# pkl.dump(vocab, open(vocab_filename, "wb"))

In [12]:
# vocab_filename = "{}-{}-vocab-fake.p".format(SRC_LANG, TARG_LANG)
# vocab = pkl.load(open(vocab_filename, "rb"))
# data = process_data(SRC_LANG, TARG_LANG, vocab)
# limited_data = process_data(SRC_LANG, TARG_LANG, vocab, sample_limit=BATCH_SIZE) 

In [13]:
# create dataloaders 
loaders_full = create_dataloaders(data, SRC_MAX_SENTENCE_LEN, TARG_MAX_SENTENCE_LEN, BATCH_SIZE)
loaders_minibatch = create_dataloaders(data_minibatch, SRC_MAX_SENTENCE_LEN, TARG_MAX_SENTENCE_LEN, BATCH_SIZE)
loaders_minitrain = create_dataloaders(data_minitrain, SRC_MAX_SENTENCE_LEN, TARG_MAX_SENTENCE_LEN, BATCH_SIZE)

In [14]:
# define model 

# encoder = EncoderRNN(enc_hidden_dim=ENC_HIDDEN_DIM, num_layers=NUM_LAYERS, src_max_sentence_len=SRC_MAX_SENTENCE_LEN,
#                      pretrained_word2vec=get_pretrained_emb(vocab[SRC_LANG]['word2vec'], vocab[SRC_LANG]['token2id']))
# encoder = EncoderSimpleRNN(enc_hidden_dim=ENC_HIDDEN_DIM, num_layers=NUM_LAYERS, src_max_sentence_len=SRC_MAX_SENTENCE_LEN,
#                            pretrained_word2vec=get_pretrained_emb(vocab[SRC_LANG]['word2vec'], vocab[SRC_LANG]['token2id']))
# encoder = EncoderSimpleRNN(enc_hidden_dim=ENC_HIDDEN_DIM, num_layers=NUM_LAYERS, src_max_sentence_len=SRC_MAX_SENTENCE_LEN,
#                            pretrained_word2vec=get_pretrained_emb(vocab[SRC_LANG]['word2vec'], vocab[SRC_LANG]['token2id']))
encoder = EncoderSimpleRNN_Test(rnn_cell_type=RNN_CELL_TYPE, enc_hidden_dim=ENC_HIDDEN_DIM, num_layers=NUM_LAYERS, src_max_sentence_len=SRC_MAX_SENTENCE_LEN,
                                enc_dropout=ENC_DROPOUT, pretrained_word2vec=get_pretrained_emb(vocab[SRC_LANG]['word2vec'], vocab[SRC_LANG]['token2id']))

# decoder = DecoderRNN(dec_hidden_dim=DEC_HIDDEN_DIM, enc_hidden_dim=ENC_HIDDEN_DIM, num_layers=NUM_LAYERS,
#                       targ_vocab_size=TARG_VOCAB_SIZE, targ_max_sentence_len=TARG_MAX_SENTENCE_LEN, 
#                       pretrained_word2vec=get_pretrained_emb(vocab[TARG_LANG]['word2vec'], vocab[TARG_LANG]['token2id']))
# decoder = DecoderRNNV2(dec_hidden_dim=DEC_HIDDEN_DIM, enc_hidden_dim=ENC_HIDDEN_DIM, num_layers=NUM_LAYERS, 
#                        targ_vocab_size=TARG_VOCAB_SIZE, targ_max_sentence_len=TARG_MAX_SENTENCE_LEN, 
#                        pretrained_word2vec=get_pretrained_emb(vocab[TARG_LANG]['word2vec'], vocab[TARG_LANG]['token2id']))
# decoder = DecoderSimpleRNN(dec_hidden_dim=DEC_HIDDEN_DIM, enc_hidden_dim=ENC_HIDDEN_DIM, num_layers=NUM_LAYERS, 
#                            targ_vocab_size=TARG_VOCAB_SIZE, targ_max_sentence_len=TARG_MAX_SENTENCE_LEN, 
#                            pretrained_word2vec=get_pretrained_emb(vocab[TARG_LANG]['word2vec'], vocab[TARG_LANG]['token2id']))
# decoder = DecoderAttnRNN(dec_hidden_dim=DEC_HIDDEN_DIM, enc_hidden_dim=ENC_HIDDEN_DIM, num_layers=NUM_LAYERS, 
#                          targ_vocab_size=TARG_VOCAB_SIZE, src_max_sentence_len=SRC_MAX_SENTENCE_LEN, 
#                          targ_max_sentence_len=TARG_MAX_SENTENCE_LEN, 
#                          pretrained_word2vec=get_pretrained_emb(vocab[TARG_LANG]['word2vec'], vocab[TARG_LANG]['token2id']))
# decoder = DecoderAttnRNN_Test(rnn_cell_type=RNN_CELL_TYPE, dec_hidden_dim=DEC_HIDDEN_DIM, enc_hidden_dim=ENC_HIDDEN_DIM, num_layers=NUM_LAYERS, 
#                          targ_vocab_size=TARG_VOCAB_SIZE, src_max_sentence_len=SRC_MAX_SENTENCE_LEN, 
#                          targ_max_sentence_len=TARG_MAX_SENTENCE_LEN, dec_dropout=DEC_DROPOUT, 
#                          pretrained_word2vec=get_pretrained_emb(vocab[TARG_LANG]['word2vec'], vocab[TARG_LANG]['token2id']))
decoder = DecoderRNN_Test(
    rnn_cell_type=RNN_CELL_TYPE, attn=True, dec_hidden_dim=DEC_HIDDEN_DIM, enc_hidden_dim=ENC_HIDDEN_DIM, 
    num_layers=NUM_LAYERS, targ_vocab_size=TARG_VOCAB_SIZE, src_max_sentence_len=SRC_MAX_SENTENCE_LEN, 
    targ_max_sentence_len=TARG_MAX_SENTENCE_LEN, dec_dropout=DEC_DROPOUT, 
    pretrained_word2vec=get_pretrained_emb(vocab[TARG_LANG]['word2vec'], vocab[TARG_LANG]['token2id']))

model = EncoderDecoderAttention(encoder, decoder, vocab[TARG_LANG]['token2id']) 

In [15]:
model, results = train_and_eval_attn(
    model=model, loaders_full=loaders_full, loaders_minibatch=loaders_minibatch, loaders_minitrain=loaders_minitrain, 
    params=params, vocab=vocab, print_intermediate=True, save_checkpoint=True, save_to_log=True, 
    lazy_eval=False, inspect_samples=1)

Epoch: 0.00, Train Loss: 10.12, Val Loss: 10.24, Train BLEU: 0.31, Val BLEU: 0.20
Sampling from training predictions...
Source: 泰坦 泰坦尼克 泰坦尼克号 坦尼 尼克 号 是 拿 了 不少
Reference: the truth of the matter is that the titanic
Model: <SOS> it the the the the the the the the
Attention Weights: tensor([[0.0976, 0.0948, 0.0933, 0.0972, 0.0977, 0.1007, 0.1033, 0.1045, 0.1055,
         0.1054],
        [0.0976, 0.0949, 0.0934, 0.0973, 0.0977, 0.1007, 0.1032, 0.1044, 0.1054,
         0.1054],
        [0.0977, 0.0949, 0.0934, 0.0973, 0.0977, 0.1007, 0.1032, 0.1044, 0.1054,
         0.1053],
        [0.0977, 0.0949, 0.0934, 0.0973, 0.0977, 0.1007, 0.1032, 0.1044, 0.1054,
         0.1053],
        [0.0976, 0.0949, 0.0934, 0.0973, 0.0977, 0.1007, 0.1032, 0.1044, 0.1054,
         0.1053],
        [0.0976, 0.0949, 0.0934, 0.0973, 0.0977, 0.1007, 0.1032, 0.1044, 0.1054,
         0.1053],
        [0.0976, 0.0949, 0.0934, 0.0973, 0.0977, 0.1007, 0.1032, 0.1044, 0.1054,
         0.1053],
        [0.0976, 0.0949, 0

Epoch: 4.00, Train Loss: 8.39, Val Loss: 9.58, Train BLEU: 0.28, Val BLEU: 0.19
Sampling from training predictions...
Source: 其实 实地 地球 上 最长 的 山脉 都 在 海洋
Reference: and in the oceans , there are the longest
Model: <SOS> the the the the the the the the the
Attention Weights: tensor([[0.0612, 0.0898, 0.1079, 0.1261, 0.1271, 0.1294, 0.1204, 0.1093, 0.0829,
         0.0459],
        [0.0595, 0.0889, 0.1079, 0.1270, 0.1282, 0.1307, 0.1214, 0.1097, 0.0822,
         0.0445],
        [0.0590, 0.0887, 0.1080, 0.1274, 0.1287, 0.1312, 0.1217, 0.1098, 0.0818,
         0.0439],
        [0.0588, 0.0887, 0.1080, 0.1275, 0.1288, 0.1313, 0.1217, 0.1097, 0.0817,
         0.0437],
        [0.0589, 0.0887, 0.1081, 0.1275, 0.1288, 0.1313, 0.1217, 0.1097, 0.0816,
         0.0437],
        [0.0590, 0.0888, 0.1081, 0.1275, 0.1288, 0.1312, 0.1216, 0.1096, 0.0816,
         0.0438],
        [0.0591, 0.0888, 0.1081, 0.1274, 0.1287, 0.1312, 0.1216, 0.1096, 0.0816,
         0.0438],
        [0.0591, 0.0889, 0.1081, 0

Epoch: 8.00, Train Loss: 6.46, Val Loss: 8.90, Train BLEU: 0.28, Val BLEU: 0.19
Sampling from training predictions...
Source: 大多 大多数 多数 地震 和 火山 喷发 也 都 发生
Reference: most of the earthquakes and volcanoes are in the
Model: <SOS> the the the the the the the the the
Attention Weights: tensor([[0.0243, 0.0450, 0.1048, 0.1369, 0.1536, 0.1503, 0.1453, 0.1303, 0.0855,
         0.0241],
        [0.0200, 0.0398, 0.1022, 0.1387, 0.1586, 0.1553, 0.1495, 0.1323, 0.0827,
         0.0208],
        [0.0196, 0.0393, 0.1019, 0.1390, 0.1593, 0.1560, 0.1500, 0.1323, 0.0821,
         0.0204],
        [0.0195, 0.0393, 0.1018, 0.1390, 0.1594, 0.1561, 0.1501, 0.1323, 0.0821,
         0.0203],
        [0.0195, 0.0392, 0.1017, 0.1390, 0.1595, 0.1562, 0.1501, 0.1323, 0.0821,
         0.0203],
        [0.0195, 0.0392, 0.1017, 0.1390, 0.1595, 0.1562, 0.1501, 0.1323, 0.0821,
         0.0204],
        [0.0195, 0.0393, 0.1017, 0.1390, 0.1594, 0.1562, 0.1501, 0.1323, 0.0821,
         0.0204],
        [0.0195, 0.0393, 

Epoch: 12.00, Train Loss: 5.13, Val Loss: 8.64, Train BLEU: 0.28, Val BLEU: 0.19
Sampling from training predictions...
Source: 原来 它 是 海洋 洋中 最长 的 生物 <EOS> <PAD>
Reference: this turns out to be the longest creature in
Model: <SOS> the the the the the the the the the
Attention Weights: tensor([[4.1061e-02, 1.2376e-01, 1.5786e-01, 1.6129e-01, 1.3720e-01, 1.5914e-01,
         1.4181e-01, 7.1598e-02, 6.2811e-03, 3.5813e-10],
        [3.4871e-02, 1.2066e-01, 1.6063e-01, 1.6532e-01, 1.3904e-01, 1.6356e-01,
         1.4323e-01, 6.7415e-02, 5.2709e-03, 2.3881e-09],
        [3.4640e-02, 1.2031e-01, 1.6051e-01, 1.6536e-01, 1.3926e-01, 1.6376e-01,
         1.4350e-01, 6.7430e-02, 5.2320e-03, 2.7607e-09],
        [3.4593e-02, 1.2022e-01, 1.6038e-01, 1.6527e-01, 1.3935e-01, 1.6379e-01,
         1.4362e-01, 6.7544e-02, 5.2317e-03, 2.7052e-09],
        [3.4605e-02, 1.2021e-01, 1.6032e-01, 1.6521e-01, 1.3937e-01, 1.6377e-01,
         1.4365e-01, 6.7621e-02, 5.2455e-03, 2.6572e-09],
        [3.4602e-02, 

Epoch: 16.00, Train Loss: 4.48, Val Loss: 8.96, Train BLEU: 0.36, Val BLEU: 0.28
Sampling from training predictions...
Source: 我们 将 用 一些 影片 来讲 讲述 一些 深海 海里
Reference: and we &apos;re going to tell you some stories
Model: <SOS> it the the the the the , , ,
Attention Weights: tensor([[0.0360, 0.0929, 0.1135, 0.1218, 0.1264, 0.1285, 0.1283, 0.1235, 0.0996,
         0.0294],
        [0.0299, 0.0902, 0.1150, 0.1250, 0.1304, 0.1326, 0.1318, 0.1250, 0.0959,
         0.0244],
        [0.0300, 0.0901, 0.1148, 0.1249, 0.1302, 0.1325, 0.1317, 0.1250, 0.0961,
         0.0247],
        [0.0297, 0.0898, 0.1147, 0.1249, 0.1303, 0.1326, 0.1318, 0.1252, 0.0963,
         0.0248],
        [0.0295, 0.0896, 0.1146, 0.1249, 0.1303, 0.1327, 0.1319, 0.1253, 0.0964,
         0.0248],
        [0.0295, 0.0896, 0.1146, 0.1249, 0.1303, 0.1327, 0.1319, 0.1253, 0.0965,
         0.0248],
        [0.0295, 0.0896, 0.1146, 0.1249, 0.1303, 0.1327, 0.1319, 0.1253, 0.0965,
         0.0248],
        [0.0294, 0.0895, 0.1146, 

Epoch: 20.00, Train Loss: 4.24, Val Loss: 9.57, Train BLEU: 0.36, Val BLEU: 0.28
Sampling from training predictions...
Source: 这 是 一只 水母 <EOS> <PAD> <PAD> <PAD> <PAD> <PAD>
Reference: here &apos;s a jelly . <EOS> <PAD> <PAD> <PAD>
Model: <SOS> it it the the the the the , ,
Attention Weights: tensor([[1.4442e-01, 2.8202e-01, 3.0928e-01, 2.3631e-01, 2.7964e-02, 8.2845e-11,
         8.2845e-11, 8.2845e-11, 8.2845e-11, 8.2845e-11],
        [1.2958e-01, 2.8921e-01, 3.2282e-01, 2.3477e-01, 2.3620e-02, 1.6870e-09,
         1.6870e-09, 1.6870e-09, 1.6870e-09, 1.6870e-09],
        [1.2939e-01, 2.8895e-01, 3.2274e-01, 2.3503e-01, 2.3894e-02, 2.5539e-09,
         2.5539e-09, 2.5539e-09, 2.5539e-09, 2.5539e-09],
        [1.2794e-01, 2.8901e-01, 3.2375e-01, 2.3580e-01, 2.3497e-02, 2.7803e-09,
         2.7803e-09, 2.7803e-09, 2.7803e-09, 2.7803e-09],
        [1.2709e-01, 2.8906e-01, 3.2437e-01, 2.3621e-01, 2.3274e-02, 2.7796e-09,
         2.7796e-09, 2.7796e-09, 2.7796e-09, 2.7796e-09],
        [1.2

Epoch: 23.00, Train Loss: 4.16, Val Loss: 9.99, Train BLEU: 7.29, Val BLEU: 0.96
Sampling from training predictions...
Source: 原来 它 是 海洋 洋中 最长 的 生物 <EOS> <PAD>
Reference: this turns out to be the longest creature in
Model: <SOS> it &apos;s the the the the the the the
Attention Weights: tensor([[5.6943e-02, 1.1686e-01, 1.3540e-01, 1.4259e-01, 1.3888e-01, 1.4666e-01,
         1.4131e-01, 1.0691e-01, 1.4443e-02, 2.1127e-11],
        [4.8269e-02, 1.1464e-01, 1.3754e-01, 1.4613e-01, 1.4110e-01, 1.5136e-01,
         1.4485e-01, 1.0408e-01, 1.2043e-02, 4.7428e-10],
        [4.8242e-02, 1.1461e-01, 1.3759e-01, 1.4619e-01, 1.4093e-01, 1.5135e-01,
         1.4483e-01, 1.0403e-01, 1.2221e-02, 8.0587e-10],
        [4.7219e-02, 1.1421e-01, 1.3770e-01, 1.4651e-01, 1.4112e-01, 1.5185e-01,
         1.4535e-01, 1.0411e-01, 1.1914e-02, 9.4395e-10],
        [4.6614e-02, 1.1394e-01, 1.3775e-01, 1.4671e-01, 1.4127e-01, 1.5219e-01,
         1.4568e-01, 1.0416e-01, 1.1699e-02, 9.3244e-10],
        [4.6419e-0

Epoch: 27.00, Train Loss: 4.08, Val Loss: 10.42, Train BLEU: 7.29, Val BLEU: 0.96
Sampling from training predictions...
Source: 这儿 基本 基本上 都 没有 被 开发 发过 但是 像
Reference: it &apos;s mostly unexplored , and yet there are
Model: <SOS> it &apos;s the the the the the the the
Attention Weights: tensor([[0.0508, 0.0947, 0.1085, 0.1161, 0.1194, 0.1211, 0.1204, 0.1172, 0.1044,
         0.0475],
        [0.0432, 0.0920, 0.1088, 0.1183, 0.1224, 0.1245, 0.1236, 0.1196, 0.1043,
         0.0431],
        [0.0433, 0.0921, 0.1089, 0.1185, 0.1225, 0.1246, 0.1236, 0.1195, 0.1040,
         0.0432],
        [0.0424, 0.0918, 0.1089, 0.1187, 0.1228, 0.1250, 0.1239, 0.1197, 0.1039,
         0.0429],
        [0.0419, 0.0916, 0.1089, 0.1188, 0.1229, 0.1252, 0.1241, 0.1198, 0.1040,
         0.0428],
        [0.0417, 0.0914, 0.1089, 0.1190, 0.1231, 0.1254, 0.1242, 0.1199, 0.1039,
         0.0425],
        [0.0415, 0.0914, 0.1089, 0.1190, 0.1232, 0.1255, 0.1243, 0.1199, 0.1038,
         0.0423],
        [0.0415, 0.0

Epoch: 31.00, Train Loss: 4.00, Val Loss: 10.76, Train BLEU: 5.94, Val BLEU: 0.22
Sampling from training predictions...
Source: 和 我们 合作 的 人们 帮 我们 找到 了 新
Reference: people that have partnered with us have given us
Model: <SOS> it the the the the the the the the
Attention Weights: tensor([[0.0619, 0.0977, 0.1094, 0.1155, 0.1177, 0.1190, 0.1173, 0.1135, 0.1011,
         0.0469],
        [0.0555, 0.0962, 0.1099, 0.1171, 0.1199, 0.1214, 0.1196, 0.1152, 0.1012,
         0.0441],
        [0.0562, 0.0964, 0.1099, 0.1170, 0.1197, 0.1211, 0.1193, 0.1148, 0.1009,
         0.0447],
        [0.0557, 0.0964, 0.1100, 0.1172, 0.1199, 0.1213, 0.1194, 0.1148, 0.1008,
         0.0445],
        [0.0549, 0.0962, 0.1101, 0.1174, 0.1201, 0.1216, 0.1196, 0.1150, 0.1008,
         0.0442],
        [0.0545, 0.0960, 0.1101, 0.1176, 0.1204, 0.1219, 0.1198, 0.1151, 0.1007,
         0.0438],
        [0.0543, 0.0959, 0.1102, 0.1177, 0.1205, 0.1220, 0.1199, 0.1151, 0.1006,
         0.0436],
        [0.0542, 0.0959, 0.

Epoch: 35.00, Train Loss: 3.93, Val Loss: 11.03, Train BLEU: 5.80, Val BLEU: 0.21
Sampling from training predictions...
Source: 还有 这些 摇晃 着 旋转 转着 的 触角 <EOS> <PAD>
Reference: it &apos;s got tentacles dangling , swirling around like
Model: <SOS> it &apos;s the the the the the the the
Attention Weights: tensor([[7.7468e-02, 1.2749e-01, 1.4003e-01, 1.4590e-01, 1.4334e-01, 1.2220e-01,
         1.3447e-01, 9.9775e-02, 9.3306e-03, 7.1641e-12],
        [7.1214e-02, 1.2622e-01, 1.4084e-01, 1.4765e-01, 1.4519e-01, 1.2284e-01,
         1.3616e-01, 9.9987e-02, 9.8934e-03, 3.2047e-10],
        [7.2643e-02, 1.2630e-01, 1.4031e-01, 1.4684e-01, 1.4437e-01, 1.2247e-01,
         1.3533e-01, 1.0050e-01, 1.1249e-02, 9.3389e-10],
        [7.2007e-02, 1.2644e-01, 1.4067e-01, 1.4724e-01, 1.4462e-01, 1.2224e-01,
         1.3555e-01, 1.0022e-01, 1.1017e-02, 1.1731e-09],
        [7.1100e-02, 1.2640e-01, 1.4108e-01, 1.4782e-01, 1.4514e-01, 1.2212e-01,
         1.3591e-01, 9.9901e-02, 1.0535e-02, 1.0623e-09],
    

Epoch: 39.00, Train Loss: 3.86, Val Loss: 11.22, Train BLEU: 7.29, Val BLEU: 0.97
Sampling from training predictions...
Source: 我们 得用 非常 特殊 的 仪器 才能 能到 到达 那个
Reference: we have to have a very special technology to
Model: <SOS> it &apos;s the the the the the the the
Attention Weights: tensor([[0.0588, 0.0701, 0.1155, 0.1232, 0.1268, 0.1266, 0.1234, 0.1065, 0.1036,
         0.0455],
        [0.0541, 0.0685, 0.1155, 0.1242, 0.1284, 0.1283, 0.1247, 0.1074, 0.1041,
         0.0447],
        [0.0564, 0.0706, 0.1147, 0.1228, 0.1267, 0.1267, 0.1234, 0.1073, 0.1041,
         0.0472],
        [0.0566, 0.0710, 0.1149, 0.1229, 0.1266, 0.1265, 0.1231, 0.1071, 0.1039,
         0.0474],
        [0.0562, 0.0707, 0.1150, 0.1231, 0.1269, 0.1267, 0.1233, 0.1071, 0.1039,
         0.0471],
        [0.0556, 0.0702, 0.1152, 0.1235, 0.1273, 0.1271, 0.1236, 0.1070, 0.1038,
         0.0467],
        [0.0553, 0.0698, 0.1154, 0.1238, 0.1276, 0.1274, 0.1238, 0.1069, 0.1037,
         0.0464],
        [0.0551, 0.0697

Epoch: 43.00, Train Loss: 3.78, Val Loss: 11.35, Train BLEU: 7.45, Val BLEU: 1.00
Sampling from training predictions...
Source: 还有 前面 的 这个 是 推进 引擎 它 一会 一会儿
Reference: and it &apos;s got these jet thrusters up in
Model: <SOS> it &apos;s the the the , , , ,
Attention Weights: tensor([[0.0791, 0.1231, 0.1334, 0.1369, 0.1394, 0.1376, 0.1323, 0.1081, 0.0094,
         0.0007],
        [0.0738, 0.1221, 0.1344, 0.1386, 0.1414, 0.1394, 0.1333, 0.1066, 0.0094,
         0.0010],
        [0.0765, 0.1220, 0.1333, 0.1371, 0.1397, 0.1379, 0.1323, 0.1077, 0.0119,
         0.0016],
        [0.0773, 0.1223, 0.1332, 0.1369, 0.1393, 0.1375, 0.1319, 0.1076, 0.0123,
         0.0017],
        [0.0772, 0.1223, 0.1332, 0.1368, 0.1393, 0.1374, 0.1319, 0.1078, 0.0124,
         0.0017],
        [0.0770, 0.1222, 0.1333, 0.1369, 0.1394, 0.1376, 0.1320, 0.1077, 0.0122,
         0.0017],
        [0.0766, 0.1223, 0.1335, 0.1372, 0.1397, 0.1378, 0.1321, 0.1074, 0.0118,
         0.0016],
        [0.0763, 0.1223, 0.1336,

Epoch: 47.00, Train Loss: 3.70, Val Loss: 11.46, Train BLEU: 7.05, Val BLEU: 1.03
Sampling from training predictions...
Source: 这儿 基本 基本上 都 没有 被 开发 发过 但是 像
Reference: it &apos;s mostly unexplored , and yet there are
Model: <SOS> most &apos;s the the the the the , ,
Attention Weights: tensor([[0.0657, 0.1021, 0.1088, 0.1152, 0.1161, 0.1172, 0.1150, 0.1113, 0.0987,
         0.0500],
        [0.0603, 0.1002, 0.1087, 0.1164, 0.1177, 0.1191, 0.1168, 0.1128, 0.0989,
         0.0490],
        [0.0631, 0.1002, 0.1079, 0.1149, 0.1162, 0.1175, 0.1154, 0.1120, 0.0996,
         0.0532],
        [0.0646, 0.1005, 0.1077, 0.1144, 0.1155, 0.1167, 0.1148, 0.1115, 0.0997,
         0.0546],
        [0.0653, 0.1006, 0.1075, 0.1140, 0.1151, 0.1163, 0.1144, 0.1114, 0.0999,
         0.0555],
        [0.0655, 0.1007, 0.1076, 0.1140, 0.1150, 0.1162, 0.1143, 0.1112, 0.0998,
         0.0556],
        [0.0654, 0.1008, 0.1077, 0.1141, 0.1151, 0.1163, 0.1144, 0.1112, 0.0997,
         0.0553],
        [0.0653, 0.100

Epoch: 51.00, Train Loss: 3.62, Val Loss: 11.53, Train BLEU: 6.38, Val BLEU: 0.27
Sampling from training predictions...
Source: 但 我 想 告诉 你 的 是 当 你 站
Reference: but when you &apos;re standing at the beach ,
Model: <SOS> it of the the the the the , ,
Attention Weights: tensor([[0.0718, 0.1042, 0.1110, 0.1132, 0.1154, 0.1149, 0.1130, 0.1091, 0.0978,
         0.0496],
        [0.0662, 0.1031, 0.1118, 0.1147, 0.1174, 0.1168, 0.1148, 0.1104, 0.0976,
         0.0472],
        [0.0687, 0.1026, 0.1105, 0.1132, 0.1157, 0.1153, 0.1137, 0.1099, 0.0986,
         0.0519],
        [0.0705, 0.1027, 0.1099, 0.1123, 0.1147, 0.1144, 0.1129, 0.1095, 0.0990,
         0.0542],
        [0.0713, 0.1026, 0.1095, 0.1119, 0.1142, 0.1139, 0.1126, 0.1093, 0.0993,
         0.0554],
        [0.0720, 0.1026, 0.1094, 0.1117, 0.1139, 0.1137, 0.1123, 0.1091, 0.0993,
         0.0560],
        [0.0721, 0.1027, 0.1094, 0.1117, 0.1139, 0.1136, 0.1123, 0.1091, 0.0992,
         0.0560],
        [0.0722, 0.1027, 0.1094, 0.1117

Epoch: 55.00, Train Loss: 3.53, Val Loss: 11.57, Train BLEU: 7.34, Val BLEU: 0.30
Sampling from training predictions...
Source: 这 是 一种 种群 栖 动物 <EOS> <PAD> <PAD> <PAD>
Reference: it &apos;s a colonial animal . <EOS> <PAD> <PAD>
Model: <SOS> it &apos;s the the . . . . <EOS>
Attention Weights: tensor([[1.3145e-01, 1.8854e-01, 1.9348e-01, 1.9023e-01, 1.7852e-01, 1.1494e-01,
         2.8407e-03, 5.1834e-12, 5.1834e-12, 5.1834e-12],
        [1.2183e-01, 1.8861e-01, 1.9666e-01, 1.9421e-01, 1.8175e-01, 1.1350e-01,
         3.4495e-03, 2.5680e-10, 2.5680e-10, 2.5680e-10],
        [1.2417e-01, 1.8437e-01, 1.9240e-01, 1.9116e-01, 1.8115e-01, 1.2072e-01,
         6.0194e-03, 1.6961e-09, 1.6961e-09, 1.6961e-09],
        [1.2644e-01, 1.8257e-01, 1.9007e-01, 1.8923e-01, 1.8036e-01, 1.2378e-01,
         7.5547e-03, 3.1122e-09, 3.1122e-09, 3.1122e-09],
        [1.2774e-01, 1.8137e-01, 1.8865e-01, 1.8810e-01, 1.8001e-01, 1.2559e-01,
         8.5450e-03, 3.1807e-09, 3.1807e-09, 3.1807e-09],
        [1.28

Epoch: 59.00, Train Loss: 3.44, Val Loss: 11.64, Train BLEU: 7.44, Val BLEU: 0.31
Sampling from training predictions...
Source: 海洋 里 生物 的 多样 多样性 和 密度 要 比
Reference: the biodiversity and the <UNK> in the ocean is
Model: <SOS> we of the the the the the , ,
Attention Weights: tensor([[0.0652, 0.1113, 0.1173, 0.1212, 0.1195, 0.0929, 0.1172, 0.1097, 0.0965,
         0.0492],
        [0.0563, 0.1109, 0.1194, 0.1250, 0.1225, 0.0916, 0.1207, 0.1122, 0.0971,
         0.0445],
        [0.0591, 0.1097, 0.1174, 0.1227, 0.1203, 0.0916, 0.1192, 0.1121, 0.0987,
         0.0492],
        [0.0618, 0.1091, 0.1159, 0.1208, 0.1186, 0.0922, 0.1179, 0.1116, 0.0995,
         0.0526],
        [0.0629, 0.1086, 0.1152, 0.1199, 0.1178, 0.0927, 0.1173, 0.1114, 0.0999,
         0.0542],
        [0.0640, 0.1084, 0.1148, 0.1193, 0.1174, 0.0933, 0.1167, 0.1110, 0.0999,
         0.0553],
        [0.0646, 0.1084, 0.1147, 0.1190, 0.1172, 0.0936, 0.1164, 0.1107, 0.0997,
         0.0557],
        [0.0649, 0.1084, 0.1146, 

Epoch: 63.00, Train Loss: 3.35, Val Loss: 11.68, Train BLEU: 8.81, Val BLEU: 0.31
Sampling from training predictions...
Source: 是 我 最 喜欢 的 因为 它 哪 都 能动
Reference: it &apos;s one of my favorites , because it
Model: <SOS> it &apos;s the the the , , , ,
Attention Weights: tensor([[0.0764, 0.1183, 0.1255, 0.1271, 0.1263, 0.1226, 0.1176, 0.1078, 0.0769,
         0.0015],
        [0.0665, 0.1172, 0.1272, 0.1296, 0.1296, 0.1255, 0.1203, 0.1089, 0.0738,
         0.0014],
        [0.0689, 0.1152, 0.1248, 0.1273, 0.1277, 0.1244, 0.1203, 0.1103, 0.0786,
         0.0027],
        [0.0716, 0.1146, 0.1234, 0.1257, 0.1262, 0.1232, 0.1196, 0.1106, 0.0813,
         0.0038],
        [0.0729, 0.1141, 0.1225, 0.1248, 0.1253, 0.1226, 0.1194, 0.1110, 0.0829,
         0.0044],
        [0.0743, 0.1139, 0.1220, 0.1242, 0.1247, 0.1222, 0.1191, 0.1110, 0.0838,
         0.0049],
        [0.0750, 0.1139, 0.1219, 0.1241, 0.1245, 0.1220, 0.1188, 0.1108, 0.0840,
         0.0050],
        [0.0752, 0.1139, 0.1218, 0.124

Epoch: 67.00, Train Loss: 3.26, Val Loss: 11.73, Train BLEU: 8.79, Val BLEU: 0.29
Sampling from training predictions...
Source: 看到 这些 在 动 的 东西 了 吗 <EOS> <PAD>
Reference: but see all those different working things ? <EOS>
Model: <SOS> it &apos;s the the the . . <EOS> <EOS>
Attention Weights: tensor([[8.3363e-02, 1.4000e-01, 1.4748e-01, 1.4725e-01, 1.4262e-01, 1.3367e-01,
         1.2168e-01, 8.2907e-02, 1.0258e-03, 3.6252e-12],
        [7.1982e-02, 1.3649e-01, 1.4853e-01, 1.4998e-01, 1.4588e-01, 1.3694e-01,
         1.2482e-01, 8.3876e-02, 1.4969e-03, 1.8044e-10],
        [7.5137e-02, 1.3227e-01, 1.4383e-01, 1.4595e-01, 1.4325e-01, 1.3641e-01,
         1.2709e-01, 9.2551e-02, 3.5151e-03, 1.5678e-09],
        [7.8915e-02, 1.3073e-01, 1.4121e-01, 1.4328e-01, 1.4110e-01, 1.3527e-01,
         1.2737e-01, 9.6722e-02, 5.4063e-03, 3.7843e-09],
        [8.0520e-02, 1.2992e-01, 1.3995e-01, 1.4206e-01, 1.4016e-01, 1.3484e-01,
         1.2765e-01, 9.8565e-02, 6.3296e-03, 4.4394e-09],
        [8.19

Epoch: 70.00, Train Loss: 3.19, Val Loss: 11.77, Train BLEU: 8.44, Val BLEU: 0.27
Sampling from training predictions...
Source: 大卫 <UNK> 这位 是 比尔 <UNK> 我 是 大卫 <UNK>
Reference: this is bill lange . i &apos;m dave gallo
Model: <SOS> this is bill is . . . . .
Attention Weights: tensor([[6.0788e-03, 2.1287e-06, 7.3506e-02, 1.0351e-01, 3.8702e-04, 5.7230e-06,
         4.3290e-01, 3.6069e-01, 2.2928e-02, 3.6855e-07],
        [8.1365e-03, 6.8467e-06, 7.1213e-02, 1.2527e-01, 5.9942e-04, 1.5246e-05,
         3.8371e-01, 3.8449e-01, 2.6561e-02, 1.4026e-06],
        [1.6044e-02, 4.7049e-05, 8.6561e-02, 1.5259e-01, 1.9617e-03, 9.1352e-05,
         3.3375e-01, 3.6717e-01, 4.1772e-02, 1.1359e-05],
        [2.1368e-02, 9.7239e-05, 9.5735e-02, 1.6450e-01, 3.1928e-03, 1.8123e-04,
         3.1401e-01, 3.4975e-01, 5.1146e-02, 2.4808e-05],
        [2.3775e-02, 1.2820e-04, 9.9194e-02, 1.6796e-01, 3.8778e-03, 2.3669e-04,
         3.0720e-01, 3.4223e-01, 5.5361e-02, 3.4337e-05],
        [2.4233e-02, 1.3962e-0

Epoch: 73.00, Train Loss: 3.11, Val Loss: 11.80, Train BLEU: 8.81, Val BLEU: 0.27
Sampling from training predictions...
Source: 原来 它 是 海洋 洋中 最长 的 生物 <EOS> <PAD>
Reference: this turns out to be the longest creature in
Model: <SOS> it &apos;s the the to . . <EOS> <EOS>
Attention Weights: tensor([[9.2496e-02, 1.6656e-01, 1.7680e-01, 1.5852e-01, 2.3131e-02, 1.5278e-01,
         1.4503e-01, 8.3900e-02, 7.8395e-04, 4.6303e-12],
        [8.1922e-02, 1.6025e-01, 1.7720e-01, 1.6429e-01, 2.7536e-02, 1.4988e-01,
         1.4742e-01, 8.9669e-02, 1.8382e-03, 3.4126e-10],
        [8.6825e-02, 1.5117e-01, 1.6579e-01, 1.5903e-01, 4.0532e-02, 1.4366e-01,
         1.4524e-01, 1.0239e-01, 5.3608e-03, 3.9953e-09],
        [9.0936e-02, 1.4728e-01, 1.5988e-01, 1.5460e-01, 4.8644e-02, 1.4035e-01,
         1.4276e-01, 1.0702e-01, 8.5285e-03, 1.1477e-08],
        [9.1604e-02, 1.4586e-01, 1.5804e-01, 1.5344e-01, 5.1023e-02, 1.3946e-01,
         1.4236e-01, 1.0863e-01, 9.5751e-03, 1.4891e-08],
        [9.2392e-0

Epoch: 77.00, Train Loss: 3.02, Val Loss: 11.83, Train BLEU: 9.81, Val BLEU: 0.27
Sampling from training predictions...
Source: 和 我们 合作 的 人们 帮 我们 找到 了 新
Reference: people that have partnered with us have given us
Model: <SOS> we of the the the the , the the
Attention Weights: tensor([[0.0534, 0.1010, 0.1154, 0.1218, 0.1230, 0.1220, 0.1171, 0.1080, 0.0916,
         0.0466],
        [0.0482, 0.0954, 0.1129, 0.1208, 0.1228, 0.1227, 0.1184, 0.1103, 0.0965,
         0.0521],
        [0.0571, 0.0957, 0.1092, 0.1151, 0.1169, 0.1170, 0.1145, 0.1092, 0.0999,
         0.0654],
        [0.0611, 0.0960, 0.1077, 0.1128, 0.1144, 0.1146, 0.1127, 0.1085, 0.1011,
         0.0710],
        [0.0617, 0.0959, 0.1072, 0.1122, 0.1138, 0.1141, 0.1124, 0.1085, 0.1017,
         0.0726],
        [0.0619, 0.0958, 0.1071, 0.1121, 0.1137, 0.1139, 0.1123, 0.1085, 0.1017,
         0.0730],
        [0.0622, 0.0958, 0.1070, 0.1120, 0.1136, 0.1138, 0.1122, 0.1084, 0.1017,
         0.0733],
        [0.0623, 0.0957, 0.107

Epoch: 81.00, Train Loss: 2.91, Val Loss: 11.88, Train BLEU: 11.32, Val BLEU: 0.28
Sampling from training predictions...
Source: 我们 得用 非常 特殊 的 仪器 才能 能到 到达 那个
Reference: we have to have a very special technology to
Model: <SOS> we of the the the the the the the
Attention Weights: tensor([[0.0318, 0.0162, 0.1327, 0.1443, 0.1473, 0.1460, 0.1411, 0.0869, 0.1054,
         0.0482],
        [0.0389, 0.0268, 0.1236, 0.1365, 0.1407, 0.1403, 0.1371, 0.0900, 0.1069,
         0.0592],
        [0.0538, 0.0433, 0.1159, 0.1253, 0.1283, 0.1285, 0.1270, 0.0955, 0.1077,
         0.0748],
        [0.0592, 0.0498, 0.1130, 0.1214, 0.1241, 0.1244, 0.1234, 0.0967, 0.1077,
         0.0803],
        [0.0595, 0.0506, 0.1125, 0.1208, 0.1236, 0.1239, 0.1230, 0.0969, 0.1080,
         0.0813],
        [0.0592, 0.0501, 0.1128, 0.1210, 0.1238, 0.1242, 0.1232, 0.0969, 0.1078,
         0.0809],
        [0.0590, 0.0497, 0.1129, 0.1212, 0.1240, 0.1244, 0.1234, 0.0969, 0.1079,
         0.0806],
        [0.0588, 0.0492, 0.

Epoch: 84.00, Train Loss: 2.84, Val Loss: 11.91, Train BLEU: 12.25, Val BLEU: 0.28
Sampling from training predictions...
Source: 深海 海中 的 生命 大卫 <UNK> <EOS> <PAD> <PAD> <PAD>
Reference: life in the deep oceans <EOS> <PAD> <PAD> <PAD>
Model: <SOS> life in the deep . . <EOS> <EOS> <EOS>
Attention Weights: tensor([[1.9398e-01, 3.4498e-01, 3.1917e-01, 1.3651e-01, 5.3489e-03, 4.1686e-08,
         1.0803e-07, 1.1435e-11, 1.1435e-11, 1.1435e-11],
        [1.6655e-01, 3.1076e-01, 3.1999e-01, 1.8402e-01, 1.8674e-02, 2.1631e-06,
         4.9692e-06, 1.0425e-09, 1.0425e-09, 1.0425e-09],
        [1.7199e-01, 2.6750e-01, 2.8476e-01, 2.1948e-01, 5.6120e-02, 5.3917e-05,
         1.0934e-04, 2.2813e-08, 2.2813e-08, 2.2813e-08],
        [1.7401e-01, 2.5313e-01, 2.7100e-01, 2.2664e-01, 7.4763e-02, 1.5989e-04,
         2.9693e-04, 8.5495e-08, 8.5495e-08, 8.5495e-08],
        [1.7283e-01, 2.5131e-01, 2.7076e-01, 2.2847e-01, 7.6110e-02, 1.8106e-04,
         3.2631e-04, 1.0485e-07, 1.0485e-07, 1.0485e-07],
  

Epoch: 88.00, Train Loss: 2.75, Val Loss: 11.97, Train BLEU: 13.20, Val BLEU: 0.28
Sampling from training predictions...
Source: 底下 这些 都 是 <UNK> 它们 上上 上上下下 上下 下下
Reference: it &apos;s got these fishing <UNK> on the bottom
Model: <SOS> it &apos;s got these these , , , ,
Attention Weights: tensor([[9.0825e-02, 1.5825e-01, 8.9971e-02, 1.8387e-02, 1.7608e-06, 9.0157e-02,
         1.0926e-01, 2.8153e-01, 1.6090e-01, 7.2269e-04],
        [1.0253e-01, 1.6873e-01, 1.3092e-01, 5.5786e-02, 1.0420e-04, 8.6725e-02,
         1.0695e-01, 2.0159e-01, 1.4218e-01, 4.4767e-03],
        [1.0978e-01, 1.5554e-01, 1.4471e-01, 9.4375e-02, 1.2315e-03, 9.0436e-02,
         1.0711e-01, 1.5178e-01, 1.2884e-01, 1.6196e-02],
        [1.1027e-01, 1.4996e-01, 1.4779e-01, 1.0989e-01, 2.7506e-03, 8.9822e-02,
         1.0492e-01, 1.3753e-01, 1.2413e-01, 2.2935e-02],
        [1.1040e-01, 1.5188e-01, 1.5187e-01, 1.1449e-01, 2.8981e-03, 8.6585e-02,
         1.0195e-01, 1.3419e-01, 1.2319e-01, 2.2544e-02],
        [1.1052e

Epoch: 91.00, Train Loss: 2.67, Val Loss: 12.01, Train BLEU: 14.11, Val BLEU: 0.29
Sampling from training predictions...
Source: 我们 得用 非常 特殊 的 仪器 才能 能到 到达 那个
Reference: we have to have a very special technology to
Model: <SOS> we of the the the the the the the
Attention Weights: tensor([[0.0222, 0.0076, 0.1330, 0.1557, 0.1589, 0.1553, 0.1483, 0.0634, 0.1049,
         0.0508],
        [0.0420, 0.0274, 0.1196, 0.1339, 0.1375, 0.1373, 0.1355, 0.0840, 0.1090,
         0.0738],
        [0.0628, 0.0511, 0.1106, 0.1184, 0.1207, 0.1214, 0.1217, 0.0953, 0.1083,
         0.0898],
        [0.0661, 0.0557, 0.1085, 0.1157, 0.1180, 0.1189, 0.1195, 0.0963, 0.1080,
         0.0932],
        [0.0652, 0.0546, 0.1084, 0.1160, 0.1185, 0.1194, 0.1201, 0.0956, 0.1085,
         0.0937],
        [0.0633, 0.0521, 0.1089, 0.1172, 0.1198, 0.1208, 0.1214, 0.0947, 0.1089,
         0.0929],
        [0.0623, 0.0510, 0.1092, 0.1177, 0.1204, 0.1214, 0.1221, 0.0945, 0.1090,
         0.0924],
        [0.0615, 0.0496, 0.

Epoch: 95.00, Train Loss: 2.58, Val Loss: 12.07, Train BLEU: 15.73, Val BLEU: 0.31
Sampling from training predictions...
Source: 大卫 <UNK> 这位 是 比尔 <UNK> 我 是 大卫 <UNK>
Reference: this is bill lange . i &apos;m dave gallo
Model: <SOS> this is bill lange . . &apos;m &apos;m dave
Attention Weights: tensor([[1.1864e-02, 4.7586e-06, 4.4490e-02, 9.3344e-02, 2.1182e-04, 4.5140e-06,
         3.6887e-01, 4.3567e-01, 4.5546e-02, 1.5759e-06],
        [1.8938e-02, 1.7597e-05, 5.8744e-02, 1.1466e-01, 5.2373e-04, 1.7810e-05,
         3.2789e-01, 4.2153e-01, 5.7676e-02, 5.8799e-06],
        [4.5027e-02, 2.6152e-04, 8.8279e-02, 1.4389e-01, 3.8029e-03, 2.6680e-04,
         2.7276e-01, 3.4819e-01, 9.7419e-02, 9.8552e-05],
        [6.3195e-02, 6.5208e-04, 1.0105e-01, 1.5862e-01, 7.3070e-03, 6.5609e-04,
         2.4092e-01, 3.1033e-01, 1.1701e-01, 2.4980e-04],
        [7.0116e-02, 8.5187e-04, 1.0260e-01, 1.6561e-01, 8.8675e-03, 8.4337e-04,
         2.2479e-01, 3.0079e-01, 1.2519e-01, 3.3582e-04],
        [7.

Epoch: 99.00, Train Loss: 2.49, Val Loss: 12.14, Train BLEU: 20.07, Val BLEU: 0.30
Sampling from training predictions...
Source: 还有 前面 的 这个 是 推进 引擎 它 一会 一会儿
Reference: and it &apos;s got these jet thrusters up in
Model: <SOS> and it &apos;s got these , , , ,
Attention Weights: tensor([[6.5814e-02, 1.3908e-01, 1.5639e-01, 1.6082e-01, 1.5119e-01, 1.4081e-01,
         1.1921e-01, 6.6476e-02, 2.0146e-04, 9.2932e-06],
        [6.7408e-02, 1.2536e-01, 1.4255e-01, 1.5036e-01, 1.4624e-01, 1.4153e-01,
         1.3057e-01, 9.2858e-02, 2.7737e-03, 3.4894e-04],
        [8.1673e-02, 1.1816e-01, 1.2922e-01, 1.3543e-01, 1.3455e-01, 1.3391e-01,
         1.3159e-01, 1.1545e-01, 1.6068e-02, 3.9615e-03],
        [8.8462e-02, 1.1401e-01, 1.2203e-01, 1.2707e-01, 1.2709e-01, 1.2775e-01,
         1.2834e-01, 1.2262e-01, 3.1840e-02, 1.0784e-02],
        [8.9529e-02, 1.1230e-01, 1.1967e-01, 1.2445e-01, 1.2472e-01, 1.2571e-01,
         1.2714e-01, 1.2445e-01, 3.7832e-02, 1.4187e-02],
        [8.7030e-02, 1.1173

Epoch: 102.00, Train Loss: 2.42, Val Loss: 12.17, Train BLEU: 23.64, Val BLEU: 0.30
Sampling from training predictions...
Source: 其实 它们 都 是 由 单独 的 动物 结合 合在
Reference: these are all individual animals banding together to make
Model: <SOS> these are all individual animals , , , ,
Attention Weights: tensor([[5.3608e-02, 1.2366e-01, 1.4240e-01, 1.4878e-01, 1.4877e-01, 1.4385e-01,
         1.2533e-01, 9.8124e-02, 1.5477e-02, 3.8611e-07],
        [5.9287e-02, 1.0995e-01, 1.2513e-01, 1.3242e-01, 1.3496e-01, 1.3537e-01,
         1.3089e-01, 1.2302e-01, 4.8896e-02, 7.3472e-05],
        [7.4174e-02, 1.0526e-01, 1.1399e-01, 1.1882e-01, 1.2131e-01, 1.2376e-01,
         1.2483e-01, 1.2776e-01, 8.8553e-02, 1.5420e-03],
        [7.8630e-02, 1.0342e-01, 1.1055e-01, 1.1475e-01, 1.1711e-01, 1.1993e-01,
         1.2213e-01, 1.2792e-01, 1.0154e-01, 4.0159e-03],
        [7.6370e-02, 1.0263e-01, 1.1033e-01, 1.1481e-01, 1.1731e-01, 1.2036e-01,
         1.2319e-01, 1.3009e-01, 1.0169e-01, 3.2233e-03],
       

Epoch: 105.00, Train Loss: 2.35, Val Loss: 12.21, Train BLEU: 25.83, Val BLEU: 0.29
Sampling from training predictions...
Source: 这儿 基本 基本上 都 没有 被 开发 发过 但是 像
Reference: it &apos;s mostly unexplored , and yet there are
Model: <SOS> and of the the , , the the the
Attention Weights: tensor([[0.0430, 0.0944, 0.1099, 0.1241, 0.1265, 0.1244, 0.1216, 0.1118, 0.0960,
         0.0484],
        [0.0562, 0.0921, 0.1071, 0.1118, 0.1158, 0.1138, 0.1141, 0.1099, 0.1039,
         0.0752],
        [0.0760, 0.0951, 0.1044, 0.1028, 0.1060, 0.1043, 0.1058, 0.1047, 0.1046,
         0.0963],
        [0.0766, 0.0944, 0.1040, 0.1016, 0.1051, 0.1033, 0.1050, 0.1043, 0.1053,
         0.1005],
        [0.0727, 0.0928, 0.1037, 0.1019, 0.1058, 0.1040, 0.1059, 0.1053, 0.1065,
         0.1014],
        [0.0689, 0.0914, 0.1038, 0.1026, 0.1069, 0.1049, 0.1070, 0.1063, 0.1075,
         0.1008],
        [0.0690, 0.0914, 0.1038, 0.1026, 0.1069, 0.1049, 0.1070, 0.1063, 0.1075,
         0.1006],
        [0.0682, 0.0913, 0

Epoch: 109.00, Train Loss: 2.27, Val Loss: 12.26, Train BLEU: 26.71, Val BLEU: 0.30
Sampling from training predictions...
Source: 海洋 的 平均 深度 是 两英里 英里 <EOS> <PAD> <PAD>
Reference: the average depth is about two miles . <EOS>
Model: <SOS> it &apos;s the to about . <EOS> <EOS> <EOS>
Attention Weights: tensor([[1.0963e-01, 2.1770e-01, 2.3168e-01, 2.3047e-01, 1.6445e-01, 3.9103e-03,
         4.1815e-02, 3.4444e-04, 3.5305e-11, 3.5305e-11],
        [1.1368e-01, 1.7398e-01, 1.8982e-01, 1.9709e-01, 1.7525e-01, 4.0346e-02,
         9.6526e-02, 1.3310e-02, 1.6740e-08, 1.6740e-08],
        [1.1784e-01, 1.4238e-01, 1.5154e-01, 1.5779e-01, 1.5911e-01, 9.3001e-02,
         1.2591e-01, 5.2426e-02, 7.2137e-07, 7.2137e-07],
        [1.1562e-01, 1.3573e-01, 1.4486e-01, 1.5184e-01, 1.5730e-01, 1.0328e-01,
         1.2837e-01, 6.2990e-02, 3.7848e-06, 3.7848e-06],
        [1.1249e-01, 1.3785e-01, 1.4984e-01, 1.5942e-01, 1.6721e-01, 9.4737e-02,
         1.2658e-01, 5.1875e-02, 2.8780e-06, 2.8780e-06],
     

Epoch: 113.00, Train Loss: 2.18, Val Loss: 12.33, Train BLEU: 28.54, Val BLEU: 0.29
Sampling from training predictions...
Source: 我们 得用 非常 特殊 的 仪器 才能 能到 到达 那个
Reference: we have to have a very special technology to
Model: <SOS> we have have have to , the the the
Attention Weights: tensor([[0.0136, 0.0035, 0.1316, 0.1683, 0.1800, 0.1721, 0.1561, 0.0311, 0.0890,
         0.0547],
        [0.0391, 0.0295, 0.1187, 0.1325, 0.1369, 0.1388, 0.1375, 0.0737, 0.1052,
         0.0881],
        [0.0633, 0.0612, 0.1074, 0.1123, 0.1138, 0.1177, 0.1212, 0.0972, 0.1051,
         0.1009],
        [0.0642, 0.0640, 0.1052, 0.1106, 0.1124, 0.1169, 0.1211, 0.0985, 0.1045,
         0.1027],
        [0.0631, 0.0623, 0.1050, 0.1111, 0.1132, 0.1179, 0.1223, 0.0973, 0.1046,
         0.1031],
        [0.0607, 0.0593, 0.1052, 0.1123, 0.1149, 0.1198, 0.1245, 0.0954, 0.1047,
         0.1032],
        [0.0593, 0.0571, 0.1056, 0.1132, 0.1161, 0.1211, 0.1257, 0.0941, 0.1049,
         0.1029],
        [0.0581, 0.0547, 

Epoch: 116.00, Train Loss: 2.11, Val Loss: 12.36, Train BLEU: 29.20, Val BLEU: 0.29
Sampling from training predictions...
Source: 原来 它 是 海洋 洋中 最长 的 生物 <EOS> <PAD>
Reference: this turns out to be the longest creature in
Model: <SOS> this &apos;s a to be . . <EOS> <EOS>
Attention Weights: tensor([[9.3308e-02, 1.7524e-01, 1.6949e-01, 6.7934e-02, 3.0616e-04, 1.7574e-01,
         2.0606e-01, 1.1064e-01, 1.2832e-03, 4.0201e-11],
        [8.6611e-02, 1.3649e-01, 1.4594e-01, 1.2491e-01, 1.1036e-02, 1.5376e-01,
         1.8051e-01, 1.4042e-01, 2.0312e-02, 1.1248e-08],
        [9.0366e-02, 1.1382e-01, 1.2749e-01, 1.5095e-01, 5.8362e-02, 1.2287e-01,
         1.3487e-01, 1.3396e-01, 6.7307e-02, 1.0254e-06],
        [8.9884e-02, 1.1026e-01, 1.2566e-01, 1.5723e-01, 7.5076e-02, 1.1221e-01,
         1.2277e-01, 1.2816e-01, 7.8738e-02, 8.6572e-06],
        [8.7194e-02, 1.1166e-01, 1.3131e-01, 1.6964e-01, 6.6532e-02, 1.0935e-01,
         1.2326e-01, 1.3025e-01, 7.0796e-02, 7.0986e-06],
        [8.6464e-

Epoch: 119.00, Train Loss: 2.05, Val Loss: 12.38, Train BLEU: 31.64, Val BLEU: 0.29
Sampling from training predictions...
Source: 和 我们 合作 的 人们 帮 我们 找到 了 新
Reference: people that have partnered with us have given us
Model: <SOS> and of the the , , , , the
Attention Weights: tensor([[0.0409, 0.0825, 0.1121, 0.1207, 0.1290, 0.1272, 0.1239, 0.1163, 0.0919,
         0.0556],
        [0.0517, 0.0813, 0.1049, 0.1099, 0.1184, 0.1149, 0.1172, 0.1163, 0.1027,
         0.0827],
        [0.0728, 0.0880, 0.1010, 0.1010, 0.1060, 0.1033, 0.1066, 0.1092, 0.1061,
         0.1062],
        [0.0735, 0.0876, 0.1001, 0.1000, 0.1047, 0.1023, 0.1057, 0.1085, 0.1073,
         0.1104],
        [0.0700, 0.0856, 0.0992, 0.0999, 0.1051, 0.1029, 0.1064, 0.1094, 0.1090,
         0.1125],
        [0.0662, 0.0836, 0.0987, 0.1000, 0.1058, 0.1036, 0.1074, 0.1106, 0.1105,
         0.1136],
        [0.0659, 0.0834, 0.0986, 0.1000, 0.1059, 0.1038, 0.1076, 0.1107, 0.1106,
         0.1135],
        [0.0654, 0.0832, 0.0985, 

Epoch: 122.00, Train Loss: 1.99, Val Loss: 12.42, Train BLEU: 32.88, Val BLEU: 0.29
Sampling from training predictions...
Source: 这儿 基本 基本上 都 没有 被 开发 发过 但是 像
Reference: it &apos;s mostly unexplored , and yet there are
Model: <SOS> and of the the , , , the the
Attention Weights: tensor([[0.0409, 0.0881, 0.1106, 0.1265, 0.1251, 0.1277, 0.1193, 0.1119, 0.0945,
         0.0555],
        [0.0517, 0.0857, 0.1078, 0.1120, 0.1163, 0.1153, 0.1143, 0.1107, 0.1045,
         0.0817],
        [0.0739, 0.0919, 0.1059, 0.0998, 0.1050, 0.1023, 0.1049, 0.1046, 0.1068,
         0.1047],
        [0.0742, 0.0912, 0.1051, 0.0988, 0.1042, 0.1015, 0.1044, 0.1044, 0.1075,
         0.1088],
        [0.0699, 0.0887, 0.1044, 0.0991, 0.1048, 0.1024, 0.1052, 0.1054, 0.1088,
         0.1113],
        [0.0662, 0.0869, 0.1042, 0.0994, 0.1056, 0.1032, 0.1061, 0.1064, 0.1098,
         0.1120],
        [0.0664, 0.0869, 0.1041, 0.0994, 0.1056, 0.1032, 0.1061, 0.1064, 0.1098,
         0.1121],
        [0.0654, 0.0867, 0.1

Epoch: 126.00, Train Loss: 1.90, Val Loss: 12.48, Train BLEU: 35.14, Val BLEU: 0.29
Sampling from training predictions...
Source: 这 是 一种 种群 栖 动物 <EOS> <PAD> <PAD> <PAD>
Reference: it &apos;s a colonial animal . <EOS> <PAD> <PAD>
Model: <SOS> it &apos;s a jelly . <EOS> <EOS> <EOS> <EOS>
Attention Weights: tensor([[9.0371e-02, 1.6763e-01, 2.1102e-01, 2.2998e-01, 2.1795e-01, 8.1967e-02,
         1.0755e-03, 5.5892e-11, 5.5892e-11, 5.5892e-11],
        [9.2425e-02, 1.4719e-01, 1.8423e-01, 2.0821e-01, 2.1349e-01, 1.3486e-01,
         1.9595e-02, 1.2008e-08, 1.2008e-08, 1.2008e-08],
        [1.0574e-01, 1.3070e-01, 1.5279e-01, 1.6851e-01, 1.7583e-01, 1.6837e-01,
         9.8056e-02, 2.5997e-06, 2.5997e-06, 2.5997e-06],
        [1.0602e-01, 1.2486e-01, 1.4419e-01, 1.5819e-01, 1.6612e-01, 1.7300e-01,
         1.2751e-01, 3.6755e-05, 3.6755e-05, 3.6755e-05],
        [1.0035e-01, 1.2143e-01, 1.4295e-01, 1.5918e-01, 1.6949e-01, 1.8073e-01,
         1.2576e-01, 3.5676e-05, 3.5676e-05, 3.5676e-05],

Epoch: 129.00, Train Loss: 1.84, Val Loss: 12.51, Train BLEU: 37.64, Val BLEU: 0.27
Sampling from training predictions...
Source: 和 我们 合作 的 人们 帮 我们 找到 了 新
Reference: people that have partnered with us have given us
Model: <SOS> but of the the and and , , the
Attention Weights: tensor([[0.0384, 0.0795, 0.1122, 0.1190, 0.1260, 0.1307, 0.1251, 0.1139, 0.0968,
         0.0585],
        [0.0482, 0.0792, 0.1062, 0.1111, 0.1194, 0.1189, 0.1194, 0.1158, 0.1024,
         0.0795],
        [0.0719, 0.0856, 0.1003, 0.1001, 0.1067, 0.1030, 0.1076, 0.1105, 0.1065,
         0.1078],
        [0.0739, 0.0856, 0.0993, 0.0989, 0.1055, 0.1014, 0.1065, 0.1100, 0.1070,
         0.1120],
        [0.0702, 0.0831, 0.0986, 0.0986, 0.1059, 0.1016, 0.1074, 0.1110, 0.1084,
         0.1152],
        [0.0659, 0.0805, 0.0978, 0.0983, 0.1067, 0.1022, 0.1085, 0.1124, 0.1100,
         0.1177],
        [0.0656, 0.0803, 0.0979, 0.0984, 0.1069, 0.1023, 0.1087, 0.1125, 0.1099,
         0.1174],
        [0.0649, 0.0799, 0.09

Epoch: 133.00, Train Loss: 1.75, Val Loss: 12.56, Train BLEU: 37.09, Val BLEU: 0.28
Sampling from training predictions...
Source: 还有 前面 的 这个 是 推进 引擎 它 一会 一会儿
Reference: and it &apos;s got these jet thrusters up in
Model: <SOS> and it &apos;s got these jet thrusters thrusters in
Attention Weights: tensor([[0.0587, 0.1209, 0.1457, 0.1577, 0.1563, 0.1468, 0.1431, 0.0704, 0.0005,
         0.0000],
        [0.0604, 0.1138, 0.1361, 0.1480, 0.1507, 0.1444, 0.1451, 0.0967, 0.0042,
         0.0005],
        [0.0700, 0.1013, 0.1157, 0.1260, 0.1301, 0.1324, 0.1424, 0.1280, 0.0415,
         0.0126],
        [0.0745, 0.0889, 0.0967, 0.1041, 0.1065, 0.1125, 0.1243, 0.1327, 0.1062,
         0.0535],
        [0.0730, 0.0842, 0.0908, 0.0981, 0.1001, 0.1071, 0.1191, 0.1341, 0.1248,
         0.0687],
        [0.0674, 0.0801, 0.0879, 0.0965, 0.0998, 0.1082, 0.1235, 0.1452, 0.1289,
         0.0624],
        [0.0649, 0.0786, 0.0871, 0.0966, 0.1002, 0.1096, 0.1266, 0.1494, 0.1291,
         0.0580],
        [

Epoch: 137.00, Train Loss: 1.67, Val Loss: 12.62, Train BLEU: 39.55, Val BLEU: 0.26
Sampling from training predictions...
Source: 这 是 一只 水母 <EOS> <PAD> <PAD> <PAD> <PAD> <PAD>
Reference: here &apos;s a jelly . <EOS> <PAD> <PAD> <PAD>
Model: <SOS> here &apos;s a jelly . <EOS> <EOS> <EOS> <EOS>
Attention Weights: tensor([[1.7087e-01, 3.1574e-01, 3.6505e-01, 1.4589e-01, 2.4534e-03, 2.1434e-10,
         2.1434e-10, 2.1434e-10, 2.1434e-10, 2.1434e-10],
        [1.5936e-01, 2.6784e-01, 3.2896e-01, 2.1688e-01, 2.6956e-02, 1.3748e-08,
         1.3748e-08, 1.3748e-08, 1.3748e-08, 1.3748e-08],
        [1.5663e-01, 2.0424e-01, 2.4608e-01, 2.5091e-01, 1.4213e-01, 1.9880e-06,
         1.9880e-06, 1.9880e-06, 1.9880e-06, 1.9880e-06],
        [1.5123e-01, 1.8414e-01, 2.2013e-01, 2.4983e-01, 1.9448e-01, 3.4878e-05,
         3.4878e-05, 3.4878e-05, 3.4878e-05, 3.4878e-05],
        [1.4138e-01, 1.7709e-01, 2.1798e-01, 2.6079e-01, 2.0249e-01, 5.3648e-05,
         5.3648e-05, 5.3648e-05, 5.3648e-05, 5.364

Epoch: 141.00, Train Loss: 1.59, Val Loss: 12.65, Train BLEU: 43.03, Val BLEU: 0.31
Sampling from training predictions...
Source: 其实 它们 都 是 由 单独 的 动物 结合 合在
Reference: these are all individual animals banding together to make
Model: <SOS> these are all individual animals banding together make make
Attention Weights: tensor([[4.4661e-02, 1.0921e-01, 1.3253e-01, 1.4683e-01, 1.5699e-01, 1.5800e-01,
         1.4506e-01, 9.5267e-02, 1.1446e-02, 1.1799e-06],
        [4.6081e-02, 9.7131e-02, 1.1775e-01, 1.3116e-01, 1.4112e-01, 1.4755e-01,
         1.4679e-01, 1.2854e-01, 4.3762e-02, 1.1377e-04],
        [5.8116e-02, 8.4525e-02, 9.6158e-02, 1.0446e-01, 1.1067e-01, 1.2155e-01,
         1.2604e-01, 1.4923e-01, 1.3787e-01, 1.1376e-02],
        [6.3506e-02, 8.0855e-02, 8.9306e-02, 9.5631e-02, 1.0035e-01, 1.1112e-01,
         1.1575e-01, 1.4492e-01, 1.6318e-01, 3.5372e-02],
        [6.0527e-02, 7.8302e-02, 8.6971e-02, 9.3629e-02, 9.8694e-02, 1.1046e-01,
         1.1621e-01, 1.4983e-01, 1.7366e-01, 3

Epoch: 144.00, Train Loss: 1.54, Val Loss: 12.70, Train BLEU: 44.82, Val BLEU: 0.31
Sampling from training predictions...
Source: <UNK> 塞尔 <UNK> <UNK> 斯特 说 过 真正 的 探索
Reference: marcel proust said , &quot; the true voyage of
Model: <SOS> marcel proust said , &quot; true true voyage voyage
Attention Weights: tensor([[2.2558e-07, 1.4966e-07, 1.6490e-07, 4.9170e-07, 1.8706e-05, 1.3534e-02,
         1.1277e-01, 2.4824e-01, 3.2998e-01, 2.9546e-01],
        [1.0566e-07, 6.3480e-08, 7.1146e-08, 2.6247e-07, 2.7629e-05, 1.8328e-02,
         1.1204e-01, 2.2897e-01, 3.1166e-01, 3.2898e-01],
        [2.1260e-05, 1.2709e-05, 1.3438e-05, 3.9155e-05, 1.1790e-03, 4.5396e-02,
         1.2725e-01, 2.1770e-01, 2.8201e-01, 3.2638e-01],
        [5.2170e-04, 3.2369e-04, 3.3105e-04, 7.8884e-04, 8.7586e-03, 7.6797e-02,
         1.4071e-01, 2.0262e-01, 2.5059e-01, 3.1856e-01],
        [1.4641e-03, 9.1566e-04, 9.2663e-04, 2.0757e-03, 1.7479e-02, 8.8103e-02,
         1.3999e-01, 1.9188e-01, 2.3668e-01, 3.2049e-01

Epoch: 148.00, Train Loss: 1.46, Val Loss: 12.75, Train BLEU: 48.07, Val BLEU: 0.30
Sampling from training predictions...
Source: 但 我 想 告诉 你 的 是 当 你 站
Reference: but when you &apos;re standing at the beach ,
Model: <SOS> but when the &apos;re standing , , , ,
Attention Weights: tensor([[0.0413, 0.0806, 0.1067, 0.1300, 0.1343, 0.1348, 0.1237, 0.1083, 0.0896,
         0.0507],
        [0.0477, 0.0814, 0.1044, 0.1252, 0.1275, 0.1288, 0.1204, 0.1089, 0.0937,
         0.0621],
        [0.0697, 0.0862, 0.0987, 0.1111, 0.1110, 0.1130, 0.1105, 0.1068, 0.1019,
         0.0912],
        [0.0767, 0.0875, 0.0966, 0.1068, 0.1057, 0.1081, 0.1073, 0.1055, 0.1039,
         0.1019],
        [0.0740, 0.0848, 0.0945, 0.1062, 0.1049, 0.1081, 0.1079, 0.1067, 0.1061,
         0.1068],
        [0.0694, 0.0812, 0.0922, 0.1064, 0.1051, 0.1091, 0.1093, 0.1084, 0.1084,
         0.1104],
        [0.0681, 0.0803, 0.0917, 0.1067, 0.1055, 0.1097, 0.1099, 0.1089, 0.1087,
         0.1107],
        [0.0672, 0.0795, 0.0

Epoch: 152.00, Train Loss: 1.39, Val Loss: 12.79, Train BLEU: 52.95, Val BLEU: 0.31
Sampling from training predictions...
Source: 大部 大部分 部分 的 动物 也 都 生活 在 海洋
Reference: most of the animals are in the oceans .
Model: <SOS> most of the animals , the the the the
Attention Weights: tensor([[0.0369, 0.1179, 0.1366, 0.1325, 0.1347, 0.1142, 0.1094, 0.1028, 0.0814,
         0.0336],
        [0.0458, 0.1121, 0.1290, 0.1231, 0.1335, 0.1085, 0.1053, 0.1082, 0.0842,
         0.0503],
        [0.0772, 0.1125, 0.1147, 0.1077, 0.1169, 0.0980, 0.0967, 0.1036, 0.0876,
         0.0852],
        [0.0849, 0.1101, 0.1090, 0.1006, 0.1140, 0.0915, 0.0912, 0.1053, 0.0882,
         0.1052],
        [0.0790, 0.1072, 0.1071, 0.0976, 0.1167, 0.0879, 0.0879, 0.1094, 0.0880,
         0.1192],
        [0.0762, 0.1062, 0.1074, 0.0979, 0.1178, 0.0880, 0.0880, 0.1101, 0.0882,
         0.1202],
        [0.0753, 0.1057, 0.1077, 0.0982, 0.1183, 0.0883, 0.0883, 0.1103, 0.0882,
         0.1196],
        [0.0750, 0.1055, 0.10

Epoch: 156.00, Train Loss: 1.32, Val Loss: 12.82, Train BLEU: 56.75, Val BLEU: 0.29
Sampling from training predictions...
Source: 深海 海中 的 生命 大卫 <UNK> <EOS> <PAD> <PAD> <PAD>
Reference: life in the deep oceans <EOS> <PAD> <PAD> <PAD>
Model: <SOS> life in the deep oceans <EOS> <EOS> <EOS> <EOS>
Attention Weights: tensor([[1.8448e-01, 3.2385e-01, 3.2330e-01, 1.5964e-01, 8.7276e-03, 2.1881e-06,
         6.2102e-07, 5.0654e-10, 5.0654e-10, 5.0654e-10],
        [1.6853e-01, 2.7564e-01, 3.0485e-01, 2.2766e-01, 2.3320e-02, 6.4596e-06,
         2.0021e-06, 1.1234e-09, 1.1234e-09, 1.1234e-09],
        [1.2712e-01, 1.7868e-01, 2.1956e-01, 2.9336e-01, 1.7837e-01, 2.2431e-03,
         6.5835e-04, 1.5956e-07, 1.5956e-07, 1.5956e-07],
        [1.0376e-01, 1.2447e-01, 1.5415e-01, 2.6601e-01, 3.2145e-01, 2.3291e-02,
         6.8525e-03, 4.4379e-06, 4.4379e-06, 4.4379e-06],
        [9.3537e-02, 1.1155e-01, 1.4139e-01, 2.5929e-01, 3.5280e-01, 3.1929e-02,
         9.4693e-03, 1.3615e-05, 1.3615e-05, 1.361

Epoch: 159.00, Train Loss: 1.26, Val Loss: 12.86, Train BLEU: 57.35, Val BLEU: 0.30
Sampling from training predictions...
Source: 但 我 想 告诉 你 的 是 当 你 站
Reference: but when you &apos;re standing at the beach ,
Model: <SOS> but when you &apos;re standing , , , ,
Attention Weights: tensor([[0.0408, 0.0803, 0.1105, 0.1349, 0.1353, 0.1363, 0.1229, 0.1056, 0.0844,
         0.0491],
        [0.0465, 0.0807, 0.1074, 0.1299, 0.1291, 0.1310, 0.1207, 0.1065, 0.0883,
         0.0599],
        [0.0644, 0.0839, 0.1010, 0.1167, 0.1151, 0.1174, 0.1125, 0.1058, 0.0972,
         0.0861],
        [0.0723, 0.0849, 0.0975, 0.1109, 0.1085, 0.1115, 0.1089, 0.1049, 0.1009,
         0.0997],
        [0.0707, 0.0816, 0.0940, 0.1094, 0.1065, 0.1106, 0.1094, 0.1063, 0.1040,
         0.1074],
        [0.0670, 0.0778, 0.0914, 0.1095, 0.1059, 0.1112, 0.1106, 0.1080, 0.1063,
         0.1121],
        [0.0668, 0.0775, 0.0911, 0.1095, 0.1060, 0.1114, 0.1108, 0.1082, 0.1064,
         0.1124],
        [0.0666, 0.0771, 0.0

Epoch: 163.00, Train Loss: 1.20, Val Loss: 12.90, Train BLEU: 60.74, Val BLEU: 0.26
Sampling from training predictions...
Source: 大多 大多数 多数 地震 和 火山 喷发 也 都 发生
Reference: most of the earthquakes and volcanoes are in the
Model: <SOS> most of the earthquakes and the the the the
Attention Weights: tensor([[0.0412, 0.1004, 0.1654, 0.1684, 0.1436, 0.1346, 0.0889, 0.0697, 0.0570,
         0.0309],
        [0.0461, 0.1419, 0.1459, 0.1479, 0.1248, 0.1326, 0.0937, 0.0680, 0.0575,
         0.0416],
        [0.0729, 0.2088, 0.1188, 0.1140, 0.0965, 0.1183, 0.0892, 0.0630, 0.0588,
         0.0596],
        [0.0761, 0.2241, 0.1103, 0.1054, 0.0887, 0.1176, 0.0894, 0.0606, 0.0583,
         0.0696],
        [0.0709, 0.2411, 0.1059, 0.1022, 0.0841, 0.1209, 0.0891, 0.0558, 0.0548,
         0.0753],
        [0.0669, 0.2455, 0.1039, 0.1018, 0.0823, 0.1254, 0.0902, 0.0525, 0.0520,
         0.0795],
        [0.0653, 0.2453, 0.1032, 0.1024, 0.0826, 0.1268, 0.0906, 0.0521, 0.0516,
         0.0800],
        [0.06

Epoch: 167.00, Train Loss: 1.14, Val Loss: 12.92, Train BLEU: 64.53, Val BLEU: 0.26
Sampling from training predictions...
Source: 我 真 喜欢 这些 东西 <EOS> <PAD> <PAD> <PAD> <PAD>
Reference: i love that kind of stuff . <EOS> <PAD>
Model: <SOS> i &apos;s a kind jelly stuff . <EOS> <EOS>
Attention Weights: tensor([[1.3282e-01, 2.4329e-01, 2.9682e-01, 2.4963e-01, 7.5929e-02, 1.5086e-03,
         1.2910e-09, 1.2910e-09, 1.2910e-09, 1.2910e-09],
        [1.3461e-01, 2.1117e-01, 2.5664e-01, 2.4609e-01, 1.3678e-01, 1.4722e-02,
         1.6392e-08, 1.6392e-08, 1.6392e-08, 1.6392e-08],
        [1.1026e-01, 1.4294e-01, 1.7795e-01, 2.0917e-01, 2.3455e-01, 1.2512e-01,
         1.0500e-06, 1.0500e-06, 1.0500e-06, 1.0500e-06],
        [9.7230e-02, 1.1516e-01, 1.4113e-01, 1.7599e-01, 2.5525e-01, 2.1516e-01,
         2.0672e-05, 2.0672e-05, 2.0672e-05, 2.0672e-05],
        [8.5308e-02, 1.0064e-01, 1.2558e-01, 1.6605e-01, 2.7367e-01, 2.4850e-01,
         6.2827e-05, 6.2827e-05, 6.2827e-05, 6.2827e-05],
      

Epoch: 171.00, Train Loss: 1.07, Val Loss: 12.96, Train BLEU: 65.57, Val BLEU: 0.38
Sampling from training predictions...
Source: 大多 大多数 多数 地震 和 火山 喷发 也 都 发生
Reference: most of the earthquakes and volcanoes are in the
Model: <SOS> most of the earthquakes and volcanoes the the the
Attention Weights: tensor([[0.0452, 0.1422, 0.1754, 0.1716, 0.1292, 0.1405, 0.0817, 0.0514, 0.0393,
         0.0235],
        [0.0488, 0.2005, 0.1490, 0.1467, 0.1100, 0.1367, 0.0858, 0.0506, 0.0402,
         0.0318],
        [0.0764, 0.2843, 0.1155, 0.1061, 0.0818, 0.1164, 0.0803, 0.0476, 0.0431,
         0.0484],
        [0.0788, 0.3019, 0.1069, 0.0967, 0.0745, 0.1148, 0.0803, 0.0465, 0.0436,
         0.0561],
        [0.0724, 0.3232, 0.1016, 0.0930, 0.0697, 0.1182, 0.0803, 0.0425, 0.0405,
         0.0586],
        [0.0690, 0.3243, 0.0989, 0.0932, 0.0683, 0.1242, 0.0828, 0.0401, 0.0384,
         0.0609],
        [0.0676, 0.3213, 0.0985, 0.0940, 0.0688, 0.1262, 0.0840, 0.0400, 0.0383,
         0.0614],
       

Epoch: 175.00, Train Loss: 1.01, Val Loss: 13.00, Train BLEU: 68.31, Val BLEU: 0.43
Sampling from training predictions...
Source: 地球 的 大部 大部分 部分 都 是 海水 <EOS> <PAD>
Reference: most of the planet is ocean water . <EOS>
Model: <SOS> most of the planet is . . <EOS> <EOS>
Attention Weights: tensor([[1.0361e-01, 2.4059e-01, 3.9542e-01, 1.8284e-01, 4.1129e-02, 2.0565e-02,
         1.1805e-02, 3.8860e-03, 1.4186e-04, 1.1018e-10],
        [1.1527e-01, 1.9301e-01, 3.5095e-01, 2.2300e-01, 5.6680e-02, 2.7558e-02,
         1.8901e-02, 1.2538e-02, 2.0973e-03, 5.7847e-09],
        [1.3582e-01, 1.5815e-01, 2.6892e-01, 2.2674e-01, 7.4567e-02, 3.7135e-02,
         3.1497e-02, 4.2207e-02, 2.4967e-02, 1.1151e-06],
        [1.3505e-01, 1.4018e-01, 2.3471e-01, 2.2189e-01, 8.0020e-02, 4.0697e-02,
         3.7165e-02, 6.1948e-02, 4.8319e-02, 1.4200e-05],
        [1.2517e-01, 1.2976e-01, 2.2474e-01, 2.2586e-01, 8.0181e-02, 3.9222e-02,
         3.7699e-02, 7.4330e-02, 6.3005e-02, 3.7140e-05],
        [1.1972e-0

Epoch: 179.00, Train Loss: 0.95, Val Loss: 13.01, Train BLEU: 70.48, Val BLEU: 0.41
Sampling from training predictions...
Source: 底下 这些 都 是 <UNK> 它们 上上 上上下下 上下 下下
Reference: it &apos;s got these fishing <UNK> on the bottom
Model: <SOS> it &apos;s got these fishing <UNK> on the bottom
Attention Weights: tensor([[0.0464, 0.0934, 0.1464, 0.0887, 0.0004, 0.0892, 0.2398, 0.1733, 0.1089,
         0.0136],
        [0.0497, 0.0878, 0.1579, 0.1582, 0.0046, 0.0908, 0.1935, 0.1329, 0.0953,
         0.0291],
        [0.0513, 0.0704, 0.1340, 0.2675, 0.1199, 0.0672, 0.1085, 0.0593, 0.0530,
         0.0690],
        [0.0467, 0.0590, 0.1104, 0.2537, 0.2289, 0.0572, 0.0832, 0.0447, 0.0426,
         0.0735],
        [0.0447, 0.0567, 0.1070, 0.2438, 0.2653, 0.0509, 0.0733, 0.0398, 0.0400,
         0.0785],
        [0.0441, 0.0551, 0.1012, 0.2275, 0.2906, 0.0493, 0.0705, 0.0389, 0.0391,
         0.0837],
        [0.0428, 0.0532, 0.0974, 0.2238, 0.3044, 0.0473, 0.0675, 0.0379, 0.0382,
         0.0874],
   

Epoch: 183.00, Train Loss: 0.89, Val Loss: 13.04, Train BLEU: 75.59, Val BLEU: 0.45
Sampling from training predictions...
Source: 泰坦 泰坦尼克 泰坦尼克号 坦尼 尼克 号 是 拿 了 不少
Reference: the truth of the matter is that the titanic
Model: <SOS> the truth of the matter is the the titanic
Attention Weights: tensor([[0.0158, 0.0088, 0.0071, 0.0145, 0.1057, 0.1215, 0.1796, 0.2042, 0.1852,
         0.1576],
        [0.0215, 0.0209, 0.0160, 0.0311, 0.1161, 0.1234, 0.1662, 0.1812, 0.1653,
         0.1580],
        [0.0478, 0.1139, 0.0752, 0.1112, 0.1118, 0.0969, 0.1090, 0.1118, 0.1059,
         0.1165],
        [0.0539, 0.2035, 0.1194, 0.1520, 0.0875, 0.0705, 0.0746, 0.0756, 0.0733,
         0.0898],
        [0.0568, 0.2762, 0.1528, 0.1676, 0.0668, 0.0524, 0.0534, 0.0535, 0.0525,
         0.0681],
        [0.0590, 0.3777, 0.1882, 0.1668, 0.0389, 0.0293, 0.0298, 0.0308, 0.0316,
         0.0478],
        [0.0604, 0.3939, 0.1925, 0.1632, 0.0350, 0.0261, 0.0266, 0.0278, 0.0289,
         0.0455],
        [0.0603,

Epoch: 187.00, Train Loss: 0.83, Val Loss: 13.09, Train BLEU: 78.30, Val BLEU: 0.39
Sampling from training predictions...
Source: 当 你 站 在 海滩 上 或是 当 你 看到
Reference: part of the problem , i think , is
Model: <SOS> part of the problem , , , , ,
Attention Weights: tensor([[0.0259, 0.0582, 0.0944, 0.1374, 0.2400, 0.1294, 0.1378, 0.0795, 0.0570,
         0.0403],
        [0.0301, 0.0574, 0.0894, 0.1290, 0.2462, 0.1194, 0.1413, 0.0762, 0.0571,
         0.0539],
        [0.0480, 0.0640, 0.0857, 0.1131, 0.2078, 0.1047, 0.1294, 0.0780, 0.0673,
         0.1020],
        [0.0566, 0.0669, 0.0840, 0.1065, 0.1924, 0.0988, 0.1236, 0.0780, 0.0695,
         0.1237],
        [0.0529, 0.0613, 0.0782, 0.1003, 0.1973, 0.0935, 0.1250, 0.0762, 0.0697,
         0.1455],
        [0.0504, 0.0573, 0.0742, 0.0956, 0.1988, 0.0909, 0.1266, 0.0757, 0.0701,
         0.1605],
        [0.0493, 0.0557, 0.0723, 0.0935, 0.2014, 0.0893, 0.1267, 0.0747, 0.0696,
         0.1676],
        [0.0481, 0.0543, 0.0707, 0.0921, 0.205

Epoch: 191.00, Train Loss: 0.78, Val Loss: 13.11, Train BLEU: 79.78, Val BLEU: 0.41
Sampling from training predictions...
Source: 还有 这些 摇晃 着 旋转 转着 的 触角 <EOS> <PAD>
Reference: it &apos;s got tentacles dangling , swirling around like
Model: <SOS> it &apos;s got tentacles dangling , swirling around like
Attention Weights: tensor([[7.7244e-02, 1.7028e-01, 2.0916e-01, 2.1447e-01, 2.8408e-01, 3.3307e-02,
         6.0463e-03, 4.2268e-03, 1.1896e-03, 2.0122e-10],
        [8.0110e-02, 1.4813e-01, 1.8634e-01, 1.9399e-01, 2.9973e-01, 7.3016e-02,
         8.9189e-03, 6.7474e-03, 3.0207e-03, 8.6300e-10],
        [7.5967e-02, 1.0273e-01, 1.2979e-01, 1.3293e-01, 2.3728e-01, 2.6536e-01,
         1.7389e-02, 1.6234e-02, 2.2313e-02, 1.1953e-07],
        [7.5220e-02, 8.8837e-02, 1.0797e-01, 1.0903e-01, 1.9213e-01, 3.4446e-01,
         2.3223e-02, 2.1880e-02, 3.7245e-02, 2.8354e-06],
        [6.5956e-02, 7.6643e-02, 9.5540e-02, 9.8010e-02, 1.8607e-01, 3.9342e-01,
         2.0536e-02, 2.0372e-02, 4.3440e-0

Epoch: 195.00, Train Loss: 0.72, Val Loss: 13.12, Train BLEU: 82.03, Val BLEU: 0.40
Sampling from training predictions...
Source: 我们 得用 非常 特殊 的 仪器 才能 能到 到达 那个
Reference: we have to have a very special technology to
Model: <SOS> we have to have a very special technology technology
Attention Weights: tensor([[0.0020, 0.0032, 0.1169, 0.1384, 0.1486, 0.1988, 0.3226, 0.0498, 0.0140,
         0.0059],
        [0.0046, 0.0154, 0.1189, 0.1278, 0.1314, 0.1785, 0.3006, 0.0908, 0.0213,
         0.0106],
        [0.0143, 0.0882, 0.1060, 0.0920, 0.0939, 0.1284, 0.2324, 0.1836, 0.0380,
         0.0232],
        [0.0198, 0.1296, 0.0981, 0.0810, 0.0833, 0.1126, 0.2076, 0.1986, 0.0418,
         0.0276],
        [0.0234, 0.1684, 0.0873, 0.0715, 0.0742, 0.1025, 0.1967, 0.2072, 0.0406,
         0.0283],
        [0.0255, 0.2084, 0.0764, 0.0619, 0.0646, 0.0926, 0.1900, 0.2159, 0.0375,
         0.0272],
        [0.0264, 0.2416, 0.0677, 0.0542, 0.0570, 0.0859, 0.1893, 0.2185, 0.0341,
         0.0253],
       

Epoch: 199.00, Train Loss: 0.67, Val Loss: 13.17, Train BLEU: 83.98, Val BLEU: 0.39
Sampling from training predictions...
Source: 和 我们 合作 的 人们 帮 我们 找到 了 新
Reference: people that have partnered with us have given us
Model: <SOS> people &apos;s have partnered with us us given us
Attention Weights: tensor([[0.0231, 0.0781, 0.1574, 0.1099, 0.1459, 0.1177, 0.1494, 0.1422, 0.0479,
         0.0286],
        [0.0272, 0.0748, 0.1486, 0.1039, 0.1505, 0.1089, 0.1563, 0.1524, 0.0465,
         0.0310],
        [0.0427, 0.0793, 0.1359, 0.1006, 0.1416, 0.1021, 0.1467, 0.1525, 0.0542,
         0.0443],
        [0.0530, 0.0826, 0.1287, 0.0986, 0.1338, 0.0997, 0.1387, 0.1513, 0.0610,
         0.0526],
        [0.0541, 0.0811, 0.1267, 0.0969, 0.1325, 0.0984, 0.1383, 0.1540, 0.0626,
         0.0556],
        [0.0488, 0.0732, 0.1231, 0.0894, 0.1355, 0.0916, 0.1451, 0.1753, 0.0611,
         0.0569],
        [0.0488, 0.0711, 0.1192, 0.0868, 0.1348, 0.0892, 0.1459, 0.1828, 0.0626,
         0.0587],
        [0

In [16]:
summarize_results(load_experiment_log())[['dt_created', 'num_epochs', 'learning_rate', 'clip_grad_max_norm', 'val_loss']].head()

Unnamed: 0,dt_created,num_epochs,learning_rate,clip_grad_max_norm,val_loss
76,2018-12-05 19:40:19,200,0.0005,1.0,8.638109
75,2018-12-05 12:50:28,200,0.0005,1.0,8.559556
74,2018-12-05 12:01:54,200,0.0005,1.0,8.5509
73,2018-12-05 00:57:30,200,0.0005,1.0,8.644947
72,2018-12-05 00:36:58,200,0.0005,1.0,8.638907


In [None]:
plot_single_learning_curve(results)

In [None]:
# Epoch: 199.00, Train Loss: 0.32, Val Loss: 13.19, Train BLEU: 98.94, Val BLEU: 0.27
plot_single_learning_curve(results)

In [None]:
# with attention energies = v_broadcast.bmm(torch.tanh(self.attn(concat)).transpose(1, 2)) # switched order  
# Epoch: 199.00, Train Loss: 0.63, Val Loss: 12.82, Train BLEU: 92.05, Val BLEU: 0.38
plot_single_learning_curve(results)

In [None]:
for i, token in enumerate(vocab[SRC_LANG]['id2token']): 
    if i < 20: 
        print("{}: {}".format(i, token))

In [None]:
for i, token in enumerate(vocab[TARG_LANG]['id2token']): 
    if i < 20: 
        print("{}: {}".format(i, token))

In [None]:
import torch
x = torch.arange(0, 3*5*10).view(3, 5, 10)
print(x)
y = x[1:, :, :]
print(y)
z = y.view(-1, 10)
print(z)

In [None]:
t = torch.arange(0, 2*5).view(5, 2)
print(t)
u = t.contiguous().view(-1)
print(u)
v = t.permute(1, 0)
print(v)
w = v.contiguous().view(-1)
print(w)

In [None]:
a = torch.arange(0, 2*1*300)
print(a)
b = a.view(-1, 1, 300)
print(b.size())

In [None]:
for i, (src_idxs, targ_idxs, src_lens, targ_lens) in enumerate(full_loaders['train']):
#     print(i)
#     print(src_idxs.size())
#     print(src_idxs)
#     print(src_lens)
#     print(targ_idxs.size())
#     print(targ_idxs)
#     print(targ_lens)
    id2token = vocab[SRC_LANG]['id2token']
    test_tensor = src_idxs
    list_of_lists = test_tensor.numpy().astype(int).tolist()
    to_token = lambda l: ' '.join([id2token[idx] for idx in l])
    list_of_lists_tokens = [to_token(l) for l in list_of_lists] 
    break 