In [1]:
import numpy as np 
import pandas as pd 
from data_processing import generate_vocab, process_data, create_dataloaders, text2tokens
from model import get_pretrained_emb, EncoderDecoder, EncoderRNN, DecoderRNN, EncoderDecoderAttn, DecoderAttnRNN
from train_eval import count_parameters, summarize_results, plot_single_learning_curve, load_experiment_log
from train_eval import train_and_eval
import importlib
import pickle as pkl 
import torch

In [2]:
# model identification
MODEL_NAME = 'zh-seq2seq-rnn-vanilla'
SRC_LANG = 'zh'
TARG_LANG = 'en'

# data processing params  
SRC_MAX_SENTENCE_LEN = 10
TARG_MAX_SENTENCE_LEN = 10
SRC_VOCAB_SIZE = 30000 #30000
TARG_VOCAB_SIZE = 30000 #30000

# model architecture params 
RNN_CELL_TYPE = 'gru'
NUM_LAYERS = 2 #2 
ENC_HIDDEN_DIM = 256 #512
DEC_HIDDEN_DIM = 2 * ENC_HIDDEN_DIM #2 * ENC_HIDDEN_DIM 
TEACHER_FORCING_RATIO = 1
CLIP_GRAD_MAX_NORM = 1
ENC_DROPOUT = 0.2 # to actually implement
DEC_DROPOUT = 0.2 # to actually implement
USE_ATTN = True

# training params  
BATCH_SIZE = 32 #32
NUM_EPOCHS = 200
LR = 0.0003 # 0.0005
OPTIMIZER = 'Adam'
LAZY_TRAIN = True

In [3]:
# store as dict to save to results later 
params = {'model_name': MODEL_NAME, 'src_lang': SRC_LANG, 'targ_lang': TARG_LANG, 'rnn_cell_type': RNN_CELL_TYPE, 
          'src_max_sentence_len': SRC_MAX_SENTENCE_LEN, 'targ_max_sentence_len': TARG_MAX_SENTENCE_LEN, 
          'src_vocab_size': SRC_VOCAB_SIZE, 'targ_vocab_size': TARG_VOCAB_SIZE, 
          'num_layers': NUM_LAYERS, 'enc_hidden_dim': ENC_HIDDEN_DIM, 'dec_hidden_dim': DEC_HIDDEN_DIM,
          'teacher_forcing_ratio': TEACHER_FORCING_RATIO, 'clip_grad_max_norm': CLIP_GRAD_MAX_NORM,
          'enc_dropout': ENC_DROPOUT, 'dec_dropout': DEC_DROPOUT, 'use_attn': USE_ATTN, 
          'batch_size': BATCH_SIZE, 'num_epochs': NUM_EPOCHS, 'learning_rate': LR, 'optimizer': OPTIMIZER, 
          'lazy_train': LAZY_TRAIN} 

In [4]:
# # takes a long time to process, save to pickle for reimport in future 
# vocab = generate_vocab(SRC_LANG, TARG_LANG, SRC_VOCAB_SIZE, TARG_VOCAB_SIZE)
# vocab_filename = "{}-{}-vocab.p".format(SRC_LANG, TARG_LANG)
# pkl.dump(vocab, open(vocab_filename, "wb"))

In [5]:
# reload from pickle 
vocab_filename = "{}-{}-vocab.p".format(SRC_LANG, TARG_LANG)
vocab = pkl.load(open(vocab_filename, "rb"))
data = process_data(SRC_LANG, TARG_LANG, SRC_MAX_SENTENCE_LEN, TARG_MAX_SENTENCE_LEN, vocab)
data_minibatch = process_data(SRC_LANG, TARG_LANG, SRC_MAX_SENTENCE_LEN, TARG_MAX_SENTENCE_LEN, vocab, sample_limit=BATCH_SIZE) 
data_minitrain = process_data(SRC_LANG, TARG_LANG, SRC_MAX_SENTENCE_LEN, TARG_MAX_SENTENCE_LEN, vocab, sample_limit=1000)

In [6]:
# create dataloaders 
loaders_full = create_dataloaders(data, SRC_MAX_SENTENCE_LEN, TARG_MAX_SENTENCE_LEN, BATCH_SIZE)
loaders_minibatch = create_dataloaders(data_minibatch, SRC_MAX_SENTENCE_LEN, TARG_MAX_SENTENCE_LEN, BATCH_SIZE)
loaders_minitrain = create_dataloaders(data_minitrain, SRC_MAX_SENTENCE_LEN, TARG_MAX_SENTENCE_LEN, BATCH_SIZE)

In [7]:
# define model 

encoder = EncoderRNN(rnn_cell_type=RNN_CELL_TYPE, enc_hidden_dim=ENC_HIDDEN_DIM, num_layers=NUM_LAYERS, 
                     src_max_sentence_len=SRC_MAX_SENTENCE_LEN, enc_dropout=ENC_DROPOUT, 
                     pretrained_word2vec=get_pretrained_emb(vocab[SRC_LANG]['word2vec'], vocab[SRC_LANG]['token2id']))

# without attention 
# decoder = DecoderRNN(dec_hidden_dim=DEC_HIDDEN_DIM, enc_hidden_dim=ENC_HIDDEN_DIM, num_layers=NUM_LAYERS,
#                      targ_vocab_size=TARG_VOCAB_SIZE, targ_max_sentence_len=TARG_MAX_SENTENCE_LEN, 
#                      pretrained_word2vec=get_pretrained_emb(vocab[TARG_LANG]['word2vec'], vocab[TARG_LANG]['token2id']))
# model = EncoderDecoder(encoder, decoder, vocab[TARG_LANG]['token2id'])

# with attention 
decoder = DecoderAttnRNN(rnn_cell_type=RNN_CELL_TYPE, dec_hidden_dim=DEC_HIDDEN_DIM, enc_hidden_dim=ENC_HIDDEN_DIM, 
                         num_layers=NUM_LAYERS, targ_vocab_size=TARG_VOCAB_SIZE, src_max_sentence_len=SRC_MAX_SENTENCE_LEN, 
                         targ_max_sentence_len=TARG_MAX_SENTENCE_LEN, dec_dropout=DEC_DROPOUT, 
                         pretrained_word2vec=get_pretrained_emb(vocab[TARG_LANG]['word2vec'], vocab[TARG_LANG]['token2id']))
model = EncoderDecoderAttn(encoder, decoder, vocab[TARG_LANG]['token2id']) 

In [None]:
model, results = train_and_eval(
    model=model, loaders_full=loaders_full, loaders_minibatch=loaders_minibatch, loaders_minitrain=loaders_minitrain, 
    params=params, vocab=vocab, print_intermediate=True, save_checkpoint=True, save_to_log=True, 
    lazy_eval=False, print_attn=True, inspect_samples=1)

Epoch: 0.00, Train Loss: 9.89, Val Loss: 10.26, Train BLEU: 0.01, Val BLEU: 0.10, Minutes Elapsed: 0.06
Sampling from training predictions...
Source: 它们 被叫 叫做 管道 蠕虫 <EOS> <PAD> <PAD> <PAD> <PAD>
Reference: these are called tube worms . <EOS> <PAD> <PAD>
Model: <SOS> yukon <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS>
Attention Weights: tensor([[0.0024, 0.0025, 0.0025, 0.0026, 0.0026, 0.0026, 0.2462, 0.2462, 0.2462,
         0.2462],
        [0.0024, 0.0024, 0.0025, 0.0026, 0.0026, 0.0026, 0.2462, 0.2462, 0.2462,
         0.2462],
        [0.0024, 0.0024, 0.0025, 0.0026, 0.0026, 0.0025, 0.2462, 0.2462, 0.2462,
         0.2462],
        [0.0024, 0.0025, 0.0025, 0.0026, 0.0026, 0.0026, 0.2462, 0.2462, 0.2462,
         0.2462],
        [0.0024, 0.0025, 0.0025, 0.0026, 0.0026, 0.0026, 0.2462, 0.2462, 0.2462,
         0.2462],
        [0.0024, 0.0025, 0.0026, 0.0026, 0.0026, 0.0026, 0.2462, 0.2462, 0.2462,
         0.2462],
        [0.0024, 0.0025, 0.0026, 0.0026, 0.0027, 0.0026, 0.2461, 0

Epoch: 4.00, Train Loss: 8.51, Val Loss: 10.02, Train BLEU: 6.74, Val BLEU: 1.26, Minutes Elapsed: 0.31
Sampling from training predictions...
Source: 海洋 的 平均 深度 是 两英里 英里 <EOS> <PAD> <PAD>
Reference: the average depth is about two miles . <EOS>
Model: <SOS> &apos;s <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS>
Attention Weights: tensor([[1.0443e-06, 1.1413e-06, 1.2228e-06, 1.2769e-06, 1.3129e-06, 1.2846e-06,
         1.3633e-06, 1.3385e-06, 5.0000e-01, 5.0000e-01],
        [7.8016e-07, 8.5521e-07, 9.1709e-07, 9.5791e-07, 9.8453e-07, 9.6118e-07,
         1.0207e-06, 1.0009e-06, 5.0000e-01, 5.0000e-01],
        [6.7296e-07, 7.3875e-07, 7.9235e-07, 8.2773e-07, 8.5061e-07, 8.2943e-07,
         8.8035e-07, 8.6351e-07, 5.0000e-01, 5.0000e-01],
        [6.2724e-07, 6.8903e-07, 7.3906e-07, 7.7226e-07, 7.9354e-07, 7.7297e-07,
         8.1940e-07, 8.0376e-07, 5.0000e-01, 5.0000e-01],
        [6.0916e-07, 6.6911e-07, 7.1756e-07, 7.4986e-07, 7.7041e-07, 7.5011e-07,
         7.9421e-07, 7.7893e-07

Epoch: 7.00, Train Loss: 7.49, Val Loss: 9.74, Train BLEU: 1.83, Val BLEU: 1.22, Minutes Elapsed: 0.51
Sampling from training predictions...
Source: 学习 掌握 它 <EOS> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
Reference: learn to manage it . <EOS> <PAD> <PAD> <PAD>
Model: <SOS> . <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS>
Attention Weights: tensor([[3.8937e-08, 4.8224e-08, 5.3745e-08, 4.6473e-08, 1.6667e-01, 1.6667e-01,
         1.6667e-01, 1.6667e-01, 1.6667e-01, 1.6667e-01],
        [2.1736e-08, 2.7028e-08, 3.0253e-08, 2.6138e-08, 1.6667e-01, 1.6667e-01,
         1.6667e-01, 1.6667e-01, 1.6667e-01, 1.6667e-01],
        [1.7319e-08, 2.1511e-08, 2.4111e-08, 2.0907e-08, 1.6667e-01, 1.6667e-01,
         1.6667e-01, 1.6667e-01, 1.6667e-01, 1.6667e-01],
        [1.5947e-08, 1.9763e-08, 2.2130e-08, 1.9217e-08, 1.6667e-01, 1.6667e-01,
         1.6667e-01, 1.6667e-01, 1.6667e-01, 1.6667e-01],
        [1.5605e-08, 1.9295e-08, 2.1575e-08, 1.8746e-08, 1.6667e-01, 1.6667e-01,
         1.6667e-01, 1.666

Epoch: 10.00, Train Loss: 6.56, Val Loss: 9.41, Train BLEU: 0.63, Val BLEU: 0.92, Minutes Elapsed: 0.69
Sampling from training predictions...
Source: 这 是 一只 水母 <EOS> <PAD> <PAD> <PAD> <PAD> <PAD>
Reference: here &apos;s a jelly . <EOS> <PAD> <PAD> <PAD>
Model: <SOS> . . <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS>
Attention Weights: tensor([[1.7553e-08, 2.8826e-08, 3.5360e-08, 3.5067e-08, 2.2030e-08, 2.0000e-01,
         2.0000e-01, 2.0000e-01, 2.0000e-01, 2.0000e-01],
        [8.1361e-09, 1.3557e-08, 1.6733e-08, 1.6585e-08, 1.0353e-08, 2.0000e-01,
         2.0000e-01, 2.0000e-01, 2.0000e-01, 2.0000e-01],
        [6.3057e-09, 1.0514e-08, 1.2995e-08, 1.2896e-08, 8.0876e-09, 2.0000e-01,
         2.0000e-01, 2.0000e-01, 2.0000e-01, 2.0000e-01],
        [5.7915e-09, 9.6445e-09, 1.1922e-08, 1.1832e-08, 7.4404e-09, 2.0000e-01,
         2.0000e-01, 2.0000e-01, 2.0000e-01, 2.0000e-01],
        [5.6993e-09, 9.4725e-09, 1.1706e-08, 1.1617e-08, 7.3227e-09, 2.0000e-01,
         2.0000e-01, 2.0000e-01

Epoch: 13.00, Train Loss: 5.77, Val Loss: 9.19, Train BLEU: 0.57, Val BLEU: 0.98, Minutes Elapsed: 0.88
Sampling from training predictions...
Source: 这 是 一只 水母 <EOS> <PAD> <PAD> <PAD> <PAD> <PAD>
Reference: here &apos;s a jelly . <EOS> <PAD> <PAD> <PAD>
Model: <SOS> . . . <EOS> <EOS> <EOS> <EOS> <EOS> <EOS>
Attention Weights: tensor([[1.1815e-08, 2.4382e-08, 3.1677e-08, 2.9625e-08, 1.4330e-08, 2.0000e-01,
         2.0000e-01, 2.0000e-01, 2.0000e-01, 2.0000e-01],
        [4.6656e-09, 9.9761e-09, 1.3153e-08, 1.2219e-08, 5.7420e-09, 2.0000e-01,
         2.0000e-01, 2.0000e-01, 2.0000e-01, 2.0000e-01],
        [3.6959e-09, 7.9748e-09, 1.0560e-08, 9.8049e-09, 4.5941e-09, 2.0000e-01,
         2.0000e-01, 2.0000e-01, 2.0000e-01, 2.0000e-01],
        [3.4490e-09, 7.4728e-09, 9.9188e-09, 9.2077e-09, 4.3139e-09, 2.0000e-01,
         2.0000e-01, 2.0000e-01, 2.0000e-01, 2.0000e-01],
        [3.3889e-09, 7.3467e-09, 9.7582e-09, 9.0587e-09, 4.2507e-09, 2.0000e-01,
         2.0000e-01, 2.0000e-01, 2.

Epoch: 16.00, Train Loss: 5.12, Val Loss: 9.06, Train BLEU: 11.07, Val BLEU: 0.96, Minutes Elapsed: 1.08
Sampling from training predictions...
Source: 而 努力 去 保存 现状 则 是 徒劳 的 <EOS>
Reference: the concept of preservation is futile . <EOS> <PAD>
Model: <SOS> &apos;s . . . <EOS> <EOS> <EOS> <EOS> <EOS>
Attention Weights: tensor([[0.0295, 0.0683, 0.1010, 0.1168, 0.1272, 0.1343, 0.1372, 0.1322, 0.1101,
         0.0434],
        [0.0270, 0.0656, 0.1002, 0.1174, 0.1287, 0.1367, 0.1398, 0.1340, 0.1098,
         0.0409],
        [0.0263, 0.0648, 0.1001, 0.1177, 0.1294, 0.1376, 0.1407, 0.1344, 0.1092,
         0.0399],
        [0.0258, 0.0644, 0.1001, 0.1179, 0.1297, 0.1380, 0.1411, 0.1345, 0.1089,
         0.0395],
        [0.0257, 0.0642, 0.1001, 0.1180, 0.1298, 0.1382, 0.1413, 0.1346, 0.1088,
         0.0393],
        [0.0256, 0.0641, 0.1002, 0.1181, 0.1299, 0.1383, 0.1413, 0.1346, 0.1086,
         0.0393],
        [0.0256, 0.0641, 0.1002, 0.1181, 0.1299, 0.1384, 0.1414, 0.1345, 0.1085,
       

Epoch: 19.00, Train Loss: 4.59, Val Loss: 9.02, Train BLEU: 11.09, Val BLEU: 0.96, Minutes Elapsed: 1.33
Sampling from training predictions...
Source: 这 是 一个 塔 组织 <EOS> <PAD> <PAD> <PAD> <PAD>
Reference: here &apos;s <UNK> structures . <EOS> <PAD> <PAD> <PAD>
Model: <SOS> &apos;s . . . . <EOS> <EOS> <EOS> <EOS>
Attention Weights: tensor([[8.7299e-09, 2.0171e-08, 2.6375e-08, 2.9470e-08, 2.6501e-08, 1.1148e-08,
         2.5000e-01, 2.5000e-01, 2.5000e-01, 2.5000e-01],
        [2.9400e-09, 7.2252e-09, 9.6820e-09, 1.0890e-08, 9.6065e-09, 3.7976e-09,
         2.5000e-01, 2.5000e-01, 2.5000e-01, 2.5000e-01],
        [2.5900e-09, 6.4713e-09, 8.7168e-09, 9.8188e-09, 8.6104e-09, 3.3658e-09,
         2.5000e-01, 2.5000e-01, 2.5000e-01, 2.5000e-01],
        [2.4271e-09, 6.1314e-09, 8.2918e-09, 9.3526e-09, 8.1795e-09, 3.1760e-09,
         2.5000e-01, 2.5000e-01, 2.5000e-01, 2.5000e-01],
        [2.3101e-09, 5.8627e-09, 7.9452e-09, 8.9679e-09, 7.8340e-09, 3.0329e-09,
         2.5000e-01, 2.5000e-01

Epoch: 22.00, Train Loss: 4.21, Val Loss: 9.07, Train BLEU: 10.27, Val BLEU: 0.97, Minutes Elapsed: 1.58
Sampling from training predictions...
Source: 你 真的 能 看穿 它 的 脑袋 <EOS> <PAD> <PAD>
Reference: you can actually see through his head . <EOS>
Model: <SOS> &apos;s &apos;s . . <EOS> <EOS> <EOS> <EOS> <EOS>
Attention Weights: tensor([[1.5065e-08, 3.2215e-08, 4.0345e-08, 4.2501e-08, 4.4876e-08, 4.5282e-08,
         4.1047e-08, 1.8619e-08, 5.0000e-01, 5.0000e-01],
        [4.9382e-09, 1.1205e-08, 1.4381e-08, 1.5228e-08, 1.6108e-08, 1.6197e-08,
         1.4422e-08, 6.1604e-09, 5.0000e-01, 5.0000e-01],
        [4.4424e-09, 1.0235e-08, 1.3209e-08, 1.3996e-08, 1.4816e-08, 1.4880e-08,
         1.3176e-08, 5.5637e-09, 5.0000e-01, 5.0000e-01],
        [4.0910e-09, 9.5418e-09, 1.2374e-08, 1.3126e-08, 1.3904e-08, 1.3961e-08,
         1.2327e-08, 5.1628e-09, 5.0000e-01, 5.0000e-01],
        [3.7905e-09, 8.8974e-09, 1.1572e-08, 1.2287e-08, 1.3020e-08, 1.3073e-08,
         1.1527e-08, 4.8076e-09, 5.000

Epoch: 25.00, Train Loss: 3.94, Val Loss: 9.22, Train BLEU: 11.67, Val BLEU: 0.97, Minutes Elapsed: 1.83
Sampling from training predictions...
Source: 这 是 这些 热液 出口 口中 的 一个 <EOS> <PAD>
Reference: this is one of these hydrothermal vents . <EOS>
Model: <SOS> &apos;s &apos;s . . <EOS> <EOS> <EOS> <EOS> <EOS>
Attention Weights: tensor([[2.7367e-08, 5.4478e-08, 6.1339e-08, 6.1852e-08, 6.9681e-08, 7.1824e-08,
         7.3410e-08, 6.7029e-08, 3.3265e-08, 1.0000e+00],
        [8.6407e-09, 1.8284e-08, 2.0887e-08, 2.0964e-08, 2.3819e-08, 2.4559e-08,
         2.5039e-08, 2.2458e-08, 1.0492e-08, 1.0000e+00],
        [7.8455e-09, 1.6849e-08, 1.9285e-08, 1.9325e-08, 2.2028e-08, 2.2721e-08,
         2.3158e-08, 2.0653e-08, 9.5246e-09, 1.0000e+00],
        [7.1261e-09, 1.5516e-08, 1.7819e-08, 1.7852e-08, 2.0387e-08, 2.1042e-08,
         2.1450e-08, 1.9076e-08, 8.7253e-09, 1.0000e+00],
        [6.4834e-09, 1.4229e-08, 1.6387e-08, 1.6422e-08, 1.8767e-08, 1.9377e-08,
         1.9755e-08, 1.7545e-08, 7.992

Epoch: 28.00, Train Loss: 3.75, Val Loss: 9.41, Train BLEU: 11.41, Val BLEU: 0.95, Minutes Elapsed: 2.03
Sampling from training predictions...
Source: 这些 更 小 地动 动物 会 蜷缩 缩在 周围 <EOS>
Reference: there are smaller animals crawling around . <EOS> <PAD>
Model: <SOS> here &apos;s &apos;s . . <EOS> <EOS> <EOS> <EOS>
Attention Weights: tensor([[0.0487, 0.0961, 0.1076, 0.1053, 0.1167, 0.1225, 0.1184, 0.1131, 0.1116,
         0.0599],
        [0.0454, 0.0958, 0.1091, 0.1061, 0.1187, 0.1250, 0.1202, 0.1135, 0.1106,
         0.0558],
        [0.0447, 0.0959, 0.1095, 0.1061, 0.1192, 0.1256, 0.1206, 0.1133, 0.1103,
         0.0547],
        [0.0440, 0.0958, 0.1097, 0.1062, 0.1195, 0.1261, 0.1209, 0.1132, 0.1102,
         0.0543],
        [0.0435, 0.0955, 0.1097, 0.1062, 0.1196, 0.1264, 0.1212, 0.1133, 0.1104,
         0.0543],
        [0.0431, 0.0953, 0.1097, 0.1062, 0.1197, 0.1265, 0.1213, 0.1133, 0.1104,
         0.0543],
        [0.0430, 0.0951, 0.1096, 0.1063, 0.1197, 0.1265, 0.1214, 0.1134, 0.11

Epoch: 31.00, Train Loss: 3.63, Val Loss: 9.61, Train BLEU: 11.39, Val BLEU: 0.95, Minutes Elapsed: 2.21
Sampling from training predictions...
Source: 它们 还 没有 被 研究 透 <EOS> <PAD> <PAD> <PAD>
Reference: they &apos;re very little understood . <EOS> <PAD> <PAD>
Model: <SOS> here &apos;s &apos;s . . <EOS> <EOS> <EOS> <EOS>
Attention Weights: tensor([[9.3834e-09, 1.6589e-08, 1.7700e-08, 1.8644e-08, 1.9001e-08, 1.9214e-08,
         1.1020e-08, 3.3333e-01, 3.3333e-01, 3.3333e-01],
        [2.6802e-09, 5.0872e-09, 5.4871e-09, 5.7834e-09, 5.8579e-09, 5.8131e-09,
         3.0875e-09, 3.3333e-01, 3.3333e-01, 3.3333e-01],
        [2.5541e-09, 4.9180e-09, 5.3119e-09, 5.6030e-09, 5.6687e-09, 5.6062e-09,
         2.9311e-09, 3.3333e-01, 3.3333e-01, 3.3333e-01],
        [2.3536e-09, 4.5996e-09, 4.9821e-09, 5.2620e-09, 5.3264e-09, 5.2660e-09,
         2.7387e-09, 3.3333e-01, 3.3333e-01, 3.3333e-01],
        [2.1338e-09, 4.2087e-09, 4.5706e-09, 4.8336e-09, 4.8971e-09, 4.8455e-09,
         2.5213e-09, 3.3

Epoch: 34.00, Train Loss: 3.55, Val Loss: 9.83, Train BLEU: 11.39, Val BLEU: 0.95, Minutes Elapsed: 2.36
Sampling from training predictions...
Source: 海洋 的 平均 深度 是 两英里 英里 <EOS> <PAD> <PAD>
Reference: the average depth is about two miles . <EOS>
Model: <SOS> here &apos;s &apos;s . . <EOS> <EOS> <EOS> <EOS>
Attention Weights: tensor([[1.5320e-08, 2.5357e-08, 2.6401e-08, 2.6963e-08, 2.7420e-08, 2.5852e-08,
         2.6731e-08, 1.5820e-08, 5.0000e-01, 5.0000e-01],
        [4.3393e-09, 7.7167e-09, 8.1038e-09, 8.2658e-09, 8.3614e-09, 7.6954e-09,
         7.8891e-09, 4.3114e-09, 5.0000e-01, 5.0000e-01],
        [4.1142e-09, 7.4287e-09, 7.8133e-09, 7.9752e-09, 8.0675e-09, 7.3982e-09,
         7.5734e-09, 4.0766e-09, 5.0000e-01, 5.0000e-01],
        [3.7278e-09, 6.8264e-09, 7.2016e-09, 7.3598e-09, 7.4554e-09, 6.8448e-09,
         7.0145e-09, 3.7715e-09, 5.0000e-01, 5.0000e-01],
        [3.3537e-09, 6.1942e-09, 6.5536e-09, 6.7101e-09, 6.8110e-09, 6.2769e-09,
         6.4436e-09, 3.4865e-09, 5.00

Epoch: 37.00, Train Loss: 3.49, Val Loss: 10.02, Train BLEU: 11.10, Val BLEU: 0.95, Minutes Elapsed: 2.53
Sampling from training predictions...
Source: 它们 还 没有 被 研究 透 <EOS> <PAD> <PAD> <PAD>
Reference: they &apos;re very little understood . <EOS> <PAD> <PAD>
Model: <SOS> here &apos;s &apos;s . . . <EOS> <EOS> <EOS>
Attention Weights: tensor([[1.0961e-08, 1.7008e-08, 1.7802e-08, 1.8611e-08, 1.8480e-08, 1.8079e-08,
         1.0476e-08, 3.3333e-01, 3.3333e-01, 3.3333e-01],
        [3.1314e-09, 5.1724e-09, 5.4325e-09, 5.6636e-09, 5.5706e-09, 5.3268e-09,
         2.8316e-09, 3.3333e-01, 3.3333e-01, 3.3333e-01],
        [2.9127e-09, 4.8768e-09, 5.1325e-09, 5.3567e-09, 5.2659e-09, 5.0227e-09,
         2.6363e-09, 3.3333e-01, 3.3333e-01, 3.3333e-01],
        [2.5537e-09, 4.3281e-09, 4.5704e-09, 4.7805e-09, 4.7126e-09, 4.5099e-09,
         2.3745e-09, 3.3333e-01, 3.3333e-01, 3.3333e-01],
        [2.2778e-09, 3.8863e-09, 4.1174e-09, 4.3181e-09, 4.2732e-09, 4.1116e-09,
         2.1902e-09, 3.3333

Epoch: 40.00, Train Loss: 3.44, Val Loss: 10.18, Train BLEU: 11.09, Val BLEU: 0.82, Minutes Elapsed: 2.73
Sampling from training predictions...
Source: 而 努力 去 保存 现状 则 是 徒劳 的 <EOS>
Reference: the concept of preservation is futile . <EOS> <PAD>
Model: <SOS> the &apos;s &apos;s &apos;s . . <EOS> <EOS> <EOS>
Attention Weights: tensor([[0.0676, 0.0978, 0.1084, 0.1127, 0.1156, 0.1173, 0.1152, 0.1094, 0.1003,
         0.0557],
        [0.0656, 0.0989, 0.1103, 0.1146, 0.1174, 0.1189, 0.1162, 0.1093, 0.0981,
         0.0507],
        [0.0652, 0.0992, 0.1108, 0.1151, 0.1179, 0.1193, 0.1163, 0.1091, 0.0974,
         0.0498],
        [0.0642, 0.0987, 0.1105, 0.1150, 0.1179, 0.1194, 0.1166, 0.1096, 0.0980,
         0.0499],
        [0.0633, 0.0980, 0.1099, 0.1146, 0.1176, 0.1193, 0.1168, 0.1103, 0.0992,
         0.0510],
        [0.0624, 0.0971, 0.1091, 0.1139, 0.1171, 0.1191, 0.1169, 0.1110, 0.1006,
         0.0528],
        [0.0620, 0.0964, 0.1084, 0.1134, 0.1167, 0.1188, 0.1170, 0.1115, 0.1017,


Epoch: 43.00, Train Loss: 3.39, Val Loss: 10.32, Train BLEU: 11.24, Val BLEU: 1.02, Minutes Elapsed: 2.90
Sampling from training predictions...
Source: 看到 这些 在 动 的 东西 了 吗 <EOS> <PAD>
Reference: but see all those different working things ? <EOS>
Model: <SOS> here &apos;s &apos;s . . <EOS> <EOS> <EOS> <EOS>
Attention Weights: tensor([[4.1908e-08, 6.1230e-08, 6.7430e-08, 7.0879e-08, 7.1335e-08, 6.9062e-08,
         6.5786e-08, 5.7383e-08, 2.9991e-08, 1.0000e+00],
        [1.2574e-08, 1.9162e-08, 2.1202e-08, 2.2232e-08, 2.2277e-08, 2.1382e-08,
         2.0111e-08, 1.7059e-08, 8.1931e-09, 1.0000e+00],
        [1.0743e-08, 1.6530e-08, 1.8332e-08, 1.9242e-08, 1.9292e-08, 1.8522e-08,
         1.7422e-08, 1.4764e-08, 7.0569e-09, 1.0000e+00],
        [9.3272e-09, 1.4468e-08, 1.6091e-08, 1.6924e-08, 1.7008e-08, 1.6377e-08,
         1.5461e-08, 1.3163e-08, 6.3444e-09, 1.0000e+00],
        [7.9758e-09, 1.2423e-08, 1.3861e-08, 1.4623e-08, 1.4744e-08, 1.4262e-08,
         1.3535e-08, 1.1607e-08, 5.67

Epoch: 46.00, Train Loss: 3.34, Val Loss: 10.43, Train BLEU: 11.63, Val BLEU: 1.02, Minutes Elapsed: 3.05
Sampling from training predictions...
Source: 这 是 一种 种群 栖 动物 <EOS> <PAD> <PAD> <PAD>
Reference: it &apos;s a colonial animal . <EOS> <PAD> <PAD>
Model: <SOS> here &apos;s &apos;s a . <EOS> <EOS> <EOS> <EOS>
Attention Weights: tensor([[1.5633e-08, 2.1645e-08, 2.3482e-08, 2.3585e-08, 2.2423e-08, 1.8752e-08,
         9.3796e-09, 3.3333e-01, 3.3333e-01, 3.3333e-01],
        [5.0278e-09, 7.1239e-09, 7.7041e-09, 7.6661e-09, 7.1839e-09, 5.8162e-09,
         2.6899e-09, 3.3333e-01, 3.3333e-01, 3.3333e-01],
        [4.1633e-09, 5.9308e-09, 6.4267e-09, 6.4037e-09, 6.0049e-09, 4.8599e-09,
         2.2420e-09, 3.3333e-01, 3.3333e-01, 3.3333e-01],
        [3.5439e-09, 5.0735e-09, 5.5199e-09, 5.5245e-09, 5.2043e-09, 4.2402e-09,
         1.9821e-09, 3.3333e-01, 3.3333e-01, 3.3333e-01],
        [3.0394e-09, 4.3623e-09, 4.7720e-09, 4.8062e-09, 4.5571e-09, 3.7472e-09,
         1.7798e-09, 3.3333e-01

Epoch: 49.00, Train Loss: 3.29, Val Loss: 10.51, Train BLEU: 11.65, Val BLEU: 1.02, Minutes Elapsed: 3.20
Sampling from training predictions...
Source: 它们 就是 这样 生活 的 <EOS> <PAD> <PAD> <PAD> <PAD>
Reference: that &apos;s how they like to live . <EOS>
Model: <SOS> here &apos;s &apos;s a . <EOS> <EOS> <EOS> <EOS>
Attention Weights: tensor([[1.2788e-08, 1.7172e-08, 1.7659e-08, 1.6674e-08, 1.4063e-08, 6.4578e-09,
         2.5000e-01, 2.5000e-01, 2.5000e-01, 2.5000e-01],
        [4.5496e-09, 6.1810e-09, 6.2979e-09, 5.8536e-09, 4.7879e-09, 2.0458e-09,
         2.5000e-01, 2.5000e-01, 2.5000e-01, 2.5000e-01],
        [3.7081e-09, 5.0385e-09, 5.1367e-09, 4.7800e-09, 3.9095e-09, 1.6700e-09,
         2.5000e-01, 2.5000e-01, 2.5000e-01, 2.5000e-01],
        [3.1595e-09, 4.2980e-09, 4.4027e-09, 4.1222e-09, 3.3983e-09, 1.4789e-09,
         2.5000e-01, 2.5000e-01, 2.5000e-01, 2.5000e-01],
        [2.6510e-09, 3.6111e-09, 3.7270e-09, 3.5206e-09, 2.9333e-09, 1.3027e-09,
         2.5000e-01, 2.5000e-01,

Epoch: 52.00, Train Loss: 3.25, Val Loss: 10.56, Train BLEU: 11.68, Val BLEU: 0.22, Minutes Elapsed: 3.35
Sampling from training predictions...
Source: 非常 非常感谢 感谢 <EOS> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
Reference: thank you very much . <EOS> <PAD> <PAD> <PAD>
Model: <SOS> here &apos;s a . <EOS> <EOS> <EOS> <EOS> <EOS>
Attention Weights: tensor([[5.2109e-09, 6.2058e-09, 7.6044e-09, 3.1381e-09, 1.6667e-01, 1.6667e-01,
         1.6667e-01, 1.6667e-01, 1.6667e-01, 1.6667e-01],
        [1.7668e-09, 2.0053e-09, 2.4310e-09, 9.5565e-10, 1.6667e-01, 1.6667e-01,
         1.6667e-01, 1.6667e-01, 1.6667e-01, 1.6667e-01],
        [1.4686e-09, 1.6568e-09, 2.0083e-09, 7.9981e-10, 1.6667e-01, 1.6667e-01,
         1.6667e-01, 1.6667e-01, 1.6667e-01, 1.6667e-01],
        [1.1757e-09, 1.3452e-09, 1.6294e-09, 6.6936e-10, 1.6667e-01, 1.6667e-01,
         1.6667e-01, 1.6667e-01, 1.6667e-01, 1.6667e-01],
        [1.1243e-09, 1.3182e-09, 1.5992e-09, 6.8107e-10, 1.6667e-01, 1.6667e-01,
         1.6667e-01, 1

Epoch: 55.00, Train Loss: 3.20, Val Loss: 10.54, Train BLEU: 11.58, Val BLEU: 0.22, Minutes Elapsed: 3.55
Sampling from training predictions...
Source: 这 是 一只 水母 <EOS> <PAD> <PAD> <PAD> <PAD> <PAD>
Reference: here &apos;s a jelly . <EOS> <PAD> <PAD> <PAD>
Model: <SOS> here &apos;s &apos;s . . <EOS> <EOS> <EOS> <EOS>
Attention Weights: tensor([[1.2702e-08, 1.5676e-08, 1.4965e-08, 1.1691e-08, 4.5296e-09, 2.0000e-01,
         2.0000e-01, 2.0000e-01, 2.0000e-01, 2.0000e-01],
        [5.6676e-09, 6.8955e-09, 6.4303e-09, 4.8493e-09, 1.7857e-09, 2.0000e-01,
         2.0000e-01, 2.0000e-01, 2.0000e-01, 2.0000e-01],
        [4.6105e-09, 5.5488e-09, 5.1633e-09, 3.8920e-09, 1.4437e-09, 2.0000e-01,
         2.0000e-01, 2.0000e-01, 2.0000e-01, 2.0000e-01],
        [3.6085e-09, 4.3134e-09, 4.0383e-09, 3.0814e-09, 1.1775e-09, 2.0000e-01,
         2.0000e-01, 2.0000e-01, 2.0000e-01, 2.0000e-01],
        [2.8795e-09, 3.4347e-09, 3.2497e-09, 2.5209e-09, 9.9428e-10, 2.0000e-01,
         2.0000e-01, 2.000

Epoch: 58.00, Train Loss: 3.15, Val Loss: 10.44, Train BLEU: 12.31, Val BLEU: 0.22, Minutes Elapsed: 3.72
Sampling from training predictions...
Source: 地球 的 大部 大部分 部分 都 是 海水 <EOS> <PAD>
Reference: most of the planet is ocean water . <EOS>
Model: <SOS> it &apos;s &apos;s a . . <EOS> <EOS> <EOS>
Attention Weights: tensor([[1.4435e-07, 1.8132e-07, 1.8072e-07, 1.6866e-07, 1.4595e-07, 1.2220e-07,
         9.9274e-08, 6.8067e-08, 2.3606e-08, 1.0000e+00],
        [8.5587e-08, 1.0831e-07, 1.0730e-07, 9.9253e-08, 8.4700e-08, 6.9699e-08,
         5.5439e-08, 3.6766e-08, 1.2265e-08, 1.0000e+00],
        [7.2413e-08, 9.0539e-08, 8.9105e-08, 8.2117e-08, 6.9842e-08, 5.7394e-08,
         4.5622e-08, 3.0246e-08, 1.0173e-08, 1.0000e+00],
        [5.8991e-08, 7.2362e-08, 7.0892e-08, 6.5395e-08, 5.5825e-08, 4.6116e-08,
         3.6877e-08, 2.4694e-08, 8.4999e-09, 1.0000e+00],
        [4.3926e-08, 5.2805e-08, 5.1599e-08, 4.7730e-08, 4.0938e-08, 3.4048e-08,
         2.7454e-08, 1.8621e-08, 6.5497e-09, 1.00

Epoch: 61.00, Train Loss: 3.11, Val Loss: 10.39, Train BLEU: 13.09, Val BLEU: 0.23, Minutes Elapsed: 3.90
Sampling from training predictions...
Source: 看 这里 <EOS> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
Reference: here they go . <EOS> <PAD> <PAD> <PAD> <PAD>
Model: <SOS> here &apos;s a . <EOS> <EOS> <EOS> <EOS> <EOS>
Attention Weights: tensor([[6.0272e-09, 5.8615e-09, 2.0743e-09, 1.4286e-01, 1.4286e-01, 1.4286e-01,
         1.4286e-01, 1.4286e-01, 1.4286e-01, 1.4286e-01],
        [2.8815e-09, 2.6336e-09, 9.2248e-10, 1.4286e-01, 1.4286e-01, 1.4286e-01,
         1.4286e-01, 1.4286e-01, 1.4286e-01, 1.4286e-01],
        [2.4329e-09, 2.1974e-09, 7.8156e-10, 1.4286e-01, 1.4286e-01, 1.4286e-01,
         1.4286e-01, 1.4286e-01, 1.4286e-01, 1.4286e-01],
        [1.7500e-09, 1.5787e-09, 5.8179e-10, 1.4286e-01, 1.4286e-01, 1.4286e-01,
         1.4286e-01, 1.4286e-01, 1.4286e-01, 1.4286e-01],
        [1.6377e-09, 1.4880e-09, 5.7052e-10, 1.4286e-01, 1.4286e-01, 1.4286e-01,
         1.4286e-01, 1.

Epoch: 64.00, Train Loss: 3.06, Val Loss: 10.34, Train BLEU: 13.82, Val BLEU: 0.24, Minutes Elapsed: 4.09
Sampling from training predictions...
Source: 看 这里 <EOS> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
Reference: here they go . <EOS> <PAD> <PAD> <PAD> <PAD>
Model: <SOS> here &apos;s a . <EOS> <EOS> <EOS> <EOS> <EOS>
Attention Weights: tensor([[6.3112e-09, 5.9385e-09, 2.0342e-09, 1.4286e-01, 1.4286e-01, 1.4286e-01,
         1.4286e-01, 1.4286e-01, 1.4286e-01, 1.4286e-01],
        [3.1703e-09, 2.7955e-09, 9.6456e-10, 1.4286e-01, 1.4286e-01, 1.4286e-01,
         1.4286e-01, 1.4286e-01, 1.4286e-01, 1.4286e-01],
        [2.7374e-09, 2.3782e-09, 8.3594e-10, 1.4286e-01, 1.4286e-01, 1.4286e-01,
         1.4286e-01, 1.4286e-01, 1.4286e-01, 1.4286e-01],
        [1.9646e-09, 1.6969e-09, 6.1747e-10, 1.4286e-01, 1.4286e-01, 1.4286e-01,
         1.4286e-01, 1.4286e-01, 1.4286e-01, 1.4286e-01],
        [1.8019e-09, 1.5601e-09, 5.9004e-10, 1.4286e-01, 1.4286e-01, 1.4286e-01,
         1.4286e-01, 1.

Epoch: 67.00, Train Loss: 3.01, Val Loss: 10.38, Train BLEU: 14.22, Val BLEU: 0.24, Minutes Elapsed: 4.25
Sampling from training predictions...
Source: 这里 有 蟹 还有 蠕虫 <EOS> <PAD> <PAD> <PAD> <PAD>
Reference: here &apos;s crabs here , worms here . <EOS>
Model: <SOS> here &apos;s &apos;s a . . <EOS> <EOS> <EOS>
Attention Weights: tensor([[4.1612e-08, 4.4039e-08, 3.6244e-08, 2.7755e-08, 1.7823e-08, 5.6332e-09,
         2.5000e-01, 2.5000e-01, 2.5000e-01, 2.5000e-01],
        [2.8745e-08, 2.8544e-08, 2.2668e-08, 1.6923e-08, 1.0701e-08, 3.4327e-09,
         2.5000e-01, 2.5000e-01, 2.5000e-01, 2.5000e-01],
        [2.5649e-08, 2.4355e-08, 1.9087e-08, 1.4226e-08, 9.0191e-09, 2.9465e-09,
         2.5000e-01, 2.5000e-01, 2.5000e-01, 2.5000e-01],
        [2.0264e-08, 1.8553e-08, 1.4493e-08, 1.0869e-08, 6.9696e-09, 2.3516e-09,
         2.5000e-01, 2.5000e-01, 2.5000e-01, 2.5000e-01],
        [1.4191e-08, 1.2574e-08, 9.8459e-09, 7.4665e-09, 4.8604e-09, 1.6906e-09,
         2.5000e-01, 2.5000e-01, 2.

Epoch: 70.00, Train Loss: 2.97, Val Loss: 10.34, Train BLEU: 14.10, Val BLEU: 0.24, Minutes Elapsed: 4.44
Sampling from training predictions...
Source: 这些 都 是 虾 <EOS> <PAD> <PAD> <PAD> <PAD> <PAD>
Reference: those are all shrimp . <EOS> <PAD> <PAD> <PAD>
Model: <SOS> here &apos;s a . . <EOS> <EOS> <EOS> <EOS>
Attention Weights: tensor([[2.2539e-08, 2.5366e-08, 2.1355e-08, 1.4604e-08, 4.6300e-09, 2.0000e-01,
         2.0000e-01, 2.0000e-01, 2.0000e-01, 2.0000e-01],
        [1.4340e-08, 1.5045e-08, 1.2267e-08, 8.2635e-09, 2.6672e-09, 2.0000e-01,
         2.0000e-01, 2.0000e-01, 2.0000e-01, 2.0000e-01],
        [1.2756e-08, 1.2863e-08, 1.0401e-08, 7.0218e-09, 2.3102e-09, 2.0000e-01,
         2.0000e-01, 2.0000e-01, 2.0000e-01, 2.0000e-01],
        [1.0002e-08, 9.7315e-09, 7.8662e-09, 5.3787e-09, 1.8421e-09, 2.0000e-01,
         2.0000e-01, 2.0000e-01, 2.0000e-01, 2.0000e-01],
        [7.4430e-09, 7.0150e-09, 5.7029e-09, 3.9690e-09, 1.4170e-09, 2.0000e-01,
         2.0000e-01, 2.0000e-01, 

Epoch: 73.00, Train Loss: 2.92, Val Loss: 10.46, Train BLEU: 14.32, Val BLEU: 0.24, Minutes Elapsed: 4.62
Sampling from training predictions...
Source: 这 是 这些 热液 出口 口中 的 一个 <EOS> <PAD>
Reference: this is one of these hydrothermal vents . <EOS>
Model: <SOS> it &apos;s &apos;s a . . . <EOS> <EOS>
Attention Weights: tensor([[4.1562e-07, 5.6350e-07, 5.5698e-07, 4.8381e-07, 3.1436e-07, 2.1857e-07,
         1.5927e-07, 9.9309e-08, 3.4591e-08, 1.0000e+00],
        [3.4825e-07, 4.6268e-07, 4.5108e-07, 3.8242e-07, 2.3533e-07, 1.5800e-07,
         1.1301e-07, 6.9599e-08, 2.4421e-08, 1.0000e+00],
        [3.0843e-07, 3.8961e-07, 3.7212e-07, 3.1287e-07, 1.9075e-07, 1.2865e-07,
         9.2655e-08, 5.7385e-08, 2.0496e-08, 1.0000e+00],
        [2.4967e-07, 2.9725e-07, 2.7896e-07, 2.3431e-07, 1.4437e-07, 9.8791e-08,
         7.2120e-08, 4.5464e-08, 1.6808e-08, 1.0000e+00],
        [1.8657e-07, 2.0838e-07, 1.9269e-07, 1.6265e-07, 1.0181e-07, 7.1136e-08,
         5.2821e-08, 3.3954e-08, 1.3000e-08, 1.0

Epoch: 76.00, Train Loss: 2.87, Val Loss: 10.53, Train BLEU: 14.65, Val BLEU: 0.24, Minutes Elapsed: 4.77
Sampling from training predictions...
Source: 这些 更 小 地动 动物 会 蜷缩 缩在 周围 <EOS>
Reference: there are smaller animals crawling around . <EOS> <PAD>
Model: <SOS> the are smaller preservation crawling . . <EOS> <EOS>
Attention Weights: tensor([[0.1239, 0.1794, 0.1776, 0.1524, 0.1287, 0.0895, 0.0641, 0.0412, 0.0316,
         0.0116],
        [0.1193, 0.1806, 0.1813, 0.1571, 0.1299, 0.0885, 0.0627, 0.0398, 0.0300,
         0.0106],
        [0.1221, 0.1829, 0.1825, 0.1574, 0.1290, 0.0871, 0.0614, 0.0385, 0.0290,
         0.0101],
        [0.1302, 0.1814, 0.1771, 0.1533, 0.1272, 0.0876, 0.0626, 0.0398, 0.0302,
         0.0107],
        [0.1439, 0.1786, 0.1689, 0.1466, 0.1238, 0.0877, 0.0641, 0.0420, 0.0323,
         0.0121],
        [0.1588, 0.1706, 0.1561, 0.1377, 0.1198, 0.0895, 0.0681, 0.0472, 0.0372,
         0.0151],
        [0.1641, 0.1595, 0.1439, 0.1301, 0.1172, 0.0930, 0.0742, 0.0545

Epoch: 79.00, Train Loss: 2.82, Val Loss: 10.55, Train BLEU: 14.61, Val BLEU: 0.24, Minutes Elapsed: 4.96
Sampling from training predictions...
Source: 它们 实际 实际上 是 感光 <UNK> 光器 器官 <EOS> <PAD>
Reference: it &apos;s actually a <UNK> organ . <EOS> <PAD>
Model: <SOS> it &apos;s &apos;s a . . . <EOS> <EOS>
Attention Weights: tensor([[4.1761e-07, 6.2928e-07, 6.0014e-07, 5.7275e-07, 4.1753e-07, 2.2153e-07,
         1.3115e-07, 9.7364e-08, 3.7797e-08, 1.0000e+00],
        [3.4631e-07, 5.2638e-07, 4.9755e-07, 4.6653e-07, 3.2847e-07, 1.6667e-07,
         9.7335e-08, 7.1650e-08, 2.8055e-08, 1.0000e+00],
        [2.8605e-07, 4.1467e-07, 3.8495e-07, 3.5488e-07, 2.4890e-07, 1.2771e-07,
         7.5701e-08, 5.6318e-08, 2.2601e-08, 1.0000e+00],
        [2.2313e-07, 2.9988e-07, 2.7296e-07, 2.5033e-07, 1.7768e-07, 9.4737e-08,
         5.7653e-08, 4.3405e-08, 1.8149e-08, 1.0000e+00],
        [1.7143e-07, 2.1386e-07, 1.9132e-07, 1.7443e-07, 1.2583e-07, 6.9662e-08,
         4.4091e-08, 3.3760e-08, 1.4635e-0

Epoch: 82.00, Train Loss: 2.78, Val Loss: 10.70, Train BLEU: 15.46, Val BLEU: 0.24, Minutes Elapsed: 5.14
Sampling from training predictions...
Source: 这 是 这些 热液 出口 口中 的 一个 <EOS> <PAD>
Reference: this is one of these hydrothermal vents . <EOS>
Model: <SOS> it &apos;s &apos;s a . . . <EOS> <EOS>
Attention Weights: tensor([[4.3907e-07, 6.1626e-07, 6.1503e-07, 5.3360e-07, 3.2080e-07, 2.1354e-07,
         1.5067e-07, 9.1910e-08, 3.9547e-08, 1.0000e+00],
        [3.5429e-07, 4.9517e-07, 4.9103e-07, 4.1755e-07, 2.3925e-07, 1.5592e-07,
         1.0913e-07, 6.6377e-08, 2.8905e-08, 1.0000e+00],
        [2.7451e-07, 3.6325e-07, 3.5298e-07, 2.9881e-07, 1.7255e-07, 1.1452e-07,
         8.1306e-08, 5.0160e-08, 2.2486e-08, 1.0000e+00],
        [2.0433e-07, 2.4986e-07, 2.3772e-07, 2.0344e-07, 1.2190e-07, 8.3823e-08,
         6.1160e-08, 3.8813e-08, 1.8064e-08, 1.0000e+00],
        [1.5173e-07, 1.6997e-07, 1.5900e-07, 1.3829e-07, 8.7776e-08, 6.3183e-08,
         4.7640e-08, 3.1301e-08, 1.5179e-08, 1.0

Epoch: 85.00, Train Loss: 2.73, Val Loss: 10.73, Train BLEU: 15.28, Val BLEU: 0.23, Minutes Elapsed: 5.30
Sampling from training predictions...
Source: 只是 一个 难以 难以置信 置信 的 故事 <EOS> <PAD> <PAD>
Reference: it &apos;s an incredible story . <EOS> <PAD> <PAD>
Model: <SOS> it &apos;s a a . . <EOS> <EOS> <EOS>
Attention Weights: tensor([[1.3154e-07, 1.5618e-07, 1.1592e-07, 7.5336e-08, 9.3570e-08, 6.5648e-08,
         3.9127e-08, 1.7087e-08, 5.0000e-01, 5.0000e-01],
        [9.3628e-08, 1.0889e-07, 7.9988e-08, 5.1095e-08, 6.1777e-08, 4.3248e-08,
         2.5855e-08, 1.1497e-08, 5.0000e-01, 5.0000e-01],
        [6.9602e-08, 7.8063e-08, 5.7583e-08, 3.7376e-08, 4.5569e-08, 3.2508e-08,
         1.9797e-08, 9.0789e-09, 5.0000e-01, 5.0000e-01],
        [4.9433e-08, 5.2970e-08, 3.9695e-08, 2.6782e-08, 3.2802e-08, 2.4346e-08,
         1.5432e-08, 7.3907e-09, 5.0000e-01, 5.0000e-01],
        [3.7608e-08, 3.8111e-08, 2.9453e-08, 2.1152e-08, 2.6220e-08, 2.0474e-08,
         1.3678e-08, 6.9214e-09, 5.0000e

Epoch: 88.00, Train Loss: 2.68, Val Loss: 10.88, Train BLEU: 15.52, Val BLEU: 0.24, Minutes Elapsed: 5.47
Sampling from training predictions...
Source: 这 是 这些 热液 出口 口中 的 一个 <EOS> <PAD>
Reference: this is one of these hydrothermal vents . <EOS>
Model: <SOS> it &apos;s &apos;s a . . . <EOS> <EOS>
Attention Weights: tensor([[3.4154e-07, 5.2403e-07, 5.2736e-07, 4.6273e-07, 3.0334e-07, 2.0384e-07,
         1.4048e-07, 8.5360e-08, 4.0933e-08, 1.0000e+00],
        [2.4108e-07, 3.7045e-07, 3.7413e-07, 3.2785e-07, 2.1527e-07, 1.4526e-07,
         1.0048e-07, 6.1633e-08, 3.0095e-08, 1.0000e+00],
        [1.5997e-07, 2.3524e-07, 2.3557e-07, 2.0948e-07, 1.4699e-07, 1.0380e-07,
         7.4136e-08, 4.6827e-08, 2.3685e-08, 1.0000e+00],
        [1.1536e-07, 1.5744e-07, 1.5528e-07, 1.4236e-07, 1.0848e-07, 8.1431e-08,
         6.0937e-08, 4.0108e-08, 2.0916e-08, 1.0000e+00],
        [8.6962e-08, 1.0929e-07, 1.0673e-07, 1.0094e-07, 8.4846e-08, 6.7996e-08,
         5.3420e-08, 3.6887e-08, 2.0052e-08, 1.0

Epoch: 91.00, Train Loss: 2.64, Val Loss: 10.94, Train BLEU: 16.23, Val BLEU: 0.24, Minutes Elapsed: 5.63
Sampling from training predictions...
Source: 它们 还 没有 被 研究 透 <EOS> <PAD> <PAD> <PAD>
Reference: they &apos;re very little understood . <EOS> <PAD> <PAD>
Model: <SOS> it &apos;s &apos;s a . . <EOS> <EOS> <EOS>
Attention Weights: tensor([[7.6484e-08, 9.8985e-08, 8.1659e-08, 6.1748e-08, 4.2883e-08, 2.9042e-08,
         1.3247e-08, 3.3333e-01, 3.3333e-01, 3.3333e-01],
        [5.1671e-08, 6.6201e-08, 5.4216e-08, 4.0799e-08, 2.8315e-08, 1.9253e-08,
         8.9820e-09, 3.3333e-01, 3.3333e-01, 3.3333e-01],
        [3.4375e-08, 4.4249e-08, 3.7129e-08, 2.8864e-08, 2.0548e-08, 1.4291e-08,
         6.9033e-09, 3.3333e-01, 3.3333e-01, 3.3333e-01],
        [2.5255e-08, 3.2541e-08, 2.8206e-08, 2.3062e-08, 1.7198e-08, 1.2397e-08,
         6.2015e-09, 3.3333e-01, 3.3333e-01, 3.3333e-01],
        [2.0170e-08, 2.5812e-08, 2.3633e-08, 2.0889e-08, 1.6726e-08, 1.2840e-08,
         6.9249e-09, 3.3333e-

Epoch: 94.00, Train Loss: 2.60, Val Loss: 11.07, Train BLEU: 17.97, Val BLEU: 0.24, Minutes Elapsed: 5.78
Sampling from training predictions...
Source: 它们 实际 实际上 是 感光 <UNK> 光器 器官 <EOS> <PAD>
Reference: it &apos;s actually a <UNK> organ . <EOS> <PAD>
Model: <SOS> it &apos;s the a . . . <EOS> <EOS>
Attention Weights: tensor([[2.5441e-07, 4.9901e-07, 4.8433e-07, 5.1533e-07, 3.6609e-07, 1.4802e-07,
         9.7874e-08, 8.1519e-08, 3.9923e-08, 1.0000e+00],
        [1.8040e-07, 3.5788e-07, 3.4694e-07, 3.7641e-07, 2.6990e-07, 1.0863e-07,
         7.3021e-08, 6.1055e-08, 3.0786e-08, 1.0000e+00],
        [1.1686e-07, 2.1981e-07, 2.1226e-07, 2.3455e-07, 1.7508e-07, 7.6210e-08,
         5.5531e-08, 4.7634e-08, 2.4921e-08, 1.0000e+00],
        [8.4814e-08, 1.4819e-07, 1.4286e-07, 1.6106e-07, 1.2797e-07, 6.2509e-08,
         5.0315e-08, 4.4410e-08, 2.3836e-08, 1.0000e+00],
        [6.2378e-08, 9.9794e-08, 9.6071e-08, 1.1048e-07, 9.4097e-08, 5.2414e-08,
         4.7016e-08, 4.2877e-08, 2.4062e-08, 1

Epoch: 97.00, Train Loss: 2.55, Val Loss: 11.14, Train BLEU: 17.98, Val BLEU: 0.25, Minutes Elapsed: 5.94
Sampling from training predictions...
Source: 这 只是 x 翼 死亡 水母 <EOS> <PAD> <PAD> <PAD>
Reference: here &apos;s the <UNK> death jelly . <EOS> <PAD>
Model: <SOS> here &apos;s a a . . <EOS> <EOS> <EOS>
Attention Weights: tensor([[8.0510e-08, 1.0175e-07, 7.6084e-08, 5.5305e-08, 3.5094e-08, 2.2892e-08,
         1.1801e-08, 3.3333e-01, 3.3333e-01, 3.3333e-01],
        [5.8029e-08, 7.1690e-08, 5.2040e-08, 3.7141e-08, 2.3382e-08, 1.5261e-08,
         8.0650e-09, 3.3333e-01, 3.3333e-01, 3.3333e-01],
        [3.8733e-08, 4.7153e-08, 3.5505e-08, 2.6012e-08, 1.6801e-08, 1.1194e-08,
         6.0907e-09, 3.3333e-01, 3.3333e-01, 3.3333e-01],
        [2.9434e-08, 3.5608e-08, 2.8918e-08, 2.2432e-08, 1.5360e-08, 1.0650e-08,
         5.9307e-09, 3.3333e-01, 3.3333e-01, 3.3333e-01],
        [2.3439e-08, 2.8026e-08, 2.4905e-08, 2.0602e-08, 1.5090e-08, 1.1082e-08,
         6.4645e-09, 3.3333e-01, 3.3333e-

Epoch: 100.00, Train Loss: 2.50, Val Loss: 11.24, Train BLEU: 18.74, Val BLEU: 0.25, Minutes Elapsed: 6.11
Sampling from training predictions...
Source: 非常 非常感谢 感谢 <EOS> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
Reference: thank you very much . <EOS> <PAD> <PAD> <PAD>
Model: <SOS> thank &apos;s a . . <EOS> <EOS> <EOS> <EOS>
Attention Weights: tensor([[5.9834e-09, 5.6428e-09, 1.6685e-08, 8.8948e-09, 1.6667e-01, 1.6667e-01,
         1.6667e-01, 1.6667e-01, 1.6667e-01, 1.6667e-01],
        [1.4504e-09, 1.2466e-09, 3.6144e-09, 1.9682e-09, 1.6667e-01, 1.6667e-01,
         1.6667e-01, 1.6667e-01, 1.6667e-01, 1.6667e-01],
        [1.2671e-09, 1.0595e-09, 2.8851e-09, 1.6335e-09, 1.6667e-01, 1.6667e-01,
         1.6667e-01, 1.6667e-01, 1.6667e-01, 1.6667e-01],
        [1.5920e-09, 1.3490e-09, 3.6286e-09, 2.1386e-09, 1.6667e-01, 1.6667e-01,
         1.6667e-01, 1.6667e-01, 1.6667e-01, 1.6667e-01],
        [4.6820e-09, 4.3814e-09, 1.1302e-08, 7.1314e-09, 1.6667e-01, 1.6667e-01,
         1.6667e-01, 1.6

Epoch: 103.00, Train Loss: 2.45, Val Loss: 11.31, Train BLEU: 21.46, Val BLEU: 0.24, Minutes Elapsed: 6.28
Sampling from training predictions...
Source: 这些 柱子 可以 有 几层 <UNK> 层楼 高 <EOS> <PAD>
Reference: those pillars get up to several stories . <EOS>
Model: <SOS> this &apos;s the a is . . . <EOS>
Attention Weights: tensor([[2.5980e-07, 5.6572e-07, 6.2119e-07, 5.8554e-07, 3.9371e-07, 1.2903e-07,
         7.2381e-08, 7.2380e-08, 3.5185e-08, 1.0000e+00],
        [2.2274e-07, 4.9393e-07, 5.4872e-07, 5.2637e-07, 3.5403e-07, 1.1237e-07,
         6.2225e-08, 6.2039e-08, 3.1085e-08, 1.0000e+00],
        [1.5912e-07, 3.3628e-07, 3.6803e-07, 3.5534e-07, 2.4318e-07, 8.0421e-08,
         4.8410e-08, 4.8851e-08, 2.5095e-08, 1.0000e+00],
        [1.1954e-07, 2.3606e-07, 2.5052e-07, 2.4526e-07, 1.7744e-07, 6.5587e-08,
         4.5092e-08, 4.6273e-08, 2.4138e-08, 1.0000e+00],
        [8.6032e-08, 1.5660e-07, 1.6281e-07, 1.6197e-07, 1.2526e-07, 5.3128e-08,
         4.1918e-08, 4.4713e-08, 2.4318e-08, 1.0

Epoch: 106.00, Train Loss: 2.40, Val Loss: 11.40, Train BLEU: 21.86, Val BLEU: 0.24, Minutes Elapsed: 6.52
Sampling from training predictions...
Source: 这 是 一个 火山 山脊 <EOS> <PAD> <PAD> <PAD> <PAD>
Reference: this is a volcanic ridge . <EOS> <PAD> <PAD>
Model: <SOS> here &apos;s a a . . <EOS> <EOS> <EOS>
Attention Weights: tensor([[5.0745e-08, 6.9497e-08, 5.3306e-08, 3.3834e-08, 2.1393e-08, 1.0683e-08,
         2.5000e-01, 2.5000e-01, 2.5000e-01, 2.5000e-01],
        [3.2636e-08, 4.2452e-08, 3.1598e-08, 1.9810e-08, 1.2510e-08, 6.3478e-09,
         2.5000e-01, 2.5000e-01, 2.5000e-01, 2.5000e-01],
        [2.3321e-08, 3.0003e-08, 2.2628e-08, 1.4512e-08, 9.3270e-09, 4.8720e-09,
         2.5000e-01, 2.5000e-01, 2.5000e-01, 2.5000e-01],
        [2.0386e-08, 2.6392e-08, 2.1155e-08, 1.4633e-08, 9.9319e-09, 5.3729e-09,
         2.5000e-01, 2.5000e-01, 2.5000e-01, 2.5000e-01],
        [1.8968e-08, 2.4799e-08, 2.1713e-08, 1.6709e-08, 1.2462e-08, 7.3009e-09,
         2.5000e-01, 2.5000e-01, 2.5000e

Epoch: 109.00, Train Loss: 2.35, Val Loss: 11.48, Train BLEU: 21.78, Val BLEU: 0.24, Minutes Elapsed: 6.72
Sampling from training predictions...
Source: 这 是 一只 水母 <EOS> <PAD> <PAD> <PAD> <PAD> <PAD>
Reference: here &apos;s a jelly . <EOS> <PAD> <PAD> <PAD>
Model: <SOS> here &apos;s a here . <EOS> <EOS> <EOS> <EOS>
Attention Weights: tensor([[2.5109e-08, 3.4127e-08, 2.5281e-08, 1.7461e-08, 8.6188e-09, 2.0000e-01,
         2.0000e-01, 2.0000e-01, 2.0000e-01, 2.0000e-01],
        [1.0234e-08, 1.3172e-08, 9.6380e-09, 6.6677e-09, 3.3689e-09, 2.0000e-01,
         2.0000e-01, 2.0000e-01, 2.0000e-01, 2.0000e-01],
        [8.4197e-09, 1.0669e-08, 8.0396e-09, 5.7214e-09, 3.0191e-09, 2.0000e-01,
         2.0000e-01, 2.0000e-01, 2.0000e-01, 2.0000e-01],
        [9.8715e-09, 1.2666e-08, 1.0223e-08, 7.7265e-09, 4.3137e-09, 2.0000e-01,
         2.0000e-01, 2.0000e-01, 2.0000e-01, 2.0000e-01],
        [1.5927e-08, 2.1396e-08, 1.9320e-08, 1.6211e-08, 1.0049e-08, 2.0000e-01,
         2.0000e-01, 2.0000e

Epoch: 112.00, Train Loss: 2.30, Val Loss: 11.56, Train BLEU: 22.92, Val BLEU: 0.24, Minutes Elapsed: 6.89
Sampling from training predictions...
Source: 你 真的 能 看穿 它 的 脑袋 <EOS> <PAD> <PAD>
Reference: you can actually see through his head . <EOS>
Model: <SOS> it &apos;s the a is . . . <EOS>
Attention Weights: tensor([[1.9023e-07, 4.3460e-07, 4.1443e-07, 2.5689e-07, 1.5303e-07, 8.4809e-08,
         4.7439e-08, 2.2746e-08, 5.0000e-01, 5.0000e-01],
        [1.7823e-07, 4.0479e-07, 3.9785e-07, 2.4794e-07, 1.4645e-07, 8.1152e-08,
         4.5460e-08, 2.2038e-08, 5.0000e-01, 5.0000e-01],
        [1.2273e-07, 2.6422e-07, 2.7271e-07, 1.8163e-07, 1.1512e-07, 6.7163e-08,
         3.9112e-08, 1.9660e-08, 5.0000e-01, 5.0000e-01],
        [1.0466e-07, 2.0527e-07, 2.2398e-07, 1.6663e-07, 1.1870e-07, 7.5817e-08,
         4.7335e-08, 2.4440e-08, 5.0000e-01, 5.0000e-01],
        [7.2366e-08, 1.3051e-07, 1.5270e-07, 1.2840e-07, 1.0429e-07, 7.4795e-08,
         5.1453e-08, 2.8478e-08, 5.0000e-01, 5.0000e-0

Epoch: 115.00, Train Loss: 2.25, Val Loss: 11.63, Train BLEU: 22.81, Val BLEU: 0.24, Minutes Elapsed: 7.07
Sampling from training predictions...
Source: 看到 这些 在 动 的 东西 了 吗 <EOS> <PAD>
Reference: but see all those different working things ? <EOS>
Model: <SOS> it &apos;s the a is . . . <EOS>
Attention Weights: tensor([[3.2480e-07, 8.7257e-07, 1.1199e-06, 9.6036e-07, 6.2430e-07, 3.7057e-07,
         2.0769e-07, 1.2416e-07, 5.3745e-08, 1.0000e+00],
        [3.0667e-07, 8.0413e-07, 1.0587e-06, 9.4425e-07, 6.2594e-07, 3.7514e-07,
         2.1252e-07, 1.2789e-07, 5.6242e-08, 1.0000e+00],
        [2.2527e-07, 5.4929e-07, 7.3303e-07, 6.9872e-07, 4.9983e-07, 3.2447e-07,
         1.9743e-07, 1.2430e-07, 5.7233e-08, 1.0000e+00],
        [2.0091e-07, 4.3892e-07, 5.7451e-07, 5.8599e-07, 4.6489e-07, 3.3637e-07,
         2.2697e-07, 1.5324e-07, 7.3664e-08, 1.0000e+00],
        [1.4057e-07, 2.7773e-07, 3.6471e-07, 3.9839e-07, 3.5126e-07, 2.8584e-07,
         2.1806e-07, 1.6076e-07, 8.4186e-08, 1.0000e+

Epoch: 118.00, Train Loss: 2.20, Val Loss: 11.67, Train BLEU: 25.10, Val BLEU: 0.24, Minutes Elapsed: 7.22
Sampling from training predictions...
Source: 它们 还 没有 被 研究 透 <EOS> <PAD> <PAD> <PAD>
Reference: they &apos;re very little understood . <EOS> <PAD> <PAD>
Model: <SOS> it &apos;s a a . . <EOS> <EOS> <EOS>
Attention Weights: tensor([[1.1655e-07, 2.1350e-07, 2.0098e-07, 1.3671e-07, 7.5737e-08, 4.8734e-08,
         2.1323e-08, 3.3333e-01, 3.3333e-01, 3.3333e-01],
        [1.1342e-07, 2.0685e-07, 1.9227e-07, 1.2989e-07, 7.2524e-08, 4.6755e-08,
         2.0440e-08, 3.3333e-01, 3.3333e-01, 3.3333e-01],
        [7.9450e-08, 1.4728e-07, 1.4100e-07, 1.0114e-07, 6.0251e-08, 4.0336e-08,
         1.8461e-08, 3.3333e-01, 3.3333e-01, 3.3333e-01],
        [7.0488e-08, 1.2950e-07, 1.3070e-07, 1.0425e-07, 6.9120e-08, 4.9496e-08,
         2.3986e-08, 3.3333e-01, 3.3333e-01, 3.3333e-01],
        [4.4212e-08, 8.0727e-08, 8.9360e-08, 8.4552e-08, 6.7501e-08, 5.4459e-08,
         3.0531e-08, 3.3333e-01, 3

Epoch: 121.00, Train Loss: 2.15, Val Loss: 11.79, Train BLEU: 25.20, Val BLEU: 0.24, Minutes Elapsed: 7.39
Sampling from training predictions...
Source: 这些 都 是 虾 <EOS> <PAD> <PAD> <PAD> <PAD> <PAD>
Reference: those are all shrimp . <EOS> <PAD> <PAD> <PAD>
Model: <SOS> here &apos;s a a . <EOS> <EOS> <EOS> <EOS>
Attention Weights: tensor([[2.4636e-08, 4.2732e-08, 3.8014e-08, 2.5751e-08, 1.1588e-08, 2.0000e-01,
         2.0000e-01, 2.0000e-01, 2.0000e-01, 2.0000e-01],
        [1.2200e-08, 1.9393e-08, 1.6930e-08, 1.1631e-08, 5.3469e-09, 2.0000e-01,
         2.0000e-01, 2.0000e-01, 2.0000e-01, 2.0000e-01],
        [1.2465e-08, 1.9017e-08, 1.7097e-08, 1.2300e-08, 6.0636e-09, 2.0000e-01,
         2.0000e-01, 2.0000e-01, 2.0000e-01, 2.0000e-01],
        [1.6289e-08, 2.5455e-08, 2.4553e-08, 1.9192e-08, 1.0380e-08, 2.0000e-01,
         2.0000e-01, 2.0000e-01, 2.0000e-01, 2.0000e-01],
        [2.3626e-08, 4.0019e-08, 4.3815e-08, 3.9570e-08, 2.5850e-08, 2.0000e-01,
         2.0000e-01, 2.0000e-01,

Epoch: 124.00, Train Loss: 2.10, Val Loss: 11.81, Train BLEU: 25.23, Val BLEU: 0.24, Minutes Elapsed: 7.57
Sampling from training predictions...
Source: 这些 柱子 可以 有 几层 <UNK> 层楼 高 <EOS> <PAD>
Reference: those pillars get up to several stories . <EOS>
Model: <SOS> this &apos;s the is is . . . <EOS>
Attention Weights: tensor([[2.1567e-07, 6.7538e-07, 9.7809e-07, 9.1809e-07, 4.5254e-07, 6.8047e-08,
         5.2752e-08, 6.3211e-08, 3.1692e-08, 1.0000e+00],
        [2.2962e-07, 7.0298e-07, 1.0537e-06, 1.0177e-06, 5.1316e-07, 7.5942e-08,
         6.0844e-08, 7.4188e-08, 3.7790e-08, 1.0000e+00],
        [1.8878e-07, 5.2293e-07, 7.5740e-07, 7.5012e-07, 4.1089e-07, 7.3028e-08,
         6.5991e-08, 8.3773e-08, 4.3899e-08, 1.0000e+00],
        [1.6785e-07, 4.1906e-07, 5.7293e-07, 5.7345e-07, 3.4833e-07, 7.5959e-08,
         7.9737e-08, 1.0130e-07, 5.4155e-08, 1.0000e+00],
        [1.2177e-07, 2.7572e-07, 3.6165e-07, 3.7110e-07, 2.5367e-07, 7.0005e-08,
         8.4227e-08, 1.0861e-07, 6.1413e-08, 1.

Epoch: 127.00, Train Loss: 2.05, Val Loss: 11.91, Train BLEU: 27.53, Val BLEU: 0.24, Minutes Elapsed: 7.74
Sampling from training predictions...
Source: 这里 有 蟹 还有 蠕虫 <EOS> <PAD> <PAD> <PAD> <PAD>
Reference: here &apos;s crabs here , worms here . <EOS>
Model: <SOS> here &apos;s a a here . . <EOS> <EOS>
Attention Weights: tensor([[7.3102e-08, 1.2395e-07, 1.2090e-07, 6.5902e-08, 3.7034e-08, 1.6975e-08,
         2.5000e-01, 2.5000e-01, 2.5000e-01, 2.5000e-01],
        [6.7119e-08, 1.0703e-07, 1.0312e-07, 5.8154e-08, 3.3731e-08, 1.5476e-08,
         2.5000e-01, 2.5000e-01, 2.5000e-01, 2.5000e-01],
        [5.4604e-08, 8.8035e-08, 8.7224e-08, 5.3342e-08, 3.2995e-08, 1.5938e-08,
         2.5000e-01, 2.5000e-01, 2.5000e-01, 2.5000e-01],
        [5.0458e-08, 8.0657e-08, 8.4785e-08, 5.9532e-08, 4.0829e-08, 2.1318e-08,
         2.5000e-01, 2.5000e-01, 2.5000e-01, 2.5000e-01],
        [3.6929e-08, 5.9329e-08, 6.7396e-08, 5.6335e-08, 4.4009e-08, 2.5933e-08,
         2.5000e-01, 2.5000e-01, 2.5000e-

Epoch: 130.00, Train Loss: 2.01, Val Loss: 11.97, Train BLEU: 27.86, Val BLEU: 0.24, Minutes Elapsed: 7.90
Sampling from training predictions...
Source: 你 真的 能 看穿 它 的 脑袋 <EOS> <PAD> <PAD>
Reference: you can actually see through his head . <EOS>
Model: <SOS> this &apos;s the a is stories . . <EOS>
Attention Weights: tensor([[2.0841e-07, 6.7157e-07, 9.6830e-07, 6.1874e-07, 3.4530e-07, 1.5916e-07,
         7.4233e-08, 3.2777e-08, 5.0000e-01, 5.0000e-01],
        [2.4769e-07, 7.6208e-07, 1.1715e-06, 7.7721e-07, 4.3812e-07, 2.0872e-07,
         1.0076e-07, 4.4771e-08, 5.0000e-01, 5.0000e-01],
        [2.0362e-07, 5.7329e-07, 9.0261e-07, 6.3573e-07, 3.9290e-07, 2.0599e-07,
         1.0755e-07, 5.0327e-08, 5.0000e-01, 5.0000e-01],
        [1.8235e-07, 4.4610e-07, 6.7952e-07, 5.2463e-07, 3.7262e-07, 2.2397e-07,
         1.3087e-07, 6.5280e-08, 5.0000e-01, 5.0000e-01],
        [1.2681e-07, 2.7238e-07, 4.1024e-07, 3.5439e-07, 2.9405e-07, 2.0852e-07,
         1.3889e-07, 7.6345e-08, 5.0000e-01, 5

Epoch: 133.00, Train Loss: 1.95, Val Loss: 12.03, Train BLEU: 28.95, Val BLEU: 0.24, Minutes Elapsed: 8.07
Sampling from training predictions...
Source: 看 这里 <EOS> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
Reference: here they go . <EOS> <PAD> <PAD> <PAD> <PAD>
Model: <SOS> here ! good . <EOS> <EOS> <EOS> <EOS> <EOS>
Attention Weights: tensor([[3.9843e-09, 6.1051e-09, 4.4171e-09, 1.4286e-01, 1.4286e-01, 1.4286e-01,
         1.4286e-01, 1.4286e-01, 1.4286e-01, 1.4286e-01],
        [1.1928e-09, 1.6559e-09, 1.1803e-09, 1.4286e-01, 1.4286e-01, 1.4286e-01,
         1.4286e-01, 1.4286e-01, 1.4286e-01, 1.4286e-01],
        [1.6026e-09, 2.1783e-09, 1.6166e-09, 1.4286e-01, 1.4286e-01, 1.4286e-01,
         1.4286e-01, 1.4286e-01, 1.4286e-01, 1.4286e-01],
        [5.6052e-09, 8.0206e-09, 6.4885e-09, 1.4286e-01, 1.4286e-01, 1.4286e-01,
         1.4286e-01, 1.4286e-01, 1.4286e-01, 1.4286e-01],
        [1.1914e-07, 1.9585e-07, 2.0678e-07, 1.4286e-01, 1.4286e-01, 1.4286e-01,
         1.4286e-01, 1.42

Epoch: 136.00, Train Loss: 1.94, Val Loss: 12.10, Train BLEU: 29.84, Val BLEU: 0.25, Minutes Elapsed: 8.23
Sampling from training predictions...
Source: 这些 蠕虫 每星期 星期 长 好几 <UNK> <EOS> <PAD> <PAD>
Reference: the worms are growing inches per week ! <EOS>
Model: <SOS> the &apos;s the is several . . <EOS> <EOS>
Attention Weights: tensor([[1.0257e-07, 3.6249e-07, 5.4019e-07, 4.2878e-07, 1.3433e-07, 2.8822e-08,
         3.6728e-09, 4.9907e-09, 5.0000e-01, 5.0000e-01],
        [1.3399e-07, 4.6495e-07, 7.1777e-07, 5.9074e-07, 1.8311e-07, 4.0482e-08,
         5.0611e-09, 7.0574e-09, 5.0000e-01, 5.0000e-01],
        [1.3731e-07, 4.3441e-07, 6.4818e-07, 5.4575e-07, 1.8849e-07, 4.8148e-08,
         6.8464e-09, 9.5487e-09, 5.0000e-01, 5.0000e-01],
        [1.1670e-07, 3.2538e-07, 4.4847e-07, 3.8656e-07, 1.6214e-07, 5.1114e-08,
         8.1589e-09, 1.1761e-08, 5.0000e-01, 5.0000e-01],
        [7.7799e-08, 1.8498e-07, 2.3814e-07, 2.2758e-07, 1.2914e-07, 5.6334e-08,
         1.1351e-08, 1.6834e-08, 5.0

Epoch: 139.00, Train Loss: 1.89, Val Loss: 12.12, Train BLEU: 30.71, Val BLEU: 0.24, Minutes Elapsed: 8.40
Sampling from training predictions...
Source: 地球 的 大部 大部分 部分 都 是 海水 <EOS> <PAD>
Reference: most of the planet is ocean water . <EOS>
Model: <SOS> this is the is is stories . . <EOS>
Attention Weights: tensor([[3.3778e-07, 1.1420e-06, 1.6357e-06, 2.2696e-06, 1.1769e-06, 5.5003e-07,
         2.2482e-07, 9.9519e-08, 4.8801e-08, 9.9999e-01],
        [4.5356e-07, 1.4638e-06, 2.1221e-06, 3.0832e-06, 1.7048e-06, 8.1394e-07,
         3.4559e-07, 1.5788e-07, 7.7410e-08, 9.9999e-01],
        [4.4464e-07, 1.3128e-06, 1.8371e-06, 2.6307e-06, 1.5516e-06, 8.1189e-07,
         3.8134e-07, 1.8705e-07, 9.5535e-08, 9.9999e-01],
        [3.9734e-07, 1.0600e-06, 1.3733e-06, 1.8325e-06, 1.1969e-06, 7.1208e-07,
         3.8077e-07, 2.0825e-07, 1.1285e-07, 9.9999e-01],
        [3.0419e-07, 7.1824e-07, 8.7810e-07, 1.1310e-06, 8.6523e-07, 6.1253e-07,
         3.8611e-07, 2.3837e-07, 1.3981e-07, 9.9999e-01

Epoch: 142.00, Train Loss: 1.90, Val Loss: 12.19, Train BLEU: 30.68, Val BLEU: 0.26, Minutes Elapsed: 8.58
Sampling from training predictions...
Source: 它们 就是 这样 生活 的 <EOS> <PAD> <PAD> <PAD> <PAD>
Reference: that &apos;s how they like to live . <EOS>
Model: <SOS> it &apos;s a a like . . <EOS> <EOS>
Attention Weights: tensor([[7.6686e-08, 1.1600e-07, 8.9804e-08, 4.3557e-08, 2.1335e-08, 9.9696e-09,
         2.5000e-01, 2.5000e-01, 2.5000e-01, 2.5000e-01],
        [1.0785e-07, 1.5405e-07, 1.1587e-07, 5.6812e-08, 2.8366e-08, 1.3015e-08,
         2.5000e-01, 2.5000e-01, 2.5000e-01, 2.5000e-01],
        [8.6173e-08, 1.2505e-07, 9.6691e-08, 5.1143e-08, 2.7385e-08, 1.3227e-08,
         2.5000e-01, 2.5000e-01, 2.5000e-01, 2.5000e-01],
        [7.3694e-08, 1.0690e-07, 9.0813e-08, 5.5721e-08, 3.3579e-08, 1.7451e-08,
         2.5000e-01, 2.5000e-01, 2.5000e-01, 2.5000e-01],
        [4.9539e-08, 7.2860e-08, 7.3592e-08, 5.6477e-08, 4.0215e-08, 2.4137e-08,
         2.5000e-01, 2.5000e-01, 2.5000e-01,

Epoch: 145.00, Train Loss: 2.04, Val Loss: 12.22, Train BLEU: 28.12, Val BLEU: 0.25, Minutes Elapsed: 8.74
Sampling from training predictions...
Source: 而 努力 去 保存 现状 则 是 徒劳 的 <EOS>
Reference: the concept of preservation is futile . <EOS> <PAD>
Model: <SOS> the concept of preservation is futile . <EOS> <EOS>
Attention Weights: tensor([[0.0384, 0.1260, 0.1676, 0.1945, 0.2533, 0.1234, 0.0624, 0.0221, 0.0087,
         0.0037],
        [0.0332, 0.1076, 0.1463, 0.1733, 0.2468, 0.1521, 0.0861, 0.0343, 0.0142,
         0.0061],
        [0.0305, 0.1001, 0.1404, 0.1692, 0.2547, 0.1592, 0.0900, 0.0354, 0.0145,
         0.0061],
        [0.0365, 0.1139, 0.1514, 0.1780, 0.2474, 0.1443, 0.0795, 0.0308, 0.0128,
         0.0055],
        [0.0277, 0.0768, 0.1002, 0.1182, 0.2101, 0.1985, 0.1430, 0.0726, 0.0363,
         0.0165],
        [0.0490, 0.1238, 0.1514, 0.1670, 0.2136, 0.1348, 0.0841, 0.0421, 0.0225,
         0.0117],
        [0.0308, 0.0517, 0.0562, 0.0587, 0.0849, 0.1136, 0.1354, 0.1545, 0.167

Epoch: 148.00, Train Loss: 1.84, Val Loss: 12.27, Train BLEU: 31.75, Val BLEU: 0.27, Minutes Elapsed: 8.90
Sampling from training predictions...
Source: 恩 厉害 <EOS> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
Reference: oh ! good going . <EOS> <PAD> <PAD> <PAD>
Model: <SOS> oh ! good . <EOS> <EOS> <EOS> <EOS> <EOS>
Attention Weights: tensor([[1.7121e-09, 3.0271e-09, 2.5926e-09, 1.4286e-01, 1.4286e-01, 1.4286e-01,
         1.4286e-01, 1.4286e-01, 1.4286e-01, 1.4286e-01],
        [5.2679e-10, 8.2489e-10, 6.8318e-10, 1.4286e-01, 1.4286e-01, 1.4286e-01,
         1.4286e-01, 1.4286e-01, 1.4286e-01, 1.4286e-01],
        [7.6425e-10, 1.1783e-09, 9.9467e-10, 1.4286e-01, 1.4286e-01, 1.4286e-01,
         1.4286e-01, 1.4286e-01, 1.4286e-01, 1.4286e-01],
        [3.2761e-09, 5.2474e-09, 4.7183e-09, 1.4286e-01, 1.4286e-01, 1.4286e-01,
         1.4286e-01, 1.4286e-01, 1.4286e-01, 1.4286e-01],
        [8.3187e-08, 1.4942e-07, 1.7271e-07, 1.4286e-01, 1.4286e-01, 1.4286e-01,
         1.4286e-01, 1.4286e-0

Epoch: 151.00, Train Loss: 1.77, Val Loss: 12.27, Train BLEU: 32.34, Val BLEU: 0.26, Minutes Elapsed: 9.07
Sampling from training predictions...
Source: 学习 掌握 它 <EOS> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
Reference: learn to manage it . <EOS> <PAD> <PAD> <PAD>
Model: <SOS> here &apos;s a shrimp . <EOS> <EOS> <EOS> <EOS>
Attention Weights: tensor([[3.1155e-09, 4.8160e-09, 5.0385e-09, 3.2696e-09, 1.6667e-01, 1.6667e-01,
         1.6667e-01, 1.6667e-01, 1.6667e-01, 1.6667e-01],
        [1.3973e-09, 1.9084e-09, 1.8606e-09, 1.1789e-09, 1.6667e-01, 1.6667e-01,
         1.6667e-01, 1.6667e-01, 1.6667e-01, 1.6667e-01],
        [1.6894e-09, 2.2923e-09, 2.2671e-09, 1.5081e-09, 1.6667e-01, 1.6667e-01,
         1.6667e-01, 1.6667e-01, 1.6667e-01, 1.6667e-01],
        [3.5571e-09, 5.3237e-09, 5.6071e-09, 3.9888e-09, 1.6667e-01, 1.6667e-01,
         1.6667e-01, 1.6667e-01, 1.6667e-01, 1.6667e-01],
        [1.5403e-08, 2.7307e-08, 3.3840e-08, 3.0811e-08, 1.6667e-01, 1.6667e-01,
         1.6667e-01, 1.6

Epoch: 154.00, Train Loss: 1.71, Val Loss: 12.37, Train BLEU: 31.77, Val BLEU: 0.26, Minutes Elapsed: 9.25
Sampling from training predictions...
Source: 这 是 一种 种群 栖 动物 <EOS> <PAD> <PAD> <PAD>
Reference: it &apos;s a colonial animal . <EOS> <PAD> <PAD>
Model: <SOS> here &apos;s a a . . <EOS> <EOS> <EOS>
Attention Weights: tensor([[5.5887e-08, 1.0079e-07, 1.0557e-07, 6.1417e-08, 2.7964e-08, 1.3429e-08,
         7.1883e-09, 3.3333e-01, 3.3333e-01, 3.3333e-01],
        [6.5464e-08, 1.0626e-07, 1.0793e-07, 6.3461e-08, 3.0093e-08, 1.4963e-08,
         7.9791e-09, 3.3333e-01, 3.3333e-01, 3.3333e-01],
        [5.5436e-08, 8.8255e-08, 9.0746e-08, 5.7624e-08, 3.0165e-08, 1.6282e-08,
         9.1867e-09, 3.3333e-01, 3.3333e-01, 3.3333e-01],
        [4.8193e-08, 7.6167e-08, 8.3342e-08, 6.3221e-08, 4.0809e-08, 2.5817e-08,
         1.5727e-08, 3.3333e-01, 3.3333e-01, 3.3333e-01],
        [4.2537e-08, 6.7176e-08, 8.2721e-08, 7.9921e-08, 6.8803e-08, 5.5375e-08,
         3.9658e-08, 3.3333e-01, 3.3333e

Epoch: 157.00, Train Loss: 1.70, Val Loss: 12.38, Train BLEU: 33.00, Val BLEU: 0.26, Minutes Elapsed: 9.42
Sampling from training predictions...
Source: 看 这里 <EOS> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
Reference: here they go . <EOS> <PAD> <PAD> <PAD> <PAD>
Model: <SOS> oh ! good . <EOS> <EOS> <EOS> <EOS> <EOS>
Attention Weights: tensor([[1.2402e-09, 2.3259e-09, 2.2142e-09, 1.4286e-01, 1.4286e-01, 1.4286e-01,
         1.4286e-01, 1.4286e-01, 1.4286e-01, 1.4286e-01],
        [3.8372e-10, 6.2785e-10, 5.6844e-10, 1.4286e-01, 1.4286e-01, 1.4286e-01,
         1.4286e-01, 1.4286e-01, 1.4286e-01, 1.4286e-01],
        [5.6108e-10, 9.1021e-10, 8.3276e-10, 1.4286e-01, 1.4286e-01, 1.4286e-01,
         1.4286e-01, 1.4286e-01, 1.4286e-01, 1.4286e-01],
        [2.7754e-09, 4.7650e-09, 4.6961e-09, 1.4286e-01, 1.4286e-01, 1.4286e-01,
         1.4286e-01, 1.4286e-01, 1.4286e-01, 1.4286e-01],
        [1.2530e-07, 2.5577e-07, 3.3628e-07, 1.4286e-01, 1.4286e-01, 1.4286e-01,
         1.4286e-01, 1.4286

Epoch: 160.00, Train Loss: 1.65, Val Loss: 12.40, Train BLEU: 33.02, Val BLEU: 0.26, Minutes Elapsed: 9.59
Sampling from training predictions...
Source: 看 这里 <EOS> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
Reference: here they go . <EOS> <PAD> <PAD> <PAD> <PAD>
Model: <SOS> oh ! go . <EOS> <EOS> <EOS> <EOS> <EOS>
Attention Weights: tensor([[1.0571e-09, 2.0258e-09, 2.0084e-09, 1.4286e-01, 1.4286e-01, 1.4286e-01,
         1.4286e-01, 1.4286e-01, 1.4286e-01, 1.4286e-01],
        [3.2821e-10, 5.4728e-10, 5.1384e-10, 1.4286e-01, 1.4286e-01, 1.4286e-01,
         1.4286e-01, 1.4286e-01, 1.4286e-01, 1.4286e-01],
        [4.9007e-10, 8.0840e-10, 7.6360e-10, 1.4286e-01, 1.4286e-01, 1.4286e-01,
         1.4286e-01, 1.4286e-01, 1.4286e-01, 1.4286e-01],
        [2.9391e-09, 5.1423e-09, 5.2713e-09, 1.4286e-01, 1.4286e-01, 1.4286e-01,
         1.4286e-01, 1.4286e-01, 1.4286e-01, 1.4286e-01],
        [2.1355e-07, 4.5058e-07, 6.2363e-07, 1.4286e-01, 1.4286e-01, 1.4286e-01,
         1.4286e-01, 1.4286e-

Epoch: 163.00, Train Loss: 1.62, Val Loss: 12.46, Train BLEU: 32.84, Val BLEU: 0.26, Minutes Elapsed: 9.75
Sampling from training predictions...
Source: 这里 有 蟹 还有 蠕虫 <EOS> <PAD> <PAD> <PAD> <PAD>
Reference: here &apos;s crabs here , worms here . <EOS>
Model: <SOS> here &apos;s a a here . . <EOS> <EOS>
Attention Weights: tensor([[2.7281e-08, 3.4093e-08, 3.3665e-08, 1.5398e-08, 7.6577e-09, 4.1892e-09,
         2.5000e-01, 2.5000e-01, 2.5000e-01, 2.5000e-01],
        [3.6404e-08, 4.0117e-08, 3.8558e-08, 1.7871e-08, 8.9042e-09, 4.7329e-09,
         2.5000e-01, 2.5000e-01, 2.5000e-01, 2.5000e-01],
        [3.1607e-08, 3.5547e-08, 3.4756e-08, 1.7890e-08, 9.7474e-09, 5.4683e-09,
         2.5000e-01, 2.5000e-01, 2.5000e-01, 2.5000e-01],
        [2.6622e-08, 3.2441e-08, 3.3996e-08, 2.1835e-08, 1.4003e-08, 8.3380e-09,
         2.5000e-01, 2.5000e-01, 2.5000e-01, 2.5000e-01],
        [2.2282e-08, 3.0060e-08, 3.5251e-08, 3.0111e-08, 2.4098e-08, 1.6553e-08,
         2.5000e-01, 2.5000e-01, 2.5000e-

Epoch: 166.00, Train Loss: 1.58, Val Loss: 12.48, Train BLEU: 34.62, Val BLEU: 0.26, Minutes Elapsed: 9.93
Sampling from training predictions...
Source: 只是 一个 难以 难以置信 置信 的 故事 <EOS> <PAD> <PAD>
Reference: it &apos;s an incredible story . <EOS> <PAD> <PAD>
Model: <SOS> it &apos;s an incredible story . <EOS> <EOS> <EOS>
Attention Weights: tensor([[8.1898e-08, 1.9480e-07, 2.5753e-07, 1.3004e-07, 9.3945e-08, 3.9947e-08,
         1.7316e-08, 9.0109e-09, 5.0000e-01, 5.0000e-01],
        [1.3966e-07, 3.1665e-07, 4.0974e-07, 2.0071e-07, 1.3968e-07, 5.9841e-08,
         2.6116e-08, 1.3098e-08, 5.0000e-01, 5.0000e-01],
        [1.0996e-07, 2.4526e-07, 3.1528e-07, 1.6477e-07, 1.2060e-07, 5.6819e-08,
         2.6914e-08, 1.4206e-08, 5.0000e-01, 5.0000e-01],
        [8.6261e-08, 1.7933e-07, 2.2087e-07, 1.3200e-07, 1.1247e-07, 6.5971e-08,
         3.6820e-08, 2.0807e-08, 5.0000e-01, 5.0000e-01],
        [6.0894e-08, 1.1424e-07, 1.4390e-07, 1.1447e-07, 1.1628e-07, 9.6232e-08,
         7.3504e-08, 5.08

Epoch: 169.00, Train Loss: 1.52, Val Loss: 12.52, Train BLEU: 35.64, Val BLEU: 0.26, Minutes Elapsed: 10.10
Sampling from training predictions...
Source: 它 是否 是否是 很 敏感 呢 是 的 <EOS> <PAD>
Reference: is it sensitive ? yes . <EOS> <PAD> <PAD>
Model: <SOS> is &apos;s sensitive of . . <EOS> <EOS> <EOS>
Attention Weights: tensor([[1.9461e-07, 1.0478e-06, 1.3903e-06, 1.6492e-06, 6.7878e-07, 2.3586e-07,
         8.7453e-08, 3.4311e-08, 1.7367e-08, 9.9999e-01],
        [3.3627e-07, 1.5763e-06, 2.0098e-06, 2.4538e-06, 1.0610e-06, 3.8455e-07,
         1.5462e-07, 6.4432e-08, 3.2843e-08, 9.9999e-01],
        [3.0168e-07, 1.3001e-06, 1.5935e-06, 1.9412e-06, 8.8356e-07, 3.5903e-07,
         1.6181e-07, 7.3414e-08, 3.8970e-08, 9.9999e-01],
        [2.2357e-07, 7.6937e-07, 8.6575e-07, 1.0829e-06, 6.1716e-07, 3.3550e-07,
         1.9519e-07, 1.0807e-07, 6.2718e-08, 1.0000e+00],
        [1.4634e-07, 3.6732e-07, 3.9983e-07, 5.1891e-07, 4.4494e-07, 3.7875e-07,
         3.2069e-07, 2.4441e-07, 1.7816e-07, 1

Epoch: 172.00, Train Loss: 1.48, Val Loss: 12.57, Train BLEU: 36.74, Val BLEU: 0.28, Minutes Elapsed: 10.26
Sampling from training predictions...
Source: 非常 非常感谢 感谢 <EOS> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
Reference: thank you very much . <EOS> <PAD> <PAD> <PAD>
Model: <SOS> thank you very much . <EOS> <EOS> <EOS> <EOS>
Attention Weights: tensor([[1.2988e-09, 1.4284e-09, 6.0684e-09, 9.2807e-09, 1.6667e-01, 1.6667e-01,
         1.6667e-01, 1.6667e-01, 1.6667e-01, 1.6667e-01],
        [1.1342e-10, 9.3305e-11, 3.5932e-10, 5.4093e-10, 1.6667e-01, 1.6667e-01,
         1.6667e-01, 1.6667e-01, 1.6667e-01, 1.6667e-01],
        [7.4417e-11, 5.5983e-11, 1.9477e-10, 2.7895e-10, 1.6667e-01, 1.6667e-01,
         1.6667e-01, 1.6667e-01, 1.6667e-01, 1.6667e-01],
        [1.6749e-10, 1.2502e-10, 5.0541e-10, 7.3022e-10, 1.6667e-01, 1.6667e-01,
         1.6667e-01, 1.6667e-01, 1.6667e-01, 1.6667e-01],
        [1.8810e-09, 2.0507e-09, 9.4859e-09, 1.5074e-08, 1.6667e-01, 1.6667e-01,
         1.6667e-01, 

Epoch: 175.00, Train Loss: 1.44, Val Loss: 12.64, Train BLEU: 37.83, Val BLEU: 0.26, Minutes Elapsed: 10.42
Sampling from training predictions...
Source: 这 是 这些 热液 出口 口中 的 一个 <EOS> <PAD>
Reference: this is one of these hydrothermal vents . <EOS>
Model: <SOS> this is the is is hydrothermal . . <EOS>
Attention Weights: tensor([[2.7149e-07, 1.1793e-06, 1.8169e-06, 1.5689e-06, 4.3751e-07, 1.2875e-07,
         4.6611e-08, 1.8119e-08, 9.5667e-09, 9.9999e-01],
        [3.9913e-07, 1.6251e-06, 2.5401e-06, 2.2539e-06, 6.9170e-07, 2.1939e-07,
         8.1977e-08, 3.2672e-08, 1.7169e-08, 9.9999e-01],
        [3.7927e-07, 1.4158e-06, 2.1343e-06, 1.8859e-06, 6.4431e-07, 2.3479e-07,
         9.9166e-08, 4.2540e-08, 2.3120e-08, 9.9999e-01],
        [3.1509e-07, 1.0036e-06, 1.3796e-06, 1.1991e-06, 4.9970e-07, 2.3722e-07,
         1.2569e-07, 6.4255e-08, 3.7287e-08, 1.0000e+00],
        [2.3477e-07, 6.1681e-07, 7.8897e-07, 7.0800e-07, 4.0540e-07, 2.5943e-07,
         1.7480e-07, 1.0957e-07, 7.0003e-08,

Epoch: 178.00, Train Loss: 1.38, Val Loss: 12.66, Train BLEU: 44.20, Val BLEU: 0.26, Minutes Elapsed: 10.59
Sampling from training predictions...
Source: 这些 更 小 地动 动物 会 蜷缩 缩在 周围 <EOS>
Reference: there are smaller animals crawling around . <EOS> <PAD>
Model: <SOS> there are smaller animals crawling around . <EOS> <EOS>
Attention Weights: tensor([[0.0553, 0.1403, 0.3091, 0.3902, 0.0751, 0.0169, 0.0067, 0.0022, 0.0024,
         0.0019],
        [0.0447, 0.1184, 0.2861, 0.4332, 0.0812, 0.0199, 0.0085, 0.0027, 0.0029,
         0.0023],
        [0.0402, 0.1159, 0.2943, 0.4400, 0.0776, 0.0180, 0.0074, 0.0023, 0.0024,
         0.0019],
        [0.0527, 0.1365, 0.3103, 0.4011, 0.0702, 0.0160, 0.0067, 0.0023, 0.0024,
         0.0019],
        [0.0434, 0.1298, 0.2886, 0.3880, 0.0922, 0.0291, 0.0142, 0.0048, 0.0055,
         0.0044],
        [0.0657, 0.1448, 0.2828, 0.3583, 0.0831, 0.0279, 0.0156, 0.0070, 0.0080,
         0.0068],
        [0.0032, 0.0050, 0.0079, 0.0130, 0.0112, 0.0228, 0.0585, 0.

Epoch: 181.00, Train Loss: 1.33, Val Loss: 12.71, Train BLEU: 44.55, Val BLEU: 0.27, Minutes Elapsed: 10.76
Sampling from training predictions...
Source: 看到 这些 在 动 的 东西 了 吗 <EOS> <PAD>
Reference: but see all those different working things ? <EOS>
Model: <SOS> but see the see different ocean . . <EOS>
Attention Weights: tensor([[2.0771e-07, 8.9360e-07, 1.7339e-06, 1.5293e-06, 4.8197e-07, 1.6177e-07,
         4.8383e-08, 1.6944e-08, 8.9076e-09, 9.9999e-01],
        [3.1845e-07, 1.3108e-06, 2.6354e-06, 2.4812e-06, 8.0059e-07, 2.7805e-07,
         8.3298e-08, 2.9202e-08, 1.5296e-08, 9.9999e-01],
        [3.4960e-07, 1.3085e-06, 2.5869e-06, 2.5063e-06, 8.7023e-07, 3.3407e-07,
         1.1283e-07, 4.2497e-08, 2.2466e-08, 9.9999e-01],
        [2.8446e-07, 9.3048e-07, 1.5722e-06, 1.4294e-06, 5.7649e-07, 2.7724e-07,
         1.1962e-07, 5.3756e-08, 3.0594e-08, 9.9999e-01],
        [2.1463e-07, 5.8058e-07, 8.5707e-07, 8.0941e-07, 4.2924e-07, 2.7183e-07,
         1.5320e-07, 8.4392e-08, 5.4033e-0

In [None]:
summarize_results(load_experiment_log())[['dt_created', 'num_epochs', 'learning_rate', 'clip_grad_max_norm', 'val_loss']].head()

In [None]:
plot_single_learning_curve(results)

In [None]:
# Epoch: 199.00, Train Loss: 0.32, Val Loss: 13.19, Train BLEU: 98.94, Val BLEU: 0.27
plot_single_learning_curve(results)

In [None]:
# with attention energies = v_broadcast.bmm(torch.tanh(self.attn(concat)).transpose(1, 2)) # switched order  
# Epoch: 199.00, Train Loss: 0.63, Val Loss: 12.82, Train BLEU: 92.05, Val BLEU: 0.38
plot_single_learning_curve(results)

In [None]:
for i, token in enumerate(vocab[SRC_LANG]['id2token']): 
    if i < 20: 
        print("{}: {}".format(i, token))

In [None]:
for i, token in enumerate(vocab[TARG_LANG]['id2token']): 
    if i < 20: 
        print("{}: {}".format(i, token))

In [None]:
import torch
x = torch.arange(0, 3*5*10).view(3, 5, 10)
print(x)
y = x[1:, :, :]
print(y)
z = y.view(-1, 10)
print(z)

In [None]:
t = torch.arange(0, 2*5).view(5, 2)
print(t)
u = t.contiguous().view(-1)
print(u)
v = t.permute(1, 0)
print(v)
w = v.contiguous().view(-1)
print(w)

In [None]:
a = torch.arange(0, 2*1*300)
print(a)
b = a.view(-1, 1, 300)
print(b.size())

In [None]:
for i, (src_idxs, targ_idxs, src_lens, targ_lens) in enumerate(full_loaders['train']):
#     print(i)
#     print(src_idxs.size())
#     print(src_idxs)
#     print(src_lens)
#     print(targ_idxs.size())
#     print(targ_idxs)
#     print(targ_lens)
    id2token = vocab[SRC_LANG]['id2token']
    test_tensor = src_idxs
    list_of_lists = test_tensor.numpy().astype(int).tolist()
    to_token = lambda l: ' '.join([id2token[idx] for idx in l])
    list_of_lists_tokens = [to_token(l) for l in list_of_lists] 
    break 