In [1]:
import numpy as np 
import pandas as pd 
from data_processing import generate_vocab, process_data, create_dataloaders, text2tokens
from model import get_pretrained_emb, EncoderDecoder, EncoderRNN, DecoderRNN, EncoderDecoderAttn, DecoderAttnRNN, \
    DecoderDotAttnRNN
from train_eval import count_parameters, summarize_results, plot_single_learning_curve, load_experiment_log
from train_eval import train_and_eval
import importlib
import pickle as pkl 
import torch

In [2]:
# model identification
MODEL_NAME = 'zh-seq2seq-rnn-vanilla'
SRC_LANG = 'zh'
TARG_LANG = 'en'

# data processing params  
SRC_MAX_SENTENCE_LEN = 10
TARG_MAX_SENTENCE_LEN = 10
SRC_VOCAB_SIZE = 30000 #30000
TARG_VOCAB_SIZE = 30000 #30000

# model architecture params 
RNN_CELL_TYPE = 'gru'
NUM_LAYERS = 2 #2 
ENC_HIDDEN_DIM = 256 #512
DEC_HIDDEN_DIM = 2 * ENC_HIDDEN_DIM #2 * ENC_HIDDEN_DIM 
TEACHER_FORCING_RATIO = 1
CLIP_GRAD_MAX_NORM = 1
ENC_DROPOUT = 0.2 # to actually implement
DEC_DROPOUT = 0.2 # to actually implement
USE_ATTN = True

# training params  
BATCH_SIZE = 32 #32
NUM_EPOCHS = 200
LR = 0.0003 # 0.0005
OPTIMIZER = 'Adam'
LAZY_TRAIN = True

In [3]:
# store as dict to save to results later 
params = {'model_name': MODEL_NAME, 'src_lang': SRC_LANG, 'targ_lang': TARG_LANG, 'rnn_cell_type': RNN_CELL_TYPE, 
          'src_max_sentence_len': SRC_MAX_SENTENCE_LEN, 'targ_max_sentence_len': TARG_MAX_SENTENCE_LEN, 
          'src_vocab_size': SRC_VOCAB_SIZE, 'targ_vocab_size': TARG_VOCAB_SIZE, 
          'num_layers': NUM_LAYERS, 'enc_hidden_dim': ENC_HIDDEN_DIM, 'dec_hidden_dim': DEC_HIDDEN_DIM,
          'teacher_forcing_ratio': TEACHER_FORCING_RATIO, 'clip_grad_max_norm': CLIP_GRAD_MAX_NORM,
          'enc_dropout': ENC_DROPOUT, 'dec_dropout': DEC_DROPOUT, 'use_attn': USE_ATTN, 
          'batch_size': BATCH_SIZE, 'num_epochs': NUM_EPOCHS, 'learning_rate': LR, 'optimizer': OPTIMIZER, 
          'lazy_train': LAZY_TRAIN} 

In [4]:
# # takes a long time to process, save to pickle for reimport in future 
# vocab = generate_vocab(SRC_LANG, TARG_LANG, SRC_VOCAB_SIZE, TARG_VOCAB_SIZE)
# vocab_filename = "{}-{}-vocab.p".format(SRC_LANG, TARG_LANG)
# pkl.dump(vocab, open(vocab_filename, "wb"))

In [5]:
# reload from pickle 
vocab_filename = "{}-{}-vocab.p".format(SRC_LANG, TARG_LANG)
vocab = pkl.load(open(vocab_filename, "rb"))
data = process_data(SRC_LANG, TARG_LANG, SRC_MAX_SENTENCE_LEN, TARG_MAX_SENTENCE_LEN, vocab, filter_long=False)
data_minibatch = process_data(SRC_LANG, TARG_LANG, SRC_MAX_SENTENCE_LEN, TARG_MAX_SENTENCE_LEN, vocab, sample_limit=BATCH_SIZE, filter_long=False) 
data_minitrain = process_data(SRC_LANG, TARG_LANG, SRC_MAX_SENTENCE_LEN, TARG_MAX_SENTENCE_LEN, vocab, sample_limit=1000, filter_long=False)

In [6]:
# create dataloaders 
loaders_full = create_dataloaders(data, SRC_MAX_SENTENCE_LEN, TARG_MAX_SENTENCE_LEN, BATCH_SIZE)
loaders_minibatch = create_dataloaders(data_minibatch, SRC_MAX_SENTENCE_LEN, TARG_MAX_SENTENCE_LEN, BATCH_SIZE)
loaders_minitrain = create_dataloaders(data_minitrain, SRC_MAX_SENTENCE_LEN, TARG_MAX_SENTENCE_LEN, BATCH_SIZE)

In [10]:
# define model 

encoder = EncoderRNN(rnn_cell_type=RNN_CELL_TYPE, enc_hidden_dim=ENC_HIDDEN_DIM, num_layers=NUM_LAYERS, 
                     src_max_sentence_len=SRC_MAX_SENTENCE_LEN, enc_dropout=ENC_DROPOUT, 
                     pretrained_word2vec=get_pretrained_emb(vocab[SRC_LANG]['word2vec'], vocab[SRC_LANG]['token2id']))

# without attention 
decoder = DecoderRNN(dec_hidden_dim=DEC_HIDDEN_DIM, enc_hidden_dim=ENC_HIDDEN_DIM, num_layers=NUM_LAYERS,
                     targ_vocab_size=TARG_VOCAB_SIZE, targ_max_sentence_len=TARG_MAX_SENTENCE_LEN, 
                     pretrained_word2vec=get_pretrained_emb(vocab[TARG_LANG]['word2vec'], vocab[TARG_LANG]['token2id']))
model = EncoderDecoder(encoder, decoder, vocab[TARG_LANG]['token2id'])

# with attention 
# decoder = DecoderAttnRNN(rnn_cell_type=RNN_CELL_TYPE, dec_hidden_dim=DEC_HIDDEN_DIM, enc_hidden_dim=ENC_HIDDEN_DIM, 
#                          num_layers=NUM_LAYERS, targ_vocab_size=TARG_VOCAB_SIZE, src_max_sentence_len=SRC_MAX_SENTENCE_LEN, 
#                          targ_max_sentence_len=TARG_MAX_SENTENCE_LEN, dec_dropout=DEC_DROPOUT, 
#                          pretrained_word2vec=get_pretrained_emb(vocab[TARG_LANG]['word2vec'], vocab[TARG_LANG]['token2id']))
# model = EncoderDecoderAttn(encoder, decoder, vocab[TARG_LANG]['token2id']) 

# with dot attention 
decoder = DecoderDotAttnRNN(rnn_cell_type=RNN_CELL_TYPE, dec_hidden_dim=DEC_HIDDEN_DIM, enc_hidden_dim=ENC_HIDDEN_DIM, 
                         num_layers=NUM_LAYERS, targ_vocab_size=TARG_VOCAB_SIZE, src_max_sentence_len=SRC_MAX_SENTENCE_LEN, 
                         targ_max_sentence_len=TARG_MAX_SENTENCE_LEN, dec_dropout=DEC_DROPOUT, 
                         pretrained_word2vec=get_pretrained_emb(vocab[TARG_LANG]['word2vec'], vocab[TARG_LANG]['token2id']))
model = EncoderDecoderAttn(encoder, decoder, vocab[TARG_LANG]['token2id']) 

In [11]:
model, results = train_and_eval(
    model=model, loaders_full=loaders_full, loaders_minibatch=loaders_minibatch, loaders_minitrain=loaders_minitrain, 
    params=params, vocab=vocab, print_intermediate=10000, save_checkpoint=True, save_to_log=True, 
    lazy_eval=False, print_attn=True, inspect_samples=1)

Epoch: 0.00, Train Loss: 10.16, Val Loss: 10.26, Train BLEU: 0.30, Val BLEU: 0.20, Minutes Elapsed: 0.05
Sampling from training predictions...
Source: 其实 它们 都 是 由 单独 的 动物 结合 合在
Reference: these are all individual animals banding together to make
Model: <SOS> it the the the the the the the the
Attention Weights: tensor([[3.9910e-05, 2.9069e-03, 1.0208e-01, 3.6750e-01, 4.0648e-01, 8.8430e-02,
         3.1263e-02, 1.2789e-03, 2.3951e-05, 9.3850e-08],
        [3.3833e-03, 2.6489e-02, 1.5200e-01, 2.8847e-01, 2.9621e-01, 1.3614e-01,
         7.9065e-02, 1.5985e-02, 2.1182e-03, 1.4055e-04],
        [2.2636e-02, 6.5902e-02, 1.6035e-01, 2.2654e-01, 2.2016e-01, 1.4209e-01,
         1.0183e-01, 4.2611e-02, 1.4208e-02, 3.6687e-03],
        [3.5100e-02, 7.6346e-02, 1.4396e-01, 1.8920e-01, 1.8635e-01, 1.4278e-01,
         1.1560e-01, 6.4330e-02, 3.2909e-02, 1.3419e-02],
        [3.6192e-02, 7.1498e-02, 1.2580e-01, 1.6447e-01, 1.6782e-01, 1.4454e-01,
         1.2899e-01, 8.2730e-02, 5.2829e-02, 2.513

Epoch: 3.00, Train Loss: 9.49, Val Loss: 9.98, Train BLEU: 0.28, Val BLEU: 0.19, Minutes Elapsed: 0.18
Sampling from training predictions...
Source: 大部 大部分 部分 的 动物 也 都 生活 在 海洋
Reference: most of the animals are in the oceans .
Model: <SOS> the the the the the the the the the
Attention Weights: tensor([[4.4584e-16, 2.4386e-11, 8.6877e-07, 2.9402e-03, 3.0255e-02, 7.6506e-01,
         2.0142e-01, 3.2837e-04, 5.6051e-07, 2.9604e-13],
        [1.9850e-10, 1.8560e-07, 1.4400e-04, 2.0886e-02, 8.5475e-02, 6.5590e-01,
         2.3399e-01, 3.5379e-03, 6.7646e-05, 7.9794e-09],
        [3.8896e-08, 7.0878e-06, 1.1895e-03, 4.6502e-02, 1.2730e-01, 5.9404e-01,
         2.2287e-01, 7.6905e-03, 4.0198e-04, 4.3180e-07],
        [4.1802e-07, 3.6872e-05, 3.0814e-03, 6.5660e-02, 1.4562e-01, 5.5369e-01,
         2.1953e-01, 1.1346e-02, 1.0307e-03, 3.5780e-06],
        [1.4691e-06, 8.8874e-05, 5.0601e-03, 7.8309e-02, 1.5344e-01, 5.2691e-01,
         2.2002e-01, 1.4316e-02, 1.8415e-03, 1.2878e-05],
        [3

Epoch: 6.00, Train Loss: 8.42, Val Loss: 9.54, Train BLEU: 0.28, Val BLEU: 0.19, Minutes Elapsed: 0.32
Sampling from training predictions...
Source: 泰坦 泰坦尼克 泰坦尼克号 坦尼 尼克 号 是 拿 了 不少
Reference: the truth of the matter is that the titanic
Model: <SOS> the the the the the the the the the
Attention Weights: tensor([[5.6471e-34, 3.1455e-34, 1.3270e-29, 2.5074e-25, 6.7483e-13, 1.3452e-04,
         4.0460e-01, 5.9519e-01, 7.3319e-05, 2.3104e-14],
        [5.2461e-23, 6.0788e-23, 2.7972e-20, 8.1447e-17, 2.6119e-08, 5.6843e-03,
         6.3813e-01, 3.5587e-01, 3.1553e-04, 7.4121e-11],
        [1.8207e-19, 2.1727e-19, 1.4556e-17, 3.5304e-14, 8.4146e-07, 2.0137e-02,
         7.2293e-01, 2.5660e-01, 3.3014e-04, 5.0103e-10],
        [6.2726e-18, 8.3116e-18, 1.9769e-16, 3.5868e-13, 2.8573e-06, 3.0305e-02,
         7.3599e-01, 2.3334e-01, 3.6601e-04, 1.3038e-09],
        [4.4454e-17, 6.3081e-17, 8.4591e-16, 1.2665e-12, 5.2419e-06, 3.6274e-02,
         7.3322e-01, 2.3007e-01, 4.3088e-04, 2.6254e-09],
  

Epoch: 10.00, Train Loss: 7.07, Val Loss: 9.02, Train BLEU: 0.28, Val BLEU: 0.19, Minutes Elapsed: 0.51
Sampling from training predictions...
Source: 我们 得用 非常 特殊 的 仪器 才能 能到 到达 那个
Reference: we have to have a very special technology to
Model: <SOS> the the the the the the the the the
Attention Weights: tensor([[8.3223e-38, 4.6517e-29, 4.5040e-12, 1.5797e-04, 9.6930e-01, 3.0542e-02,
         9.7546e-07, 5.6022e-19, 3.6370e-21, 6.4480e-38],
        [6.7508e-23, 4.6718e-17, 3.5667e-07, 7.3121e-03, 8.8302e-01, 1.0939e-01,
         2.7492e-04, 3.2939e-11, 3.1259e-13, 2.6438e-23],
        [5.6450e-19, 5.5130e-14, 7.0303e-06, 1.9870e-02, 8.4487e-01, 1.3423e-01,
         1.0313e-03, 2.0634e-09, 3.0165e-11, 2.1723e-19],
        [1.2495e-17, 6.3469e-13, 1.9726e-05, 2.8042e-02, 8.2681e-01, 1.4345e-01,
         1.6826e-03, 8.4942e-09, 1.5670e-10, 6.7912e-18],
        [4.2198e-17, 1.6827e-12, 2.9384e-05, 3.1978e-02, 8.1620e-01, 1.4967e-01,
         2.1246e-03, 1.6280e-08, 3.5117e-10, 3.6924e-17],
  

Epoch: 13.00, Train Loss: 6.24, Val Loss: 8.76, Train BLEU: 0.28, Val BLEU: 0.19, Minutes Elapsed: 0.67
Sampling from training predictions...
Source: 深海 海中 的 生命 大卫 <UNK> <EOS> <PAD> <PAD> <PAD>
Reference: life in the deep oceans <EOS> <PAD> <PAD> <PAD>
Model: <SOS> the the the the the the the the the
Attention Weights: tensor([[0.0000, 0.0000, 0.9935, 0.0065, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.9124, 0.0876, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0002, 0.8733, 0.1265, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0004, 0.8516, 0.1480, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0004, 0.8336, 0.1660, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0004, 0.8214, 0.1782, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0004, 0.8138, 0.1858, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
    

Epoch: 17.00, Train Loss: 5.37, Val Loss: 8.58, Train BLEU: 0.28, Val BLEU: 0.19, Minutes Elapsed: 0.88
Sampling from training predictions...
Source: <UNK> 塞尔 <UNK> <UNK> 斯特 说 过 真正 的 探索
Reference: marcel proust said , &quot; the true voyage of
Model: <SOS> the the the the the the the the the
Attention Weights: tensor([[0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.6071, 0.3929, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0033, 0.7517, 0.2450, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0182, 0.8129, 0.1690, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0304, 0.8156, 0.1539, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0367, 0.8110, 0.1523, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0399, 0.8070, 0.1530, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0419, 0.8041, 0.1539, 0.0001,
         0.00

Epoch: 21.00, Train Loss: 4.80, Val Loss: 8.62, Train BLEU: 0.28, Val BLEU: 0.19, Minutes Elapsed: 1.05
Sampling from training predictions...
Source: 其实 它们 都 是 由 单独 的 动物 结合 合在
Reference: these are all individual animals banding together to make
Model: <SOS> the the the the the the the the the
Attention Weights: tensor([[0.0000, 0.0000, 0.0000, 0.0500, 0.7016, 0.2446, 0.0039, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0003, 0.0612, 0.4628, 0.4471, 0.0286, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0009, 0.0806, 0.4488, 0.4386, 0.0311, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0014, 0.0885, 0.4356, 0.4400, 0.0345, 0.0001, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0017, 0.0921, 0.4282, 0.4411, 0.0368, 0.0001, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0019, 0.0937, 0.4243, 0.4419, 0.0381, 0.0001, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0020, 0.0946, 0.4224, 0.4419, 0.0389, 0.0001, 0.0000,
         0.0

Epoch: 25.00, Train Loss: 4.49, Val Loss: 8.90, Train BLEU: 0.33, Val BLEU: 0.21, Minutes Elapsed: 1.20
Sampling from training predictions...
Source: 大部 大部分 部分 的 动物 也 都 生活 在 海洋
Reference: most of the animals are in the oceans .
Model: <SOS> it the the the the the the the the
Attention Weights: tensor([[1.5554e-43, 1.2728e-17, 6.1069e-07, 9.5274e-03, 1.8452e-01, 6.7038e-01,
         1.3540e-01, 1.7978e-04, 2.0372e-12, 1.0816e-40],
        [2.8816e-23, 5.4197e-10, 2.8161e-04, 4.3478e-02, 3.2891e-01, 4.7056e-01,
         1.5234e-01, 4.4271e-03, 1.9279e-07, 3.9513e-21],
        [2.7784e-19, 2.4426e-08, 1.1156e-03, 6.1041e-02, 3.4119e-01, 4.3379e-01,
         1.5622e-01, 6.6316e-03, 8.6366e-07, 1.9519e-18],
        [1.9202e-17, 1.3934e-07, 2.0584e-03, 7.0952e-02, 3.3949e-01, 4.2022e-01,
         1.5910e-01, 8.1797e-03, 2.0228e-06, 6.5776e-17],
        [1.5539e-16, 3.2946e-07, 2.7769e-03, 7.6228e-02, 3.3613e-01, 4.1351e-01,
         1.6195e-01, 9.3993e-03, 3.4757e-06, 5.7364e-16],
        [4

Epoch: 29.00, Train Loss: 4.34, Val Loss: 9.26, Train BLEU: 0.36, Val BLEU: 0.28, Minutes Elapsed: 1.35
Sampling from training predictions...
Source: 大多 大多数 多数 地震 和 火山 喷发 也 都 发生
Reference: most of the earthquakes and volcanoes are in the
Model: <SOS> it the the the the the the , ,
Attention Weights: tensor([[1.3032e-43, 2.8793e-21, 1.2330e-07, 4.0611e-03, 3.4849e-01, 4.8599e-01,
         1.5950e-01, 1.9614e-03, 6.7656e-11, 9.5624e-39],
        [5.5914e-23, 6.4144e-10, 4.0676e-04, 5.7394e-02, 3.2585e-01, 4.1406e-01,
         1.9367e-01, 8.6121e-03, 8.2612e-07, 1.6559e-20],
        [1.0587e-18, 8.2570e-08, 1.7275e-03, 7.5742e-02, 2.8829e-01, 3.8938e-01,
         2.2866e-01, 1.6197e-02, 5.7153e-06, 1.4037e-17],
        [1.1037e-16, 7.8169e-07, 3.3829e-03, 8.8695e-02, 2.8482e-01, 3.7052e-01,
         2.3190e-01, 2.0673e-02, 1.4205e-05, 5.6710e-16],
        [9.0842e-16, 2.2627e-06, 4.5170e-03, 9.4068e-02, 2.8162e-01, 3.6155e-01,
         2.3434e-01, 2.3883e-02, 2.4835e-05, 5.4857e-15],
    

Epoch: 32.00, Train Loss: 4.28, Val Loss: 9.51, Train BLEU: 0.36, Val BLEU: 0.28, Minutes Elapsed: 1.46
Sampling from training predictions...
Source: 但 我 想 告诉 你 的 是 当 你 站
Reference: but when you &apos;re standing at the beach ,
Model: <SOS> it the the the the the , , ,
Attention Weights: tensor([[1.0788e-36, 8.2897e-12, 1.8319e-04, 3.6300e-02, 3.8018e-01, 4.6029e-01,
         1.2156e-01, 1.4873e-03, 4.7287e-10, 4.3669e-35],
        [1.0084e-19, 3.4792e-07, 3.6266e-03, 1.0414e-01, 3.4237e-01, 3.7611e-01,
         1.6130e-01, 1.2450e-02, 5.3526e-06, 1.9025e-18],
        [5.3366e-16, 3.7272e-06, 6.3043e-03, 1.1009e-01, 2.9133e-01, 3.6028e-01,
         2.0372e-01, 2.8226e-02, 4.3179e-05, 1.4399e-15],
        [3.2091e-14, 1.2485e-05, 8.7162e-03, 1.1383e-01, 2.7659e-01, 3.4506e-01,
         2.1623e-01, 3.9435e-02, 1.2910e-04, 8.5484e-14],
        [2.3350e-13, 2.2704e-05, 1.0201e-02, 1.1478e-01, 2.6928e-01, 3.3612e-01,
         2.2198e-01, 4.7350e-02, 2.5574e-04, 1.1532e-12],
        [5.8505e

Epoch: 36.00, Train Loss: 4.21, Val Loss: 9.77, Train BLEU: 0.33, Val BLEU: 0.21, Minutes Elapsed: 1.61
Sampling from training predictions...
Source: 泰坦 泰坦尼克 泰坦尼克号 坦尼 尼克 号 是 拿 了 不少
Reference: the truth of the matter is that the titanic
Model: <SOS> it the the the the the the the the
Attention Weights: tensor([[0.0000, 0.0000, 0.0000, 0.0000, 0.0004, 0.3033, 0.6754, 0.0209, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0000, 0.0001, 0.0860, 0.5459, 0.3396, 0.0284, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0000, 0.0003, 0.1202, 0.4760, 0.3527, 0.0507, 0.0001,
         0.0000],
        [0.0000, 0.0000, 0.0000, 0.0008, 0.1450, 0.4417, 0.3439, 0.0683, 0.0002,
         0.0000],
        [0.0000, 0.0000, 0.0000, 0.0016, 0.1682, 0.4156, 0.3305, 0.0836, 0.0005,
         0.0000],
        [0.0000, 0.0000, 0.0001, 0.0021, 0.1818, 0.3985, 0.3216, 0.0949, 0.0009,
         0.0000],
        [0.0000, 0.0000, 0.0001, 0.0025, 0.1898, 0.3891, 0.3159, 0.1013, 0.0013,
         0.0000],
    

Epoch: 40.00, Train Loss: 4.16, Val Loss: 10.00, Train BLEU: 0.33, Val BLEU: 0.21, Minutes Elapsed: 1.75
Sampling from training predictions...
Source: 还有 前面 的 这个 是 推进 引擎 它 一会 一会儿
Reference: and it &apos;s got these jet thrusters up in
Model: <SOS> it the the the the the the the the
Attention Weights: tensor([[0.0000, 0.0000, 0.0007, 0.1202, 0.6411, 0.2358, 0.0021, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0140, 0.2151, 0.4469, 0.3013, 0.0228, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0001, 0.0300, 0.2368, 0.3685, 0.3220, 0.0427, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0002, 0.0401, 0.2326, 0.3455, 0.3217, 0.0598, 0.0001, 0.0000,
         0.0000],
        [0.0000, 0.0004, 0.0430, 0.2112, 0.3197, 0.3355, 0.0898, 0.0004, 0.0000,
         0.0000],
        [0.0000, 0.0006, 0.0442, 0.1978, 0.3030, 0.3412, 0.1123, 0.0008, 0.0000,
         0.0000],
        [0.0000, 0.0008, 0.0449, 0.1904, 0.2936, 0.3433, 0.1257, 0.0013, 0.0000,
         0.0000],
     

Epoch: 44.00, Train Loss: 4.11, Val Loss: 10.19, Train BLEU: 0.33, Val BLEU: 0.21, Minutes Elapsed: 1.91
Sampling from training predictions...
Source: 底下 这些 都 是 <UNK> 它们 上上 上上下下 上下 下下
Reference: it &apos;s got these fishing <UNK> on the bottom
Model: <SOS> it the the the the the the the the
Attention Weights: tensor([[0.0000, 0.0000, 0.0772, 0.4108, 0.0000, 0.5120, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0559, 0.2205, 0.0030, 0.7117, 0.0079, 0.0009, 0.0000,
         0.0000],
        [0.0000, 0.0006, 0.1366, 0.2913, 0.0041, 0.5594, 0.0034, 0.0046, 0.0000,
         0.0000],
        [0.0000, 0.0021, 0.1721, 0.3159, 0.0037, 0.4853, 0.0040, 0.0168, 0.0000,
         0.0000],
        [0.0000, 0.0040, 0.1737, 0.2821, 0.0033, 0.4784, 0.0061, 0.0523, 0.0000,
         0.0000],
        [0.0000, 0.0053, 0.1636, 0.2491, 0.0031, 0.4704, 0.0073, 0.1013, 0.0000,
         0.0000],
        [0.0000, 0.0063, 0.1600, 0.2307, 0.0028, 0.4647, 0.0073, 0.1282, 0.0000,
         0.000

Epoch: 48.00, Train Loss: 4.06, Val Loss: 10.34, Train BLEU: 1.31, Val BLEU: 0.21, Minutes Elapsed: 2.06
Sampling from training predictions...
Source: 原因 在于 我们 一直 没 把 海洋 当回事 回事 回事儿
Reference: and the problem , i think , is that
Model: <SOS> it the the the the the the the the
Attention Weights: tensor([[0.0000, 0.0000, 0.0017, 0.1448, 0.6839, 0.1695, 0.0001, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0371, 0.3093, 0.4710, 0.1792, 0.0033, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0005, 0.0721, 0.3041, 0.4135, 0.2021, 0.0078, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0014, 0.0793, 0.2794, 0.3755, 0.2435, 0.0208, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0026, 0.0801, 0.2528, 0.3440, 0.2772, 0.0432, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0036, 0.0803, 0.2383, 0.3239, 0.2915, 0.0624, 0.0001, 0.0000,
         0.0000],
        [0.0000, 0.0043, 0.0815, 0.2332, 0.3147, 0.2947, 0.0716, 0.0001, 0.0000,
         0.0000],
        [0.0

Epoch: 52.00, Train Loss: 4.01, Val Loss: 10.45, Train BLEU: 5.92, Val BLEU: 0.20, Minutes Elapsed: 2.28
Sampling from training predictions...
Source: 我们 用 的 是 深海 潜水 潜水艇 <UNK> 号 和
Reference: we use the submarine alvin and we use cameras
Model: <SOS> it the the the the the the the the
Attention Weights: tensor([[2.1848e-35, 1.6784e-08, 4.2893e-02, 9.1739e-01, 3.9713e-02, 3.9938e-07,
         3.6453e-30, 5.7297e-40, 4.3907e-12, 2.8345e-34],
        [1.3507e-17, 1.7380e-04, 1.8597e-01, 6.8404e-01, 1.2958e-01, 2.4087e-04,
         5.4968e-17, 1.6851e-19, 2.3570e-06, 6.1132e-19],
        [2.2875e-12, 3.2241e-03, 2.5573e-01, 5.8990e-01, 1.5023e-01, 7.1046e-04,
         2.9266e-15, 2.1330e-15, 2.0837e-04, 1.7901e-14],
        [2.0669e-10, 5.8948e-03, 2.1945e-01, 5.3160e-01, 2.3425e-01, 3.3237e-03,
         6.0563e-13, 3.3745e-12, 5.4878e-03, 8.3297e-11],
        [4.8525e-09, 8.6376e-03, 1.9503e-01, 4.7777e-01, 2.8033e-01, 7.0996e-03,
         1.6404e-11, 2.2021e-10, 3.1134e-02, 1.5855e-08],
 

Epoch: 56.00, Train Loss: 3.97, Val Loss: 10.53, Train BLEU: 7.41, Val BLEU: 1.01, Minutes Elapsed: 2.45
Sampling from training predictions...
Source: 还有 前面 的 这个 是 推进 引擎 它 一会 一会儿
Reference: and it &apos;s got these jet thrusters up in
Model: <SOS> it &apos;s the the the the , , ,
Attention Weights: tensor([[0.0000, 0.0000, 0.0096, 0.2977, 0.5989, 0.0935, 0.0003, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0001, 0.0659, 0.3885, 0.4005, 0.1404, 0.0047, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0012, 0.1089, 0.3376, 0.3861, 0.1569, 0.0092, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0025, 0.0970, 0.2788, 0.3722, 0.2190, 0.0305, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0042, 0.0887, 0.2392, 0.3485, 0.2566, 0.0626, 0.0001, 0.0000,
         0.0000],
        [0.0000, 0.0052, 0.0796, 0.2091, 0.3266, 0.2802, 0.0988, 0.0005, 0.0000,
         0.0000],
        [0.0000, 0.0058, 0.0754, 0.1952, 0.3139, 0.2873, 0.1215, 0.0010, 0.0000,
         0.0000],
       

Epoch: 60.00, Train Loss: 3.92, Val Loss: 10.58, Train BLEU: 7.46, Val BLEU: 1.01, Minutes Elapsed: 2.60
Sampling from training predictions...
Source: 其实 它们 都 是 由 单独 的 动物 结合 合在
Reference: these are all individual animals banding together to make
Model: <SOS> it &apos;s the the the the , , ,
Attention Weights: tensor([[0.0000, 0.0000, 0.0110, 0.3170, 0.5299, 0.1367, 0.0054, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0001, 0.0672, 0.3207, 0.3858, 0.1983, 0.0279, 0.0001, 0.0000,
         0.0000],
        [0.0000, 0.0018, 0.1143, 0.3028, 0.3515, 0.1887, 0.0408, 0.0002, 0.0000,
         0.0000],
        [0.0000, 0.0041, 0.1143, 0.2701, 0.3315, 0.2109, 0.0680, 0.0010, 0.0000,
         0.0000],
        [0.0000, 0.0064, 0.1071, 0.2405, 0.3148, 0.2277, 0.1000, 0.0035, 0.0000,
         0.0000],
        [0.0000, 0.0073, 0.0945, 0.2142, 0.3003, 0.2416, 0.1336, 0.0084, 0.0000,
         0.0000],
        [0.0000, 0.0081, 0.0926, 0.2093, 0.2959, 0.2416, 0.1415, 0.0109, 0.0000,
         0.000

Epoch: 64.00, Train Loss: 3.87, Val Loss: 10.63, Train BLEU: 7.26, Val BLEU: 1.07, Minutes Elapsed: 2.77
Sampling from training predictions...
Source: 这 是 一只 水母 <EOS> <PAD> <PAD> <PAD> <PAD> <PAD>
Reference: here &apos;s a jelly . <EOS> <PAD> <PAD> <PAD>
Model: <SOS> it &apos;s the the the the . . .
Attention Weights: tensor([[0.0000, 0.1780, 0.8220, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.4653, 0.5347, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.8828, 0.1172, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0001, 0.8104, 0.1896, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0003, 0.7336, 0.2660, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0006, 0.6385, 0.3607, 0.0002, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0008, 0.6091, 0.3897, 0.0005, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
     

Epoch: 68.00, Train Loss: 3.82, Val Loss: 10.66, Train BLEU: 7.31, Val BLEU: 1.07, Minutes Elapsed: 2.95
Sampling from training predictions...
Source: 这儿 基本 基本上 都 没有 被 开发 发过 但是 像
Reference: it &apos;s mostly unexplored , and yet there are
Model: <SOS> and &apos;s the the the the , , ,
Attention Weights: tensor([[7.1588e-36, 7.6948e-10, 4.5748e-04, 2.1098e-01, 3.2081e-01, 4.3340e-01,
         3.4066e-02, 2.9541e-04, 3.8729e-10, 2.2599e-31],
        [4.8452e-18, 8.7371e-05, 3.8583e-02, 3.2190e-01, 3.1216e-01, 2.5631e-01,
         6.6739e-02, 4.2106e-03, 2.5211e-06, 7.1868e-18],
        [2.1881e-12, 4.9969e-03, 8.5433e-02, 3.8257e-01, 2.3688e-01, 2.2763e-01,
         5.7269e-02, 5.2062e-03, 1.0951e-05, 7.4804e-15],
        [1.6744e-10, 1.1220e-02, 9.7276e-02, 3.5552e-01, 2.2410e-01, 2.3168e-01,
         7.0296e-02, 9.8228e-03, 8.0729e-05, 1.0175e-11],
        [4.6581e-09, 1.7271e-02, 9.9071e-02, 3.2623e-01, 2.1596e-01, 2.3990e-01,
         8.5206e-02, 1.6062e-02, 2.9864e-04, 8.8973e-10],


Epoch: 72.00, Train Loss: 3.77, Val Loss: 10.71, Train BLEU: 7.32, Val BLEU: 1.07, Minutes Elapsed: 3.10
Sampling from training predictions...
Source: 看到 这些 在 动 的 东西 了 吗 <EOS> <PAD>
Reference: but see all those different working things ? <EOS>
Model: <SOS> it &apos;s the the the . . . .
Attention Weights: tensor([[0.0000, 0.0000, 0.0252, 0.5221, 0.4194, 0.0333, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0013, 0.1922, 0.4537, 0.2829, 0.0691, 0.0007, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0242, 0.3743, 0.3876, 0.1794, 0.0336, 0.0009, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0434, 0.3910, 0.3585, 0.1682, 0.0365, 0.0024, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0560, 0.3678, 0.3428, 0.1789, 0.0483, 0.0062, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0618, 0.3460, 0.3305, 0.1893, 0.0599, 0.0125, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0648, 0.3419, 0.3250, 0.1903, 0.0624, 0.0155, 0.0000, 0.0000,
         0.0000],


Epoch: 76.00, Train Loss: 3.71, Val Loss: 10.75, Train BLEU: 7.41, Val BLEU: 1.07, Minutes Elapsed: 3.25
Sampling from training predictions...
Source: 大卫 <UNK> 通过 潜水 潜水艇 拍下 的 影片 把 我们
Reference: with vibrant video clips captured by submarines , david
Model: <SOS> we of the the , , , , ,
Attention Weights: tensor([[0.0000, 0.0000, 0.0000, 0.0014, 0.0000, 0.0192, 0.7239, 0.2544, 0.0010,
         0.0000],
        [0.0000, 0.0000, 0.0000, 0.5407, 0.0014, 0.0395, 0.2266, 0.1768, 0.0150,
         0.0000],
        [0.0000, 0.0000, 0.0002, 0.8635, 0.0001, 0.0173, 0.0766, 0.0361, 0.0062,
         0.0000],
        [0.0000, 0.0000, 0.0012, 0.8615, 0.0001, 0.0190, 0.0739, 0.0331, 0.0112,
         0.0000],
        [0.0000, 0.0000, 0.0056, 0.8551, 0.0001, 0.0212, 0.0709, 0.0322, 0.0149,
         0.0000],
        [0.0000, 0.0000, 0.0119, 0.8179, 0.0001, 0.0255, 0.0812, 0.0396, 0.0237,
         0.0001],
        [0.0000, 0.0000, 0.0156, 0.7893, 0.0001, 0.0279, 0.0905, 0.0464, 0.0300,
         0.0002],
 

Epoch: 80.00, Train Loss: 3.65, Val Loss: 10.79, Train BLEU: 7.37, Val BLEU: 1.07, Minutes Elapsed: 3.40
Sampling from training predictions...
Source: 我们 这 有 不少 精彩 的 泰坦 泰坦尼克 坦尼 尼克
Reference: we &apos;ve got some of the most incredible video
Model: <SOS> we &apos;s the the the the the the ,
Attention Weights: tensor([[6.7178e-29, 1.1808e-05, 1.5148e-01, 5.2084e-01, 3.1506e-01, 1.2607e-02,
         2.5904e-09, 2.5223e-44, 4.0217e-43, 1.7796e-43],
        [5.2514e-15, 4.4788e-03, 2.9379e-01, 3.8251e-01, 2.6977e-01, 4.9428e-02,
         2.3094e-05, 3.9571e-24, 1.4992e-23, 9.1708e-24],
        [5.8210e-11, 5.4886e-02, 4.6779e-01, 2.5900e-01, 1.7102e-01, 4.7283e-02,
         2.7094e-05, 3.4325e-23, 1.1680e-21, 1.4477e-19],
        [1.8467e-09, 7.7096e-02, 4.2910e-01, 2.3000e-01, 1.8030e-01, 8.3143e-02,
         3.6395e-04, 7.7401e-19, 3.1902e-18, 1.4020e-15],
        [2.5496e-08, 9.1186e-02, 3.9439e-01, 2.1687e-01, 1.8628e-01, 1.0995e-01,
         1.3256e-03, 1.2191e-16, 6.9438e-16, 4.5350e-

Epoch: 83.00, Train Loss: 3.61, Val Loss: 10.82, Train BLEU: 7.72, Val BLEU: 1.02, Minutes Elapsed: 3.51
Sampling from training predictions...
Source: 但 我 想 告诉 你 的 是 当 你 站
Reference: but when you &apos;re standing at the beach ,
Model: <SOS> we &apos;s the the the the the the the
Attention Weights: tensor([[6.8670e-29, 6.4954e-07, 1.5717e-02, 1.3882e-01, 5.0651e-01, 2.8396e-01,
         5.3762e-02, 1.2186e-03, 2.7882e-08, 2.1785e-28],
        [4.2015e-15, 7.6261e-04, 8.4068e-02, 1.9147e-01, 3.6427e-01, 2.4659e-01,
         9.9617e-02, 1.3162e-02, 5.3341e-05, 2.6859e-15],
        [8.5722e-11, 2.0401e-02, 2.5044e-01, 1.8722e-01, 2.9664e-01, 1.6126e-01,
         6.8951e-02, 1.4900e-02, 1.7127e-04, 5.9576e-13],
        [2.3830e-09, 3.4176e-02, 2.7149e-01, 1.8450e-01, 2.7334e-01, 1.4933e-01,
         6.7664e-02, 1.8946e-02, 5.4972e-04, 2.6949e-10],
        [4.6524e-08, 4.5574e-02, 2.6338e-01, 1.7664e-01, 2.5727e-01, 1.5016e-01,
         7.6915e-02, 2.8365e-02, 1.6986e-03, 1.7504e-08],
     

Epoch: 87.00, Train Loss: 3.55, Val Loss: 10.86, Train BLEU: 6.93, Val BLEU: 0.89, Minutes Elapsed: 3.65
Sampling from training predictions...
Source: 当 你 站 在 海滩 上 或是 当 你 看到
Reference: part of the problem , i think , is
Model: <SOS> we &apos;s the the the the the the the
Attention Weights: tensor([[1.9718e-28, 6.5322e-07, 1.3345e-02, 2.4587e-01, 3.4856e-01, 3.5045e-01,
         4.0430e-02, 1.3481e-03, 2.1205e-08, 2.7705e-31],
        [1.6843e-15, 2.4302e-04, 3.9971e-02, 2.0902e-01, 3.0609e-01, 3.1339e-01,
         1.1115e-01, 2.0041e-02, 8.8929e-05, 1.5323e-16],
        [3.1473e-11, 8.1460e-03, 1.5665e-01, 2.9257e-01, 2.0310e-01, 2.4422e-01,
         6.5298e-02, 2.9624e-02, 4.0585e-04, 3.8603e-14],
        [9.6077e-10, 1.3845e-02, 1.6969e-01, 2.8389e-01, 1.8892e-01, 2.3360e-01,
         6.5535e-02, 4.2957e-02, 1.5758e-03, 3.9369e-11],
        [2.4728e-08, 1.7933e-02, 1.5191e-01, 2.5138e-01, 1.8081e-01, 2.3646e-01,
         8.0130e-02, 7.4924e-02, 6.4531e-03, 7.3809e-09],
        [2.026

Epoch: 91.00, Train Loss: 3.48, Val Loss: 10.90, Train BLEU: 7.25, Val BLEU: 0.24, Minutes Elapsed: 3.80
Sampling from training predictions...
Source: 当 你 站 在 海滩 上 或是 当 你 看到
Reference: part of the problem , i think , is
Model: <SOS> we of the the the the the the the
Attention Weights: tensor([[2.8551e-28, 6.2512e-07, 1.0285e-02, 2.0522e-01, 3.5879e-01, 3.6853e-01,
         5.4991e-02, 2.1793e-03, 6.4237e-08, 4.5369e-30],
        [5.6434e-16, 9.7572e-05, 2.0933e-02, 1.4924e-01, 2.9380e-01, 3.4023e-01,
         1.5792e-01, 3.7481e-02, 2.9078e-04, 1.5396e-15],
        [7.5583e-12, 3.4101e-03, 9.5375e-02, 2.4128e-01, 2.1256e-01, 2.9215e-01,
         9.6407e-02, 5.7311e-02, 1.4998e-03, 4.8094e-13],
        [2.3208e-10, 6.0609e-03, 1.0728e-01, 2.3446e-01, 1.8793e-01, 2.7966e-01,
         9.4012e-02, 8.4691e-02, 5.9035e-03, 4.5451e-10],
        [6.4298e-09, 7.6315e-03, 8.9517e-02, 1.9020e-01, 1.6469e-01, 2.7034e-01,
         1.1018e-01, 1.4471e-01, 2.2727e-02, 6.6483e-08],
        [4.4936e-08

Epoch: 95.00, Train Loss: 3.42, Val Loss: 10.95, Train BLEU: 6.96, Val BLEU: 0.25, Minutes Elapsed: 3.94
Sampling from training predictions...
Source: 我们 用 的 是 深海 潜水 潜水艇 <UNK> 号 和
Reference: we use the submarine alvin and we use cameras
Model: <SOS> we of the the the the the the the
Attention Weights: tensor([[0.0000, 0.0031, 0.9704, 0.0265, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0307, 0.8048, 0.1645, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0514, 0.6853, 0.2633, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0447, 0.5852, 0.3701, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0381, 0.4906, 0.4712, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0266, 0.3824, 0.5909, 0.0001, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0192, 0.3087, 0.6718, 0.0003, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
    

Epoch: 99.00, Train Loss: 3.35, Val Loss: 11.00, Train BLEU: 6.97, Val BLEU: 0.25, Minutes Elapsed: 4.09
Sampling from training predictions...
Source: 这儿 基本 基本上 都 没有 被 开发 发过 但是 像
Reference: it &apos;s mostly unexplored , and yet there are
Model: <SOS> we of the the the the the the the
Attention Weights: tensor([[4.1602e-33, 2.2169e-08, 1.3839e-03, 1.4486e-01, 3.1594e-01, 4.5540e-01,
         8.0103e-02, 2.3233e-03, 1.1672e-07, 2.5790e-25],
        [5.0328e-19, 1.0916e-05, 6.9828e-03, 8.9550e-02, 2.0679e-01, 3.9068e-01,
         2.4012e-01, 6.5092e-02, 7.6442e-04, 9.8855e-13],
        [7.1157e-15, 3.1707e-04, 1.5755e-02, 1.8164e-01, 1.7904e-01, 3.9299e-01,
         1.6296e-01, 6.4580e-02, 2.7227e-03, 6.5278e-10],
        [2.8399e-13, 6.0844e-04, 1.8626e-02, 2.0049e-01, 1.6689e-01, 3.7571e-01,
         1.4997e-01, 7.8204e-02, 9.5016e-03, 4.9567e-07],
        [1.8742e-11, 1.0885e-03, 2.1013e-02, 1.9543e-01, 1.5688e-01, 3.4837e-01,
         1.5151e-01, 1.0032e-01, 2.5372e-02, 2.4249e-05],


Epoch: 103.00, Train Loss: 3.29, Val Loss: 11.05, Train BLEU: 7.25, Val BLEU: 0.22, Minutes Elapsed: 4.24
Sampling from training predictions...
Source: 大卫 <UNK> 通过 潜水 潜水艇 拍下 的 影片 把 我们
Reference: with vibrant video clips captured by submarines , david
Model: <SOS> with of the the , , , , ,
Attention Weights: tensor([[0.0000, 0.0000, 0.0000, 0.2032, 0.0000, 0.0000, 0.0319, 0.5225, 0.2424,
         0.0000],
        [0.2339, 0.0000, 0.0000, 0.5763, 0.0010, 0.0000, 0.0007, 0.0226, 0.1632,
         0.0023],
        [0.2328, 0.0000, 0.0000, 0.4517, 0.0000, 0.0000, 0.0024, 0.0248, 0.2586,
         0.0296],
        [0.4890, 0.0000, 0.0000, 0.0343, 0.0000, 0.0000, 0.0006, 0.0037, 0.0816,
         0.3909],
        [0.5479, 0.0000, 0.0000, 0.0030, 0.0000, 0.0000, 0.0001, 0.0007, 0.0219,
         0.4263],
        [0.0196, 0.0000, 0.0000, 0.0005, 0.0000, 0.0000, 0.0001, 0.0006, 0.0230,
         0.9561],
        [0.0011, 0.0000, 0.0000, 0.0002, 0.0000, 0.0000, 0.0001, 0.0007, 0.0244,
         0.9736]

Epoch: 107.00, Train Loss: 3.22, Val Loss: 11.09, Train BLEU: 7.28, Val BLEU: 0.22, Minutes Elapsed: 4.38
Sampling from training predictions...
Source: 还有 这些 摇晃 着 旋转 转着 的 触角 <EOS> <PAD>
Reference: it &apos;s got tentacles dangling , swirling around like
Model: <SOS> it &apos;s the the . . . . .
Attention Weights: tensor([[0.0000, 0.0000, 0.0735, 0.4202, 0.5063, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0146, 0.0962, 0.6995, 0.1881, 0.0017, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0007, 0.0624, 0.3475, 0.5616, 0.0101, 0.0177, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0026, 0.0868, 0.4801, 0.3382, 0.0072, 0.0850, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0024, 0.0687, 0.4112, 0.3241, 0.0251, 0.1677, 0.0006, 0.0002,
         0.0000],
        [0.0000, 0.0000, 0.0013, 0.0085, 0.0082, 0.0006, 0.0130, 0.0008, 0.9677,
         0.0000],
        [0.0000, 0.0000, 0.0012, 0.0074, 0.0075, 0.0002, 0.0089, 0.0011, 0.9737,
         0

Epoch: 111.00, Train Loss: 3.15, Val Loss: 11.12, Train BLEU: 8.09, Val BLEU: 0.22, Minutes Elapsed: 4.53
Sampling from training predictions...
Source: 这 是 一只 水母 <EOS> <PAD> <PAD> <PAD> <PAD> <PAD>
Reference: here &apos;s a jelly . <EOS> <PAD> <PAD> <PAD>
Model: <SOS> it &apos;s the the . . . . .
Attention Weights: tensor([[0.0000, 0.0161, 0.9839, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0033, 0.9966, 0.0001, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0289, 0.9710, 0.0001, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0475, 0.9469, 0.0045, 0.0010, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0010, 0.0335, 0.0026, 0.9629, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 1.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 1.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        

Epoch: 115.00, Train Loss: 3.09, Val Loss: 11.18, Train BLEU: 8.93, Val BLEU: 0.22, Minutes Elapsed: 4.68
Sampling from training predictions...
Source: 还有 这些 摇晃 着 旋转 转着 的 触角 <EOS> <PAD>
Reference: it &apos;s got tentacles dangling , swirling around like
Model: <SOS> it &apos;s a the . . . . <EOS>
Attention Weights: tensor([[0.0000, 0.0000, 0.0632, 0.4315, 0.5052, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0048, 0.0382, 0.4229, 0.5334, 0.0007, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0005, 0.0406, 0.2530, 0.6728, 0.0147, 0.0186, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0032, 0.0773, 0.4498, 0.3452, 0.0076, 0.1168, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0028, 0.0607, 0.3567, 0.3335, 0.0358, 0.2015, 0.0009, 0.0081,
         0.0000],
        [0.0000, 0.0000, 0.0000, 0.0001, 0.0001, 0.0000, 0.0003, 0.0000, 0.9994,
         0.0000],
        [0.0000, 0.0000, 0.0000, 0.0002, 0.0002, 0.0000, 0.0002, 0.0001, 0.9993,
        

Epoch: 119.00, Train Loss: 3.02, Val Loss: 11.22, Train BLEU: 9.23, Val BLEU: 0.23, Minutes Elapsed: 4.82
Sampling from training predictions...
Source: 这 是 一种 种群 栖 动物 <EOS> <PAD> <PAD> <PAD>
Reference: it &apos;s a colonial animal . <EOS> <PAD> <PAD>
Model: <SOS> it &apos;s a the . . . . .
Attention Weights: tensor([[0.0000, 0.0001, 0.2783, 0.7184, 0.0032, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0001, 0.0479, 0.6905, 0.2614, 0.0000, 0.0001, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0010, 0.0574, 0.4839, 0.4577, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0041, 0.0459, 0.2428, 0.7040, 0.0003, 0.0029, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0003, 0.0020, 0.0107, 0.0001, 0.9869, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 1.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 1.0000, 0.0000, 0.0000,
         0.0000

Epoch: 123.00, Train Loss: 2.95, Val Loss: 11.28, Train BLEU: 9.78, Val BLEU: 0.23, Minutes Elapsed: 4.97
Sampling from training predictions...
Source: 海洋 的 平均 深度 是 两英里 英里 <EOS> <PAD> <PAD>
Reference: the average depth is about two miles . <EOS>
Model: <SOS> it &apos;s a the about . . . .
Attention Weights: tensor([[0.0000, 0.0000, 0.1786, 0.8053, 0.0160, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0088, 0.1024, 0.0761, 0.8126, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0008, 0.0881, 0.4491, 0.4600, 0.0021, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0098, 0.1734, 0.3485, 0.4664, 0.0019, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0122, 0.1628, 0.3292, 0.4663, 0.0295, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0078, 0.1094, 0.2661, 0.4764, 0.0760, 0.0000, 0.0642, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0000, 0.0001, 0.0001, 0.0000, 0.0000, 0.9998, 0.0000,
         0.0000]

Epoch: 127.00, Train Loss: 2.89, Val Loss: 11.33, Train BLEU: 11.14, Val BLEU: 0.24, Minutes Elapsed: 5.12
Sampling from training predictions...
Source: 大多 大多数 多数 地震 和 火山 喷发 也 都 发生
Reference: most of the earthquakes and volcanoes are in the
Model: <SOS> most of the the the the the the the
Attention Weights: tensor([[1.8012e-38, 1.4551e-22, 5.4191e-07, 1.1527e-02, 1.3683e-01, 7.1075e-01,
         1.3847e-01, 2.4272e-03, 5.3517e-07, 3.6580e-25],
        [9.4230e-23, 1.1021e-07, 7.5789e-06, 6.1300e-03, 2.9855e-02, 4.5402e-01,
         4.2404e-01, 8.2387e-02, 3.5671e-03, 2.5705e-12],
        [1.0184e-16, 5.7319e-06, 1.3156e-03, 3.4925e-02, 1.2514e-01, 2.8438e-01,
         2.8774e-01, 2.5369e-01, 1.2805e-02, 5.6284e-11],
        [1.7491e-14, 3.1321e-04, 6.8595e-03, 4.0557e-02, 1.3271e-01, 1.1728e-01,
         1.5586e-01, 4.7773e-01, 6.8693e-02, 5.6535e-08],
        [3.3948e-13, 2.0419e-03, 1.2766e-02, 5.8321e-02, 1.2122e-01, 1.5314e-01,
         2.0682e-01, 3.6769e-01, 7.7996e-02, 2.1312e-0

Epoch: 131.00, Train Loss: 2.82, Val Loss: 11.36, Train BLEU: 11.88, Val BLEU: 0.26, Minutes Elapsed: 5.27
Sampling from training predictions...
Source: 地球 的 大部 大部分 部分 都 是 海水 <EOS> <PAD>
Reference: most of the planet is ocean water . <EOS>
Model: <SOS> it &apos;s a to . . <EOS> <EOS> <EOS>
Attention Weights: tensor([[0.0000, 0.0000, 0.0459, 0.8316, 0.1199, 0.0027, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0253, 0.6869, 0.2226, 0.0614, 0.0038, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0005, 0.1239, 0.2445, 0.3478, 0.2658, 0.0176, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0185, 0.3081, 0.1440, 0.2227, 0.2868, 0.0198, 0.0000, 0.0001,
         0.0000],
        [0.0000, 0.0109, 0.1197, 0.0857, 0.1049, 0.0777, 0.0070, 0.0000, 0.5940,
         0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 1.0000,
         0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 1.0000,
         0.0000

Epoch: 135.00, Train Loss: 2.75, Val Loss: 11.40, Train BLEU: 12.50, Val BLEU: 0.26, Minutes Elapsed: 5.50
Sampling from training predictions...
Source: 大家 想想 海洋 占 了 地球 球面 面积 的 75
Reference: when you think about it , the oceans are
Model: <SOS> when you the the , , , , ,
Attention Weights: tensor([[0.0000, 0.0095, 0.1208, 0.5915, 0.2517, 0.0258, 0.0007, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0179, 0.0941, 0.3158, 0.3036, 0.1972, 0.0705, 0.0009, 0.0000,
         0.0000],
        [0.0000, 0.0047, 0.0435, 0.2528, 0.3274, 0.2356, 0.1319, 0.0040, 0.0000,
         0.0000],
        [0.0000, 0.0029, 0.0492, 0.1973, 0.2964, 0.2804, 0.1655, 0.0083, 0.0000,
         0.0000],
        [0.0000, 0.0025, 0.0521, 0.1839, 0.2806, 0.2834, 0.1865, 0.0110, 0.0000,
         0.0000],
        [0.0000, 0.0045, 0.0524, 0.1804, 0.2563, 0.2561, 0.2279, 0.0224, 0.0000,
         0.0000],
        [0.0000, 0.0054, 0.0581, 0.1761, 0.2370, 0.2497, 0.2420, 0.0317, 0.0001,
         0.0000],
        [0.0000,

Epoch: 139.00, Train Loss: 2.69, Val Loss: 11.43, Train BLEU: 14.36, Val BLEU: 0.24, Minutes Elapsed: 5.70
Sampling from training predictions...
Source: 它 可以 伸展 <UNK> 150 英尺 长 <EOS> <PAD> <PAD>
Reference: it gets up to about 150 feet long .
Model: <SOS> it gets a to about . . . .
Attention Weights: tensor([[0.0000, 1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0007, 0.9993, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0026, 0.9974, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0042, 0.9958, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0073, 0.9926, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0074, 0.9308, 0.0002, 0.0000, 0.0000, 0.0000, 0.0000, 0.0616, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 1.0000, 0.0000,
         0.0000],
       

Epoch: 143.00, Train Loss: 2.61, Val Loss: 11.47, Train BLEU: 17.37, Val BLEU: 0.25, Minutes Elapsed: 5.88
Sampling from training predictions...
Source: 原因 在于 我们 一直 没 把 海洋 当回事 回事 回事儿
Reference: and the problem , i think , is that
Model: <SOS> and the the , , , , , ,
Attention Weights: tensor([[0.0000, 0.0005, 0.2211, 0.7381, 0.0386, 0.0018, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0001, 0.0890, 0.7629, 0.1035, 0.0440, 0.0006, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0001, 0.0802, 0.3436, 0.2920, 0.2810, 0.0031, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0001, 0.0625, 0.2787, 0.3696, 0.2678, 0.0213, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0001, 0.0265, 0.1690, 0.3959, 0.3456, 0.0629, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0001, 0.0196, 0.1275, 0.3962, 0.3602, 0.0960, 0.0005, 0.0000,
         0.0000],
        [0.0000, 0.0001, 0.0191, 0.1227, 0.3964, 0.3596, 0.1012, 0.0009, 0.0000,
         0.0000],
        [0.0000, 0.00

Epoch: 147.00, Train Loss: 2.54, Val Loss: 11.48, Train BLEU: 19.79, Val BLEU: 0.27, Minutes Elapsed: 6.06
Sampling from training predictions...
Source: 我们 用 的 是 深海 潜水 潜水艇 <UNK> 号 和
Reference: we use the submarine alvin and we use cameras
Model: <SOS> we use the the the the the the the
Attention Weights: tensor([[0.0000, 0.9996, 0.0004, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.9915, 0.0085, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.8922, 0.1078, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.3714, 0.6267, 0.0019, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.1550, 0.8354, 0.0096, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0980, 0.8802, 0.0218, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0929, 0.8685, 0.0386, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
 

Epoch: 151.00, Train Loss: 2.46, Val Loss: 11.52, Train BLEU: 22.46, Val BLEU: 0.28, Minutes Elapsed: 6.23
Sampling from training predictions...
Source: 大多 大多数 多数 地震 和 火山 喷发 也 都 发生
Reference: most of the earthquakes and volcanoes are in the
Model: <SOS> most of the the , the the the the
Attention Weights: tensor([[2.5749e-39, 1.4151e-25, 1.8390e-08, 5.3298e-03, 4.5092e-02, 8.9066e-01,
         5.8806e-02, 1.1636e-04, 7.0873e-09, 3.5592e-26],
        [1.5052e-24, 3.9783e-08, 1.1854e-07, 1.8136e-03, 4.9409e-03, 7.8039e-01,
         2.1116e-01, 1.6839e-03, 1.7073e-05, 2.8417e-14],
        [3.6575e-17, 1.1826e-07, 2.6081e-04, 2.7646e-02, 1.4009e-01, 5.4618e-01,
         2.1564e-01, 6.7882e-02, 2.2966e-03, 1.4958e-12],
        [3.7656e-15, 7.2665e-05, 3.0775e-03, 7.3054e-02, 2.1045e-01, 3.1901e-01,
         1.9190e-01, 1.8696e-01, 1.5478e-02, 1.2300e-09],
        [2.3066e-14, 1.1838e-04, 6.2879e-03, 1.3942e-01, 2.6115e-01, 2.5753e-01,
         1.8450e-01, 1.4750e-01, 3.4920e-03, 9.3879e-09]

Epoch: 155.00, Train Loss: 2.39, Val Loss: 11.56, Train BLEU: 24.52, Val BLEU: 0.27, Minutes Elapsed: 6.40
Sampling from training predictions...
Source: 这儿 基本 基本上 都 没有 被 开发 发过 但是 像
Reference: it &apos;s mostly unexplored , and yet there are
Model: <SOS> most of the the , , the the the
Attention Weights: tensor([[1.0993e-38, 2.7316e-12, 5.8990e-05, 1.7726e-01, 6.3219e-02, 7.5010e-01,
         9.2541e-03, 1.0271e-04, 1.5702e-09, 4.3557e-26],
        [6.8302e-27, 2.0270e-10, 1.7525e-03, 4.2721e-02, 1.8985e-01, 5.2385e-01,
         2.1200e-01, 2.9769e-02, 5.9372e-05, 1.6186e-15],
        [5.7337e-21, 2.4335e-07, 4.5460e-04, 1.8284e-01, 5.1836e-02, 7.1251e-01,
         3.9906e-02, 1.2374e-02, 8.4290e-05, 2.5632e-13],
        [2.0018e-19, 8.0585e-07, 1.8834e-03, 1.5036e-01, 9.6953e-02, 6.5534e-01,
         7.4097e-02, 2.1145e-02, 2.2670e-04, 2.1834e-10],
        [2.4598e-17, 1.4756e-06, 3.2589e-03, 1.3438e-01, 1.3359e-01, 6.1338e-01,
         9.0848e-02, 2.4262e-02, 2.8718e-04, 4.0665e-10],


Epoch: 159.00, Train Loss: 2.30, Val Loss: 11.55, Train BLEU: 25.20, Val BLEU: 0.28, Minutes Elapsed: 6.55
Sampling from training predictions...
Source: 我们 得用 非常 特殊 的 仪器 才能 能到 到达 那个
Reference: we have to have a very special technology to
Model: <SOS> we have the the the , the the the
Attention Weights: tensor([[0.0000, 0.0000, 0.9489, 0.0383, 0.0129, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0065, 0.7857, 0.0150, 0.0053, 0.0000, 0.0000, 0.0000, 0.1875,
         0.0000],
        [0.0000, 0.0001, 0.7759, 0.0904, 0.1163, 0.0007, 0.0000, 0.0000, 0.0166,
         0.0000],
        [0.0000, 0.0085, 0.6398, 0.1912, 0.1572, 0.0033, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0096, 0.6149, 0.3076, 0.0654, 0.0026, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0045, 0.5571, 0.3674, 0.0686, 0.0024, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0045, 0.5221, 0.3936, 0.0769, 0.0028, 0.0000, 0.0000, 0.0000,
         0.0000],
   

Epoch: 163.00, Train Loss: 2.23, Val Loss: 11.59, Train BLEU: 27.12, Val BLEU: 0.29, Minutes Elapsed: 6.88
Sampling from training predictions...
Source: 大卫 <UNK> 这位 是 比尔 <UNK> 我 是 大卫 <UNK>
Reference: this is bill lange . i &apos;m dave gallo
Model: <SOS> this is bill lange . . . . gallo
Attention Weights: tensor([[0.0000, 0.0000, 0.0000, 0.0000, 1.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0004, 0.0000, 0.0000, 0.9996, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0001, 0.9999, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0002, 0.9998, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0008, 0.9992, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0047, 0.0000, 0.0018, 0.9935, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0383, 0.0000, 0.0005, 0.9612, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],


Epoch: 167.00, Train Loss: 2.14, Val Loss: 11.60, Train BLEU: 28.95, Val BLEU: 0.30, Minutes Elapsed: 7.08
Sampling from training predictions...
Source: 但 我 想 告诉 你 的 是 当 你 站
Reference: but when you &apos;re standing at the beach ,
Model: <SOS> and of the the the , the the the
Attention Weights: tensor([[5.8877e-36, 1.0812e-12, 1.2268e-04, 9.3823e-03, 6.7259e-01, 2.7350e-01,
         4.3415e-02, 9.9801e-04, 4.5386e-08, 1.7142e-25],
        [2.1126e-26, 1.0299e-11, 7.2045e-05, 6.1431e-02, 2.4509e-01, 4.2382e-01,
         2.3727e-01, 3.2188e-02, 1.2959e-04, 3.0736e-15],
        [4.7488e-20, 1.1125e-08, 1.0848e-03, 1.3253e-02, 4.2200e-01, 2.9734e-01,
         2.1084e-01, 5.5035e-02, 4.4887e-04, 3.0480e-13],
        [3.9282e-19, 2.2077e-09, 3.9222e-04, 4.3067e-02, 2.9153e-01, 4.0141e-01,
         2.3064e-01, 3.2611e-02, 3.4702e-04, 1.4068e-10],
        [2.3115e-19, 2.0345e-10, 5.8636e-05, 6.7375e-02, 2.7160e-01, 4.6396e-01,
         1.9013e-01, 6.8489e-03, 2.3608e-05, 3.8160e-11],
        [

Epoch: 171.00, Train Loss: 2.06, Val Loss: 11.69, Train BLEU: 29.38, Val BLEU: 0.28, Minutes Elapsed: 7.24
Sampling from training predictions...
Source: 原来 它 是 海洋 洋中 最长 的 生物 <EOS> <PAD>
Reference: this turns out to be the longest creature in
Model: <SOS> it &apos;s a to about is . . .
Attention Weights: tensor([[0.0000, 0.0000, 0.3950, 0.6050, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0000, 0.9995, 0.0005, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0018, 0.9982, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0084, 0.9916, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0017, 0.9983, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0002, 0.9998, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0000, 1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
  

Epoch: 175.00, Train Loss: 2.00, Val Loss: 11.70, Train BLEU: 28.86, Val BLEU: 0.28, Minutes Elapsed: 7.42
Sampling from training predictions...
Source: 看到 这些 在 动 的 东西 了 吗 <EOS> <PAD>
Reference: but see all those different working things ? <EOS>
Model: <SOS> it &apos;s a planet about working . <EOS> <EOS>
Attention Weights: tensor([[0.0000, 0.0000, 0.2979, 0.6510, 0.0459, 0.0053, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0001, 0.1387, 0.3283, 0.1451, 0.3879, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0017, 0.2597, 0.4795, 0.1919, 0.0673, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0130, 0.5217, 0.4277, 0.0376, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0668, 0.6726, 0.2479, 0.0127, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0606, 0.6518, 0.2701, 0.0175, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0240, 0.2622, 0.1521, 0.0158, 0.0001, 0.0000, 0.0000, 0.5458,

Epoch: 179.00, Train Loss: 1.96, Val Loss: 11.73, Train BLEU: 30.76, Val BLEU: 0.28, Minutes Elapsed: 7.64
Sampling from training predictions...
Source: 这 是 一只 水母 <EOS> <PAD> <PAD> <PAD> <PAD> <PAD>
Reference: here &apos;s a jelly . <EOS> <PAD> <PAD> <PAD>
Model: <SOS> here &apos;s a jelly . <EOS> <EOS> <EOS> <EOS>
Attention Weights: tensor([[0.0000, 0.0017, 0.9983, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0001, 0.9996, 0.0003, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0025, 0.9974, 0.0001, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0018, 0.0254, 0.8118, 0.1610, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 1.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 1.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 1.0000, 0.0000, 0.0000, 0.000

Epoch: 183.00, Train Loss: 1.86, Val Loss: 11.82, Train BLEU: 33.55, Val BLEU: 0.29, Minutes Elapsed: 7.81
Sampling from training predictions...
Source: 当 你 站 在 海滩 上 或是 当 你 看到
Reference: part of the problem , i think , is
Model: <SOS> and of the the , the the the the
Attention Weights: tensor([[9.7466e-38, 2.0436e-14, 3.5897e-06, 5.9468e-03, 4.2955e-01, 3.5022e-01,
         2.1424e-01, 4.0727e-05, 1.7723e-10, 8.9372e-29],
        [2.8131e-29, 3.6409e-15, 3.7872e-08, 9.1433e-05, 6.3061e-01, 8.9605e-03,
         3.6032e-01, 1.9842e-05, 4.9465e-09, 4.9278e-18],
        [8.4919e-21, 5.0245e-10, 7.4254e-05, 4.1134e-03, 1.5336e-01, 3.8488e-01,
         4.5169e-01, 5.8685e-03, 4.0081e-06, 6.0032e-16],
        [2.0455e-19, 2.2030e-10, 8.7713e-05, 1.5136e-02, 1.5288e-01, 6.1650e-01,
         2.0715e-01, 8.2271e-03, 1.9209e-05, 2.7092e-13],
        [7.7795e-19, 4.4659e-11, 1.4187e-05, 1.4250e-02, 2.1696e-01, 5.9447e-01,
         1.7333e-01, 9.7642e-04, 3.9815e-07, 4.9449e-12],
        [2.0169e-1

Epoch: 186.00, Train Loss: 1.80, Val Loss: 11.67, Train BLEU: 33.95, Val BLEU: 0.28, Minutes Elapsed: 7.98
Sampling from training predictions...
Source: <UNK> 塞尔 <UNK> <UNK> 斯特 说 过 真正 的 探索
Reference: marcel proust said , &quot; the true voyage of
Model: <SOS> marcel proust said , &quot; the true voyage voyage
Attention Weights: tensor([[0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         1.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         1.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0001,
         0.9999],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         1.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         1.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         1.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0

Epoch: 190.00, Train Loss: 1.71, Val Loss: 11.78, Train BLEU: 38.49, Val BLEU: 0.28, Minutes Elapsed: 8.19
Sampling from training predictions...
Source: 我 真 喜欢 这些 东西 <EOS> <PAD> <PAD> <PAD> <PAD>
Reference: i love that kind of stuff . <EOS> <PAD>
Model: <SOS> here &apos;s a kind about . . <EOS> <EOS>
Attention Weights: tensor([[0.0000, 0.0000, 0.9513, 0.0486, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.1160, 0.5387, 0.0131, 0.3322, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0017, 0.6693, 0.3264, 0.0025, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0017, 0.0007, 0.9966, 0.0010, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 1.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 1.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 1.0000, 0.0000, 0.0000, 0.0000,
    

Epoch: 194.00, Train Loss: 1.63, Val Loss: 11.76, Train BLEU: 43.37, Val BLEU: 0.29, Minutes Elapsed: 8.40
Sampling from training predictions...
Source: 这 是 一种 种群 栖 动物 <EOS> <PAD> <PAD> <PAD>
Reference: it &apos;s a colonial animal . <EOS> <PAD> <PAD>
Model: <SOS> here &apos;s a colonial animal . <EOS> <EOS> <EOS>
Attention Weights: tensor([[0.0000, 0.0000, 0.2318, 0.7682, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.1956, 0.8039, 0.0000, 0.0000, 0.0004, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0005, 0.6495, 0.3500, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0037, 0.9739, 0.0200, 0.0000, 0.0024, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 1.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 1.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 1.0000, 0.0000

Epoch: 198.00, Train Loss: 1.56, Val Loss: 11.79, Train BLEU: 46.07, Val BLEU: 0.28, Minutes Elapsed: 8.58
Sampling from training predictions...
Source: 其实 它们 都 是 由 单独 的 动物 结合 合在
Reference: these are all individual animals banding together to make
Model: <SOS> these are all individual animals banding together to make
Attention Weights: tensor([[0.0000, 0.0002, 0.0002, 0.0034, 0.0031, 0.9931, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         1.0000],
        [0.0000, 0.0001, 0.0002, 0.0068, 0.0121, 0.9807, 0.0001, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0091, 0.0015, 0.0168, 0.0222, 0.9504, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0395, 0.0021, 0.0530, 0.0863, 0.8190, 0.0001, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0001, 0.0000, 0.0002, 0.0003, 0.0018, 0.0000, 0.0000, 0.0000,
         0.9976],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0

In [None]:
summarize_results(load_experiment_log())[['dt_created', 'num_epochs', 'learning_rate', 'clip_grad_max_norm', 'val_loss']].head()

In [None]:
plot_single_learning_curve(results)

In [None]:
# Epoch: 199.00, Train Loss: 0.32, Val Loss: 13.19, Train BLEU: 98.94, Val BLEU: 0.27
plot_single_learning_curve(results)

In [None]:
# with attention energies = v_broadcast.bmm(torch.tanh(self.attn(concat)).transpose(1, 2)) # switched order  
# Epoch: 199.00, Train Loss: 0.63, Val Loss: 12.82, Train BLEU: 92.05, Val BLEU: 0.38
plot_single_learning_curve(results)

In [None]:
for i, token in enumerate(vocab[SRC_LANG]['id2token']): 
    if i < 20: 
        print("{}: {}".format(i, token))

In [None]:
for i, token in enumerate(vocab[TARG_LANG]['id2token']): 
    if i < 20: 
        print("{}: {}".format(i, token))

In [None]:
import torch
x = torch.arange(0, 3*5*10).view(3, 5, 10)
print(x)
y = x[1:, :, :]
print(y)
z = y.view(-1, 10)
print(z)

In [None]:
t = torch.arange(0, 2*5).view(5, 2)
print(t)
u = t.contiguous().view(-1)
print(u)
v = t.permute(1, 0)
print(v)
w = v.contiguous().view(-1)
print(w)

In [None]:
a = torch.arange(0, 2*1*300)
print(a)
b = a.view(-1, 1, 300)
print(b.size())

In [None]:
for i, (src_idxs, targ_idxs, src_lens, targ_lens) in enumerate(loaders_full['train']):
#     print(i)
#     print(src_idxs.size())
#     print(src_idxs)
#     print(src_lens)
#     print(targ_idxs.size())
#     print(targ_idxs)
#     print(targ_lens)
    id2token = vocab[SRC_LANG]['id2token']
    test_tensor = src_idxs
    list_of_lists = test_tensor.numpy().astype(int).tolist()
    to_token = lambda l: ' '.join([id2token[idx] for idx in l])
    list_of_lists_tokens = [to_token(l) for l in list_of_lists] 
    break 

In [None]:
#attn.data.masked_fill_(self.mask, -float('inf'))
test_tensor.

test_tensor.data.masked_fill_(test_tensor == 2, float('inf'))