<a href="https://colab.research.google.com/github/mausamsion/playing_with_Sequences/blob/master/english_to_hindi_machine_translation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install indic-nlp-library
!pip install torchtext==0.6.0

In [None]:
!nvidia-smi

Tue Sep  8 09:22:56 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.66       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   35C    P0    26W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

## Download dataset

In [None]:
!mkdir mtdata
!tar -C mtdata -xvzf drive/My\ Drive/datasets/hindi_english_mt/dev_test.tgz
# !tar -C mtdata -xvzf drive/My\ Drive/datasets/hindi_english_mt/monolingual.hi.tgz
!tar -C mtdata -xvzf drive/My\ Drive/datasets/hindi_english_mt/parallel.tgz
# !tar -C mtdata -xvzf drive/My\ Drive/datasets/hindi_english_mt/prunedCorpus.tar.gz
# !tar -C mtdata -xvzf drive/My\ Drive/datasets/hindi_english_mt/xlit-iitb-par.tgz

dev_test/
dev_test/dev.hi
dev_test/test.hi
dev_test/dev.en
dev_test/test.en
parallel/
parallel/IITB.en-hi.hi
parallel/IITB.en-hi.en


## Preparing CSVs

In [None]:
# # Reading train data
# with open('mtdata/parallel/IITB.en-hi.hi', 'rb') as f:
#     train_hi = f.readlines()
# train_hi = [i[:-1].decode() for i in train_hi]

# with open('mtdata/parallel/IITB.en-hi.en', 'rb') as f:
#     train_en = f.readlines()
# train_en = [i[:-1].decode() for i in train_en]

# # Reading dev data
# with open('mtdata/dev_test/dev.hi', 'rb') as f:
#     val_hi = f.readlines()
# val_hi = [i[:-1].decode() for i in val_hi]

# with open('mtdata/dev_test/dev.en', 'rb') as f:
#     val_en = f.readlines()
# val_en = [i[:-1].decode() for i in val_en]

# # Reading test data
# with open('mtdata/dev_test/test.hi', 'rb') as f:
#     test_hi = f.readlines()
# test_hi = [i[:-1].decode() for i in test_hi]

# with open('mtdata/dev_test/test.en', 'rb') as f:
#     test_en = f.readlines()
# test_en = [i[:-1].decode() for i in test_en]

# # ------------------------------------------

# import pandas as pd

# train_pairs = pd.DataFrame()
# train_pairs['ENGLISH'] = train_en
# train_pairs['HINDI'] = train_hi
# train_pairs.to_csv('drive/My Drive/datasets/hindi_english_mt/train_pairs.csv', index=None)
# train_pairs_small = train_pairs.sample(frac=0.4, replace=False)
# train_pairs_small.to_csv('drive/My Drive/datasets/hindi_english_mt/train_pairs_small.csv', index=None)

# val_pairs = pd.DataFrame()
# val_pairs['ENGLISH'] = val_en
# val_pairs['HINDI'] = val_hi
# val_pairs.to_csv('drive/My Drive/datasets/hindi_english_mt/val_pairs.csv', index=None)

# test_pairs = pd.DataFrame()
# test_pairs['ENGLISH'] = test_en
# test_pairs['HINDI'] = test_hi
# test_pairs.to_csv('drive/My Drive/datasets/hindi_english_mt/test_pairs.csv', index=None)

# del train_en, train_hi, val_en, val_hi, test_en, test_hi, train_pairs, dev_pairs, test_pairs

# !wc -l drive/My\ Drive/datasets/hindi_english_mt/*csv

In [None]:
!wc -l drive/My\ Drive/datasets/hindi_english_mt/*csv

     2508 drive/My Drive/datasets/hindi_english_mt/test_pairs.csv
  1561841 drive/My Drive/datasets/hindi_english_mt/train_pairs.csv
   624737 drive/My Drive/datasets/hindi_english_mt/train_pairs_small.csv
      521 drive/My Drive/datasets/hindi_english_mt/val_pairs.csv
  2189607 total


## Imports

In [None]:
import numpy as np
import pandas as pd
import random
import time

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torchtext.data import Field, TabularDataset, BucketIterator, Iterator
from torchtext.data.metrics import bleu_score as BLEU_SCORE

from nltk.translate.bleu_score import corpus_bleu
from nltk.translate.bleu_score import SmoothingFunction

import spacy
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS
from indicnlp.tokenize import indic_tokenize

In [None]:
SEED = 1234
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

## Tokenizer, Dataset, Iterator

In [None]:
spacy_en = spacy.load('en') 
indic_hi = indic_tokenize.trivial_tokenize_indic

def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text) if not tok.is_quote]

def tokenize_hi(text):
    text = indic_hi(text)
    return [tok for tok in text if not tok in ['"',"'"]]

In [None]:
SRC = Field(tokenize=tokenize_en,
            init_token='<sos>',
            eos_token='<eos>',
            lower=True)
TRG = Field(tokenize=tokenize_hi,
            init_token='<sos>',
            eos_token='<eos>')

In [None]:
train_dataset, val_dataset, test_dataset = TabularDataset.splits(
    path='drive/My Drive/datasets/hindi_english_mt', 
    train='train_pairs_small.csv', validation='val_pairs.csv', test='test_pairs.csv', format='csv',
    fields=[('English', SRC), ('Hindi', TRG)], 
    skip_header=True
)
print('train:', len(train_dataset), 
      ', val:', len(val_dataset), 
      ', test:', len(test_dataset))

train: 624736 , val: 520 , test: 2507


In [None]:
print(test_dataset[0].English)
print(test_dataset[0].Hindi)

In [None]:
SRC.build_vocab(train_dataset, val_dataset, test_dataset, max_size=70000)
TRG.build_vocab(train_dataset, val_dataset, test_dataset, max_size=70000)

In [None]:
def get_default_device():
    return torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
def to_device(data, device):
    if isinstance(data, (list,tuple)):
        return [to_device(x, device) for x in data]
    return data.to(device, non_blocking=True)

class DeviceDataLoader():
    def __init__(self, dl, device):
        self.dl = dl
        self.device = device
    def __iter__(self):
        for b in self.dl: 
            yield to_device(b, self.device)
    def __len__(self):
        return len(self.dl)
    
device = get_default_device()
device

device(type='cuda')

In [None]:
BATCH_SIZE = 8

train_iter = BucketIterator(train_dataset, batch_size=BATCH_SIZE, device=device, 
                            sort_key=lambda x: len(x.English), sort_within_batch=False, 
                            repeat=False)

val_iter = Iterator(val_dataset, batch_size=8, train=False, device=device, 
                    sort=False, sort_within_batch=False, repeat=False)

test_iter = Iterator(test_dataset, batch_size=8, train=False, device=device, 
                     sort=False, sort_within_batch=False, repeat=False)

In [None]:
# What's inside batch
for i in train_iter:
    print(i, '\n')
    for seqen, seqhi in zip(i.English.T, i.Hindi.T):
        for id in seqen:
            print(SRC.vocab.itos[id], end=' ')
        print('\n')
        for id in seqhi:
            print(TRG.vocab.itos[id], end=' ')
        print('\n\n')
        break
    break

## Encoder-Decoder Model

In [None]:
# Vanilla encoder-decoder architecture from the following paper:
# "Learning Phrase Representations using RNN Encoder–Decoder for Statistical Machine Translation"
# by Cho et al. (https://arxiv.org/pdf/1406.1078.pdf)

class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.GRU(emb_dim, enc_hid_dim)
        self.linear = nn.Linear(enc_hid_dim, dec_hid_dim)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, input):
        embed = self.dropout(self.embedding(input))
        output, hidden = self.rnn(embed)
        hidden = torch.tanh(self.linear(hidden))
        return output, hidden

class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout):
        super().__init__()
        self.output_dim = output_dim
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.GRU(emb_dim + dec_hid_dim*2, dec_hid_dim)
        self.linear = nn.Linear(dec_hid_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, context):
        embed = self.dropout(self.embedding(input))
        embed = embed.unsqueeze(0)
        input = torch.cat((embed, hidden, context), dim=2)
        output, hidden = self.rnn(input)
        output = self.linear(output)
        return output, hidden


class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
    
    def forward(self, source_sent, target_sent='', custom_test='False'):
        # if custom_test:
        #     target_vocab_size = self.decoder.output_dim
        #     encoder_outputs, encoder_hidden = self.encoder(source_sent)
        #     decoder_input = TRG.vocab.stoi('<sos>')
        #     pred_len = 0
        #     while pred_len <= 30:
        #         decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden, encoder_hidden)
        #         decoder_output
        #         pred_len += 1
        #         if pred == '<eos>':
        #             break
        target_len = target_sent.shape[0]
        target_vocab_size = self.decoder.output_dim
        decoder_outputs = torch.zeros(target_len, target_sent.shape[1], target_vocab_size).to(device)
        encoder_outputs, encoder_hidden = self.encoder(source_sent)
        decoder_hidden = encoder_hidden
        decoder_input = target_sent[0, :]
        for t in range(1, target_len):
            decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden, encoder_hidden)
            decoder_outputs[t] = decoder_output
            decoder_input = target_sent[t]
        return decoder_outputs

In [None]:
smoothing = SmoothingFunction().method3
def get_bleu(output, target, print_seqs=False):
    def trim_pad(target):
        new_target = []
        for seq in target:
            new_seq = []
            for i in seq[1:]:
                if TRG.vocab.itos[i] == '<eos>':
                    break
                new_seq.append(i.item())
            new_target.append(new_seq)
        return new_target

    def fix_output(output):
        new_output = []
        output = output.permute(1,0,2)
        softmax = torch.nn.Softmax(dim=1)
        for out in output:
            tmp = list(torch.argmax(softmax(out), dim=1))
            new_output.append(tmp)
        new_output = trim_pad(new_output)
        return new_output

    target = trim_pad(target.T)
    output = fix_output(output)
    
    tmp_bleu = 0
    counter = 0
    assert len(target) == len(output)
    for tar, out in zip(target, output):
        tmp_bleu += corpus_bleu([[tar]], [out], smoothing_function=smoothing)
    tmp_bleu = tmp_bleu / len(target)
    
    if print_seqs:
        for tar, out in zip(target, output):
            print('----------------------')
            print(tar)
            print(out)
        print('----------------------')

    return tmp_bleu


def get_acc(references, hypotheses, acc_type='word'):
    assert acc_type == 'word_acc' or acc_type == 'sent_acc'
    cum_acc = 0.

    for ref, hyp in zip(references, hypotheses):
        ref = ref[1:-1]
        hyp = hyp[1:-1]
        if acc_type == 'word_acc':
            acc = len([1 for ref_w, hyp_w in zip(ref, hyp) if ref_w == hyp_w]) / float(len(hyp) + 1e-6)
        else:
            acc = 1. if all(ref_w == hyp_w for ref_w, hyp_w in zip(ref, hyp)) else 0.
        cum_acc += acc

    acc = cum_acc / len(hypotheses)
    return acc

In [None]:
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
ENC_HID_DIM = 512
DEC_HID_DIM = 512
ENC_DROPOUT = 0.6
DEC_DROPOUT = 0.6
ENC = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, ENC_DROPOUT)
DEC = Decoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, DEC_DROPOUT)

In [None]:
def init_weights(m):
    for name, param in m.named_parameters():
        if 'weight' in name:
            nn.init.normal_(param.data, mean=0, std=0.01)
        else:
            nn.init.constant_(param.data, 0)

In [None]:
checkpoint = 'drive/My Drive/datasets/hindi_english_mt/best_model_1.pt'
ckp = None
if not checkpoint == '':
    ckp = torch.load(checkpoint)

model = Seq2Seq(ENC, DEC).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.0001)

if not ckp == None:
    model.load_state_dict(ckp['state_dict'])
    # optimizer.load_state_dict(ckp['optimizer'])
else:
    model.apply(init_weights)


In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 75,955,060 trainable parameters


In [None]:
TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]
criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)

In [None]:
def train(model, iterator, optimizer, criterion, clip):
    model.train()
    epoch_loss = 0
    bscore = 0
    for i, batch in enumerate(iterator):
        src = batch.English
        trg = batch.Hindi
        optimizer.zero_grad()
        output = model(src, trg)
        #trg = [trg len, batch size]
        #output = [trg len, batch size, output dim]
        bscore += get_bleu(output, trg)
        output_dim = output.shape[-1]
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)
        #trg = [(trg len - 1) * batch size]
        #output = [(trg len - 1) * batch size, output dim]
        loss = criterion(output, trg)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(iterator), bscore / len(iterator)

In [None]:
def evaluate(model, iterator, criterion):
    model.eval()
    epoch_loss = 0
    bscore = 0
    with torch.no_grad():
        for i, batch in enumerate(iterator):
            src = batch.English
            trg = batch.Hindi
            output = model(src, trg)
            #trg = [trg len, batch size]
            #output = [trg len, batch size, output dim]
            bscore += get_bleu(output, trg)
            output_dim = output.shape[-1]
            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)
            #trg = [(trg len - 1) * batch size]
            #output = [(trg len - 1) * batch size, output dim]
            loss = criterion(output, trg)
            epoch_loss += loss.item()
    return epoch_loss / len(iterator), bscore / len(iterator)

In [None]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
STEPS = 200000
N_EPOCHS = 1000
CLIP = 1
log_steps = 2000
best_val_loss = float('inf')

global_steps = 0
step_time = 0
train_loss = 0
bscore = 0

for epoch in range(N_EPOCHS):
    model.train()
    for i, batch in enumerate(train_iter):
        start_time = time.time()
        
        src = batch.English
        trg = batch.Hindi
        optimizer.zero_grad()
        output = model(src, trg)
        #trg = [trg len, batch size]
        #output = [trg len, batch size, output dim]
        bscore += get_bleu(output, trg)
        output_dim = output.shape[-1]
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)
        #trg = [(trg len - 1) * batch size]
        #output = [(trg len - 1) * batch size, output dim]
        loss = criterion(output, trg)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), CLIP)
        optimizer.step()
        train_loss += loss.item()
        
        end_time = time.time()
        step_time += end_time - start_time
        
        global_steps += 1
        if global_steps % log_steps == 0:
            avg_train_loss = train_loss / log_steps
            avg_train_bleu = bscore / log_steps
            avg_step_time = step_time / log_steps
            val_loss, val_bleu = evaluate(model, val_iter, criterion)
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                cp = {'global_steps': global_steps, 
                      'state_dict': model.state_dict(),
                      'optimizer': optimizer.state_dict()}
                torch.save(cp, 'drive/My Drive/datasets/hindi_english_mt/best_model_2.pt')
            print(f'Global step: {global_steps} | '
                  f'Avg_step_time: {avg_step_time:.2f}s | '
                  f'Train Loss: {avg_train_loss:.4f} | '
                  f'Val. Loss: {val_loss:.4f} | '
                  f'Train Bleu: {avg_train_bleu:.4f} | '
                  f'Val. Bleu: {val_bleu:.4f}')
            train_loss = 0
            bscore = 0
            step_time = 0
            model.train()
        
        if global_steps == STEPS:
            break
    if global_steps == STEPS:
            break

Global step: 2000 | Avg_step_time: 0.28s | Train Loss: 3.9194 | Val. Loss: 5.3395 | Train Bleu: 0.1155 | Val. Bleu: 0.0692
Global step: 4000 | Avg_step_time: 0.28s | Train Loss: 3.8808 | Val. Loss: 5.3391 | Train Bleu: 0.1212 | Val. Bleu: 0.0694
Global step: 6000 | Avg_step_time: 0.28s | Train Loss: 3.8903 | Val. Loss: 5.3374 | Train Bleu: 0.1222 | Val. Bleu: 0.0687
Global step: 8000 | Avg_step_time: 0.28s | Train Loss: 3.8852 | Val. Loss: 5.3318 | Train Bleu: 0.1198 | Val. Bleu: 0.0688
Global step: 10000 | Avg_step_time: 0.27s | Train Loss: 3.9907 | Val. Loss: 5.2981 | Train Bleu: 0.1200 | Val. Bleu: 0.0698
Global step: 12000 | Avg_step_time: 0.28s | Train Loss: 4.0364 | Val. Loss: 5.2789 | Train Bleu: 0.1161 | Val. Bleu: 0.0687
Global step: 14000 | Avg_step_time: 0.28s | Train Loss: 3.8710 | Val. Loss: 5.2878 | Train Bleu: 0.1236 | Val. Bleu: 0.0692
Global step: 16000 | Avg_step_time: 0.28s | Train Loss: 3.8140 | Val. Loss: 5.2810 | Train Bleu: 0.1281 | Val. Bleu: 0.0688
Global step:

In [None]:
# Global step: 1000 | Avg_step_time: 0.45s | Train Loss: 7.2874 | Val. Loss: 7.2485 | Train Bleu: 0.0136 | Val. Bleu: 0.0000
# Global step: 2000 | Avg_step_time: 0.45s | Train Loss: 6.8569 | Val. Loss: 7.0490 | Train Bleu: 0.0185 | Val. Bleu: 0.0000
# Global step: 3000 | Avg_step_time: 0.44s | Train Loss: 6.6827 | Val. Loss: 6.9799 | Train Bleu: 0.0202 | Val. Bleu: 0.0027
# Global step: 4000 | Avg_step_time: 0.43s | Train Loss: 6.6085 | Val. Loss: 6.8838 | Train Bleu: 0.0190 | Val. Bleu: 0.0017
# Global step: 5000 | Avg_step_time: 0.43s | Train Loss: 6.4932 | Val. Loss: 6.7985 | Train Bleu: 0.0221 | Val. Bleu: 0.0094
# Global step: 6000 | Avg_step_time: 0.45s | Train Loss: 6.4123 | Val. Loss: 6.7590 | Train Bleu: 0.0241 | Val. Bleu: 0.0070
# Global step: 7000 | Avg_step_time: 0.45s | Train Loss: 6.3443 | Val. Loss: 6.7281 | Train Bleu: 0.0237 | Val. Bleu: 0.0081
# Global step: 8000 | Avg_step_time: 0.45s | Train Loss: 6.2755 | Val. Loss: 6.6502 | Train Bleu: 0.0239 | Val. Bleu: 0.0091
# Global step: 9000 | Avg_step_time: 0.43s | Train Loss: 6.2054 | Val. Loss: 6.6247 | Train Bleu: 0.0269 | Val. Bleu: 0.0101
# Global step: 10000 | Avg_step_time: 0.44s | Train Loss: 6.1481 | Val. Loss: 6.5855 | Train Bleu: 0.0258 | Val. Bleu: 0.0064
# Global step: 11000 | Avg_step_time: 0.44s | Train Loss: 6.1050 | Val. Loss: 6.5562 | Train Bleu: 0.0263 | Val. Bleu: 0.0071
# Global step: 12000 | Avg_step_time: 0.44s | Train Loss: 6.0624 | Val. Loss: 6.5343 | Train Bleu: 0.0244 | Val. Bleu: 0.0087
# Global step: 13000 | Avg_step_time: 0.44s | Train Loss: 5.9604 | Val. Loss: 6.4796 | Train Bleu: 0.0317 | Val. Bleu: 0.0326
# Global step: 14000 | Avg_step_time: 0.44s | Train Loss: 5.8751 | Val. Loss: 6.4408 | Train Bleu: 0.0368 | Val. Bleu: 0.0393
# Global step: 15000 | Avg_step_time: 0.47s | Train Loss: 5.8402 | Val. Loss: 6.4061 | Train Bleu: 0.0375 | Val. Bleu: 0.0372
# Global step: 16000 | Avg_step_time: 0.44s | Train Loss: 5.8188 | Val. Loss: 6.3704 | Train Bleu: 0.0406 | Val. Bleu: 0.0395
# Global step: 17000 | Avg_step_time: 0.45s | Train Loss: 5.7751 | Val. Loss: 6.3449 | Train Bleu: 0.0399 | Val. Bleu: 0.0409
# Global step: 18000 | Avg_step_time: 0.45s | Train Loss: 5.6975 | Val. Loss: 6.3040 | Train Bleu: 0.0412 | Val. Bleu: 0.0422
# Global step: 19000 | Avg_step_time: 0.44s | Train Loss: 5.6821 | Val. Loss: 6.3286 | Train Bleu: 0.0435 | Val. Bleu: 0.0423
# Global step: 20000 | Avg_step_time: 0.44s | Train Loss: 5.6444 | Val. Loss: 6.2735 | Train Bleu: 0.0445 | Val. Bleu: 0.0414
# Global step: 21000 | Avg_step_time: 0.45s | Train Loss: 5.6347 | Val. Loss: 6.2511 | Train Bleu: 0.0451 | Val. Bleu: 0.0426
# Global step: 22000 | Avg_step_time: 0.44s | Train Loss: 5.5880 | Val. Loss: 6.2053 | Train Bleu: 0.0461 | Val. Bleu: 0.0432
# Global step: 23000 | Avg_step_time: 0.44s | Train Loss: 5.5875 | Val. Loss: 6.2302 | Train Bleu: 0.0450 | Val. Bleu: 0.0427
# Global step: 24000 | Avg_step_time: 0.44s | Train Loss: 5.5366 | Val. Loss: 6.2192 | Train Bleu: 0.0471 | Val. Bleu: 0.0449
# Global step: 25000 | Avg_step_time: 0.45s | Train Loss: 5.5287 | Val. Loss: 6.2073 | Train Bleu: 0.0474 | Val. Bleu: 0.0443
# Global step: 26000 | Avg_step_time: 0.45s | Train Loss: 5.5349 | Val. Loss: 6.1741 | Train Bleu: 0.0507 | Val. Bleu: 0.0444
# Global step: 27000 | Avg_step_time: 0.44s | Train Loss: 5.4930 | Val. Loss: 6.1724 | Train Bleu: 0.0503 | Val. Bleu: 0.0447
# Global step: 28000 | Avg_step_time: 0.48s | Train Loss: 5.4818 | Val. Loss: 6.1400 | Train Bleu: 0.0516 | Val. Bleu: 0.0442
# Global step: 29000 | Avg_step_time: 0.45s | Train Loss: 5.4495 | Val. Loss: 6.1142 | Train Bleu: 0.0513 | Val. Bleu: 0.0454
# Global step: 30000 | Avg_step_time: 0.45s | Train Loss: 5.4226 | Val. Loss: 6.1115 | Train Bleu: 0.0523 | Val. Bleu: 0.0450
# Global step: 31000 | Avg_step_time: 0.44s | Train Loss: 5.4017 | Val. Loss: 6.1262 | Train Bleu: 0.0529 | Val. Bleu: 0.0455
# Global step: 32000 | Avg_step_time: 0.45s | Train Loss: 5.4129 | Val. Loss: 6.0849 | Train Bleu: 0.0522 | Val. Bleu: 0.0470
# Global step: 33000 | Avg_step_time: 0.45s | Train Loss: 5.3834 | Val. Loss: 6.0847 | Train Bleu: 0.0546 | Val. Bleu: 0.0451
# Global step: 34000 | Avg_step_time: 0.46s | Train Loss: 5.3263 | Val. Loss: 6.0765 | Train Bleu: 0.0569 | Val. Bleu: 0.0465
# Global step: 35000 | Avg_step_time: 0.46s | Train Loss: 5.3109 | Val. Loss: 6.0739 | Train Bleu: 0.0549 | Val. Bleu: 0.0477
# Global step: 36000 | Avg_step_time: 0.47s | Train Loss: 5.2960 | Val. Loss: 6.0581 | Train Bleu: 0.0554 | Val. Bleu: 0.0472
# Global step: 37000 | Avg_step_time: 0.47s | Train Loss: 5.2860 | Val. Loss: 6.0518 | Train Bleu: 0.0568 | Val. Bleu: 0.0483
# Global step: 38000 | Avg_step_time: 0.44s | Train Loss: 5.2539 | Val. Loss: 6.0308 | Train Bleu: 0.0567 | Val. Bleu: 0.0460
# Global step: 39000 | Avg_step_time: 0.45s | Train Loss: 5.2928 | Val. Loss: 6.0032 | Train Bleu: 0.0595 | Val. Bleu: 0.0466
# Global step: 40000 | Avg_step_time: 0.46s | Train Loss: 5.2489 | Val. Loss: 6.0163 | Train Bleu: 0.0569 | Val. Bleu: 0.0478
# Global step: 41000 | Avg_step_time: 0.45s | Train Loss: 5.2265 | Val. Loss: 5.9990 | Train Bleu: 0.0595 | Val. Bleu: 0.0475
# Global step: 42000 | Avg_step_time: 0.45s | Train Loss: 5.2154 | Val. Loss: 5.9700 | Train Bleu: 0.0595 | Val. Bleu: 0.0493
# Global step: 43000 | Avg_step_time: 0.44s | Train Loss: 5.1963 | Val. Loss: 5.9610 | Train Bleu: 0.0605 | Val. Bleu: 0.0482
# Global step: 44000 | Avg_step_time: 0.44s | Train Loss: 5.1567 | Val. Loss: 5.9695 | Train Bleu: 0.0595 | Val. Bleu: 0.0483
# Global step: 45000 | Avg_step_time: 0.46s | Train Loss: 5.1188 | Val. Loss: 5.9379 | Train Bleu: 0.0616 | Val. Bleu: 0.0489
# Global step: 46000 | Avg_step_time: 0.45s | Train Loss: 5.1374 | Val. Loss: 5.9547 | Train Bleu: 0.0616 | Val. Bleu: 0.0489
# Global step: 47000 | Avg_step_time: 0.47s | Train Loss: 5.1879 | Val. Loss: 5.9247 | Train Bleu: 0.0608 | Val. Bleu: 0.0486
# Global step: 48000 | Avg_step_time: 0.44s | Train Loss: 5.0934 | Val. Loss: 5.9262 | Train Bleu: 0.0639 | Val. Bleu: 0.0479
# Global step: 49000 | Avg_step_time: 0.46s | Train Loss: 5.0855 | Val. Loss: 5.8980 | Train Bleu: 0.0650 | Val. Bleu: 0.0497
# Global step: 50000 | Avg_step_time: 0.45s | Train Loss: 5.0911 | Val. Loss: 5.8933 | Train Bleu: 0.0636 | Val. Bleu: 0.0485
# Global step: 51000 | Avg_step_time: 0.45s | Train Loss: 5.0742 | Val. Loss: 5.8865 | Train Bleu: 0.0631 | Val. Bleu: 0.0495
# Global step: 52000 | Avg_step_time: 0.45s | Train Loss: 5.0412 | Val. Loss: 5.8828 | Train Bleu: 0.0646 | Val. Bleu: 0.0499
# Global step: 53000 | Avg_step_time: 0.45s | Train Loss: 5.0398 | Val. Loss: 5.8482 | Train Bleu: 0.0642 | Val. Bleu: 0.0496
# Global step: 54000 | Avg_step_time: 0.45s | Train Loss: 5.0180 | Val. Loss: 5.8708 | Train Bleu: 0.0646 | Val. Bleu: 0.0485

# Global step: 2000 | Avg_step_time: 0.30s | Train Loss: 5.3315 | Val. Loss: 6.2003 | Train Bleu: 0.0558 | Val. Bleu: 0.0462
# Global step: 4000 | Avg_step_time: 0.29s | Train Loss: 5.3448 | Val. Loss: 6.1103 | Train Bleu: 0.0574 | Val. Bleu: 0.0484
# Global step: 6000 | Avg_step_time: 0.29s | Train Loss: 5.3109 | Val. Loss: 6.0868 | Train Bleu: 0.0589 | Val. Bleu: 0.0477
# Global step: 8000 | Avg_step_time: 0.29s | Train Loss: 5.2975 | Val. Loss: 6.0407 | Train Bleu: 0.0589 | Val. Bleu: 0.0495
# Global step: 10000 | Avg_step_time: 0.29s | Train Loss: 5.2201 | Val. Loss: 6.0197 | Train Bleu: 0.0615 | Val. Bleu: 0.0497
# Global step: 12000 | Avg_step_time: 0.29s | Train Loss: 5.2164 | Val. Loss: 5.9907 | Train Bleu: 0.0614 | Val. Bleu: 0.0493
# Global step: 14000 | Avg_step_time: 0.29s | Train Loss: 5.1898 | Val. Loss: 5.9506 | Train Bleu: 0.0626 | Val. Bleu: 0.0503
# Global step: 16000 | Avg_step_time: 0.30s | Train Loss: 5.1479 | Val. Loss: 5.9502 | Train Bleu: 0.0641 | Val. Bleu: 0.0528
# Global step: 18000 | Avg_step_time: 0.29s | Train Loss: 5.1007 | Val. Loss: 5.9500 | Train Bleu: 0.0645 | Val. Bleu: 0.0525
# Global step: 20000 | Avg_step_time: 0.29s | Train Loss: 5.0753 | Val. Loss: 5.9107 | Train Bleu: 0.0671 | Val. Bleu: 0.0517
# Global step: 22000 | Avg_step_time: 0.29s | Train Loss: 5.0850 | Val. Loss: 5.8950 | Train Bleu: 0.0676 | Val. Bleu: 0.0518
# Global step: 24000 | Avg_step_time: 0.29s | Train Loss: 5.0563 | Val. Loss: 5.8839 | Train Bleu: 0.0662 | Val. Bleu: 0.0529
# Global step: 26000 | Avg_step_time: 0.29s | Train Loss: 5.0401 | Val. Loss: 5.9325 | Train Bleu: 0.0678 | Val. Bleu: 0.0515
# Global step: 28000 | Avg_step_time: 0.29s | Train Loss: 4.9876 | Val. Loss: 5.8915 | Train Bleu: 0.0701 | Val. Bleu: 0.0536
# Global step: 30000 | Avg_step_time: 0.31s | Train Loss: 4.9898 | Val. Loss: 5.8922 | Train Bleu: 0.0706 | Val. Bleu: 0.0551
# Global step: 32000 | Avg_step_time: 0.30s | Train Loss: 4.9994 | Val. Loss: 5.9162 | Train Bleu: 0.0720 | Val. Bleu: 0.0537
# Global step: 34000 | Avg_step_time: 0.30s | Train Loss: 4.9680 | Val. Loss: 5.9021 | Train Bleu: 0.0722 | Val. Bleu: 0.0539
# Global step: 36000 | Avg_step_time: 0.30s | Train Loss: 4.9362 | Val. Loss: 5.8346 | Train Bleu: 0.0713 | Val. Bleu: 0.0549
# Global step: 38000 | Avg_step_time: 0.29s | Train Loss: 4.9276 | Val. Loss: 5.8316 | Train Bleu: 0.0758 | Val. Bleu: 0.0550
# Global step: 40000 | Avg_step_time: 0.29s | Train Loss: 4.9058 | Val. Loss: 5.8548 | Train Bleu: 0.0763 | Val. Bleu: 0.0538
# Global step: 42000 | Avg_step_time: 0.29s | Train Loss: 4.9413 | Val. Loss: 5.8239 | Train Bleu: 0.0735 | Val. Bleu: 0.0544
# Global step: 44000 | Avg_step_time: 0.29s | Train Loss: 4.8981 | Val. Loss: 5.8157 | Train Bleu: 0.0768 | Val. Bleu: 0.0548
# Global step: 46000 | Avg_step_time: 0.29s | Train Loss: 4.9041 | Val. Loss: 5.8420 | Train Bleu: 0.0763 | Val. Bleu: 0.0546
# Global step: 48000 | Avg_step_time: 0.29s | Train Loss: 4.8608 | Val. Loss: 5.7974 | Train Bleu: 0.0776 | Val. Bleu: 0.0557
# Global step: 50000 | Avg_step_time: 0.30s | Train Loss: 4.8674 | Val. Loss: 5.8028 | Train Bleu: 0.0779 | Val. Bleu: 0.0559
# Global step: 52000 | Avg_step_time: 0.30s | Train Loss: 4.8790 | Val. Loss: 5.8420 | Train Bleu: 0.0783 | Val. Bleu: 0.0553
# Global step: 54000 | Avg_step_time: 0.29s | Train Loss: 4.8392 | Val. Loss: 5.8121 | Train Bleu: 0.0783 | Val. Bleu: 0.0568
# Global step: 56000 | Avg_step_time: 0.31s | Train Loss: 4.8577 | Val. Loss: 5.7938 | Train Bleu: 0.0802 | Val. Bleu: 0.0565
# Global step: 58000 | Avg_step_time: 0.29s | Train Loss: 4.8316 | Val. Loss: 5.8004 | Train Bleu: 0.0793 | Val. Bleu: 0.0563
# Global step: 60000 | Avg_step_time: 0.29s | Train Loss: 4.8270 | Val. Loss: 5.7785 | Train Bleu: 0.0804 | Val. Bleu: 0.0557
# Global step: 62000 | Avg_step_time: 0.29s | Train Loss: 4.8003 | Val. Loss: 5.7962 | Train Bleu: 0.0806 | Val. Bleu: 0.0567
# Global step: 64000 | Avg_step_time: 0.30s | Train Loss: 4.8393 | Val. Loss: 5.7628 | Train Bleu: 0.0805 | Val. Bleu: 0.0568
# Global step: 66000 | Avg_step_time: 0.29s | Train Loss: 4.7920 | Val. Loss: 5.7719 | Train Bleu: 0.0821 | Val. Bleu: 0.0560
# Global step: 68000 | Avg_step_time: 0.30s | Train Loss: 4.7609 | Val. Loss: 5.7561 | Train Bleu: 0.0829 | Val. Bleu: 0.0578
# Global step: 70000 | Avg_step_time: 0.30s | Train Loss: 4.7515 | Val. Loss: 5.7504 | Train Bleu: 0.0847 | Val. Bleu: 0.0585
# Global step: 72000 | Avg_step_time: 0.31s | Train Loss: 4.7548 | Val. Loss: 5.7666 | Train Bleu: 0.0818 | Val. Bleu: 0.0581
# Global step: 74000 | Avg_step_time: 0.30s | Train Loss: 4.7479 | Val. Loss: 5.7937 | Train Bleu: 0.0838 | Val. Bleu: 0.0556
# Global step: 76000 | Avg_step_time: 0.29s | Train Loss: 4.7218 | Val. Loss: 5.7392 | Train Bleu: 0.0833 | Val. Bleu: 0.0589
# Global step: 78000 | Avg_step_time: 0.29s | Train Loss: 4.7956 | Val. Loss: 5.7209 | Train Bleu: 0.0839 | Val. Bleu: 0.0584
# Global step: 80000 | Avg_step_time: 0.30s | Train Loss: 4.7553 | Val. Loss: 5.7246 | Train Bleu: 0.0822 | Val. Bleu: 0.0574
# Global step: 82000 | Avg_step_time: 0.29s | Train Loss: 4.7506 | Val. Loss: 5.7096 | Train Bleu: 0.0849 | Val. Bleu: 0.0579
# Global step: 84000 | Avg_step_time: 0.29s | Train Loss: 4.7412 | Val. Loss: 5.7048 | Train Bleu: 0.0840 | Val. Bleu: 0.0589
# Global step: 86000 | Avg_step_time: 0.29s | Train Loss: 4.7324 | Val. Loss: 5.7003 | Train Bleu: 0.0856 | Val. Bleu: 0.0582
# Global step: 88000 | Avg_step_time: 0.29s | Train Loss: 4.7125 | Val. Loss: 5.6950 | Train Bleu: 0.0866 | Val. Bleu: 0.0577
# Global step: 90000 | Avg_step_time: 0.30s | Train Loss: 4.6858 | Val. Loss: 5.7063 | Train Bleu: 0.0864 | Val. Bleu: 0.0578
# Global step: 92000 | Avg_step_time: 0.30s | Train Loss: 4.7028 | Val. Loss: 5.7069 | Train Bleu: 0.0856 | Val. Bleu: 0.0582
# Global step: 94000 | Avg_step_time: 0.30s | Train Loss: 4.7323 | Val. Loss: 5.6757 | Train Bleu: 0.0851 | Val. Bleu: 0.0592
# Global step: 96000 | Avg_step_time: 0.29s | Train Loss: 4.6713 | Val. Loss: 5.6982 | Train Bleu: 0.0873 | Val. Bleu: 0.0579
# Global step: 98000 | Avg_step_time: 0.30s | Train Loss: 4.6876 | Val. Loss: 5.6984 | Train Bleu: 0.0892 | Val. Bleu: 0.0604
# Global step: 100000 | Avg_step_time: 0.30s | Train Loss: 4.7205 | Val. Loss: 5.6711 | Train Bleu: 0.0891 | Val. Bleu: 0.0569
# Global step: 102000 | Avg_step_time: 0.29s | Train Loss: 4.7184 | Val. Loss: 5.6771 | Train Bleu: 0.0847 | Val. Bleu: 0.0591
# Global step: 104000 | Avg_step_time: 0.30s | Train Loss: 4.6883 | Val. Loss: 5.6759 | Train Bleu: 0.0869 | Val. Bleu: 0.0603
# Global step: 106000 | Avg_step_time: 0.30s | Train Loss: 4.7064 | Val. Loss: 5.6517 | Train Bleu: 0.0878 | Val. Bleu: 0.0609
# Global step: 108000 | Avg_step_time: 0.29s | Train Loss: 4.6729 | Val. Loss: 5.6762 | Train Bleu: 0.0889 | Val. Bleu: 0.0615
# Global step: 110000 | Avg_step_time: 0.31s | Train Loss: 4.6739 | Val. Loss: 5.6413 | Train Bleu: 0.0903 | Val. Bleu: 0.0605
# Global step: 112000 | Avg_step_time: 0.30s | Train Loss: 4.6816 | Val. Loss: 5.6564 | Train Bleu: 0.0907 | Val. Bleu: 0.0589
# Global step: 114000 | Avg_step_time: 0.30s | Train Loss: 4.6387 | Val. Loss: 5.6276 | Train Bleu: 0.0914 | Val. Bleu: 0.0598
# Global step: 116000 | Avg_step_time: 0.29s | Train Loss: 4.6749 | Val. Loss: 5.6465 | Train Bleu: 0.0907 | Val. Bleu: 0.0583

# Global step: 2000 | Avg_step_time: 0.18s | Train Loss: 4.7019 | Val. Loss: 5.8130 | Train Bleu: 0.0897 | Val. Bleu: 0.0612
# Global step: 4000 | Avg_step_time: 0.18s | Train Loss: 4.7334 | Val. Loss: 5.7931 | Train Bleu: 0.0894 | Val. Bleu: 0.0613
# Global step: 6000 | Avg_step_time: 0.18s | Train Loss: 4.7022 | Val. Loss: 5.7723 | Train Bleu: 0.0900 | Val. Bleu: 0.0607
# Global step: 8000 | Avg_step_time: 0.17s | Train Loss: 4.6688 | Val. Loss: 5.7482 | Train Bleu: 0.0927 | Val. Bleu: 0.0608
# Global step: 10000 | Avg_step_time: 0.18s | Train Loss: 4.6702 | Val. Loss: 5.7098 | Train Bleu: 0.0906 | Val. Bleu: 0.0611
# Global step: 12000 | Avg_step_time: 0.18s | Train Loss: 4.6160 | Val. Loss: 5.6928 | Train Bleu: 0.0913 | Val. Bleu: 0.0621
# Global step: 14000 | Avg_step_time: 0.17s | Train Loss: 4.6287 | Val. Loss: 5.6837 | Train Bleu: 0.0893 | Val. Bleu: 0.0596
# Global step: 16000 | Avg_step_time: 0.17s | Train Loss: 4.5639 | Val. Loss: 5.6355 | Train Bleu: 0.0950 | Val. Bleu: 0.0606
# Global step: 18000 | Avg_step_time: 0.18s | Train Loss: 4.5463 | Val. Loss: 5.6251 | Train Bleu: 0.0939 | Val. Bleu: 0.0606
# Global step: 20000 | Avg_step_time: 0.18s | Train Loss: 4.5623 | Val. Loss: 5.6140 | Train Bleu: 0.0962 | Val. Bleu: 0.0608
# Global step: 22000 | Avg_step_time: 0.18s | Train Loss: 4.5380 | Val. Loss: 5.6025 | Train Bleu: 0.0934 | Val. Bleu: 0.0606
# Global step: 24000 | Avg_step_time: 0.18s | Train Loss: 4.5779 | Val. Loss: 5.6119 | Train Bleu: 0.0950 | Val. Bleu: 0.0609
# Global step: 26000 | Avg_step_time: 0.18s | Train Loss: 4.4960 | Val. Loss: 5.6009 | Train Bleu: 0.0936 | Val. Bleu: 0.0621
# Global step: 28000 | Avg_step_time: 0.18s | Train Loss: 4.5105 | Val. Loss: 5.5870 | Train Bleu: 0.0972 | Val. Bleu: 0.0628
# Global step: 30000 | Avg_step_time: 0.18s | Train Loss: 4.5316 | Val. Loss: 5.5717 | Train Bleu: 0.0946 | Val. Bleu: 0.0610
# Global step: 32000 | Avg_step_time: 0.17s | Train Loss: 4.4818 | Val. Loss: 5.5671 | Train Bleu: 0.0940 | Val. Bleu: 0.0618
# Global step: 34000 | Avg_step_time: 0.18s | Train Loss: 4.5132 | Val. Loss: 5.5703 | Train Bleu: 0.0958 | Val. Bleu: 0.0632
# Global step: 36000 | Avg_step_time: 0.18s | Train Loss: 4.4833 | Val. Loss: 5.5677 | Train Bleu: 0.0969 | Val. Bleu: 0.0636
# Global step: 38000 | Avg_step_time: 0.18s | Train Loss: 4.4812 | Val. Loss: 5.5862 | Train Bleu: 0.0966 | Val. Bleu: 0.0621
# Global step: 40000 | Avg_step_time: 0.18s | Train Loss: 4.4966 | Val. Loss: 5.6195 | Train Bleu: 0.0967 | Val. Bleu: 0.0626
# Global step: 42000 | Avg_step_time: 0.18s | Train Loss: 4.4808 | Val. Loss: 5.5646 | Train Bleu: 0.0971 | Val. Bleu: 0.0624
# Global step: 44000 | Avg_step_time: 0.18s | Train Loss: 4.4756 | Val. Loss: 5.5635 | Train Bleu: 0.0978 | Val. Bleu: 0.0632
# Global step: 46000 | Avg_step_time: 0.18s | Train Loss: 4.4644 | Val. Loss: 5.5422 | Train Bleu: 0.0973 | Val. Bleu: 0.0628
# Global step: 48000 | Avg_step_time: 0.17s | Train Loss: 4.4838 | Val. Loss: 5.5662 | Train Bleu: 0.1004 | Val. Bleu: 0.0629
# Global step: 50000 | Avg_step_time: 0.18s | Train Loss: 4.4797 | Val. Loss: 5.5397 | Train Bleu: 0.0987 | Val. Bleu: 0.0635
# Global step: 52000 | Avg_step_time: 0.18s | Train Loss: 4.4823 | Val. Loss: 5.5637 | Train Bleu: 0.0979 | Val. Bleu: 0.0626
# Global step: 54000 | Avg_step_time: 0.18s | Train Loss: 4.4756 | Val. Loss: 5.5191 | Train Bleu: 0.0978 | Val. Bleu: 0.0627
# Global step: 56000 | Avg_step_time: 0.17s | Train Loss: 4.4530 | Val. Loss: 5.5405 | Train Bleu: 0.0994 | Val. Bleu: 0.0638
# Global step: 58000 | Avg_step_time: 0.18s | Train Loss: 4.4519 | Val. Loss: 5.5148 | Train Bleu: 0.0997 | Val. Bleu: 0.0633
# Global step: 60000 | Avg_step_time: 0.18s | Train Loss: 4.4327 | Val. Loss: 5.5157 | Train Bleu: 0.0994 | Val. Bleu: 0.0625
# Global step: 62000 | Avg_step_time: 0.18s | Train Loss: 4.4546 | Val. Loss: 5.5269 | Train Bleu: 0.0982 | Val. Bleu: 0.0642
# Global step: 64000 | Avg_step_time: 0.18s | Train Loss: 4.4754 | Val. Loss: 5.4940 | Train Bleu: 0.0979 | Val. Bleu: 0.0632
# Global step: 66000 | Avg_step_time: 0.18s | Train Loss: 4.4336 | Val. Loss: 5.4972 | Train Bleu: 0.0979 | Val. Bleu: 0.0653
# Global step: 68000 | Avg_step_time: 0.18s | Train Loss: 4.4126 | Val. Loss: 5.5228 | Train Bleu: 0.0999 | Val. Bleu: 0.0654
# Global step: 70000 | Avg_step_time: 0.18s | Train Loss: 4.4254 | Val. Loss: 5.5273 | Train Bleu: 0.0991 | Val. Bleu: 0.0645
# Global step: 72000 | Avg_step_time: 0.18s | Train Loss: 4.4439 | Val. Loss: 5.5034 | Train Bleu: 0.1011 | Val. Bleu: 0.0638
# Global step: 74000 | Avg_step_time: 0.19s | Train Loss: 4.4052 | Val. Loss: 5.4969 | Train Bleu: 0.1004 | Val. Bleu: 0.0644
# Global step: 76000 | Avg_step_time: 0.18s | Train Loss: 4.4013 | Val. Loss: 5.5243 | Train Bleu: 0.1008 | Val. Bleu: 0.0632
# Global step: 78000 | Avg_step_time: 0.18s | Train Loss: 4.4312 | Val. Loss: 5.5218 | Train Bleu: 0.0982 | Val. Bleu: 0.0630
# Global step: 80000 | Avg_step_time: 0.18s | Train Loss: 4.4224 | Val. Loss: 5.5492 | Train Bleu: 0.1021 | Val. Bleu: 0.0635
# Global step: 82000 | Avg_step_time: 0.19s | Train Loss: 4.4201 | Val. Loss: 5.5193 | Train Bleu: 0.0991 | Val. Bleu: 0.0630
# Global step: 84000 | Avg_step_time: 0.19s | Train Loss: 4.3809 | Val. Loss: 5.5282 | Train Bleu: 0.1016 | Val. Bleu: 0.0655
# Global step: 86000 | Avg_step_time: 0.18s | Train Loss: 4.4216 | Val. Loss: 5.5245 | Train Bleu: 0.0994 | Val. Bleu: 0.0655
# Global step: 88000 | Avg_step_time: 0.18s | Train Loss: 4.4089 | Val. Loss: 5.5336 | Train Bleu: 0.1030 | Val. Bleu: 0.0640

# Global step: 2000 | Avg_step_time: 0.46s | Train Loss: 4.2803 | Val. Loss: 5.4556 | Train Bleu: 0.1040 | Val. Bleu: 0.0660
# Global step: 4000 | Avg_step_time: 0.44s | Train Loss: 4.2345 | Val. Loss: 5.4405 | Train Bleu: 0.1049 | Val. Bleu: 0.0652
# Global step: 6000 | Avg_step_time: 0.46s | Train Loss: 4.2340 | Val. Loss: 5.4303 | Train Bleu: 0.1035 | Val. Bleu: 0.0660
# Global step: 8000 | Avg_step_time: 0.45s | Train Loss: 4.1643 | Val. Loss: 5.3716 | Train Bleu: 0.1073 | Val. Bleu: 0.0653
# Global step: 10000 | Avg_step_time: 0.45s | Train Loss: 4.1589 | Val. Loss: 5.3680 | Train Bleu: 0.1085 | Val. Bleu: 0.0672
# Global step: 12000 | Avg_step_time: 0.44s | Train Loss: 4.1662 | Val. Loss: 5.3446 | Train Bleu: 0.1065 | Val. Bleu: 0.0675
# Global step: 14000 | Avg_step_time: 0.43s | Train Loss: 4.1633 | Val. Loss: 5.3454 | Train Bleu: 0.1066 | Val. Bleu: 0.0677
# Global step: 16000 | Avg_step_time: 0.44s | Train Loss: 4.1312 | Val. Loss: 5.3118 | Train Bleu: 0.1067 | Val. Bleu: 0.0678
# Global step: 18000 | Avg_step_time: 0.44s | Train Loss: 4.1144 | Val. Loss: 5.3314 | Train Bleu: 0.1098 | Val. Bleu: 0.0667
# Global step: 20000 | Avg_step_time: 0.45s | Train Loss: 4.0986 | Val. Loss: 5.3075 | Train Bleu: 0.1106 | Val. Bleu: 0.0660
# Global step: 22000 | Avg_step_time: 0.45s | Train Loss: 4.0948 | Val. Loss: 5.3167 | Train Bleu: 0.1095 | Val. Bleu: 0.0680
# Global step: 24000 | Avg_step_time: 0.45s | Train Loss: 4.0968 | Val. Loss: 5.3086 | Train Bleu: 0.1096 | Val. Bleu: 0.0671
# Global step: 26000 | Avg_step_time: 0.45s | Train Loss: 4.1153 | Val. Loss: 5.3092 | Train Bleu: 0.1089 | Val. Bleu: 0.0646
# Global step: 28000 | Avg_step_time: 0.45s | Train Loss: 4.0759 | Val. Loss: 5.2931 | Train Bleu: 0.1114 | Val. Bleu: 0.0682
# Global step: 30000 | Avg_step_time: 0.44s | Train Loss: 4.0966 | Val. Loss: 5.2990 | Train Bleu: 0.1127 | Val. Bleu: 0.0663

# Global step: 2000 | Avg_step_time: 0.28s | Train Loss: 3.9194 | Val. Loss: 5.3395 | Train Bleu: 0.1155 | Val. Bleu: 0.0692
# Global step: 4000 | Avg_step_time: 0.28s | Train Loss: 3.8808 | Val. Loss: 5.3391 | Train Bleu: 0.1212 | Val. Bleu: 0.0694
# Global step: 6000 | Avg_step_time: 0.28s | Train Loss: 3.8903 | Val. Loss: 5.3374 | Train Bleu: 0.1222 | Val. Bleu: 0.0687
# Global step: 8000 | Avg_step_time: 0.28s | Train Loss: 3.8852 | Val. Loss: 5.3318 | Train Bleu: 0.1198 | Val. Bleu: 0.0688
# Global step: 10000 | Avg_step_time: 0.27s | Train Loss: 3.9907 | Val. Loss: 5.2981 | Train Bleu: 0.1200 | Val. Bleu: 0.0698
# Global step: 12000 | Avg_step_time: 0.28s | Train Loss: 4.0364 | Val. Loss: 5.2789 | Train Bleu: 0.1161 | Val. Bleu: 0.0687
# Global step: 14000 | Avg_step_time: 0.28s | Train Loss: 3.8710 | Val. Loss: 5.2878 | Train Bleu: 0.1236 | Val. Bleu: 0.0692
# Global step: 16000 | Avg_step_time: 0.28s | Train Loss: 3.8140 | Val. Loss: 5.2810 | Train Bleu: 0.1281 | Val. Bleu: 0.0688
# Global step: 18000 | Avg_step_time: 0.29s | Train Loss: 3.8394 | Val. Loss: 5.2747 | Train Bleu: 0.1233 | Val. Bleu: 0.0700
# Global step: 20000 | Avg_step_time: 0.28s | Train Loss: 3.8381 | Val. Loss: 5.2757 | Train Bleu: 0.1238 | Val. Bleu: 0.0682
# Global step: 22000 | Avg_step_time: 0.28s | Train Loss: 3.8299 | Val. Loss: 5.2721 | Train Bleu: 0.1273 | Val. Bleu: 0.0695
# Global step: 24000 | Avg_step_time: 0.28s | Train Loss: 3.9572 | Val. Loss: 5.2678 | Train Bleu: 0.1200 | Val. Bleu: 0.0681
# Global step: 26000 | Avg_step_time: 0.28s | Train Loss: 3.9838 | Val. Loss: 5.2476 | Train Bleu: 0.1192 | Val. Bleu: 0.0684
# Global step: 28000 | Avg_step_time: 0.28s | Train Loss: 4.0655 | Val. Loss: 5.2373 | Train Bleu: 0.1183 | Val. Bleu: 0.0694
# Global step: 30000 | Avg_step_time: 0.28s | Train Loss: 4.0504 | Val. Loss: 5.2422 | Train Bleu: 0.1183 | Val. Bleu: 0.0699
# Global step: 32000 | Avg_step_time: 0.29s | Train Loss: 4.0274 | Val. Loss: 5.2453 | Train Bleu: 0.1207 | Val. Bleu: 0.0691
# Global step: 34000 | Avg_step_time: 0.28s | Train Loss: 4.0466 | Val. Loss: 5.2354 | Train Bleu: 0.1168 | Val. Bleu: 0.0697
# Global step: 36000 | Avg_step_time: 0.28s | Train Loss: 4.0324 | Val. Loss: 5.2432 | Train Bleu: 0.1193 | Val. Bleu: 0.0692
# Global step: 38000 | Avg_step_time: 0.29s | Train Loss: 4.0241 | Val. Loss: 5.2337 | Train Bleu: 0.1181 | Val. Bleu: 0.0696
# Global step: 40000 | Avg_step_time: 0.28s | Train Loss: 4.0092 | Val. Loss: 5.2411 | Train Bleu: 0.1213 | Val. Bleu: 0.0699
# Global step: 42000 | Avg_step_time: 0.28s | Train Loss: 4.0125 | Val. Loss: 5.2304 | Train Bleu: 0.1186 | Val. Bleu: 0.0690
# Global step: 44000 | Avg_step_time: 0.28s | Train Loss: 4.0192 | Val. Loss: 5.2181 | Train Bleu: 0.1171 | Val. Bleu: 0.0702
# Global step: 46000 | Avg_step_time: 0.27s | Train Loss: 4.0026 | Val. Loss: 5.2218 | Train Bleu: 0.1184 | Val. Bleu: 0.0698
# Global step: 48000 | Avg_step_time: 0.28s | Train Loss: 4.0251 | Val. Loss: 5.2108 | Train Bleu: 0.1183 | Val. Bleu: 0.0706
# Global step: 50000 | Avg_step_time: 0.28s | Train Loss: 4.0070 | Val. Loss: 5.2108 | Train Bleu: 0.1212 | Val. Bleu: 0.0698
# Global step: 52000 | Avg_step_time: 0.28s | Train Loss: 3.9756 | Val. Loss: 5.2099 | Train Bleu: 0.1225 | Val. Bleu: 0.0696
# Global step: 54000 | Avg_step_time: 0.28s | Train Loss: 4.0155 | Val. Loss: 5.2199 | Train Bleu: 0.1197 | Val. Bleu: 0.0700
# Global step: 56000 | Avg_step_time: 0.28s | Train Loss: 3.9853 | Val. Loss: 5.2032 | Train Bleu: 0.1194 | Val. Bleu: 0.0707
# Global step: 58000 | Avg_step_time: 0.28s | Train Loss: 3.9775 | Val. Loss: 5.2087 | Train Bleu: 0.1219 | Val. Bleu: 0.0701