# Transformer

* Reference: https://pytorch.org/tutorials/beginner/transformer_tutorial.html

In [1]:
import math
from typing import Tuple

import torch
from torch import nn, Tensor
import torch.nn.functional as F
from torch.nn import TransformerEncoder, TransformerEncoderLayer, TransformerDecoder, TransformerDecoderLayer
from torch.utils.data import dataset

def generate_square_subsequent_mask(sz: int) -> Tensor:
    """
    Generates an upper-triangular matrix of -int, with zeros on dialog
    refer to https://pytorch.org/docs/stable/generated/torch.triu.html
    """
    return torch.triu(torch.ones(sz, sz) * float('-inf'), diagonal=1)

class PositionalEncoding(nn.Module):
    def __init__(self, d_model: int, dropout: float=0.1, max_len: int=5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)
        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)
        
    def forward(self, x: Tensor) -> Tensor:
        """
        Args:
            x: Tensor, shape [seq_len, batch_size, embedding_dim]
        """
        x = x + self.pe[:x.size(0)]
        # here we also add a dropout layer for position encoding
        return self.dropout(x)

class TransformerModel(nn.Module):
    def __init__(self, ntoken: int, d_model: int, nhead: int, d_hid: int, nlayers: int, dropout: float = 0.5):
        super().__init__()
        self.model_type = 'Tansformer'
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        encoder_layers = TransformerEncoderLayer(d_model, nhead, d_hid, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.encoder = nn.Embedding(ntoken, d_model)
        self.d_model = d_model
        # should be replaced with TransformerDeCoder
        self.decoder = nn.Linear(d_model, ntoken)
        self.init_weights()
        
    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)
        
    def forward(self, src: Tensor, src_mask: Tensor) -> Tensor:
        """
        Args:
            src: Tensor, shape [seq_len, batch_size]
            src_mask: Tensor, shape [seq_len, seq_len
            
        Returns:
            output Tensor of shape [seq_len, batch_size, ntoken]
        """
        src = self.encoder(src) * math.sqrt(self.d_model)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src, src_mask)
        output = self.decoder(output)
        return output

In [2]:
from torchtext.datasets import WikiText2
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

train_iter = WikiText2(split='train')
tokenizer = get_tokenizer('basic_english')
vocab = build_vocab_from_iterator(map(tokenizer, train_iter), specials=['<unk'])
vocab.set_default_index(vocab['<unk'])

def data_process(raw_text_iter: dataset.IterableDataset) -> Tensor:
    """
    converts raw text into a flat tensor
    """
    data = [torch.tensor(vocab(tokenizer(item)), dtype=torch.long) for item in raw_text_iter]
    return torch.cat(tuple(filter(lambda t: t.numel() > 0, data)))

# train_iter was "consumed" by the process of building the vocab, so we have to create it again
train_iter, val_iter, test_iter = WikiText2()
train_data = data_process(train_iter)
val_data = data_process(val_iter)
test_data = data_process(test_iter)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def batchify(data: Tensor, bsz: int) -> Tensor:
    """
    Divides the data into bsz separate sequences, removing extra elements that wouldn't clearly fit.
    
    Args:
        data: Tensor, shape [N]
        bsz: int, batch size
        
    Returns:
        Tensor of shape [N // bsz, bsz]
    """
    # rounding up based on batch size
    seq_len = data.size(0) // bsz
    data = data[:seq_len * bsz]
    # transform to bsz * seq_len, then use contiguous() to return a contiguous in memory tensor 
    # containing the same data as self tensor. If self tensor is already in the specified memory format, 
    # this function returns the self tensor: refer to https://pytorch.org/docs/stable/generated/torch.Tensor.contiguous.html?highlight=contiguous#torch.Tensor.contiguous
    data = data.view(bsz, seq_len).t().contiguous()
    return data.to(device)

batch_size = 20
eval_batch_size = 10
train_data = batchify(train_data, batch_size) # shape [seq_len, batch_size]
val_data = batchify(val_data, eval_batch_size)
test_data = batchify(test_data, eval_batch_size)

In [3]:
bptt = 35
def get_batch(source: Tensor, i: int) -> Tuple[Tensor, Tensor]:
    """
    Args:
        sources: Tensor, shape [full_seq_len, batch_size]
        i: int
        
    Returns:
        tuple (data, target), where data has shape [seq_len, batch_size] and target has shape [seq_len * batch_size]
    """
    seq_len = min(bptt, len(source) - 1 - i)
    data = source[i:i+seq_len]
    target = source[i+1:i+1+seq_len].reshape(-1)
    return data, target

In [4]:
ntokens = len(vocab)  # size of vocabulary
emsize = 200  # embedding dimension
d_hid = 200  # dimension of the feedforward network model in nn.TransformerEncoder
nlayers = 2  # number of nn.TransformerEncoderLayer in nn.TransformerEncoder
nhead = 2  # number of heads in nn.MultiheadAttention
dropout = 0.2  # dropout probability
model = TransformerModel(ntokens, emsize, nhead, d_hid, nlayers, dropout).to(device)

In [5]:
import copy
import time

criterion = nn.CrossEntropyLoss()
lr = 5.0  # learning rate
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)

def train(model: nn.Module) -> None:
    model.train()  # turn on train mode
    total_loss = 0.
    log_interval = 200
    start_time = time.time()
    src_mask = generate_square_subsequent_mask(bptt).to(device)

    num_batches = len(train_data) // bptt
    for batch, i in enumerate(range(0, train_data.size(0) - 1, bptt)):
        data, targets = get_batch(train_data, i)
        batch_size = data.size(0)
        if batch_size != bptt:  # only on last batch
            src_mask = src_mask[:batch_size, :batch_size]
        output = model(data, src_mask)
        loss = criterion(output.view(-1, ntokens), targets)

        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()

        total_loss += loss.item()
        if batch % log_interval == 0 and batch > 0:
            lr = scheduler.get_last_lr()[0]
            ms_per_batch = (time.time() - start_time) * 1000 / log_interval
            cur_loss = total_loss / log_interval
            ppl = math.exp(cur_loss)
            print(f'| epoch {epoch:3d} | {batch:5d}/{num_batches:5d} batches | '
                  f'lr {lr:02.2f} | ms/batch {ms_per_batch:5.2f} | '
                  f'loss {cur_loss:5.2f} | ppl {ppl:8.2f}')
            total_loss = 0
            start_time = time.time()

def evaluate(model: nn.Module, eval_data: Tensor) -> float:
    model.eval()  # turn on evaluation mode
    total_loss = 0.
    src_mask = generate_square_subsequent_mask(bptt).to(device)
    with torch.no_grad():
        for i in range(0, eval_data.size(0) - 1, bptt):
            data, targets = get_batch(eval_data, i)
            batch_size = data.size(0)
            if batch_size != bptt:
                src_mask = src_mask[:batch_size, :batch_size]
            output = model(data, src_mask)
            output_flat = output.view(-1, ntokens)
            total_loss += batch_size * criterion(output_flat, targets).item()
    return total_loss / (len(eval_data) - 1)

In [6]:
%%time
best_val_loss = float('inf')
epochs = 1000
best_model = None

for epoch in range(1, epochs + 1):
    epoch_start_time = time.time()
    train(model)
    val_loss = evaluate(model, val_data)
    val_ppl = math.exp(val_loss)
    elapsed = time.time() - epoch_start_time
    print('-' * 89)
    print(f'| end of epoch {epoch:3d} | time: {elapsed:5.2f}s | '
          f'valid loss {val_loss:5.2f} | valid ppl {val_ppl:8.2f}')
    print('-' * 89)

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model = copy.deepcopy(model)

    scheduler.step()

| epoch   1 |   200/ 2928 batches | lr 5.00 | ms/batch 10.59 | loss  8.19 | ppl  3596.99
| epoch   1 |   400/ 2928 batches | lr 5.00 | ms/batch  9.26 | loss  6.92 | ppl  1009.33
| epoch   1 |   600/ 2928 batches | lr 5.00 | ms/batch  9.20 | loss  6.46 | ppl   640.37
| epoch   1 |   800/ 2928 batches | lr 5.00 | ms/batch  9.23 | loss  6.31 | ppl   550.80
| epoch   1 |  1000/ 2928 batches | lr 5.00 | ms/batch  9.22 | loss  6.19 | ppl   488.88
| epoch   1 |  1200/ 2928 batches | lr 5.00 | ms/batch  9.21 | loss  6.17 | ppl   475.94
| epoch   1 |  1400/ 2928 batches | lr 5.00 | ms/batch  9.22 | loss  6.12 | ppl   453.07
| epoch   1 |  1600/ 2928 batches | lr 5.00 | ms/batch  9.20 | loss  6.12 | ppl   455.65
| epoch   1 |  1800/ 2928 batches | lr 5.00 | ms/batch  9.25 | loss  6.03 | ppl   415.84
| epoch   1 |  2000/ 2928 batches | lr 5.00 | ms/batch  9.21 | loss  6.04 | ppl   418.11
| epoch   1 |  2200/ 2928 batches | lr 5.00 | ms/batch  9.21 | loss  5.90 | ppl   365.06
| epoch   1 |  2400/ 

| epoch   6 |  1800/ 2928 batches | lr 3.87 | ms/batch  8.68 | loss  5.12 | ppl   168.16
| epoch   6 |  2000/ 2928 batches | lr 3.87 | ms/batch  8.67 | loss  5.14 | ppl   170.44
| epoch   6 |  2200/ 2928 batches | lr 3.87 | ms/batch  8.59 | loss  4.99 | ppl   147.45
| epoch   6 |  2400/ 2928 batches | lr 3.87 | ms/batch  8.73 | loss  5.11 | ppl   165.40
| epoch   6 |  2600/ 2928 batches | lr 3.87 | ms/batch  8.84 | loss  5.13 | ppl   168.18
| epoch   6 |  2800/ 2928 batches | lr 3.87 | ms/batch  8.77 | loss  5.05 | ppl   156.28
-----------------------------------------------------------------------------------------
| end of epoch   6 | time: 26.35s | valid loss  5.53 | valid ppl   250.91
-----------------------------------------------------------------------------------------
| epoch   7 |   200/ 2928 batches | lr 3.68 | ms/batch  9.50 | loss  5.10 | ppl   164.41
| epoch   7 |   400/ 2928 batches | lr 3.68 | ms/batch  8.68 | loss  5.14 | ppl   169.92
| epoch   7 |   600/ 2928 batches 

| epoch  12 |   200/ 2928 batches | lr 2.84 | ms/batch  8.88 | loss  4.77 | ppl   117.36
| epoch  12 |   400/ 2928 batches | lr 2.84 | ms/batch  8.41 | loss  4.79 | ppl   120.46
| epoch  12 |   600/ 2928 batches | lr 2.84 | ms/batch  8.40 | loss  4.61 | ppl   100.97
| epoch  12 |   800/ 2928 batches | lr 2.84 | ms/batch  8.40 | loss  4.67 | ppl   107.13
| epoch  12 |  1000/ 2928 batches | lr 2.84 | ms/batch  8.44 | loss  4.67 | ppl   106.50
| epoch  12 |  1200/ 2928 batches | lr 2.84 | ms/batch  8.43 | loss  4.71 | ppl   110.84
| epoch  12 |  1400/ 2928 batches | lr 2.84 | ms/batch  8.49 | loss  4.72 | ppl   111.71
| epoch  12 |  1600/ 2928 batches | lr 2.84 | ms/batch  8.49 | loss  4.77 | ppl   118.38
| epoch  12 |  1800/ 2928 batches | lr 2.84 | ms/batch  8.52 | loss  4.72 | ppl   112.59
| epoch  12 |  2000/ 2928 batches | lr 2.84 | ms/batch 11.44 | loss  4.73 | ppl   113.00
| epoch  12 |  2200/ 2928 batches | lr 2.84 | ms/batch 11.46 | loss  4.59 | ppl    98.01
| epoch  12 |  2400/ 

| epoch  17 |  1800/ 2928 batches | lr 2.20 | ms/batch  8.35 | loss  4.52 | ppl    91.60
| epoch  17 |  2000/ 2928 batches | lr 2.20 | ms/batch  8.35 | loss  4.51 | ppl    90.66
| epoch  17 |  2200/ 2928 batches | lr 2.20 | ms/batch  8.33 | loss  4.37 | ppl    78.94
| epoch  17 |  2400/ 2928 batches | lr 2.20 | ms/batch  8.33 | loss  4.47 | ppl    87.54
| epoch  17 |  2600/ 2928 batches | lr 2.20 | ms/batch  8.40 | loss  4.49 | ppl    89.47
| epoch  17 |  2800/ 2928 batches | lr 2.20 | ms/batch  8.33 | loss  4.45 | ppl    85.20
-----------------------------------------------------------------------------------------
| end of epoch  17 | time: 25.79s | valid loss  5.56 | valid ppl   260.34
-----------------------------------------------------------------------------------------
| epoch  18 |   200/ 2928 batches | lr 2.09 | ms/batch  8.46 | loss  4.50 | ppl    90.27
| epoch  18 |   400/ 2928 batches | lr 2.09 | ms/batch  8.32 | loss  4.52 | ppl    91.99
| epoch  18 |   600/ 2928 batches 

| epoch  23 |   200/ 2928 batches | lr 1.62 | ms/batch  8.51 | loss  4.35 | ppl    77.42
| epoch  23 |   400/ 2928 batches | lr 1.62 | ms/batch  8.33 | loss  4.37 | ppl    78.90
| epoch  23 |   600/ 2928 batches | lr 1.62 | ms/batch  8.34 | loss  4.21 | ppl    67.41
| epoch  23 |   800/ 2928 batches | lr 1.62 | ms/batch  8.44 | loss  4.27 | ppl    71.74
| epoch  23 |  1000/ 2928 batches | lr 1.62 | ms/batch  8.46 | loss  4.28 | ppl    72.19
| epoch  23 |  1200/ 2928 batches | lr 1.62 | ms/batch  8.37 | loss  4.31 | ppl    74.69
| epoch  23 |  1400/ 2928 batches | lr 1.62 | ms/batch  8.34 | loss  4.31 | ppl    74.32
| epoch  23 |  1600/ 2928 batches | lr 1.62 | ms/batch  8.34 | loss  4.36 | ppl    77.99
| epoch  23 |  1800/ 2928 batches | lr 1.62 | ms/batch  8.34 | loss  4.34 | ppl    76.53
| epoch  23 |  2000/ 2928 batches | lr 1.62 | ms/batch  8.36 | loss  4.32 | ppl    75.50
| epoch  23 |  2200/ 2928 batches | lr 1.62 | ms/batch  8.35 | loss  4.18 | ppl    65.39
| epoch  23 |  2400/ 

| epoch  28 |  1800/ 2928 batches | lr 1.25 | ms/batch  8.43 | loss  4.23 | ppl    68.91
| epoch  28 |  2000/ 2928 batches | lr 1.25 | ms/batch  8.34 | loss  4.22 | ppl    67.87
| epoch  28 |  2200/ 2928 batches | lr 1.25 | ms/batch  8.36 | loss  4.08 | ppl    58.86
| epoch  28 |  2400/ 2928 batches | lr 1.25 | ms/batch  8.36 | loss  4.18 | ppl    65.28
| epoch  28 |  2600/ 2928 batches | lr 1.25 | ms/batch  8.33 | loss  4.19 | ppl    66.25
| epoch  28 |  2800/ 2928 batches | lr 1.25 | ms/batch  8.35 | loss  4.15 | ppl    63.44
-----------------------------------------------------------------------------------------
| end of epoch  28 | time: 25.86s | valid loss  5.62 | valid ppl   275.99
-----------------------------------------------------------------------------------------
| epoch  29 |   200/ 2928 batches | lr 1.19 | ms/batch  8.45 | loss  4.22 | ppl    68.24
| epoch  29 |   400/ 2928 batches | lr 1.19 | ms/batch  8.30 | loss  4.24 | ppl    69.17
| epoch  29 |   600/ 2928 batches 

| epoch  34 |   200/ 2928 batches | lr 0.92 | ms/batch  8.51 | loss  4.15 | ppl    63.29
| epoch  34 |   400/ 2928 batches | lr 0.92 | ms/batch  8.31 | loss  4.16 | ppl    63.89
| epoch  34 |   600/ 2928 batches | lr 0.92 | ms/batch  8.31 | loss  4.02 | ppl    55.67
| epoch  34 |   800/ 2928 batches | lr 0.92 | ms/batch  8.34 | loss  4.08 | ppl    59.10
| epoch  34 |  1000/ 2928 batches | lr 0.92 | ms/batch  8.39 | loss  4.09 | ppl    59.84
| epoch  34 |  1200/ 2928 batches | lr 0.92 | ms/batch  8.35 | loss  4.12 | ppl    61.71
| epoch  34 |  1400/ 2928 batches | lr 0.92 | ms/batch  8.33 | loss  4.10 | ppl    60.27
| epoch  34 |  1600/ 2928 batches | lr 0.92 | ms/batch  8.35 | loss  4.16 | ppl    63.99
| epoch  34 |  1800/ 2928 batches | lr 0.92 | ms/batch  8.35 | loss  4.15 | ppl    63.12
| epoch  34 |  2000/ 2928 batches | lr 0.92 | ms/batch  8.31 | loss  4.13 | ppl    62.10
| epoch  34 |  2200/ 2928 batches | lr 0.92 | ms/batch  8.41 | loss  3.99 | ppl    53.89
| epoch  34 |  2400/ 

| epoch  39 |  1800/ 2928 batches | lr 0.71 | ms/batch  8.40 | loss  4.09 | ppl    59.85
| epoch  39 |  2000/ 2928 batches | lr 0.71 | ms/batch  8.36 | loss  4.08 | ppl    59.05
| epoch  39 |  2200/ 2928 batches | lr 0.71 | ms/batch  8.37 | loss  3.93 | ppl    50.98
| epoch  39 |  2400/ 2928 batches | lr 0.71 | ms/batch  8.43 | loss  4.03 | ppl    56.32
| epoch  39 |  2600/ 2928 batches | lr 0.71 | ms/batch  8.45 | loss  4.05 | ppl    57.37
| epoch  39 |  2800/ 2928 batches | lr 0.71 | ms/batch  8.38 | loss  4.01 | ppl    55.17
-----------------------------------------------------------------------------------------
| end of epoch  39 | time: 25.94s | valid loss  5.64 | valid ppl   280.22
-----------------------------------------------------------------------------------------
| epoch  40 |   200/ 2928 batches | lr 0.68 | ms/batch  8.52 | loss  4.08 | ppl    59.42
| epoch  40 |   400/ 2928 batches | lr 0.68 | ms/batch  8.40 | loss  4.09 | ppl    59.81
| epoch  40 |   600/ 2928 batches 

| epoch  45 |   200/ 2928 batches | lr 0.52 | ms/batch  8.45 | loss  4.05 | ppl    57.14
| epoch  45 |   400/ 2928 batches | lr 0.52 | ms/batch  8.33 | loss  4.05 | ppl    57.64
| epoch  45 |   600/ 2928 batches | lr 0.52 | ms/batch  8.33 | loss  3.93 | ppl    50.91
| epoch  45 |   800/ 2928 batches | lr 0.52 | ms/batch  8.31 | loss  3.98 | ppl    53.59
| epoch  45 |  1000/ 2928 batches | lr 0.52 | ms/batch  8.32 | loss  4.00 | ppl    54.41
| epoch  45 |  1200/ 2928 batches | lr 0.52 | ms/batch  8.33 | loss  4.03 | ppl    56.17
| epoch  45 |  1400/ 2928 batches | lr 0.52 | ms/batch  8.32 | loss  4.00 | ppl    54.55
| epoch  45 |  1600/ 2928 batches | lr 0.52 | ms/batch  8.37 | loss  4.06 | ppl    57.87
| epoch  45 |  1800/ 2928 batches | lr 0.52 | ms/batch  8.36 | loss  4.05 | ppl    57.16
| epoch  45 |  2000/ 2928 batches | lr 0.52 | ms/batch  8.33 | loss  4.04 | ppl    56.63
| epoch  45 |  2200/ 2928 batches | lr 0.52 | ms/batch  8.33 | loss  3.89 | ppl    49.12
| epoch  45 |  2400/ 

| epoch  50 |  1800/ 2928 batches | lr 0.40 | ms/batch  8.32 | loss  4.02 | ppl    55.76
| epoch  50 |  2000/ 2928 batches | lr 0.40 | ms/batch  8.33 | loss  4.01 | ppl    55.27
| epoch  50 |  2200/ 2928 batches | lr 0.40 | ms/batch  8.33 | loss  3.87 | ppl    47.77
| epoch  50 |  2400/ 2928 batches | lr 0.40 | ms/batch  8.35 | loss  3.96 | ppl    52.24
| epoch  50 |  2600/ 2928 batches | lr 0.40 | ms/batch  8.35 | loss  3.98 | ppl    53.34
| epoch  50 |  2800/ 2928 batches | lr 0.40 | ms/batch  8.33 | loss  3.94 | ppl    51.47
-----------------------------------------------------------------------------------------
| end of epoch  50 | time: 25.74s | valid loss  5.66 | valid ppl   287.25
-----------------------------------------------------------------------------------------
| epoch  51 |   200/ 2928 batches | lr 0.38 | ms/batch  8.39 | loss  4.02 | ppl    55.45
| epoch  51 |   400/ 2928 batches | lr 0.38 | ms/batch  8.32 | loss  4.02 | ppl    55.88
| epoch  51 |   600/ 2928 batches 

| epoch  56 |   200/ 2928 batches | lr 0.30 | ms/batch  8.78 | loss  4.00 | ppl    54.56
| epoch  56 |   400/ 2928 batches | lr 0.30 | ms/batch  8.55 | loss  4.01 | ppl    54.92
| epoch  56 |   600/ 2928 batches | lr 0.30 | ms/batch  8.86 | loss  3.88 | ppl    48.54
| epoch  56 |   800/ 2928 batches | lr 0.30 | ms/batch  8.91 | loss  3.94 | ppl    51.26
| epoch  56 |  1000/ 2928 batches | lr 0.30 | ms/batch  8.99 | loss  3.96 | ppl    52.23
| epoch  56 |  1200/ 2928 batches | lr 0.30 | ms/batch  8.95 | loss  3.98 | ppl    53.74
| epoch  56 |  1400/ 2928 batches | lr 0.30 | ms/batch  8.96 | loss  3.95 | ppl    51.77
| epoch  56 |  1600/ 2928 batches | lr 0.30 | ms/batch  9.00 | loss  4.00 | ppl    54.72
| epoch  56 |  1800/ 2928 batches | lr 0.30 | ms/batch  8.96 | loss  4.00 | ppl    54.70
| epoch  56 |  2000/ 2928 batches | lr 0.30 | ms/batch  9.00 | loss  3.99 | ppl    54.08
| epoch  56 |  2200/ 2928 batches | lr 0.30 | ms/batch  9.00 | loss  3.85 | ppl    46.82
| epoch  56 |  2400/ 

| epoch  61 |  1800/ 2928 batches | lr 0.23 | ms/batch  8.46 | loss  3.98 | ppl    53.70
| epoch  61 |  2000/ 2928 batches | lr 0.23 | ms/batch  8.44 | loss  3.98 | ppl    53.63
| epoch  61 |  2200/ 2928 batches | lr 0.23 | ms/batch  8.46 | loss  3.83 | ppl    46.25
| epoch  61 |  2400/ 2928 batches | lr 0.23 | ms/batch  8.43 | loss  3.92 | ppl    50.44
| epoch  61 |  2600/ 2928 batches | lr 0.23 | ms/batch  8.45 | loss  3.95 | ppl    51.87
| epoch  61 |  2800/ 2928 batches | lr 0.23 | ms/batch  8.45 | loss  3.91 | ppl    50.02
-----------------------------------------------------------------------------------------
| end of epoch  61 | time: 26.17s | valid loss  5.64 | valid ppl   281.20
-----------------------------------------------------------------------------------------
| epoch  62 |   200/ 2928 batches | lr 0.22 | ms/batch  9.85 | loss  3.99 | ppl    54.15
| epoch  62 |   400/ 2928 batches | lr 0.22 | ms/batch 10.66 | loss  3.99 | ppl    54.05
| epoch  62 |   600/ 2928 batches 

| epoch  67 |   200/ 2928 batches | lr 0.17 | ms/batch  8.96 | loss  3.98 | ppl    53.51
| epoch  67 |   400/ 2928 batches | lr 0.17 | ms/batch  8.66 | loss  3.98 | ppl    53.62
| epoch  67 |   600/ 2928 batches | lr 0.17 | ms/batch  8.41 | loss  3.86 | ppl    47.51
| epoch  67 |   800/ 2928 batches | lr 0.17 | ms/batch  8.46 | loss  3.93 | ppl    50.89
| epoch  67 |  1000/ 2928 batches | lr 0.17 | ms/batch  8.42 | loss  3.95 | ppl    51.69
| epoch  67 |  1200/ 2928 batches | lr 0.17 | ms/batch  8.43 | loss  3.97 | ppl    52.88
| epoch  67 |  1400/ 2928 batches | lr 0.17 | ms/batch  8.43 | loss  3.93 | ppl    51.06
| epoch  67 |  1600/ 2928 batches | lr 0.17 | ms/batch  8.44 | loss  3.99 | ppl    54.17
| epoch  67 |  1800/ 2928 batches | lr 0.17 | ms/batch  8.41 | loss  3.98 | ppl    53.47
| epoch  67 |  2000/ 2928 batches | lr 0.17 | ms/batch  8.41 | loss  3.98 | ppl    53.38
| epoch  67 |  2200/ 2928 batches | lr 0.17 | ms/batch  8.45 | loss  3.83 | ppl    46.02
| epoch  67 |  2400/ 

| epoch  72 |  1800/ 2928 batches | lr 0.13 | ms/batch  8.42 | loss  3.98 | ppl    53.62
| epoch  72 |  2000/ 2928 batches | lr 0.13 | ms/batch  8.44 | loss  3.98 | ppl    53.35
| epoch  72 |  2200/ 2928 batches | lr 0.13 | ms/batch  8.43 | loss  3.82 | ppl    45.76
| epoch  72 |  2400/ 2928 batches | lr 0.13 | ms/batch  8.43 | loss  3.91 | ppl    49.89
| epoch  72 |  2600/ 2928 batches | lr 0.13 | ms/batch  8.42 | loss  3.94 | ppl    51.20
| epoch  72 |  2800/ 2928 batches | lr 0.13 | ms/batch  8.45 | loss  3.91 | ppl    49.66
-----------------------------------------------------------------------------------------
| end of epoch  72 | time: 26.26s | valid loss  5.63 | valid ppl   277.67
-----------------------------------------------------------------------------------------
| epoch  73 |   200/ 2928 batches | lr 0.12 | ms/batch  9.32 | loss  3.98 | ppl    53.72
| epoch  73 |   400/ 2928 batches | lr 0.12 | ms/batch  9.54 | loss  3.98 | ppl    53.71
| epoch  73 |   600/ 2928 batches 

| epoch  78 |   200/ 2928 batches | lr 0.10 | ms/batch  8.47 | loss  3.98 | ppl    53.58
| epoch  78 |   400/ 2928 batches | lr 0.10 | ms/batch  8.33 | loss  3.99 | ppl    53.87
| epoch  78 |   600/ 2928 batches | lr 0.10 | ms/batch  8.36 | loss  3.86 | ppl    47.47
| epoch  78 |   800/ 2928 batches | lr 0.10 | ms/batch  8.34 | loss  3.93 | ppl    50.71
| epoch  78 |  1000/ 2928 batches | lr 0.10 | ms/batch  8.43 | loss  3.94 | ppl    51.55
| epoch  78 |  1200/ 2928 batches | lr 0.10 | ms/batch  8.39 | loss  3.97 | ppl    52.89
| epoch  78 |  1400/ 2928 batches | lr 0.10 | ms/batch  8.31 | loss  3.93 | ppl    51.03
| epoch  78 |  1600/ 2928 batches | lr 0.10 | ms/batch  8.38 | loss  3.99 | ppl    53.91
| epoch  78 |  1800/ 2928 batches | lr 0.10 | ms/batch  8.34 | loss  3.98 | ppl    53.31
| epoch  78 |  2000/ 2928 batches | lr 0.10 | ms/batch  8.37 | loss  3.98 | ppl    53.27
| epoch  78 |  2200/ 2928 batches | lr 0.10 | ms/batch  8.35 | loss  3.83 | ppl    45.89
| epoch  78 |  2400/ 

| epoch  83 |  1800/ 2928 batches | lr 0.07 | ms/batch  8.33 | loss  3.98 | ppl    53.67
| epoch  83 |  2000/ 2928 batches | lr 0.07 | ms/batch  8.34 | loss  3.99 | ppl    53.86
| epoch  83 |  2200/ 2928 batches | lr 0.07 | ms/batch  8.33 | loss  3.83 | ppl    46.16
| epoch  83 |  2400/ 2928 batches | lr 0.07 | ms/batch  8.35 | loss  3.92 | ppl    50.38
| epoch  83 |  2600/ 2928 batches | lr 0.07 | ms/batch  8.35 | loss  3.94 | ppl    51.52
| epoch  83 |  2800/ 2928 batches | lr 0.07 | ms/batch  8.42 | loss  3.91 | ppl    49.96
-----------------------------------------------------------------------------------------
| end of epoch  83 | time: 25.84s | valid loss  5.63 | valid ppl   277.29
-----------------------------------------------------------------------------------------
| epoch  84 |   200/ 2928 batches | lr 0.07 | ms/batch  8.49 | loss  3.99 | ppl    54.00
| epoch  84 |   400/ 2928 batches | lr 0.07 | ms/batch  8.35 | loss  3.99 | ppl    53.88
| epoch  84 |   600/ 2928 batches 

| epoch  89 |   200/ 2928 batches | lr 0.05 | ms/batch  8.48 | loss  4.00 | ppl    54.35
| epoch  89 |   400/ 2928 batches | lr 0.05 | ms/batch  8.36 | loss  3.99 | ppl    54.16
| epoch  89 |   600/ 2928 batches | lr 0.05 | ms/batch  8.33 | loss  3.87 | ppl    48.15
| epoch  89 |   800/ 2928 batches | lr 0.05 | ms/batch  8.32 | loss  3.94 | ppl    51.36
| epoch  89 |  1000/ 2928 batches | lr 0.05 | ms/batch  8.34 | loss  3.95 | ppl    51.99
| epoch  89 |  1200/ 2928 batches | lr 0.05 | ms/batch  8.35 | loss  3.98 | ppl    53.60
| epoch  89 |  1400/ 2928 batches | lr 0.05 | ms/batch  8.35 | loss  3.94 | ppl    51.59
| epoch  89 |  1600/ 2928 batches | lr 0.05 | ms/batch  8.36 | loss  4.00 | ppl    54.63
| epoch  89 |  1800/ 2928 batches | lr 0.05 | ms/batch  8.35 | loss  4.00 | ppl    54.44
| epoch  89 |  2000/ 2928 batches | lr 0.05 | ms/batch  8.31 | loss  3.99 | ppl    54.31
| epoch  89 |  2200/ 2928 batches | lr 0.05 | ms/batch  8.36 | loss  3.84 | ppl    46.40
| epoch  89 |  2400/ 

| epoch  94 |  1800/ 2928 batches | lr 0.04 | ms/batch  8.37 | loss  4.00 | ppl    54.61
| epoch  94 |  2000/ 2928 batches | lr 0.04 | ms/batch  8.36 | loss  3.99 | ppl    54.13
| epoch  94 |  2200/ 2928 batches | lr 0.04 | ms/batch  8.35 | loss  3.85 | ppl    46.77
| epoch  94 |  2400/ 2928 batches | lr 0.04 | ms/batch  8.35 | loss  3.93 | ppl    51.03
| epoch  94 |  2600/ 2928 batches | lr 0.04 | ms/batch  8.39 | loss  3.96 | ppl    52.46
| epoch  94 |  2800/ 2928 batches | lr 0.04 | ms/batch  8.36 | loss  3.93 | ppl    50.70
-----------------------------------------------------------------------------------------
| end of epoch  94 | time: 25.85s | valid loss  5.59 | valid ppl   268.48
-----------------------------------------------------------------------------------------
| epoch  95 |   200/ 2928 batches | lr 0.04 | ms/batch  8.47 | loss  4.00 | ppl    54.66
| epoch  95 |   400/ 2928 batches | lr 0.04 | ms/batch  8.36 | loss  4.00 | ppl    54.78
| epoch  95 |   600/ 2928 batches 

| epoch 100 |   200/ 2928 batches | lr 0.03 | ms/batch  8.51 | loss  4.02 | ppl    55.76
| epoch 100 |   400/ 2928 batches | lr 0.03 | ms/batch  8.43 | loss  4.02 | ppl    55.49
| epoch 100 |   600/ 2928 batches | lr 0.03 | ms/batch  8.45 | loss  3.89 | ppl    49.00
| epoch 100 |   800/ 2928 batches | lr 0.03 | ms/batch  8.50 | loss  3.96 | ppl    52.57
| epoch 100 |  1000/ 2928 batches | lr 0.03 | ms/batch  8.47 | loss  3.96 | ppl    52.71
| epoch 100 |  1200/ 2928 batches | lr 0.03 | ms/batch  8.43 | loss  4.00 | ppl    54.42
| epoch 100 |  1400/ 2928 batches | lr 0.03 | ms/batch  8.42 | loss  3.97 | ppl    52.79
| epoch 100 |  1600/ 2928 batches | lr 0.03 | ms/batch  8.46 | loss  4.02 | ppl    55.86
| epoch 100 |  1800/ 2928 batches | lr 0.03 | ms/batch  8.43 | loss  4.01 | ppl    55.25
| epoch 100 |  2000/ 2928 batches | lr 0.03 | ms/batch  8.43 | loss  4.00 | ppl    54.76
| epoch 100 |  2200/ 2928 batches | lr 0.03 | ms/batch  8.42 | loss  3.85 | ppl    47.03
| epoch 100 |  2400/ 

| epoch 105 |  1800/ 2928 batches | lr 0.02 | ms/batch  8.43 | loss  4.02 | ppl    55.68
| epoch 105 |  2000/ 2928 batches | lr 0.02 | ms/batch  8.44 | loss  4.02 | ppl    55.52
| epoch 105 |  2200/ 2928 batches | lr 0.02 | ms/batch  8.43 | loss  3.85 | ppl    46.94
| epoch 105 |  2400/ 2928 batches | lr 0.02 | ms/batch  8.49 | loss  3.96 | ppl    52.26
| epoch 105 |  2600/ 2928 batches | lr 0.02 | ms/batch  8.49 | loss  3.97 | ppl    52.81
| epoch 105 |  2800/ 2928 batches | lr 0.02 | ms/batch  8.42 | loss  3.94 | ppl    51.59
-----------------------------------------------------------------------------------------
| end of epoch 105 | time: 26.12s | valid loss  5.60 | valid ppl   271.11
-----------------------------------------------------------------------------------------
| epoch 106 |   200/ 2928 batches | lr 0.02 | ms/batch  8.49 | loss  4.02 | ppl    55.95
| epoch 106 |   400/ 2928 batches | lr 0.02 | ms/batch  8.44 | loss  4.03 | ppl    56.19
| epoch 106 |   600/ 2928 batches 

| epoch 111 |   200/ 2928 batches | lr 0.02 | ms/batch  8.47 | loss  4.03 | ppl    56.18
| epoch 111 |   400/ 2928 batches | lr 0.02 | ms/batch  8.42 | loss  4.04 | ppl    56.76
| epoch 111 |   600/ 2928 batches | lr 0.02 | ms/batch  8.45 | loss  3.91 | ppl    49.87
| epoch 111 |   800/ 2928 batches | lr 0.02 | ms/batch  8.45 | loss  3.97 | ppl    53.02
| epoch 111 |  1000/ 2928 batches | lr 0.02 | ms/batch  8.43 | loss  3.99 | ppl    53.98
| epoch 111 |  1200/ 2928 batches | lr 0.02 | ms/batch  8.42 | loss  4.02 | ppl    55.92
| epoch 111 |  1400/ 2928 batches | lr 0.02 | ms/batch  8.44 | loss  3.98 | ppl    53.54
| epoch 111 |  1600/ 2928 batches | lr 0.02 | ms/batch  8.46 | loss  4.04 | ppl    56.94
| epoch 111 |  1800/ 2928 batches | lr 0.02 | ms/batch  8.44 | loss  4.03 | ppl    56.08
| epoch 111 |  2000/ 2928 batches | lr 0.02 | ms/batch  8.48 | loss  4.02 | ppl    55.62
| epoch 111 |  2200/ 2928 batches | lr 0.02 | ms/batch  8.53 | loss  3.86 | ppl    47.46
| epoch 111 |  2400/ 

| epoch 116 |  1800/ 2928 batches | lr 0.01 | ms/batch  8.43 | loss  4.03 | ppl    56.18
| epoch 116 |  2000/ 2928 batches | lr 0.01 | ms/batch  8.42 | loss  4.03 | ppl    56.26
| epoch 116 |  2200/ 2928 batches | lr 0.01 | ms/batch  8.45 | loss  3.87 | ppl    47.86
| epoch 116 |  2400/ 2928 batches | lr 0.01 | ms/batch  8.42 | loss  3.96 | ppl    52.46
| epoch 116 |  2600/ 2928 batches | lr 0.01 | ms/batch  8.42 | loss  4.00 | ppl    54.53
| epoch 116 |  2800/ 2928 batches | lr 0.01 | ms/batch  8.44 | loss  3.96 | ppl    52.64
-----------------------------------------------------------------------------------------
| end of epoch 116 | time: 26.06s | valid loss  5.57 | valid ppl   262.08
-----------------------------------------------------------------------------------------
| epoch 117 |   200/ 2928 batches | lr 0.01 | ms/batch  8.46 | loss  4.04 | ppl    57.08
| epoch 117 |   400/ 2928 batches | lr 0.01 | ms/batch  8.41 | loss  4.05 | ppl    57.68
| epoch 117 |   600/ 2928 batches 

| epoch 122 |   200/ 2928 batches | lr 0.01 | ms/batch  8.61 | loss  4.06 | ppl    57.89
| epoch 122 |   400/ 2928 batches | lr 0.01 | ms/batch  8.43 | loss  4.05 | ppl    57.53
| epoch 122 |   600/ 2928 batches | lr 0.01 | ms/batch  8.41 | loss  3.92 | ppl    50.62
| epoch 122 |   800/ 2928 batches | lr 0.01 | ms/batch  8.46 | loss  3.99 | ppl    53.94
| epoch 122 |  1000/ 2928 batches | lr 0.01 | ms/batch  8.43 | loss  4.01 | ppl    55.14
| epoch 122 |  1200/ 2928 batches | lr 0.01 | ms/batch  8.45 | loss  4.03 | ppl    56.25
| epoch 122 |  1400/ 2928 batches | lr 0.01 | ms/batch  8.44 | loss  3.99 | ppl    54.14
| epoch 122 |  1600/ 2928 batches | lr 0.01 | ms/batch  8.42 | loss  4.05 | ppl    57.30
| epoch 122 |  1800/ 2928 batches | lr 0.01 | ms/batch  8.44 | loss  4.03 | ppl    56.36
| epoch 122 |  2000/ 2928 batches | lr 0.01 | ms/batch  8.44 | loss  4.03 | ppl    56.10
| epoch 122 |  2200/ 2928 batches | lr 0.01 | ms/batch  8.44 | loss  3.88 | ppl    48.50
| epoch 122 |  2400/ 

| epoch 127 |  1800/ 2928 batches | lr 0.01 | ms/batch  9.25 | loss  4.04 | ppl    56.87
| epoch 127 |  2000/ 2928 batches | lr 0.01 | ms/batch  9.28 | loss  4.03 | ppl    56.09
| epoch 127 |  2200/ 2928 batches | lr 0.01 | ms/batch  9.27 | loss  3.89 | ppl    48.95
| epoch 127 |  2400/ 2928 batches | lr 0.01 | ms/batch  9.28 | loss  3.98 | ppl    53.28
| epoch 127 |  2600/ 2928 batches | lr 0.01 | ms/batch  9.25 | loss  4.00 | ppl    54.72
| epoch 127 |  2800/ 2928 batches | lr 0.01 | ms/batch  9.25 | loss  3.99 | ppl    53.80
-----------------------------------------------------------------------------------------
| end of epoch 127 | time: 28.70s | valid loss  5.56 | valid ppl   260.62
-----------------------------------------------------------------------------------------
| epoch 128 |   200/ 2928 batches | lr 0.01 | ms/batch  8.55 | loss  4.07 | ppl    58.63
| epoch 128 |   400/ 2928 batches | lr 0.01 | ms/batch  8.39 | loss  4.07 | ppl    58.27
| epoch 128 |   600/ 2928 batches 

| epoch 133 |   200/ 2928 batches | lr 0.01 | ms/batch  8.55 | loss  4.07 | ppl    58.64
| epoch 133 |   400/ 2928 batches | lr 0.01 | ms/batch  8.42 | loss  4.07 | ppl    58.57
| epoch 133 |   600/ 2928 batches | lr 0.01 | ms/batch  8.34 | loss  3.93 | ppl    51.12
| epoch 133 |   800/ 2928 batches | lr 0.01 | ms/batch  8.38 | loss  4.01 | ppl    54.93
| epoch 133 |  1000/ 2928 batches | lr 0.01 | ms/batch  8.42 | loss  4.01 | ppl    54.94
| epoch 133 |  1200/ 2928 batches | lr 0.01 | ms/batch  8.67 | loss  4.04 | ppl    56.93
| epoch 133 |  1400/ 2928 batches | lr 0.01 | ms/batch  8.69 | loss  4.01 | ppl    55.33
| epoch 133 |  1600/ 2928 batches | lr 0.01 | ms/batch  8.77 | loss  4.07 | ppl    58.38
| epoch 133 |  1800/ 2928 batches | lr 0.01 | ms/batch  8.76 | loss  4.05 | ppl    57.59
| epoch 133 |  2000/ 2928 batches | lr 0.01 | ms/batch  8.78 | loss  4.03 | ppl    56.50
| epoch 133 |  2200/ 2928 batches | lr 0.01 | ms/batch  8.79 | loss  3.90 | ppl    49.39
| epoch 133 |  2400/ 

| epoch 138 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.46 | loss  4.05 | ppl    57.67
| epoch 138 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.36 | loss  4.03 | ppl    56.40
| epoch 138 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.35 | loss  3.90 | ppl    49.33
| epoch 138 |  2400/ 2928 batches | lr 0.00 | ms/batch  8.36 | loss  4.00 | ppl    54.46
| epoch 138 |  2600/ 2928 batches | lr 0.00 | ms/batch  8.53 | loss  4.00 | ppl    54.62
| epoch 138 |  2800/ 2928 batches | lr 0.00 | ms/batch  9.63 | loss  4.00 | ppl    54.41
-----------------------------------------------------------------------------------------
| end of epoch 138 | time: 26.35s | valid loss  5.54 | valid ppl   255.52
-----------------------------------------------------------------------------------------
| epoch 139 |   200/ 2928 batches | lr 0.00 | ms/batch  8.47 | loss  4.07 | ppl    58.27
| epoch 139 |   400/ 2928 batches | lr 0.00 | ms/batch  8.35 | loss  4.07 | ppl    58.81
| epoch 139 |   600/ 2928 batches 

| epoch 144 |   200/ 2928 batches | lr 0.00 | ms/batch  8.41 | loss  4.07 | ppl    58.51
| epoch 144 |   400/ 2928 batches | lr 0.00 | ms/batch  8.32 | loss  4.08 | ppl    58.91
| epoch 144 |   600/ 2928 batches | lr 0.00 | ms/batch  8.32 | loss  3.94 | ppl    51.64
| epoch 144 |   800/ 2928 batches | lr 0.00 | ms/batch  8.35 | loss  4.01 | ppl    54.99
| epoch 144 |  1000/ 2928 batches | lr 0.00 | ms/batch  8.30 | loss  4.01 | ppl    55.27
| epoch 144 |  1200/ 2928 batches | lr 0.00 | ms/batch  8.37 | loss  4.04 | ppl    56.95
| epoch 144 |  1400/ 2928 batches | lr 0.00 | ms/batch  8.37 | loss  4.01 | ppl    55.36
| epoch 144 |  1600/ 2928 batches | lr 0.00 | ms/batch  8.38 | loss  4.07 | ppl    58.70
| epoch 144 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.35 | loss  4.05 | ppl    57.58
| epoch 144 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.38 | loss  4.03 | ppl    56.51
| epoch 144 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.40 | loss  3.90 | ppl    49.41
| epoch 144 |  2400/ 

| epoch 149 |  1800/ 2928 batches | lr 0.00 | ms/batch 10.03 | loss  4.05 | ppl    57.37
| epoch 149 |  2000/ 2928 batches | lr 0.00 | ms/batch 10.00 | loss  4.03 | ppl    56.52
| epoch 149 |  2200/ 2928 batches | lr 0.00 | ms/batch 10.01 | loss  3.90 | ppl    49.35
| epoch 149 |  2400/ 2928 batches | lr 0.00 | ms/batch 10.04 | loss  4.01 | ppl    55.37
| epoch 149 |  2600/ 2928 batches | lr 0.00 | ms/batch 10.07 | loss  4.01 | ppl    55.20
| epoch 149 |  2800/ 2928 batches | lr 0.00 | ms/batch 10.04 | loss  3.99 | ppl    54.20
-----------------------------------------------------------------------------------------
| end of epoch 149 | time: 30.33s | valid loss  5.54 | valid ppl   253.69
-----------------------------------------------------------------------------------------
| epoch 150 |   200/ 2928 batches | lr 0.00 | ms/batch  8.45 | loss  4.07 | ppl    58.49
| epoch 150 |   400/ 2928 batches | lr 0.00 | ms/batch  8.40 | loss  4.07 | ppl    58.81
| epoch 150 |   600/ 2928 batches 

| epoch 155 |   200/ 2928 batches | lr 0.00 | ms/batch  8.45 | loss  4.07 | ppl    58.44
| epoch 155 |   400/ 2928 batches | lr 0.00 | ms/batch  8.39 | loss  4.07 | ppl    58.79
| epoch 155 |   600/ 2928 batches | lr 0.00 | ms/batch  8.39 | loss  3.94 | ppl    51.31
| epoch 155 |   800/ 2928 batches | lr 0.00 | ms/batch  8.39 | loss  4.00 | ppl    54.78
| epoch 155 |  1000/ 2928 batches | lr 0.00 | ms/batch  8.41 | loss  4.02 | ppl    55.43
| epoch 155 |  1200/ 2928 batches | lr 0.00 | ms/batch  8.40 | loss  4.04 | ppl    56.92
| epoch 155 |  1400/ 2928 batches | lr 0.00 | ms/batch  8.42 | loss  4.02 | ppl    55.49
| epoch 155 |  1600/ 2928 batches | lr 0.00 | ms/batch  8.44 | loss  4.08 | ppl    59.03
| epoch 155 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.41 | loss  4.06 | ppl    57.82
| epoch 155 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.42 | loss  4.04 | ppl    56.86
| epoch 155 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.48 | loss  3.90 | ppl    49.36
| epoch 155 |  2400/ 

| epoch 160 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.52 | loss  4.05 | ppl    57.66
| epoch 160 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.40 | loss  4.04 | ppl    57.10
| epoch 160 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.46 | loss  3.90 | ppl    49.42
| epoch 160 |  2400/ 2928 batches | lr 0.00 | ms/batch  8.51 | loss  4.01 | ppl    55.35
| epoch 160 |  2600/ 2928 batches | lr 0.00 | ms/batch  8.83 | loss  4.02 | ppl    55.72
| epoch 160 |  2800/ 2928 batches | lr 0.00 | ms/batch  8.46 | loss  3.99 | ppl    54.04
-----------------------------------------------------------------------------------------
| end of epoch 160 | time: 26.25s | valid loss  5.54 | valid ppl   253.64
-----------------------------------------------------------------------------------------
| epoch 161 |   200/ 2928 batches | lr 0.00 | ms/batch  8.54 | loss  4.07 | ppl    58.45
| epoch 161 |   400/ 2928 batches | lr 0.00 | ms/batch  8.40 | loss  4.08 | ppl    58.86
| epoch 161 |   600/ 2928 batches 

| epoch 166 |   200/ 2928 batches | lr 0.00 | ms/batch  8.47 | loss  4.07 | ppl    58.53
| epoch 166 |   400/ 2928 batches | lr 0.00 | ms/batch  8.35 | loss  4.07 | ppl    58.68
| epoch 166 |   600/ 2928 batches | lr 0.00 | ms/batch  8.35 | loss  3.94 | ppl    51.28
| epoch 166 |   800/ 2928 batches | lr 0.00 | ms/batch  8.35 | loss  4.00 | ppl    54.83
| epoch 166 |  1000/ 2928 batches | lr 0.00 | ms/batch  8.38 | loss  4.01 | ppl    55.29
| epoch 166 |  1200/ 2928 batches | lr 0.00 | ms/batch  8.37 | loss  4.04 | ppl    56.81
| epoch 166 |  1400/ 2928 batches | lr 0.00 | ms/batch  8.34 | loss  4.01 | ppl    55.28
| epoch 166 |  1600/ 2928 batches | lr 0.00 | ms/batch  8.32 | loss  4.08 | ppl    58.99
| epoch 166 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.32 | loss  4.06 | ppl    57.75
| epoch 166 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.36 | loss  4.05 | ppl    57.34
| epoch 166 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.35 | loss  3.90 | ppl    49.46
| epoch 166 |  2400/ 

| epoch 171 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.32 | loss  4.06 | ppl    57.80
| epoch 171 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.33 | loss  4.06 | ppl    57.70
| epoch 171 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.34 | loss  3.90 | ppl    49.44
| epoch 171 |  2400/ 2928 batches | lr 0.00 | ms/batch  8.35 | loss  4.01 | ppl    55.11
| epoch 171 |  2600/ 2928 batches | lr 0.00 | ms/batch  8.37 | loss  4.02 | ppl    55.88
| epoch 171 |  2800/ 2928 batches | lr 0.00 | ms/batch  8.35 | loss  3.99 | ppl    54.28
-----------------------------------------------------------------------------------------
| end of epoch 171 | time: 25.72s | valid loss  5.54 | valid ppl   253.83
-----------------------------------------------------------------------------------------
| epoch 172 |   200/ 2928 batches | lr 0.00 | ms/batch  8.53 | loss  4.07 | ppl    58.53
| epoch 172 |   400/ 2928 batches | lr 0.00 | ms/batch  8.41 | loss  4.08 | ppl    58.91
| epoch 172 |   600/ 2928 batches 

| epoch 177 |   200/ 2928 batches | lr 0.00 | ms/batch  8.51 | loss  4.07 | ppl    58.35
| epoch 177 |   400/ 2928 batches | lr 0.00 | ms/batch  8.40 | loss  4.07 | ppl    58.82
| epoch 177 |   600/ 2928 batches | lr 0.00 | ms/batch  8.43 | loss  3.94 | ppl    51.48
| epoch 177 |   800/ 2928 batches | lr 0.00 | ms/batch  8.45 | loss  4.00 | ppl    54.65
| epoch 177 |  1000/ 2928 batches | lr 0.00 | ms/batch  8.43 | loss  4.01 | ppl    55.39
| epoch 177 |  1200/ 2928 batches | lr 0.00 | ms/batch  8.54 | loss  4.04 | ppl    56.93
| epoch 177 |  1400/ 2928 batches | lr 0.00 | ms/batch  8.84 | loss  4.01 | ppl    55.35
| epoch 177 |  1600/ 2928 batches | lr 0.00 | ms/batch  8.88 | loss  4.07 | ppl    58.69
| epoch 177 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.75 | loss  4.05 | ppl    57.68
| epoch 177 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.90 | loss  4.05 | ppl    57.30
| epoch 177 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.65 | loss  3.90 | ppl    49.42
| epoch 177 |  2400/ 

| epoch 182 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.49 | loss  4.06 | ppl    58.04
| epoch 182 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.66 | loss  4.06 | ppl    57.71
| epoch 182 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.64 | loss  3.90 | ppl    49.35
| epoch 182 |  2400/ 2928 batches | lr 0.00 | ms/batch  8.51 | loss  4.00 | ppl    54.78
| epoch 182 |  2600/ 2928 batches | lr 0.00 | ms/batch  8.49 | loss  4.02 | ppl    55.97
| epoch 182 |  2800/ 2928 batches | lr 0.00 | ms/batch  8.42 | loss  3.99 | ppl    53.88
-----------------------------------------------------------------------------------------
| end of epoch 182 | time: 26.18s | valid loss  5.54 | valid ppl   253.69
-----------------------------------------------------------------------------------------
| epoch 183 |   200/ 2928 batches | lr 0.00 | ms/batch  8.48 | loss  4.07 | ppl    58.69
| epoch 183 |   400/ 2928 batches | lr 0.00 | ms/batch  8.39 | loss  4.07 | ppl    58.74
| epoch 183 |   600/ 2928 batches 

| epoch 188 |   200/ 2928 batches | lr 0.00 | ms/batch  8.52 | loss  4.07 | ppl    58.47
| epoch 188 |   400/ 2928 batches | lr 0.00 | ms/batch  8.42 | loss  4.08 | ppl    58.86
| epoch 188 |   600/ 2928 batches | lr 0.00 | ms/batch  8.41 | loss  3.94 | ppl    51.59
| epoch 188 |   800/ 2928 batches | lr 0.00 | ms/batch  8.41 | loss  4.00 | ppl    54.81
| epoch 188 |  1000/ 2928 batches | lr 0.00 | ms/batch  8.41 | loss  4.01 | ppl    55.30
| epoch 188 |  1200/ 2928 batches | lr 0.00 | ms/batch  8.41 | loss  4.04 | ppl    56.95
| epoch 188 |  1400/ 2928 batches | lr 0.00 | ms/batch  8.44 | loss  4.01 | ppl    55.30
| epoch 188 |  1600/ 2928 batches | lr 0.00 | ms/batch  8.43 | loss  4.07 | ppl    58.44
| epoch 188 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.43 | loss  4.05 | ppl    57.57
| epoch 188 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.42 | loss  4.05 | ppl    57.24
| epoch 188 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.40 | loss  3.90 | ppl    49.48
| epoch 188 |  2400/ 

| epoch 193 |  1800/ 2928 batches | lr 0.00 | ms/batch 10.10 | loss  4.05 | ppl    57.35
| epoch 193 |  2000/ 2928 batches | lr 0.00 | ms/batch 10.16 | loss  4.05 | ppl    57.65
| epoch 193 |  2200/ 2928 batches | lr 0.00 | ms/batch 10.08 | loss  3.90 | ppl    49.52
| epoch 193 |  2400/ 2928 batches | lr 0.00 | ms/batch 10.12 | loss  4.00 | ppl    54.68
| epoch 193 |  2600/ 2928 batches | lr 0.00 | ms/batch 10.04 | loss  4.02 | ppl    55.60
| epoch 193 |  2800/ 2928 batches | lr 0.00 | ms/batch 10.12 | loss  3.99 | ppl    54.02
-----------------------------------------------------------------------------------------
| end of epoch 193 | time: 29.35s | valid loss  5.54 | valid ppl   253.76
-----------------------------------------------------------------------------------------
| epoch 194 |   200/ 2928 batches | lr 0.00 | ms/batch  9.89 | loss  4.07 | ppl    58.47
| epoch 194 |   400/ 2928 batches | lr 0.00 | ms/batch 11.06 | loss  4.08 | ppl    59.05
| epoch 194 |   600/ 2928 batches 

| epoch 199 |   200/ 2928 batches | lr 0.00 | ms/batch  9.44 | loss  4.07 | ppl    58.46
| epoch 199 |   400/ 2928 batches | lr 0.00 | ms/batch  8.66 | loss  4.07 | ppl    58.77
| epoch 199 |   600/ 2928 batches | lr 0.00 | ms/batch  9.81 | loss  3.94 | ppl    51.45
| epoch 199 |   800/ 2928 batches | lr 0.00 | ms/batch 10.01 | loss  4.00 | ppl    54.77
| epoch 199 |  1000/ 2928 batches | lr 0.00 | ms/batch 10.06 | loss  4.02 | ppl    55.45
| epoch 199 |  1200/ 2928 batches | lr 0.00 | ms/batch 10.03 | loss  4.04 | ppl    57.06
| epoch 199 |  1400/ 2928 batches | lr 0.00 | ms/batch 10.04 | loss  4.02 | ppl    55.43
| epoch 199 |  1600/ 2928 batches | lr 0.00 | ms/batch 10.03 | loss  4.07 | ppl    58.85
| epoch 199 |  1800/ 2928 batches | lr 0.00 | ms/batch 10.02 | loss  4.06 | ppl    58.09
| epoch 199 |  2000/ 2928 batches | lr 0.00 | ms/batch  9.96 | loss  4.05 | ppl    57.35
| epoch 199 |  2200/ 2928 batches | lr 0.00 | ms/batch 10.06 | loss  3.90 | ppl    49.60
| epoch 199 |  2400/ 

| epoch 204 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.34 | loss  4.06 | ppl    57.91
| epoch 204 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.34 | loss  4.06 | ppl    57.69
| epoch 204 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.34 | loss  3.90 | ppl    49.48
| epoch 204 |  2400/ 2928 batches | lr 0.00 | ms/batch  8.34 | loss  4.00 | ppl    54.51
| epoch 204 |  2600/ 2928 batches | lr 0.00 | ms/batch  8.35 | loss  4.02 | ppl    55.81
| epoch 204 |  2800/ 2928 batches | lr 0.00 | ms/batch  8.35 | loss  3.99 | ppl    53.97
-----------------------------------------------------------------------------------------
| end of epoch 204 | time: 25.82s | valid loss  5.54 | valid ppl   253.66
-----------------------------------------------------------------------------------------
| epoch 205 |   200/ 2928 batches | lr 0.00 | ms/batch  8.47 | loss  4.07 | ppl    58.56
| epoch 205 |   400/ 2928 batches | lr 0.00 | ms/batch  8.40 | loss  4.08 | ppl    58.96
| epoch 205 |   600/ 2928 batches 

| epoch 210 |   200/ 2928 batches | lr 0.00 | ms/batch  9.29 | loss  4.07 | ppl    58.75
| epoch 210 |   400/ 2928 batches | lr 0.00 | ms/batch  8.45 | loss  4.07 | ppl    58.75
| epoch 210 |   600/ 2928 batches | lr 0.00 | ms/batch  8.41 | loss  3.94 | ppl    51.47
| epoch 210 |   800/ 2928 batches | lr 0.00 | ms/batch  8.46 | loss  4.00 | ppl    54.64
| epoch 210 |  1000/ 2928 batches | lr 0.00 | ms/batch  8.45 | loss  4.02 | ppl    55.58
| epoch 210 |  1200/ 2928 batches | lr 0.00 | ms/batch  8.43 | loss  4.04 | ppl    56.75
| epoch 210 |  1400/ 2928 batches | lr 0.00 | ms/batch  8.45 | loss  4.01 | ppl    55.38
| epoch 210 |  1600/ 2928 batches | lr 0.00 | ms/batch  8.42 | loss  4.07 | ppl    58.77
| epoch 210 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.42 | loss  4.06 | ppl    57.82
| epoch 210 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.44 | loss  4.05 | ppl    57.37
| epoch 210 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.44 | loss  3.90 | ppl    49.55
| epoch 210 |  2400/ 

| epoch 215 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.37 | loss  4.05 | ppl    57.67
| epoch 215 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.38 | loss  4.05 | ppl    57.54
| epoch 215 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.37 | loss  3.90 | ppl    49.30
| epoch 215 |  2400/ 2928 batches | lr 0.00 | ms/batch  8.33 | loss  4.00 | ppl    54.65
| epoch 215 |  2600/ 2928 batches | lr 0.00 | ms/batch  8.34 | loss  4.02 | ppl    55.57
| epoch 215 |  2800/ 2928 batches | lr 0.00 | ms/batch  8.36 | loss  3.99 | ppl    54.18
-----------------------------------------------------------------------------------------
| end of epoch 215 | time: 25.80s | valid loss  5.54 | valid ppl   253.63
-----------------------------------------------------------------------------------------
| epoch 216 |   200/ 2928 batches | lr 0.00 | ms/batch  8.48 | loss  4.06 | ppl    58.24
| epoch 216 |   400/ 2928 batches | lr 0.00 | ms/batch  8.34 | loss  4.08 | ppl    59.06
| epoch 216 |   600/ 2928 batches 

| epoch 221 |   200/ 2928 batches | lr 0.00 | ms/batch  8.50 | loss  4.07 | ppl    58.60
| epoch 221 |   400/ 2928 batches | lr 0.00 | ms/batch  8.30 | loss  4.08 | ppl    58.97
| epoch 221 |   600/ 2928 batches | lr 0.00 | ms/batch  8.34 | loss  3.94 | ppl    51.34
| epoch 221 |   800/ 2928 batches | lr 0.00 | ms/batch  8.38 | loss  4.01 | ppl    54.93
| epoch 221 |  1000/ 2928 batches | lr 0.00 | ms/batch  8.38 | loss  4.01 | ppl    55.32
| epoch 221 |  1200/ 2928 batches | lr 0.00 | ms/batch  8.46 | loss  4.04 | ppl    56.75
| epoch 221 |  1400/ 2928 batches | lr 0.00 | ms/batch  8.44 | loss  4.01 | ppl    55.11
| epoch 221 |  1600/ 2928 batches | lr 0.00 | ms/batch  8.48 | loss  4.08 | ppl    58.86
| epoch 221 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.46 | loss  4.05 | ppl    57.58
| epoch 221 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.42 | loss  4.05 | ppl    57.66
| epoch 221 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.44 | loss  3.90 | ppl    49.29
| epoch 221 |  2400/ 

| epoch 226 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.42 | loss  4.05 | ppl    57.63
| epoch 226 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.42 | loss  4.05 | ppl    57.65
| epoch 226 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.39 | loss  3.90 | ppl    49.59
| epoch 226 |  2400/ 2928 batches | lr 0.00 | ms/batch  8.49 | loss  4.00 | ppl    54.72
| epoch 226 |  2600/ 2928 batches | lr 0.00 | ms/batch  8.44 | loss  4.02 | ppl    55.81
| epoch 226 |  2800/ 2928 batches | lr 0.00 | ms/batch  8.43 | loss  3.99 | ppl    54.06
-----------------------------------------------------------------------------------------
| end of epoch 226 | time: 26.08s | valid loss  5.54 | valid ppl   253.61
-----------------------------------------------------------------------------------------
| epoch 227 |   200/ 2928 batches | lr 0.00 | ms/batch  8.57 | loss  4.07 | ppl    58.33
| epoch 227 |   400/ 2928 batches | lr 0.00 | ms/batch  8.39 | loss  4.08 | ppl    59.08
| epoch 227 |   600/ 2928 batches 

| epoch 232 |   200/ 2928 batches | lr 0.00 | ms/batch  8.42 | loss  4.07 | ppl    58.45
| epoch 232 |   400/ 2928 batches | lr 0.00 | ms/batch  8.42 | loss  4.07 | ppl    58.73
| epoch 232 |   600/ 2928 batches | lr 0.00 | ms/batch  8.40 | loss  3.94 | ppl    51.28
| epoch 232 |   800/ 2928 batches | lr 0.00 | ms/batch  8.53 | loss  4.01 | ppl    54.92
| epoch 232 |  1000/ 2928 batches | lr 0.00 | ms/batch  8.41 | loss  4.01 | ppl    55.41
| epoch 232 |  1200/ 2928 batches | lr 0.00 | ms/batch  8.55 | loss  4.04 | ppl    56.78
| epoch 232 |  1400/ 2928 batches | lr 0.00 | ms/batch 10.06 | loss  4.02 | ppl    55.43
| epoch 232 |  1600/ 2928 batches | lr 0.00 | ms/batch 10.13 | loss  4.07 | ppl    58.43
| epoch 232 |  1800/ 2928 batches | lr 0.00 | ms/batch 10.14 | loss  4.06 | ppl    57.99
| epoch 232 |  2000/ 2928 batches | lr 0.00 | ms/batch 10.17 | loss  4.05 | ppl    57.45
| epoch 232 |  2200/ 2928 batches | lr 0.00 | ms/batch 10.11 | loss  3.90 | ppl    49.60
| epoch 232 |  2400/ 

| epoch 237 |  1800/ 2928 batches | lr 0.00 | ms/batch  9.26 | loss  4.05 | ppl    57.48
| epoch 237 |  2000/ 2928 batches | lr 0.00 | ms/batch  9.25 | loss  4.05 | ppl    57.66
| epoch 237 |  2200/ 2928 batches | lr 0.00 | ms/batch  9.28 | loss  3.90 | ppl    49.49
| epoch 237 |  2400/ 2928 batches | lr 0.00 | ms/batch  9.29 | loss  4.00 | ppl    54.50
| epoch 237 |  2600/ 2928 batches | lr 0.00 | ms/batch  9.28 | loss  4.02 | ppl    55.83
| epoch 237 |  2800/ 2928 batches | lr 0.00 | ms/batch  9.27 | loss  3.98 | ppl    53.76
-----------------------------------------------------------------------------------------
| end of epoch 237 | time: 28.65s | valid loss  5.54 | valid ppl   253.61
-----------------------------------------------------------------------------------------
| epoch 238 |   200/ 2928 batches | lr 0.00 | ms/batch  9.15 | loss  4.07 | ppl    58.54
| epoch 238 |   400/ 2928 batches | lr 0.00 | ms/batch  9.14 | loss  4.08 | ppl    59.00
| epoch 238 |   600/ 2928 batches 

| epoch 243 |   200/ 2928 batches | lr 0.00 | ms/batch  8.53 | loss  4.07 | ppl    58.37
| epoch 243 |   400/ 2928 batches | lr 0.00 | ms/batch  8.37 | loss  4.08 | ppl    58.86
| epoch 243 |   600/ 2928 batches | lr 0.00 | ms/batch  8.38 | loss  3.94 | ppl    51.41
| epoch 243 |   800/ 2928 batches | lr 0.00 | ms/batch  8.40 | loss  4.01 | ppl    55.05
| epoch 243 |  1000/ 2928 batches | lr 0.00 | ms/batch  8.43 | loss  4.01 | ppl    55.09
| epoch 243 |  1200/ 2928 batches | lr 0.00 | ms/batch  8.42 | loss  4.04 | ppl    56.94
| epoch 243 |  1400/ 2928 batches | lr 0.00 | ms/batch  8.42 | loss  4.02 | ppl    55.44
| epoch 243 |  1600/ 2928 batches | lr 0.00 | ms/batch  8.40 | loss  4.07 | ppl    58.53
| epoch 243 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.42 | loss  4.05 | ppl    57.66
| epoch 243 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.42 | loss  4.06 | ppl    57.69
| epoch 243 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.42 | loss  3.90 | ppl    49.54
| epoch 243 |  2400/ 

| epoch 248 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.33 | loss  4.06 | ppl    57.74
| epoch 248 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.38 | loss  4.05 | ppl    57.60
| epoch 248 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.35 | loss  3.90 | ppl    49.64
| epoch 248 |  2400/ 2928 batches | lr 0.00 | ms/batch  8.35 | loss  4.00 | ppl    54.87
| epoch 248 |  2600/ 2928 batches | lr 0.00 | ms/batch  8.37 | loss  4.02 | ppl    55.53
| epoch 248 |  2800/ 2928 batches | lr 0.00 | ms/batch  8.36 | loss  3.99 | ppl    54.09
-----------------------------------------------------------------------------------------
| end of epoch 248 | time: 25.80s | valid loss  5.54 | valid ppl   253.62
-----------------------------------------------------------------------------------------
| epoch 249 |   200/ 2928 batches | lr 0.00 | ms/batch  8.40 | loss  4.07 | ppl    58.32
| epoch 249 |   400/ 2928 batches | lr 0.00 | ms/batch  8.34 | loss  4.08 | ppl    58.98
| epoch 249 |   600/ 2928 batches 

| epoch 254 |   200/ 2928 batches | lr 0.00 | ms/batch  8.55 | loss  4.07 | ppl    58.29
| epoch 254 |   400/ 2928 batches | lr 0.00 | ms/batch  8.43 | loss  4.08 | ppl    58.89
| epoch 254 |   600/ 2928 batches | lr 0.00 | ms/batch  8.43 | loss  3.94 | ppl    51.31
| epoch 254 |   800/ 2928 batches | lr 0.00 | ms/batch  8.37 | loss  4.00 | ppl    54.68
| epoch 254 |  1000/ 2928 batches | lr 0.00 | ms/batch  8.39 | loss  4.01 | ppl    55.08
| epoch 254 |  1200/ 2928 batches | lr 0.00 | ms/batch  8.38 | loss  4.04 | ppl    57.03
| epoch 254 |  1400/ 2928 batches | lr 0.00 | ms/batch  8.38 | loss  4.01 | ppl    55.27
| epoch 254 |  1600/ 2928 batches | lr 0.00 | ms/batch  8.37 | loss  4.07 | ppl    58.54
| epoch 254 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.39 | loss  4.06 | ppl    57.69
| epoch 254 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.38 | loss  4.06 | ppl    57.69
| epoch 254 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.38 | loss  3.90 | ppl    49.41
| epoch 254 |  2400/ 

| epoch 259 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.41 | loss  4.05 | ppl    57.64
| epoch 259 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.42 | loss  4.06 | ppl    57.75
| epoch 259 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.40 | loss  3.90 | ppl    49.53
| epoch 259 |  2400/ 2928 batches | lr 0.00 | ms/batch  8.41 | loss  4.00 | ppl    54.68
| epoch 259 |  2600/ 2928 batches | lr 0.00 | ms/batch  8.40 | loss  4.02 | ppl    55.64
| epoch 259 |  2800/ 2928 batches | lr 0.00 | ms/batch  8.43 | loss  3.99 | ppl    54.14
-----------------------------------------------------------------------------------------
| end of epoch 259 | time: 26.06s | valid loss  5.54 | valid ppl   253.62
-----------------------------------------------------------------------------------------
| epoch 260 |   200/ 2928 batches | lr 0.00 | ms/batch  8.51 | loss  4.06 | ppl    58.26
| epoch 260 |   400/ 2928 batches | lr 0.00 | ms/batch  8.38 | loss  4.08 | ppl    59.08
| epoch 260 |   600/ 2928 batches 

| epoch 265 |   200/ 2928 batches | lr 0.00 | ms/batch  8.55 | loss  4.07 | ppl    58.72
| epoch 265 |   400/ 2928 batches | lr 0.00 | ms/batch  8.40 | loss  4.07 | ppl    58.68
| epoch 265 |   600/ 2928 batches | lr 0.00 | ms/batch  8.41 | loss  3.94 | ppl    51.37
| epoch 265 |   800/ 2928 batches | lr 0.00 | ms/batch  8.41 | loss  4.00 | ppl    54.70
| epoch 265 |  1000/ 2928 batches | lr 0.00 | ms/batch  8.45 | loss  4.01 | ppl    55.33
| epoch 265 |  1200/ 2928 batches | lr 0.00 | ms/batch  8.43 | loss  4.04 | ppl    57.00
| epoch 265 |  1400/ 2928 batches | lr 0.00 | ms/batch  8.46 | loss  4.01 | ppl    55.30
| epoch 265 |  1600/ 2928 batches | lr 0.00 | ms/batch  8.44 | loss  4.07 | ppl    58.58
| epoch 265 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.43 | loss  4.06 | ppl    57.90
| epoch 265 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.43 | loss  4.05 | ppl    57.63
| epoch 265 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.42 | loss  3.90 | ppl    49.54
| epoch 265 |  2400/ 

| epoch 270 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.42 | loss  4.05 | ppl    57.68
| epoch 270 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.45 | loss  4.05 | ppl    57.67
| epoch 270 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.43 | loss  3.90 | ppl    49.51
| epoch 270 |  2400/ 2928 batches | lr 0.00 | ms/batch  8.47 | loss  4.00 | ppl    54.66
| epoch 270 |  2600/ 2928 batches | lr 0.00 | ms/batch  8.45 | loss  4.02 | ppl    55.60
| epoch 270 |  2800/ 2928 batches | lr 0.00 | ms/batch  8.45 | loss  3.99 | ppl    54.05
-----------------------------------------------------------------------------------------
| end of epoch 270 | time: 26.13s | valid loss  5.54 | valid ppl   253.62
-----------------------------------------------------------------------------------------
| epoch 271 |   200/ 2928 batches | lr 0.00 | ms/batch  9.37 | loss  4.07 | ppl    58.52
| epoch 271 |   400/ 2928 batches | lr 0.00 | ms/batch  9.24 | loss  4.07 | ppl    58.67
| epoch 271 |   600/ 2928 batches 

| epoch 276 |   200/ 2928 batches | lr 0.00 | ms/batch  8.46 | loss  4.07 | ppl    58.28
| epoch 276 |   400/ 2928 batches | lr 0.00 | ms/batch  8.39 | loss  4.07 | ppl    58.71
| epoch 276 |   600/ 2928 batches | lr 0.00 | ms/batch  8.43 | loss  3.94 | ppl    51.52
| epoch 276 |   800/ 2928 batches | lr 0.00 | ms/batch  8.48 | loss  4.01 | ppl    54.99
| epoch 276 |  1000/ 2928 batches | lr 0.00 | ms/batch  8.49 | loss  4.01 | ppl    55.32
| epoch 276 |  1200/ 2928 batches | lr 0.00 | ms/batch  8.43 | loss  4.04 | ppl    57.10
| epoch 276 |  1400/ 2928 batches | lr 0.00 | ms/batch  8.45 | loss  4.01 | ppl    55.19
| epoch 276 |  1600/ 2928 batches | lr 0.00 | ms/batch  8.46 | loss  4.08 | ppl    58.95
| epoch 276 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.48 | loss  4.05 | ppl    57.50
| epoch 276 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.43 | loss  4.05 | ppl    57.66
| epoch 276 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.44 | loss  3.90 | ppl    49.46
| epoch 276 |  2400/ 

| epoch 281 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.35 | loss  4.05 | ppl    57.55
| epoch 281 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.35 | loss  4.05 | ppl    57.63
| epoch 281 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.35 | loss  3.90 | ppl    49.61
| epoch 281 |  2400/ 2928 batches | lr 0.00 | ms/batch  8.35 | loss  4.00 | ppl    54.67
| epoch 281 |  2600/ 2928 batches | lr 0.00 | ms/batch  8.34 | loss  4.02 | ppl    55.82
| epoch 281 |  2800/ 2928 batches | lr 0.00 | ms/batch  8.35 | loss  3.99 | ppl    54.20
-----------------------------------------------------------------------------------------
| end of epoch 281 | time: 25.79s | valid loss  5.54 | valid ppl   253.62
-----------------------------------------------------------------------------------------
| epoch 282 |   200/ 2928 batches | lr 0.00 | ms/batch  8.42 | loss  4.07 | ppl    58.40
| epoch 282 |   400/ 2928 batches | lr 0.00 | ms/batch  8.35 | loss  4.07 | ppl    58.82
| epoch 282 |   600/ 2928 batches 

| epoch 287 |   200/ 2928 batches | lr 0.00 | ms/batch  8.74 | loss  4.07 | ppl    58.65
| epoch 287 |   400/ 2928 batches | lr 0.00 | ms/batch  8.41 | loss  4.07 | ppl    58.72
| epoch 287 |   600/ 2928 batches | lr 0.00 | ms/batch  8.43 | loss  3.94 | ppl    51.17
| epoch 287 |   800/ 2928 batches | lr 0.00 | ms/batch  8.48 | loss  4.00 | ppl    54.79
| epoch 287 |  1000/ 2928 batches | lr 0.00 | ms/batch  8.43 | loss  4.01 | ppl    55.32
| epoch 287 |  1200/ 2928 batches | lr 0.00 | ms/batch  8.44 | loss  4.04 | ppl    56.94
| epoch 287 |  1400/ 2928 batches | lr 0.00 | ms/batch  8.48 | loss  4.01 | ppl    55.27
| epoch 287 |  1600/ 2928 batches | lr 0.00 | ms/batch  8.49 | loss  4.08 | ppl    58.86
| epoch 287 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.65 | loss  4.05 | ppl    57.68
| epoch 287 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.68 | loss  4.06 | ppl    57.84
| epoch 287 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.83 | loss  3.90 | ppl    49.64
| epoch 287 |  2400/ 

| epoch 292 |  1800/ 2928 batches | lr 0.00 | ms/batch  9.86 | loss  4.05 | ppl    57.63
| epoch 292 |  2000/ 2928 batches | lr 0.00 | ms/batch  9.90 | loss  4.05 | ppl    57.64
| epoch 292 |  2200/ 2928 batches | lr 0.00 | ms/batch  9.98 | loss  3.90 | ppl    49.56
| epoch 292 |  2400/ 2928 batches | lr 0.00 | ms/batch  9.81 | loss  4.00 | ppl    54.59
| epoch 292 |  2600/ 2928 batches | lr 0.00 | ms/batch  9.96 | loss  4.02 | ppl    55.47
| epoch 292 |  2800/ 2928 batches | lr 0.00 | ms/batch  9.85 | loss  3.99 | ppl    54.18
-----------------------------------------------------------------------------------------
| end of epoch 292 | time: 29.08s | valid loss  5.54 | valid ppl   253.63
-----------------------------------------------------------------------------------------
| epoch 293 |   200/ 2928 batches | lr 0.00 | ms/batch  8.82 | loss  4.07 | ppl    58.35
| epoch 293 |   400/ 2928 batches | lr 0.00 | ms/batch  8.40 | loss  4.07 | ppl    58.78
| epoch 293 |   600/ 2928 batches 

| epoch 298 |   200/ 2928 batches | lr 0.00 | ms/batch  8.55 | loss  4.07 | ppl    58.54
| epoch 298 |   400/ 2928 batches | lr 0.00 | ms/batch  8.38 | loss  4.07 | ppl    58.84
| epoch 298 |   600/ 2928 batches | lr 0.00 | ms/batch  8.38 | loss  3.94 | ppl    51.64
| epoch 298 |   800/ 2928 batches | lr 0.00 | ms/batch  8.39 | loss  4.00 | ppl    54.53
| epoch 298 |  1000/ 2928 batches | lr 0.00 | ms/batch  8.37 | loss  4.01 | ppl    55.22
| epoch 298 |  1200/ 2928 batches | lr 0.00 | ms/batch  8.37 | loss  4.04 | ppl    56.80
| epoch 298 |  1400/ 2928 batches | lr 0.00 | ms/batch  8.39 | loss  4.01 | ppl    55.27
| epoch 298 |  1600/ 2928 batches | lr 0.00 | ms/batch  8.40 | loss  4.07 | ppl    58.78
| epoch 298 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.51 | loss  4.06 | ppl    57.75
| epoch 298 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.48 | loss  4.06 | ppl    57.72
| epoch 298 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.42 | loss  3.90 | ppl    49.56
| epoch 298 |  2400/ 

| epoch 303 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.48 | loss  4.05 | ppl    57.67
| epoch 303 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.46 | loss  4.05 | ppl    57.43
| epoch 303 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.45 | loss  3.90 | ppl    49.47
| epoch 303 |  2400/ 2928 batches | lr 0.00 | ms/batch  8.41 | loss  4.00 | ppl    54.61
| epoch 303 |  2600/ 2928 batches | lr 0.00 | ms/batch  8.49 | loss  4.03 | ppl    56.07
| epoch 303 |  2800/ 2928 batches | lr 0.00 | ms/batch  8.45 | loss  3.99 | ppl    53.95
-----------------------------------------------------------------------------------------
| end of epoch 303 | time: 26.12s | valid loss  5.54 | valid ppl   253.63
-----------------------------------------------------------------------------------------
| epoch 304 |   200/ 2928 batches | lr 0.00 | ms/batch  8.43 | loss  4.07 | ppl    58.49
| epoch 304 |   400/ 2928 batches | lr 0.00 | ms/batch  8.35 | loss  4.08 | ppl    58.93
| epoch 304 |   600/ 2928 batches 

| epoch 309 |   200/ 2928 batches | lr 0.00 | ms/batch  8.46 | loss  4.07 | ppl    58.34
| epoch 309 |   400/ 2928 batches | lr 0.00 | ms/batch  8.42 | loss  4.07 | ppl    58.84
| epoch 309 |   600/ 2928 batches | lr 0.00 | ms/batch  8.41 | loss  3.94 | ppl    51.47
| epoch 309 |   800/ 2928 batches | lr 0.00 | ms/batch  8.42 | loss  4.00 | ppl    54.86
| epoch 309 |  1000/ 2928 batches | lr 0.00 | ms/batch  8.43 | loss  4.01 | ppl    55.01
| epoch 309 |  1200/ 2928 batches | lr 0.00 | ms/batch  8.40 | loss  4.04 | ppl    56.81
| epoch 309 |  1400/ 2928 batches | lr 0.00 | ms/batch  8.45 | loss  4.01 | ppl    55.26
| epoch 309 |  1600/ 2928 batches | lr 0.00 | ms/batch  8.45 | loss  4.07 | ppl    58.58
| epoch 309 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.41 | loss  4.05 | ppl    57.62
| epoch 309 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.43 | loss  4.05 | ppl    57.45
| epoch 309 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.43 | loss  3.91 | ppl    49.71
| epoch 309 |  2400/ 

| epoch 314 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.42 | loss  4.06 | ppl    57.85
| epoch 314 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.40 | loss  4.05 | ppl    57.56
| epoch 314 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.41 | loss  3.90 | ppl    49.52
| epoch 314 |  2400/ 2928 batches | lr 0.00 | ms/batch  8.48 | loss  4.00 | ppl    54.75
| epoch 314 |  2600/ 2928 batches | lr 0.00 | ms/batch  8.42 | loss  4.02 | ppl    55.60
| epoch 314 |  2800/ 2928 batches | lr 0.00 | ms/batch  8.41 | loss  3.99 | ppl    54.03
-----------------------------------------------------------------------------------------
| end of epoch 314 | time: 26.02s | valid loss  5.54 | valid ppl   253.63
-----------------------------------------------------------------------------------------
| epoch 315 |   200/ 2928 batches | lr 0.00 | ms/batch  8.60 | loss  4.07 | ppl    58.61
| epoch 315 |   400/ 2928 batches | lr 0.00 | ms/batch  8.41 | loss  4.07 | ppl    58.81
| epoch 315 |   600/ 2928 batches 

| epoch 320 |   200/ 2928 batches | lr 0.00 | ms/batch  8.52 | loss  4.07 | ppl    58.64
| epoch 320 |   400/ 2928 batches | lr 0.00 | ms/batch  8.44 | loss  4.07 | ppl    58.62
| epoch 320 |   600/ 2928 batches | lr 0.00 | ms/batch  8.51 | loss  3.94 | ppl    51.25
| epoch 320 |   800/ 2928 batches | lr 0.00 | ms/batch  8.46 | loss  4.00 | ppl    54.85
| epoch 320 |  1000/ 2928 batches | lr 0.00 | ms/batch  8.48 | loss  4.01 | ppl    55.39
| epoch 320 |  1200/ 2928 batches | lr 0.00 | ms/batch  8.43 | loss  4.04 | ppl    56.92
| epoch 320 |  1400/ 2928 batches | lr 0.00 | ms/batch  8.44 | loss  4.01 | ppl    55.31
| epoch 320 |  1600/ 2928 batches | lr 0.00 | ms/batch  8.43 | loss  4.07 | ppl    58.45
| epoch 320 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.46 | loss  4.05 | ppl    57.52
| epoch 320 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.45 | loss  4.06 | ppl    57.70
| epoch 320 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.42 | loss  3.90 | ppl    49.41
| epoch 320 |  2400/ 

| epoch 325 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.45 | loss  4.06 | ppl    57.87
| epoch 325 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.54 | loss  4.05 | ppl    57.48
| epoch 325 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.46 | loss  3.90 | ppl    49.54
| epoch 325 |  2400/ 2928 batches | lr 0.00 | ms/batch  8.47 | loss  4.00 | ppl    54.64
| epoch 325 |  2600/ 2928 batches | lr 0.00 | ms/batch  8.45 | loss  4.02 | ppl    55.59
| epoch 325 |  2800/ 2928 batches | lr 0.00 | ms/batch  8.52 | loss  3.99 | ppl    54.01
-----------------------------------------------------------------------------------------
| end of epoch 325 | time: 26.21s | valid loss  5.54 | valid ppl   253.63
-----------------------------------------------------------------------------------------
| epoch 326 |   200/ 2928 batches | lr 0.00 | ms/batch  8.63 | loss  4.07 | ppl    58.77
| epoch 326 |   400/ 2928 batches | lr 0.00 | ms/batch  8.46 | loss  4.08 | ppl    58.87
| epoch 326 |   600/ 2928 batches 

| epoch 331 |   200/ 2928 batches | lr 0.00 | ms/batch  8.60 | loss  4.07 | ppl    58.30
| epoch 331 |   400/ 2928 batches | lr 0.00 | ms/batch  8.39 | loss  4.08 | ppl    58.98
| epoch 331 |   600/ 2928 batches | lr 0.00 | ms/batch  8.37 | loss  3.94 | ppl    51.31
| epoch 331 |   800/ 2928 batches | lr 0.00 | ms/batch  8.40 | loss  4.00 | ppl    54.79
| epoch 331 |  1000/ 2928 batches | lr 0.00 | ms/batch  8.43 | loss  4.01 | ppl    55.11
| epoch 331 |  1200/ 2928 batches | lr 0.00 | ms/batch  8.41 | loss  4.05 | ppl    57.28
| epoch 331 |  1400/ 2928 batches | lr 0.00 | ms/batch  8.43 | loss  4.02 | ppl    55.65
| epoch 331 |  1600/ 2928 batches | lr 0.00 | ms/batch  8.44 | loss  4.07 | ppl    58.69
| epoch 331 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.44 | loss  4.05 | ppl    57.63
| epoch 331 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.40 | loss  4.05 | ppl    57.63
| epoch 331 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.43 | loss  3.91 | ppl    49.70
| epoch 331 |  2400/ 

| epoch 336 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.39 | loss  4.06 | ppl    57.72
| epoch 336 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.38 | loss  4.06 | ppl    57.81
| epoch 336 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.38 | loss  3.90 | ppl    49.60
| epoch 336 |  2400/ 2928 batches | lr 0.00 | ms/batch  8.39 | loss  4.00 | ppl    54.54
| epoch 336 |  2600/ 2928 batches | lr 0.00 | ms/batch  8.41 | loss  4.02 | ppl    55.53
| epoch 336 |  2800/ 2928 batches | lr 0.00 | ms/batch  8.41 | loss  3.99 | ppl    54.01
-----------------------------------------------------------------------------------------
| end of epoch 336 | time: 25.95s | valid loss  5.54 | valid ppl   253.63
-----------------------------------------------------------------------------------------
| epoch 337 |   200/ 2928 batches | lr 0.00 | ms/batch 10.18 | loss  4.07 | ppl    58.71
| epoch 337 |   400/ 2928 batches | lr 0.00 | ms/batch  8.66 | loss  4.07 | ppl    58.72
| epoch 337 |   600/ 2928 batches 

| epoch 342 |   200/ 2928 batches | lr 0.00 | ms/batch  9.40 | loss  4.07 | ppl    58.34
| epoch 342 |   400/ 2928 batches | lr 0.00 | ms/batch  8.56 | loss  4.07 | ppl    58.74
| epoch 342 |   600/ 2928 batches | lr 0.00 | ms/batch  8.40 | loss  3.94 | ppl    51.54
| epoch 342 |   800/ 2928 batches | lr 0.00 | ms/batch  8.42 | loss  4.00 | ppl    54.80
| epoch 342 |  1000/ 2928 batches | lr 0.00 | ms/batch  8.43 | loss  4.01 | ppl    55.32
| epoch 342 |  1200/ 2928 batches | lr 0.00 | ms/batch  8.42 | loss  4.04 | ppl    56.69
| epoch 342 |  1400/ 2928 batches | lr 0.00 | ms/batch  8.41 | loss  4.01 | ppl    55.29
| epoch 342 |  1600/ 2928 batches | lr 0.00 | ms/batch  8.43 | loss  4.07 | ppl    58.73
| epoch 342 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.45 | loss  4.05 | ppl    57.65
| epoch 342 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.41 | loss  4.05 | ppl    57.39
| epoch 342 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.43 | loss  3.90 | ppl    49.65
| epoch 342 |  2400/ 

| epoch 347 |  1800/ 2928 batches | lr 0.00 | ms/batch 10.07 | loss  4.06 | ppl    58.04
| epoch 347 |  2000/ 2928 batches | lr 0.00 | ms/batch 10.05 | loss  4.06 | ppl    57.79
| epoch 347 |  2200/ 2928 batches | lr 0.00 | ms/batch 10.12 | loss  3.91 | ppl    49.79
| epoch 347 |  2400/ 2928 batches | lr 0.00 | ms/batch 10.01 | loss  4.00 | ppl    54.71
| epoch 347 |  2600/ 2928 batches | lr 0.00 | ms/batch 10.12 | loss  4.02 | ppl    55.51
| epoch 347 |  2800/ 2928 batches | lr 0.00 | ms/batch  9.96 | loss  3.99 | ppl    53.88
-----------------------------------------------------------------------------------------
| end of epoch 347 | time: 30.66s | valid loss  5.54 | valid ppl   253.63
-----------------------------------------------------------------------------------------
| epoch 348 |   200/ 2928 batches | lr 0.00 | ms/batch  8.55 | loss  4.07 | ppl    58.49
| epoch 348 |   400/ 2928 batches | lr 0.00 | ms/batch  8.41 | loss  4.07 | ppl    58.56
| epoch 348 |   600/ 2928 batches 

| epoch 353 |   200/ 2928 batches | lr 0.00 | ms/batch  8.48 | loss  4.07 | ppl    58.73
| epoch 353 |   400/ 2928 batches | lr 0.00 | ms/batch  8.39 | loss  4.07 | ppl    58.72
| epoch 353 |   600/ 2928 batches | lr 0.00 | ms/batch  9.29 | loss  3.94 | ppl    51.46
| epoch 353 |   800/ 2928 batches | lr 0.00 | ms/batch  9.70 | loss  4.00 | ppl    54.81
| epoch 353 |  1000/ 2928 batches | lr 0.00 | ms/batch  9.72 | loss  4.02 | ppl    55.44
| epoch 353 |  1200/ 2928 batches | lr 0.00 | ms/batch  9.67 | loss  4.04 | ppl    56.86
| epoch 353 |  1400/ 2928 batches | lr 0.00 | ms/batch  9.70 | loss  4.01 | ppl    55.14
| epoch 353 |  1600/ 2928 batches | lr 0.00 | ms/batch  9.71 | loss  4.07 | ppl    58.78
| epoch 353 |  1800/ 2928 batches | lr 0.00 | ms/batch  9.76 | loss  4.05 | ppl    57.54
| epoch 353 |  2000/ 2928 batches | lr 0.00 | ms/batch  9.76 | loss  4.05 | ppl    57.53
| epoch 353 |  2200/ 2928 batches | lr 0.00 | ms/batch  9.77 | loss  3.90 | ppl    49.53
| epoch 353 |  2400/ 

| epoch 358 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.41 | loss  4.06 | ppl    57.72
| epoch 358 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.37 | loss  4.05 | ppl    57.53
| epoch 358 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.34 | loss  3.90 | ppl    49.49
| epoch 358 |  2400/ 2928 batches | lr 0.00 | ms/batch  8.38 | loss  4.00 | ppl    54.74
| epoch 358 |  2600/ 2928 batches | lr 0.00 | ms/batch  8.39 | loss  4.02 | ppl    55.63
| epoch 358 |  2800/ 2928 batches | lr 0.00 | ms/batch  8.36 | loss  3.99 | ppl    53.86
-----------------------------------------------------------------------------------------
| end of epoch 358 | time: 25.84s | valid loss  5.54 | valid ppl   253.63
-----------------------------------------------------------------------------------------
| epoch 359 |   200/ 2928 batches | lr 0.00 | ms/batch  8.44 | loss  4.07 | ppl    58.57
| epoch 359 |   400/ 2928 batches | lr 0.00 | ms/batch  8.35 | loss  4.08 | ppl    58.92
| epoch 359 |   600/ 2928 batches 

| epoch 364 |   200/ 2928 batches | lr 0.00 | ms/batch  8.35 | loss  4.07 | ppl    58.31
| epoch 364 |   400/ 2928 batches | lr 0.00 | ms/batch  8.28 | loss  4.07 | ppl    58.75
| epoch 364 |   600/ 2928 batches | lr 0.00 | ms/batch  8.27 | loss  3.94 | ppl    51.42
| epoch 364 |   800/ 2928 batches | lr 0.00 | ms/batch  8.28 | loss  4.00 | ppl    54.70
| epoch 364 |  1000/ 2928 batches | lr 0.00 | ms/batch  8.28 | loss  4.01 | ppl    55.38
| epoch 364 |  1200/ 2928 batches | lr 0.00 | ms/batch  8.28 | loss  4.04 | ppl    56.96
| epoch 364 |  1400/ 2928 batches | lr 0.00 | ms/batch  8.27 | loss  4.02 | ppl    55.55
| epoch 364 |  1600/ 2928 batches | lr 0.00 | ms/batch  8.28 | loss  4.07 | ppl    58.76
| epoch 364 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.28 | loss  4.06 | ppl    57.70
| epoch 364 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.28 | loss  4.06 | ppl    57.83
| epoch 364 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.28 | loss  3.90 | ppl    49.39
| epoch 364 |  2400/ 

| epoch 369 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.35 | loss  4.05 | ppl    57.67
| epoch 369 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.37 | loss  4.05 | ppl    57.50
| epoch 369 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.36 | loss  3.90 | ppl    49.59
| epoch 369 |  2400/ 2928 batches | lr 0.00 | ms/batch  8.30 | loss  4.00 | ppl    54.66
| epoch 369 |  2600/ 2928 batches | lr 0.00 | ms/batch  8.34 | loss  4.02 | ppl    55.93
| epoch 369 |  2800/ 2928 batches | lr 0.00 | ms/batch  8.36 | loss  3.99 | ppl    54.18
-----------------------------------------------------------------------------------------
| end of epoch 369 | time: 25.79s | valid loss  5.54 | valid ppl   253.63
-----------------------------------------------------------------------------------------
| epoch 370 |   200/ 2928 batches | lr 0.00 | ms/batch  8.48 | loss  4.07 | ppl    58.33
| epoch 370 |   400/ 2928 batches | lr 0.00 | ms/batch  8.33 | loss  4.07 | ppl    58.67
| epoch 370 |   600/ 2928 batches 

| epoch 375 |   200/ 2928 batches | lr 0.00 | ms/batch  8.39 | loss  4.07 | ppl    58.52
| epoch 375 |   400/ 2928 batches | lr 0.00 | ms/batch  8.33 | loss  4.08 | ppl    58.87
| epoch 375 |   600/ 2928 batches | lr 0.00 | ms/batch  8.35 | loss  3.94 | ppl    51.39
| epoch 375 |   800/ 2928 batches | lr 0.00 | ms/batch  8.32 | loss  4.01 | ppl    55.00
| epoch 375 |  1000/ 2928 batches | lr 0.00 | ms/batch  8.32 | loss  4.01 | ppl    55.36
| epoch 375 |  1200/ 2928 batches | lr 0.00 | ms/batch  8.33 | loss  4.04 | ppl    56.84
| epoch 375 |  1400/ 2928 batches | lr 0.00 | ms/batch  8.35 | loss  4.02 | ppl    55.43
| epoch 375 |  1600/ 2928 batches | lr 0.00 | ms/batch  8.34 | loss  4.08 | ppl    59.01
| epoch 375 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.39 | loss  4.06 | ppl    57.77
| epoch 375 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.38 | loss  4.05 | ppl    57.49
| epoch 375 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.33 | loss  3.90 | ppl    49.55
| epoch 375 |  2400/ 

| epoch 380 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.27 | loss  4.05 | ppl    57.67
| epoch 380 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.27 | loss  4.05 | ppl    57.64
| epoch 380 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.27 | loss  3.90 | ppl    49.63
| epoch 380 |  2400/ 2928 batches | lr 0.00 | ms/batch  8.27 | loss  4.00 | ppl    54.68
| epoch 380 |  2600/ 2928 batches | lr 0.00 | ms/batch  8.27 | loss  4.02 | ppl    55.59
| epoch 380 |  2800/ 2928 batches | lr 0.00 | ms/batch  8.27 | loss  3.99 | ppl    54.01
-----------------------------------------------------------------------------------------
| end of epoch 380 | time: 25.57s | valid loss  5.54 | valid ppl   253.63
-----------------------------------------------------------------------------------------
| epoch 381 |   200/ 2928 batches | lr 0.00 | ms/batch  8.35 | loss  4.07 | ppl    58.54
| epoch 381 |   400/ 2928 batches | lr 0.00 | ms/batch  8.27 | loss  4.07 | ppl    58.62
| epoch 381 |   600/ 2928 batches 

| epoch 386 |   200/ 2928 batches | lr 0.00 | ms/batch  8.48 | loss  4.07 | ppl    58.30
| epoch 386 |   400/ 2928 batches | lr 0.00 | ms/batch  8.30 | loss  4.08 | ppl    58.96
| epoch 386 |   600/ 2928 batches | lr 0.00 | ms/batch  8.32 | loss  3.94 | ppl    51.59
| epoch 386 |   800/ 2928 batches | lr 0.00 | ms/batch  8.32 | loss  4.00 | ppl    54.85
| epoch 386 |  1000/ 2928 batches | lr 0.00 | ms/batch  8.38 | loss  4.01 | ppl    55.17
| epoch 386 |  1200/ 2928 batches | lr 0.00 | ms/batch  8.37 | loss  4.04 | ppl    56.74
| epoch 386 |  1400/ 2928 batches | lr 0.00 | ms/batch  8.36 | loss  4.02 | ppl    55.51
| epoch 386 |  1600/ 2928 batches | lr 0.00 | ms/batch  8.35 | loss  4.07 | ppl    58.68
| epoch 386 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.37 | loss  4.05 | ppl    57.60
| epoch 386 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.37 | loss  4.05 | ppl    57.52
| epoch 386 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.34 | loss  3.90 | ppl    49.48
| epoch 386 |  2400/ 

| epoch 391 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.45 | loss  4.06 | ppl    57.73
| epoch 391 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.43 | loss  4.05 | ppl    57.55
| epoch 391 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.43 | loss  3.90 | ppl    49.60
| epoch 391 |  2400/ 2928 batches | lr 0.00 | ms/batch  8.42 | loss  4.00 | ppl    54.50
| epoch 391 |  2600/ 2928 batches | lr 0.00 | ms/batch  8.42 | loss  4.02 | ppl    55.68
| epoch 391 |  2800/ 2928 batches | lr 0.00 | ms/batch  8.44 | loss  3.99 | ppl    54.00
-----------------------------------------------------------------------------------------
| end of epoch 391 | time: 26.06s | valid loss  5.54 | valid ppl   253.63
-----------------------------------------------------------------------------------------
| epoch 392 |   200/ 2928 batches | lr 0.00 | ms/batch  8.94 | loss  4.07 | ppl    58.29
| epoch 392 |   400/ 2928 batches | lr 0.00 | ms/batch  8.44 | loss  4.08 | ppl    59.05
| epoch 392 |   600/ 2928 batches 

| epoch 397 |   200/ 2928 batches | lr 0.00 | ms/batch  8.53 | loss  4.07 | ppl    58.44
| epoch 397 |   400/ 2928 batches | lr 0.00 | ms/batch  8.36 | loss  4.07 | ppl    58.78
| epoch 397 |   600/ 2928 batches | lr 0.00 | ms/batch  8.36 | loss  3.94 | ppl    51.42
| epoch 397 |   800/ 2928 batches | lr 0.00 | ms/batch  8.35 | loss  4.00 | ppl    54.86
| epoch 397 |  1000/ 2928 batches | lr 0.00 | ms/batch  8.38 | loss  4.01 | ppl    55.42
| epoch 397 |  1200/ 2928 batches | lr 0.00 | ms/batch  8.37 | loss  4.04 | ppl    56.92
| epoch 397 |  1400/ 2928 batches | lr 0.00 | ms/batch  8.38 | loss  4.01 | ppl    55.41
| epoch 397 |  1600/ 2928 batches | lr 0.00 | ms/batch  8.36 | loss  4.07 | ppl    58.82
| epoch 397 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.36 | loss  4.06 | ppl    57.69
| epoch 397 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.37 | loss  4.06 | ppl    57.73
| epoch 397 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.38 | loss  3.90 | ppl    49.40
| epoch 397 |  2400/ 

| epoch 402 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.45 | loss  4.06 | ppl    57.78
| epoch 402 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.49 | loss  4.05 | ppl    57.58
| epoch 402 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.50 | loss  3.90 | ppl    49.58
| epoch 402 |  2400/ 2928 batches | lr 0.00 | ms/batch  8.42 | loss  4.00 | ppl    54.38
| epoch 402 |  2600/ 2928 batches | lr 0.00 | ms/batch  8.43 | loss  4.02 | ppl    55.56
| epoch 402 |  2800/ 2928 batches | lr 0.00 | ms/batch  8.46 | loss  3.99 | ppl    54.13
-----------------------------------------------------------------------------------------
| end of epoch 402 | time: 26.15s | valid loss  5.54 | valid ppl   253.63
-----------------------------------------------------------------------------------------
| epoch 403 |   200/ 2928 batches | lr 0.00 | ms/batch  8.56 | loss  4.07 | ppl    58.39
| epoch 403 |   400/ 2928 batches | lr 0.00 | ms/batch  8.44 | loss  4.08 | ppl    59.03
| epoch 403 |   600/ 2928 batches 

| epoch 408 |   200/ 2928 batches | lr 0.00 | ms/batch  9.01 | loss  4.06 | ppl    58.26
| epoch 408 |   400/ 2928 batches | lr 0.00 | ms/batch  9.24 | loss  4.08 | ppl    59.04
| epoch 408 |   600/ 2928 batches | lr 0.00 | ms/batch  9.12 | loss  3.94 | ppl    51.44
| epoch 408 |   800/ 2928 batches | lr 0.00 | ms/batch 10.65 | loss  4.00 | ppl    54.55
| epoch 408 |  1000/ 2928 batches | lr 0.00 | ms/batch 10.49 | loss  4.01 | ppl    55.11
| epoch 408 |  1200/ 2928 batches | lr 0.00 | ms/batch 10.10 | loss  4.04 | ppl    56.63
| epoch 408 |  1400/ 2928 batches | lr 0.00 | ms/batch  9.97 | loss  4.01 | ppl    55.32
| epoch 408 |  1600/ 2928 batches | lr 0.00 | ms/batch 10.00 | loss  4.07 | ppl    58.61
| epoch 408 |  1800/ 2928 batches | lr 0.00 | ms/batch 10.09 | loss  4.05 | ppl    57.53
| epoch 408 |  2000/ 2928 batches | lr 0.00 | ms/batch 10.01 | loss  4.05 | ppl    57.66
| epoch 408 |  2200/ 2928 batches | lr 0.00 | ms/batch 10.02 | loss  3.90 | ppl    49.56
| epoch 408 |  2400/ 

| epoch 413 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.42 | loss  4.06 | ppl    57.83
| epoch 413 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.41 | loss  4.05 | ppl    57.56
| epoch 413 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.44 | loss  3.90 | ppl    49.62
| epoch 413 |  2400/ 2928 batches | lr 0.00 | ms/batch  8.47 | loss  4.00 | ppl    54.70
| epoch 413 |  2600/ 2928 batches | lr 0.00 | ms/batch  8.46 | loss  4.02 | ppl    55.65
| epoch 413 |  2800/ 2928 batches | lr 0.00 | ms/batch  8.46 | loss  3.99 | ppl    53.95
-----------------------------------------------------------------------------------------
| end of epoch 413 | time: 26.31s | valid loss  5.54 | valid ppl   253.63
-----------------------------------------------------------------------------------------
| epoch 414 |   200/ 2928 batches | lr 0.00 | ms/batch  8.57 | loss  4.07 | ppl    58.33
| epoch 414 |   400/ 2928 batches | lr 0.00 | ms/batch  8.42 | loss  4.07 | ppl    58.75
| epoch 414 |   600/ 2928 batches 

| epoch 419 |   200/ 2928 batches | lr 0.00 | ms/batch  8.46 | loss  4.07 | ppl    58.28
| epoch 419 |   400/ 2928 batches | lr 0.00 | ms/batch  8.38 | loss  4.07 | ppl    58.75
| epoch 419 |   600/ 2928 batches | lr 0.00 | ms/batch  8.39 | loss  3.94 | ppl    51.45
| epoch 419 |   800/ 2928 batches | lr 0.00 | ms/batch  8.39 | loss  4.00 | ppl    54.79
| epoch 419 |  1000/ 2928 batches | lr 0.00 | ms/batch  8.38 | loss  4.01 | ppl    55.17
| epoch 419 |  1200/ 2928 batches | lr 0.00 | ms/batch  8.39 | loss  4.04 | ppl    57.06
| epoch 419 |  1400/ 2928 batches | lr 0.00 | ms/batch  8.42 | loss  4.01 | ppl    55.24
| epoch 419 |  1600/ 2928 batches | lr 0.00 | ms/batch  8.41 | loss  4.08 | ppl    59.02
| epoch 419 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.40 | loss  4.05 | ppl    57.67
| epoch 419 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.45 | loss  4.05 | ppl    57.56
| epoch 419 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.46 | loss  3.90 | ppl    49.52
| epoch 419 |  2400/ 

| epoch 424 |  1800/ 2928 batches | lr 0.00 | ms/batch  9.72 | loss  4.06 | ppl    57.80
| epoch 424 |  2000/ 2928 batches | lr 0.00 | ms/batch  9.73 | loss  4.05 | ppl    57.64
| epoch 424 |  2200/ 2928 batches | lr 0.00 | ms/batch  9.74 | loss  3.90 | ppl    49.42
| epoch 424 |  2400/ 2928 batches | lr 0.00 | ms/batch  9.76 | loss  4.00 | ppl    54.61
| epoch 424 |  2600/ 2928 batches | lr 0.00 | ms/batch  9.71 | loss  4.02 | ppl    55.83
| epoch 424 |  2800/ 2928 batches | lr 0.00 | ms/batch  9.73 | loss  3.99 | ppl    53.90
-----------------------------------------------------------------------------------------
| end of epoch 424 | time: 28.18s | valid loss  5.54 | valid ppl   253.63
-----------------------------------------------------------------------------------------
| epoch 425 |   200/ 2928 batches | lr 0.00 | ms/batch  8.46 | loss  4.07 | ppl    58.78
| epoch 425 |   400/ 2928 batches | lr 0.00 | ms/batch  8.34 | loss  4.08 | ppl    58.91
| epoch 425 |   600/ 2928 batches 

| epoch 430 |   200/ 2928 batches | lr 0.00 | ms/batch  8.40 | loss  4.07 | ppl    58.45
| epoch 430 |   400/ 2928 batches | lr 0.00 | ms/batch  8.31 | loss  4.08 | ppl    58.88
| epoch 430 |   600/ 2928 batches | lr 0.00 | ms/batch  8.30 | loss  3.94 | ppl    51.29
| epoch 430 |   800/ 2928 batches | lr 0.00 | ms/batch  8.33 | loss  4.01 | ppl    55.05
| epoch 430 |  1000/ 2928 batches | lr 0.00 | ms/batch  8.31 | loss  4.02 | ppl    55.46
| epoch 430 |  1200/ 2928 batches | lr 0.00 | ms/batch  8.32 | loss  4.04 | ppl    57.06
| epoch 430 |  1400/ 2928 batches | lr 0.00 | ms/batch  8.33 | loss  4.02 | ppl    55.45
| epoch 430 |  1600/ 2928 batches | lr 0.00 | ms/batch  8.33 | loss  4.08 | ppl    58.85
| epoch 430 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.36 | loss  4.05 | ppl    57.61
| epoch 430 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.35 | loss  4.05 | ppl    57.55
| epoch 430 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.33 | loss  3.90 | ppl    49.39
| epoch 430 |  2400/ 

| epoch 435 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.42 | loss  4.06 | ppl    57.92
| epoch 435 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.42 | loss  4.06 | ppl    57.74
| epoch 435 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.41 | loss  3.90 | ppl    49.55
| epoch 435 |  2400/ 2928 batches | lr 0.00 | ms/batch  8.43 | loss  4.00 | ppl    54.79
| epoch 435 |  2600/ 2928 batches | lr 0.00 | ms/batch  8.41 | loss  4.02 | ppl    55.75
| epoch 435 |  2800/ 2928 batches | lr 0.00 | ms/batch  8.41 | loss  3.99 | ppl    53.86
-----------------------------------------------------------------------------------------
| end of epoch 435 | time: 25.96s | valid loss  5.54 | valid ppl   253.63
-----------------------------------------------------------------------------------------
| epoch 436 |   200/ 2928 batches | lr 0.00 | ms/batch  8.45 | loss  4.07 | ppl    58.52
| epoch 436 |   400/ 2928 batches | lr 0.00 | ms/batch  8.38 | loss  4.07 | ppl    58.83
| epoch 436 |   600/ 2928 batches 

| epoch 441 |   200/ 2928 batches | lr 0.00 | ms/batch  8.48 | loss  4.07 | ppl    58.64
| epoch 441 |   400/ 2928 batches | lr 0.00 | ms/batch  8.47 | loss  4.07 | ppl    58.75
| epoch 441 |   600/ 2928 batches | lr 0.00 | ms/batch  8.44 | loss  3.94 | ppl    51.40
| epoch 441 |   800/ 2928 batches | lr 0.00 | ms/batch  8.46 | loss  4.00 | ppl    54.84
| epoch 441 |  1000/ 2928 batches | lr 0.00 | ms/batch  8.45 | loss  4.01 | ppl    55.31
| epoch 441 |  1200/ 2928 batches | lr 0.00 | ms/batch  8.41 | loss  4.03 | ppl    56.49
| epoch 441 |  1400/ 2928 batches | lr 0.00 | ms/batch  8.44 | loss  4.02 | ppl    55.50
| epoch 441 |  1600/ 2928 batches | lr 0.00 | ms/batch  8.44 | loss  4.07 | ppl    58.79
| epoch 441 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.45 | loss  4.05 | ppl    57.57
| epoch 441 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.43 | loss  4.05 | ppl    57.45
| epoch 441 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.44 | loss  3.91 | ppl    49.66
| epoch 441 |  2400/ 

| epoch 446 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.33 | loss  4.06 | ppl    57.78
| epoch 446 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.32 | loss  4.05 | ppl    57.61
| epoch 446 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.34 | loss  3.90 | ppl    49.43
| epoch 446 |  2400/ 2928 batches | lr 0.00 | ms/batch  8.34 | loss  4.00 | ppl    54.72
| epoch 446 |  2600/ 2928 batches | lr 0.00 | ms/batch  8.37 | loss  4.02 | ppl    55.90
| epoch 446 |  2800/ 2928 batches | lr 0.00 | ms/batch  8.36 | loss  3.99 | ppl    54.24
-----------------------------------------------------------------------------------------
| end of epoch 446 | time: 25.88s | valid loss  5.54 | valid ppl   253.63
-----------------------------------------------------------------------------------------
| epoch 447 |   200/ 2928 batches | lr 0.00 | ms/batch  8.56 | loss  4.07 | ppl    58.49
| epoch 447 |   400/ 2928 batches | lr 0.00 | ms/batch  8.38 | loss  4.08 | ppl    59.01
| epoch 447 |   600/ 2928 batches 

| epoch 452 |   200/ 2928 batches | lr 0.00 | ms/batch  8.46 | loss  4.07 | ppl    58.51
| epoch 452 |   400/ 2928 batches | lr 0.00 | ms/batch  8.31 | loss  4.08 | ppl    58.91
| epoch 452 |   600/ 2928 batches | lr 0.00 | ms/batch  8.34 | loss  3.94 | ppl    51.32
| epoch 452 |   800/ 2928 batches | lr 0.00 | ms/batch  8.31 | loss  4.01 | ppl    54.94
| epoch 452 |  1000/ 2928 batches | lr 0.00 | ms/batch  8.31 | loss  4.01 | ppl    55.16
| epoch 452 |  1200/ 2928 batches | lr 0.00 | ms/batch  8.32 | loss  4.04 | ppl    57.04
| epoch 452 |  1400/ 2928 batches | lr 0.00 | ms/batch  8.33 | loss  4.01 | ppl    55.13
| epoch 452 |  1600/ 2928 batches | lr 0.00 | ms/batch  8.31 | loss  4.07 | ppl    58.60
| epoch 452 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.35 | loss  4.05 | ppl    57.64
| epoch 452 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.36 | loss  4.05 | ppl    57.35
| epoch 452 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.33 | loss  3.90 | ppl    49.33
| epoch 452 |  2400/ 

| epoch 457 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.46 | loss  4.05 | ppl    57.49
| epoch 457 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.46 | loss  4.06 | ppl    57.69
| epoch 457 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.44 | loss  3.90 | ppl    49.49
| epoch 457 |  2400/ 2928 batches | lr 0.00 | ms/batch  8.46 | loss  4.00 | ppl    54.58
| epoch 457 |  2600/ 2928 batches | lr 0.00 | ms/batch  8.46 | loss  4.03 | ppl    56.06
| epoch 457 |  2800/ 2928 batches | lr 0.00 | ms/batch  8.46 | loss  3.99 | ppl    53.88
-----------------------------------------------------------------------------------------
| end of epoch 457 | time: 26.15s | valid loss  5.54 | valid ppl   253.63
-----------------------------------------------------------------------------------------
| epoch 458 |   200/ 2928 batches | lr 0.00 | ms/batch  8.56 | loss  4.07 | ppl    58.36
| epoch 458 |   400/ 2928 batches | lr 0.00 | ms/batch  8.41 | loss  4.07 | ppl    58.82
| epoch 458 |   600/ 2928 batches 

| epoch 463 |   200/ 2928 batches | lr 0.00 | ms/batch  8.54 | loss  4.07 | ppl    58.55
| epoch 463 |   400/ 2928 batches | lr 0.00 | ms/batch  8.38 | loss  4.08 | ppl    59.04
| epoch 463 |   600/ 2928 batches | lr 0.00 | ms/batch  8.39 | loss  3.94 | ppl    51.34
| epoch 463 |   800/ 2928 batches | lr 0.00 | ms/batch  8.39 | loss  4.00 | ppl    54.72
| epoch 463 |  1000/ 2928 batches | lr 0.00 | ms/batch  8.39 | loss  4.02 | ppl    55.50
| epoch 463 |  1200/ 2928 batches | lr 0.00 | ms/batch  8.40 | loss  4.04 | ppl    56.82
| epoch 463 |  1400/ 2928 batches | lr 0.00 | ms/batch  8.39 | loss  4.01 | ppl    55.36
| epoch 463 |  1600/ 2928 batches | lr 0.00 | ms/batch  8.38 | loss  4.07 | ppl    58.62
| epoch 463 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.38 | loss  4.05 | ppl    57.59
| epoch 463 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.39 | loss  4.06 | ppl    57.71
| epoch 463 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.41 | loss  3.91 | ppl    49.71
| epoch 463 |  2400/ 

| epoch 468 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.27 | loss  4.06 | ppl    57.73
| epoch 468 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.28 | loss  4.05 | ppl    57.61
| epoch 468 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.28 | loss  3.90 | ppl    49.58
| epoch 468 |  2400/ 2928 batches | lr 0.00 | ms/batch  8.28 | loss  4.00 | ppl    54.42
| epoch 468 |  2600/ 2928 batches | lr 0.00 | ms/batch  8.28 | loss  4.02 | ppl    55.57
| epoch 468 |  2800/ 2928 batches | lr 0.00 | ms/batch  8.29 | loss  3.99 | ppl    53.96
-----------------------------------------------------------------------------------------
| end of epoch 468 | time: 25.57s | valid loss  5.54 | valid ppl   253.63
-----------------------------------------------------------------------------------------
| epoch 469 |   200/ 2928 batches | lr 0.00 | ms/batch  8.35 | loss  4.07 | ppl    58.56
| epoch 469 |   400/ 2928 batches | lr 0.00 | ms/batch  8.27 | loss  4.08 | ppl    58.97
| epoch 469 |   600/ 2928 batches 

| epoch 474 |   200/ 2928 batches | lr 0.00 | ms/batch  8.55 | loss  4.07 | ppl    58.43
| epoch 474 |   400/ 2928 batches | lr 0.00 | ms/batch  8.45 | loss  4.08 | ppl    58.97
| epoch 474 |   600/ 2928 batches | lr 0.00 | ms/batch  8.42 | loss  3.94 | ppl    51.48
| epoch 474 |   800/ 2928 batches | lr 0.00 | ms/batch  8.47 | loss  4.01 | ppl    54.91
| epoch 474 |  1000/ 2928 batches | lr 0.00 | ms/batch  8.51 | loss  4.01 | ppl    55.22
| epoch 474 |  1200/ 2928 batches | lr 0.00 | ms/batch  8.46 | loss  4.04 | ppl    56.76
| epoch 474 |  1400/ 2928 batches | lr 0.00 | ms/batch  8.44 | loss  4.01 | ppl    55.20
| epoch 474 |  1600/ 2928 batches | lr 0.00 | ms/batch  8.43 | loss  4.07 | ppl    58.67
| epoch 474 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.46 | loss  4.06 | ppl    57.76
| epoch 474 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.44 | loss  4.05 | ppl    57.49
| epoch 474 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.43 | loss  3.90 | ppl    49.36
| epoch 474 |  2400/ 

| epoch 479 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.36 | loss  4.05 | ppl    57.63
| epoch 479 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.41 | loss  4.05 | ppl    57.52
| epoch 479 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.34 | loss  3.90 | ppl    49.41
| epoch 479 |  2400/ 2928 batches | lr 0.00 | ms/batch  8.35 | loss  4.00 | ppl    54.46
| epoch 479 |  2600/ 2928 batches | lr 0.00 | ms/batch  8.35 | loss  4.02 | ppl    55.85
| epoch 479 |  2800/ 2928 batches | lr 0.00 | ms/batch  8.36 | loss  3.99 | ppl    53.89
-----------------------------------------------------------------------------------------
| end of epoch 479 | time: 25.79s | valid loss  5.54 | valid ppl   253.63
-----------------------------------------------------------------------------------------
| epoch 480 |   200/ 2928 batches | lr 0.00 | ms/batch  8.41 | loss  4.07 | ppl    58.36
| epoch 480 |   400/ 2928 batches | lr 0.00 | ms/batch  8.32 | loss  4.07 | ppl    58.65
| epoch 480 |   600/ 2928 batches 

| epoch 485 |   200/ 2928 batches | lr 0.00 | ms/batch  8.40 | loss  4.07 | ppl    58.48
| epoch 485 |   400/ 2928 batches | lr 0.00 | ms/batch  8.37 | loss  4.07 | ppl    58.72
| epoch 485 |   600/ 2928 batches | lr 0.00 | ms/batch  8.34 | loss  3.94 | ppl    51.32
| epoch 485 |   800/ 2928 batches | lr 0.00 | ms/batch  8.37 | loss  4.01 | ppl    54.97
| epoch 485 |  1000/ 2928 batches | lr 0.00 | ms/batch  8.34 | loss  4.01 | ppl    55.29
| epoch 485 |  1200/ 2928 batches | lr 0.00 | ms/batch  8.36 | loss  4.04 | ppl    56.85
| epoch 485 |  1400/ 2928 batches | lr 0.00 | ms/batch  8.37 | loss  4.01 | ppl    55.27
| epoch 485 |  1600/ 2928 batches | lr 0.00 | ms/batch  8.36 | loss  4.07 | ppl    58.70
| epoch 485 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.35 | loss  4.06 | ppl    57.70
| epoch 485 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.37 | loss  4.05 | ppl    57.54
| epoch 485 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.37 | loss  3.90 | ppl    49.60
| epoch 485 |  2400/ 

| epoch 490 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.34 | loss  4.05 | ppl    57.56
| epoch 490 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.38 | loss  4.06 | ppl    57.72
| epoch 490 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.33 | loss  3.90 | ppl    49.43
| epoch 490 |  2400/ 2928 batches | lr 0.00 | ms/batch  8.48 | loss  4.00 | ppl    54.69
| epoch 490 |  2600/ 2928 batches | lr 0.00 | ms/batch  8.35 | loss  4.03 | ppl    56.01
| epoch 490 |  2800/ 2928 batches | lr 0.00 | ms/batch  8.36 | loss  3.99 | ppl    54.11
-----------------------------------------------------------------------------------------
| end of epoch 490 | time: 25.87s | valid loss  5.54 | valid ppl   253.63
-----------------------------------------------------------------------------------------
| epoch 491 |   200/ 2928 batches | lr 0.00 | ms/batch  8.50 | loss  4.07 | ppl    58.69
| epoch 491 |   400/ 2928 batches | lr 0.00 | ms/batch  8.36 | loss  4.07 | ppl    58.85
| epoch 491 |   600/ 2928 batches 

| epoch 496 |   200/ 2928 batches | lr 0.00 | ms/batch  8.39 | loss  4.07 | ppl    58.56
| epoch 496 |   400/ 2928 batches | lr 0.00 | ms/batch  8.31 | loss  4.08 | ppl    59.05
| epoch 496 |   600/ 2928 batches | lr 0.00 | ms/batch  8.32 | loss  3.94 | ppl    51.22
| epoch 496 |   800/ 2928 batches | lr 0.00 | ms/batch  8.32 | loss  4.00 | ppl    54.67
| epoch 496 |  1000/ 2928 batches | lr 0.00 | ms/batch  8.34 | loss  4.01 | ppl    55.34
| epoch 496 |  1200/ 2928 batches | lr 0.00 | ms/batch  8.33 | loss  4.04 | ppl    56.72
| epoch 496 |  1400/ 2928 batches | lr 0.00 | ms/batch  8.39 | loss  4.01 | ppl    55.36
| epoch 496 |  1600/ 2928 batches | lr 0.00 | ms/batch  8.35 | loss  4.07 | ppl    58.62
| epoch 496 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.34 | loss  4.06 | ppl    57.69
| epoch 496 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.31 | loss  4.05 | ppl    57.67
| epoch 496 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.37 | loss  3.90 | ppl    49.60
| epoch 496 |  2400/ 

| epoch 501 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.38 | loss  4.05 | ppl    57.49
| epoch 501 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.36 | loss  4.05 | ppl    57.61
| epoch 501 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.41 | loss  3.90 | ppl    49.57
| epoch 501 |  2400/ 2928 batches | lr 0.00 | ms/batch  8.43 | loss  4.00 | ppl    54.87
| epoch 501 |  2600/ 2928 batches | lr 0.00 | ms/batch  8.33 | loss  4.02 | ppl    55.86
| epoch 501 |  2800/ 2928 batches | lr 0.00 | ms/batch  8.38 | loss  3.99 | ppl    54.24
-----------------------------------------------------------------------------------------
| end of epoch 501 | time: 25.80s | valid loss  5.54 | valid ppl   253.63
-----------------------------------------------------------------------------------------
| epoch 502 |   200/ 2928 batches | lr 0.00 | ms/batch  8.44 | loss  4.07 | ppl    58.55
| epoch 502 |   400/ 2928 batches | lr 0.00 | ms/batch  8.34 | loss  4.07 | ppl    58.76
| epoch 502 |   600/ 2928 batches 

| epoch 507 |   200/ 2928 batches | lr 0.00 | ms/batch  8.56 | loss  4.07 | ppl    58.69
| epoch 507 |   400/ 2928 batches | lr 0.00 | ms/batch  8.42 | loss  4.07 | ppl    58.78
| epoch 507 |   600/ 2928 batches | lr 0.00 | ms/batch  8.43 | loss  3.94 | ppl    51.34
| epoch 507 |   800/ 2928 batches | lr 0.00 | ms/batch  8.43 | loss  4.00 | ppl    54.65
| epoch 507 |  1000/ 2928 batches | lr 0.00 | ms/batch  8.43 | loss  4.01 | ppl    55.24
| epoch 507 |  1200/ 2928 batches | lr 0.00 | ms/batch  8.42 | loss  4.04 | ppl    56.80
| epoch 507 |  1400/ 2928 batches | lr 0.00 | ms/batch  8.45 | loss  4.01 | ppl    55.33
| epoch 507 |  1600/ 2928 batches | lr 0.00 | ms/batch  8.42 | loss  4.07 | ppl    58.65
| epoch 507 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.42 | loss  4.05 | ppl    57.66
| epoch 507 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.43 | loss  4.05 | ppl    57.65
| epoch 507 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.44 | loss  3.90 | ppl    49.44
| epoch 507 |  2400/ 

| epoch 512 |  1800/ 2928 batches | lr 0.00 | ms/batch 10.14 | loss  4.06 | ppl    57.72
| epoch 512 |  2000/ 2928 batches | lr 0.00 | ms/batch 10.12 | loss  4.05 | ppl    57.64
| epoch 512 |  2200/ 2928 batches | lr 0.00 | ms/batch 10.11 | loss  3.90 | ppl    49.52
| epoch 512 |  2400/ 2928 batches | lr 0.00 | ms/batch 10.08 | loss  4.00 | ppl    54.54
| epoch 512 |  2600/ 2928 batches | lr 0.00 | ms/batch 10.07 | loss  4.02 | ppl    55.61
| epoch 512 |  2800/ 2928 batches | lr 0.00 | ms/batch 10.10 | loss  3.99 | ppl    54.05
-----------------------------------------------------------------------------------------
| end of epoch 512 | time: 30.44s | valid loss  5.54 | valid ppl   253.63
-----------------------------------------------------------------------------------------
| epoch 513 |   200/ 2928 batches | lr 0.00 | ms/batch  8.86 | loss  4.07 | ppl    58.44
| epoch 513 |   400/ 2928 batches | lr 0.00 | ms/batch  8.42 | loss  4.07 | ppl    58.79
| epoch 513 |   600/ 2928 batches 

| epoch 518 |   200/ 2928 batches | lr 0.00 | ms/batch  8.48 | loss  4.07 | ppl    58.54
| epoch 518 |   400/ 2928 batches | lr 0.00 | ms/batch  8.34 | loss  4.07 | ppl    58.68
| epoch 518 |   600/ 2928 batches | lr 0.00 | ms/batch  8.40 | loss  3.94 | ppl    51.36
| epoch 518 |   800/ 2928 batches | lr 0.00 | ms/batch  8.35 | loss  4.00 | ppl    54.81
| epoch 518 |  1000/ 2928 batches | lr 0.00 | ms/batch  8.34 | loss  4.01 | ppl    55.31
| epoch 518 |  1200/ 2928 batches | lr 0.00 | ms/batch  8.38 | loss  4.04 | ppl    56.73
| epoch 518 |  1400/ 2928 batches | lr 0.00 | ms/batch  8.37 | loss  4.02 | ppl    55.62
| epoch 518 |  1600/ 2928 batches | lr 0.00 | ms/batch  8.39 | loss  4.07 | ppl    58.64
| epoch 518 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.35 | loss  4.06 | ppl    57.75
| epoch 518 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.34 | loss  4.05 | ppl    57.37
| epoch 518 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.32 | loss  3.90 | ppl    49.45
| epoch 518 |  2400/ 

| epoch 523 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.37 | loss  4.05 | ppl    57.60
| epoch 523 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.36 | loss  4.05 | ppl    57.56
| epoch 523 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.32 | loss  3.91 | ppl    49.84
| epoch 523 |  2400/ 2928 batches | lr 0.00 | ms/batch  8.37 | loss  4.00 | ppl    54.79
| epoch 523 |  2600/ 2928 batches | lr 0.00 | ms/batch  8.36 | loss  4.02 | ppl    55.80
| epoch 523 |  2800/ 2928 batches | lr 0.00 | ms/batch  8.35 | loss  3.99 | ppl    53.99
-----------------------------------------------------------------------------------------
| end of epoch 523 | time: 25.88s | valid loss  5.54 | valid ppl   253.63
-----------------------------------------------------------------------------------------
| epoch 524 |   200/ 2928 batches | lr 0.00 | ms/batch  8.49 | loss  4.07 | ppl    58.29
| epoch 524 |   400/ 2928 batches | lr 0.00 | ms/batch  8.38 | loss  4.08 | ppl    59.01
| epoch 524 |   600/ 2928 batches 

| epoch 529 |   200/ 2928 batches | lr 0.00 | ms/batch  8.46 | loss  4.07 | ppl    58.46
| epoch 529 |   400/ 2928 batches | lr 0.00 | ms/batch  8.38 | loss  4.08 | ppl    58.88
| epoch 529 |   600/ 2928 batches | lr 0.00 | ms/batch  8.37 | loss  3.94 | ppl    51.27
| epoch 529 |   800/ 2928 batches | lr 0.00 | ms/batch  8.36 | loss  4.01 | ppl    55.02
| epoch 529 |  1000/ 2928 batches | lr 0.00 | ms/batch  8.39 | loss  4.01 | ppl    55.36
| epoch 529 |  1200/ 2928 batches | lr 0.00 | ms/batch  8.40 | loss  4.05 | ppl    57.22
| epoch 529 |  1400/ 2928 batches | lr 0.00 | ms/batch  8.40 | loss  4.01 | ppl    55.18
| epoch 529 |  1600/ 2928 batches | lr 0.00 | ms/batch  8.44 | loss  4.07 | ppl    58.85
| epoch 529 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.38 | loss  4.06 | ppl    57.81
| epoch 529 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.40 | loss  4.05 | ppl    57.56
| epoch 529 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.42 | loss  3.90 | ppl    49.40
| epoch 529 |  2400/ 

| epoch 534 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.43 | loss  4.06 | ppl    57.69
| epoch 534 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.47 | loss  4.05 | ppl    57.55
| epoch 534 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.46 | loss  3.90 | ppl    49.62
| epoch 534 |  2400/ 2928 batches | lr 0.00 | ms/batch  8.40 | loss  4.00 | ppl    54.73
| epoch 534 |  2600/ 2928 batches | lr 0.00 | ms/batch  8.49 | loss  4.02 | ppl    55.84
| epoch 534 |  2800/ 2928 batches | lr 0.00 | ms/batch  8.42 | loss  3.99 | ppl    53.81
-----------------------------------------------------------------------------------------
| end of epoch 534 | time: 26.12s | valid loss  5.54 | valid ppl   253.63
-----------------------------------------------------------------------------------------
| epoch 535 |   200/ 2928 batches | lr 0.00 | ms/batch  8.57 | loss  4.07 | ppl    58.48
| epoch 535 |   400/ 2928 batches | lr 0.00 | ms/batch  8.44 | loss  4.07 | ppl    58.57
| epoch 535 |   600/ 2928 batches 

| epoch 540 |   200/ 2928 batches | lr 0.00 | ms/batch  8.91 | loss  4.07 | ppl    58.42
| epoch 540 |   400/ 2928 batches | lr 0.00 | ms/batch  9.05 | loss  4.07 | ppl    58.66
| epoch 540 |   600/ 2928 batches | lr 0.00 | ms/batch  8.97 | loss  3.94 | ppl    51.42
| epoch 540 |   800/ 2928 batches | lr 0.00 | ms/batch  8.95 | loss  4.00 | ppl    54.61
| epoch 540 |  1000/ 2928 batches | lr 0.00 | ms/batch  8.63 | loss  4.01 | ppl    55.25
| epoch 540 |  1200/ 2928 batches | lr 0.00 | ms/batch  8.41 | loss  4.04 | ppl    56.99
| epoch 540 |  1400/ 2928 batches | lr 0.00 | ms/batch  8.37 | loss  4.02 | ppl    55.56
| epoch 540 |  1600/ 2928 batches | lr 0.00 | ms/batch  8.36 | loss  4.07 | ppl    58.60
| epoch 540 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.32 | loss  4.06 | ppl    57.71
| epoch 540 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.33 | loss  4.05 | ppl    57.59
| epoch 540 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.34 | loss  3.90 | ppl    49.48
| epoch 540 |  2400/ 

| epoch 545 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.40 | loss  4.05 | ppl    57.47
| epoch 545 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.38 | loss  4.05 | ppl    57.53
| epoch 545 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.37 | loss  3.90 | ppl    49.54
| epoch 545 |  2400/ 2928 batches | lr 0.00 | ms/batch  8.40 | loss  4.00 | ppl    54.49
| epoch 545 |  2600/ 2928 batches | lr 0.00 | ms/batch  8.36 | loss  4.02 | ppl    55.72
| epoch 545 |  2800/ 2928 batches | lr 0.00 | ms/batch  8.36 | loss  3.99 | ppl    54.07
-----------------------------------------------------------------------------------------
| end of epoch 545 | time: 25.85s | valid loss  5.54 | valid ppl   253.63
-----------------------------------------------------------------------------------------
| epoch 546 |   200/ 2928 batches | lr 0.00 | ms/batch 10.29 | loss  4.07 | ppl    58.63
| epoch 546 |   400/ 2928 batches | lr 0.00 | ms/batch  8.97 | loss  4.07 | ppl    58.83
| epoch 546 |   600/ 2928 batches 

| epoch 551 |   200/ 2928 batches | lr 0.00 | ms/batch  9.42 | loss  4.07 | ppl    58.53
| epoch 551 |   400/ 2928 batches | lr 0.00 | ms/batch  9.74 | loss  4.07 | ppl    58.83
| epoch 551 |   600/ 2928 batches | lr 0.00 | ms/batch 10.79 | loss  3.94 | ppl    51.48
| epoch 551 |   800/ 2928 batches | lr 0.00 | ms/batch 10.77 | loss  4.00 | ppl    54.79
| epoch 551 |  1000/ 2928 batches | lr 0.00 | ms/batch 10.07 | loss  4.01 | ppl    55.35
| epoch 551 |  1200/ 2928 batches | lr 0.00 | ms/batch  9.96 | loss  4.03 | ppl    56.53
| epoch 551 |  1400/ 2928 batches | lr 0.00 | ms/batch  9.95 | loss  4.01 | ppl    55.36
| epoch 551 |  1600/ 2928 batches | lr 0.00 | ms/batch  9.95 | loss  4.07 | ppl    58.81
| epoch 551 |  1800/ 2928 batches | lr 0.00 | ms/batch 10.06 | loss  4.05 | ppl    57.52
| epoch 551 |  2000/ 2928 batches | lr 0.00 | ms/batch 10.07 | loss  4.05 | ppl    57.67
| epoch 551 |  2200/ 2928 batches | lr 0.00 | ms/batch 10.09 | loss  3.90 | ppl    49.61
| epoch 551 |  2400/ 

| epoch 556 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.42 | loss  4.06 | ppl    57.70
| epoch 556 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.45 | loss  4.05 | ppl    57.64
| epoch 556 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.42 | loss  3.90 | ppl    49.51
| epoch 556 |  2400/ 2928 batches | lr 0.00 | ms/batch  8.45 | loss  4.00 | ppl    54.36
| epoch 556 |  2600/ 2928 batches | lr 0.00 | ms/batch  8.44 | loss  4.02 | ppl    55.80
| epoch 556 |  2800/ 2928 batches | lr 0.00 | ms/batch  8.41 | loss  3.99 | ppl    53.95
-----------------------------------------------------------------------------------------
| end of epoch 556 | time: 26.03s | valid loss  5.54 | valid ppl   253.63
-----------------------------------------------------------------------------------------
| epoch 557 |   200/ 2928 batches | lr 0.00 | ms/batch  8.70 | loss  4.07 | ppl    58.77
| epoch 557 |   400/ 2928 batches | lr 0.00 | ms/batch 10.05 | loss  4.07 | ppl    58.75
| epoch 557 |   600/ 2928 batches 

| epoch 562 |   200/ 2928 batches | lr 0.00 | ms/batch  8.48 | loss  4.07 | ppl    58.82
| epoch 562 |   400/ 2928 batches | lr 0.00 | ms/batch  8.39 | loss  4.08 | ppl    58.88
| epoch 562 |   600/ 2928 batches | lr 0.00 | ms/batch  8.43 | loss  3.94 | ppl    51.56
| epoch 562 |   800/ 2928 batches | lr 0.00 | ms/batch  8.43 | loss  4.01 | ppl    54.89
| epoch 562 |  1000/ 2928 batches | lr 0.00 | ms/batch  8.44 | loss  4.01 | ppl    55.22
| epoch 562 |  1200/ 2928 batches | lr 0.00 | ms/batch  8.42 | loss  4.04 | ppl    56.92
| epoch 562 |  1400/ 2928 batches | lr 0.00 | ms/batch  8.43 | loss  4.01 | ppl    55.20
| epoch 562 |  1600/ 2928 batches | lr 0.00 | ms/batch  8.42 | loss  4.07 | ppl    58.33
| epoch 562 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.46 | loss  4.06 | ppl    57.83
| epoch 562 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.43 | loss  4.05 | ppl    57.67
| epoch 562 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.44 | loss  3.90 | ppl    49.45
| epoch 562 |  2400/ 

| epoch 567 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.43 | loss  4.05 | ppl    57.51
| epoch 567 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.41 | loss  4.05 | ppl    57.59
| epoch 567 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.67 | loss  3.91 | ppl    49.68
| epoch 567 |  2400/ 2928 batches | lr 0.00 | ms/batch  8.99 | loss  4.00 | ppl    54.72
| epoch 567 |  2600/ 2928 batches | lr 0.00 | ms/batch  8.60 | loss  4.02 | ppl    55.88
| epoch 567 |  2800/ 2928 batches | lr 0.00 | ms/batch  8.47 | loss  3.99 | ppl    54.19
-----------------------------------------------------------------------------------------
| end of epoch 567 | time: 26.24s | valid loss  5.54 | valid ppl   253.63
-----------------------------------------------------------------------------------------
| epoch 568 |   200/ 2928 batches | lr 0.00 | ms/batch  8.48 | loss  4.07 | ppl    58.53
| epoch 568 |   400/ 2928 batches | lr 0.00 | ms/batch  8.42 | loss  4.08 | ppl    58.90
| epoch 568 |   600/ 2928 batches 

| epoch 573 |   200/ 2928 batches | lr 0.00 | ms/batch  9.47 | loss  4.07 | ppl    58.56
| epoch 573 |   400/ 2928 batches | lr 0.00 | ms/batch  9.28 | loss  4.07 | ppl    58.75
| epoch 573 |   600/ 2928 batches | lr 0.00 | ms/batch  9.30 | loss  3.94 | ppl    51.55
| epoch 573 |   800/ 2928 batches | lr 0.00 | ms/batch  9.28 | loss  4.00 | ppl    54.76
| epoch 573 |  1000/ 2928 batches | lr 0.00 | ms/batch  9.25 | loss  4.01 | ppl    55.23
| epoch 573 |  1200/ 2928 batches | lr 0.00 | ms/batch  9.28 | loss  4.04 | ppl    57.07
| epoch 573 |  1400/ 2928 batches | lr 0.00 | ms/batch  9.23 | loss  4.02 | ppl    55.54
| epoch 573 |  1600/ 2928 batches | lr 0.00 | ms/batch  9.28 | loss  4.07 | ppl    58.50
| epoch 573 |  1800/ 2928 batches | lr 0.00 | ms/batch  9.26 | loss  4.05 | ppl    57.65
| epoch 573 |  2000/ 2928 batches | lr 0.00 | ms/batch  9.28 | loss  4.06 | ppl    57.78
| epoch 573 |  2200/ 2928 batches | lr 0.00 | ms/batch  9.27 | loss  3.90 | ppl    49.29
| epoch 573 |  2400/ 

| epoch 578 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.42 | loss  4.06 | ppl    57.82
| epoch 578 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.43 | loss  4.05 | ppl    57.50
| epoch 578 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.42 | loss  3.90 | ppl    49.44
| epoch 578 |  2400/ 2928 batches | lr 0.00 | ms/batch  8.45 | loss  4.01 | ppl    54.90
| epoch 578 |  2600/ 2928 batches | lr 0.00 | ms/batch  8.49 | loss  4.02 | ppl    55.56
| epoch 578 |  2800/ 2928 batches | lr 0.00 | ms/batch  8.41 | loss  3.99 | ppl    54.11
-----------------------------------------------------------------------------------------
| end of epoch 578 | time: 26.06s | valid loss  5.54 | valid ppl   253.63
-----------------------------------------------------------------------------------------
| epoch 579 |   200/ 2928 batches | lr 0.00 | ms/batch  8.55 | loss  4.07 | ppl    58.36
| epoch 579 |   400/ 2928 batches | lr 0.00 | ms/batch  8.39 | loss  4.07 | ppl    58.75
| epoch 579 |   600/ 2928 batches 

| epoch 584 |   200/ 2928 batches | lr 0.00 | ms/batch  8.50 | loss  4.07 | ppl    58.63
| epoch 584 |   400/ 2928 batches | lr 0.00 | ms/batch  8.42 | loss  4.08 | ppl    59.05
| epoch 584 |   600/ 2928 batches | lr 0.00 | ms/batch  8.42 | loss  3.94 | ppl    51.46
| epoch 584 |   800/ 2928 batches | lr 0.00 | ms/batch  8.47 | loss  4.00 | ppl    54.63
| epoch 584 |  1000/ 2928 batches | lr 0.00 | ms/batch  8.42 | loss  4.01 | ppl    55.24
| epoch 584 |  1200/ 2928 batches | lr 0.00 | ms/batch  8.43 | loss  4.04 | ppl    56.61
| epoch 584 |  1400/ 2928 batches | lr 0.00 | ms/batch  8.51 | loss  4.01 | ppl    55.19
| epoch 584 |  1600/ 2928 batches | lr 0.00 | ms/batch  8.46 | loss  4.07 | ppl    58.80
| epoch 584 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.42 | loss  4.05 | ppl    57.67
| epoch 584 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.45 | loss  4.05 | ppl    57.33
| epoch 584 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.50 | loss  3.90 | ppl    49.53
| epoch 584 |  2400/ 

| epoch 589 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.45 | loss  4.05 | ppl    57.60
| epoch 589 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.43 | loss  4.06 | ppl    57.76
| epoch 589 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.46 | loss  3.90 | ppl    49.63
| epoch 589 |  2400/ 2928 batches | lr 0.00 | ms/batch  8.44 | loss  4.00 | ppl    54.76
| epoch 589 |  2600/ 2928 batches | lr 0.00 | ms/batch  8.44 | loss  4.02 | ppl    55.50
| epoch 589 |  2800/ 2928 batches | lr 0.00 | ms/batch  8.46 | loss  3.99 | ppl    53.93
-----------------------------------------------------------------------------------------
| end of epoch 589 | time: 26.17s | valid loss  5.54 | valid ppl   253.63
-----------------------------------------------------------------------------------------
| epoch 590 |   200/ 2928 batches | lr 0.00 | ms/batch  8.61 | loss  4.07 | ppl    58.50
| epoch 590 |   400/ 2928 batches | lr 0.00 | ms/batch  8.44 | loss  4.08 | ppl    58.89
| epoch 590 |   600/ 2928 batches 

| epoch 595 |   200/ 2928 batches | lr 0.00 | ms/batch  9.37 | loss  4.07 | ppl    58.68
| epoch 595 |   400/ 2928 batches | lr 0.00 | ms/batch  9.10 | loss  4.08 | ppl    59.13
| epoch 595 |   600/ 2928 batches | lr 0.00 | ms/batch  9.15 | loss  3.94 | ppl    51.30
| epoch 595 |   800/ 2928 batches | lr 0.00 | ms/batch  9.26 | loss  4.01 | ppl    54.91
| epoch 595 |  1000/ 2928 batches | lr 0.00 | ms/batch  9.79 | loss  4.01 | ppl    55.20
| epoch 595 |  1200/ 2928 batches | lr 0.00 | ms/batch  9.81 | loss  4.04 | ppl    56.60
| epoch 595 |  1400/ 2928 batches | lr 0.00 | ms/batch 10.00 | loss  4.01 | ppl    55.39
| epoch 595 |  1600/ 2928 batches | lr 0.00 | ms/batch 10.01 | loss  4.07 | ppl    58.79
| epoch 595 |  1800/ 2928 batches | lr 0.00 | ms/batch 10.04 | loss  4.06 | ppl    57.80
| epoch 595 |  2000/ 2928 batches | lr 0.00 | ms/batch 10.02 | loss  4.05 | ppl    57.66
| epoch 595 |  2200/ 2928 batches | lr 0.00 | ms/batch 10.03 | loss  3.90 | ppl    49.42
| epoch 595 |  2400/ 

| epoch 600 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.44 | loss  4.06 | ppl    57.84
| epoch 600 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.43 | loss  4.05 | ppl    57.30
| epoch 600 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.44 | loss  3.90 | ppl    49.47
| epoch 600 |  2400/ 2928 batches | lr 0.00 | ms/batch  8.42 | loss  4.00 | ppl    54.75
| epoch 600 |  2600/ 2928 batches | lr 0.00 | ms/batch  8.41 | loss  4.02 | ppl    55.63
| epoch 600 |  2800/ 2928 batches | lr 0.00 | ms/batch  8.42 | loss  3.99 | ppl    53.95
-----------------------------------------------------------------------------------------
| end of epoch 600 | time: 26.02s | valid loss  5.54 | valid ppl   253.63
-----------------------------------------------------------------------------------------
| epoch 601 |   200/ 2928 batches | lr 0.00 | ms/batch  8.47 | loss  4.07 | ppl    58.32
| epoch 601 |   400/ 2928 batches | lr 0.00 | ms/batch  8.43 | loss  4.08 | ppl    58.95
| epoch 601 |   600/ 2928 batches 

| epoch 606 |   200/ 2928 batches | lr 0.00 | ms/batch  8.49 | loss  4.07 | ppl    58.70
| epoch 606 |   400/ 2928 batches | lr 0.00 | ms/batch  8.42 | loss  4.08 | ppl    58.98
| epoch 606 |   600/ 2928 batches | lr 0.00 | ms/batch  8.45 | loss  3.94 | ppl    51.54
| epoch 606 |   800/ 2928 batches | lr 0.00 | ms/batch  9.16 | loss  4.00 | ppl    54.85
| epoch 606 |  1000/ 2928 batches | lr 0.00 | ms/batch  9.92 | loss  4.01 | ppl    55.35
| epoch 606 |  1200/ 2928 batches | lr 0.00 | ms/batch 10.06 | loss  4.04 | ppl    56.89
| epoch 606 |  1400/ 2928 batches | lr 0.00 | ms/batch 10.01 | loss  4.01 | ppl    55.37
| epoch 606 |  1600/ 2928 batches | lr 0.00 | ms/batch 10.09 | loss  4.08 | ppl    59.01
| epoch 606 |  1800/ 2928 batches | lr 0.00 | ms/batch 10.07 | loss  4.06 | ppl    57.70
| epoch 606 |  2000/ 2928 batches | lr 0.00 | ms/batch 10.07 | loss  4.05 | ppl    57.65
| epoch 606 |  2200/ 2928 batches | lr 0.00 | ms/batch 10.07 | loss  3.90 | ppl    49.48
| epoch 606 |  2400/ 

| epoch 611 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.43 | loss  4.05 | ppl    57.38
| epoch 611 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.40 | loss  4.05 | ppl    57.64
| epoch 611 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.35 | loss  3.90 | ppl    49.39
| epoch 611 |  2400/ 2928 batches | lr 0.00 | ms/batch  8.38 | loss  4.00 | ppl    54.64
| epoch 611 |  2600/ 2928 batches | lr 0.00 | ms/batch  8.36 | loss  4.02 | ppl    55.75
| epoch 611 |  2800/ 2928 batches | lr 0.00 | ms/batch  8.44 | loss  3.99 | ppl    53.91
-----------------------------------------------------------------------------------------
| end of epoch 611 | time: 25.91s | valid loss  5.54 | valid ppl   253.63
-----------------------------------------------------------------------------------------
| epoch 612 |   200/ 2928 batches | lr 0.00 | ms/batch  8.47 | loss  4.07 | ppl    58.54
| epoch 612 |   400/ 2928 batches | lr 0.00 | ms/batch  8.33 | loss  4.08 | ppl    58.92
| epoch 612 |   600/ 2928 batches 

| epoch 617 |   200/ 2928 batches | lr 0.00 | ms/batch  9.42 | loss  4.07 | ppl    58.35
| epoch 617 |   400/ 2928 batches | lr 0.00 | ms/batch 10.08 | loss  4.07 | ppl    58.63
| epoch 617 |   600/ 2928 batches | lr 0.00 | ms/batch 10.04 | loss  3.94 | ppl    51.39
| epoch 617 |   800/ 2928 batches | lr 0.00 | ms/batch 10.08 | loss  4.01 | ppl    55.00
| epoch 617 |  1000/ 2928 batches | lr 0.00 | ms/batch 10.08 | loss  4.01 | ppl    55.29
| epoch 617 |  1200/ 2928 batches | lr 0.00 | ms/batch 10.09 | loss  4.04 | ppl    56.64
| epoch 617 |  1400/ 2928 batches | lr 0.00 | ms/batch 10.10 | loss  4.01 | ppl    55.31
| epoch 617 |  1600/ 2928 batches | lr 0.00 | ms/batch 10.12 | loss  4.07 | ppl    58.85
| epoch 617 |  1800/ 2928 batches | lr 0.00 | ms/batch 10.13 | loss  4.05 | ppl    57.68
| epoch 617 |  2000/ 2928 batches | lr 0.00 | ms/batch 10.10 | loss  4.05 | ppl    57.62
| epoch 617 |  2200/ 2928 batches | lr 0.00 | ms/batch 10.15 | loss  3.90 | ppl    49.53
| epoch 617 |  2400/ 

| epoch 622 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.39 | loss  4.05 | ppl    57.59
| epoch 622 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.37 | loss  4.05 | ppl    57.56
| epoch 622 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.37 | loss  3.91 | ppl    49.72
| epoch 622 |  2400/ 2928 batches | lr 0.00 | ms/batch  8.36 | loss  4.00 | ppl    54.72
| epoch 622 |  2600/ 2928 batches | lr 0.00 | ms/batch  8.40 | loss  4.02 | ppl    55.68
| epoch 622 |  2800/ 2928 batches | lr 0.00 | ms/batch  8.40 | loss  3.99 | ppl    54.05
-----------------------------------------------------------------------------------------
| end of epoch 622 | time: 25.92s | valid loss  5.54 | valid ppl   253.63
-----------------------------------------------------------------------------------------
| epoch 623 |   200/ 2928 batches | lr 0.00 | ms/batch  8.46 | loss  4.06 | ppl    58.17
| epoch 623 |   400/ 2928 batches | lr 0.00 | ms/batch  8.38 | loss  4.08 | ppl    58.96
| epoch 623 |   600/ 2928 batches 

| epoch 628 |   200/ 2928 batches | lr 0.00 | ms/batch  8.45 | loss  4.07 | ppl    58.55
| epoch 628 |   400/ 2928 batches | lr 0.00 | ms/batch  8.40 | loss  4.07 | ppl    58.81
| epoch 628 |   600/ 2928 batches | lr 0.00 | ms/batch  8.40 | loss  3.94 | ppl    51.49
| epoch 628 |   800/ 2928 batches | lr 0.00 | ms/batch  8.41 | loss  4.00 | ppl    54.87
| epoch 628 |  1000/ 2928 batches | lr 0.00 | ms/batch  8.39 | loss  4.01 | ppl    55.35
| epoch 628 |  1200/ 2928 batches | lr 0.00 | ms/batch  8.43 | loss  4.04 | ppl    56.89
| epoch 628 |  1400/ 2928 batches | lr 0.00 | ms/batch  8.44 | loss  4.01 | ppl    55.24
| epoch 628 |  1600/ 2928 batches | lr 0.00 | ms/batch  8.41 | loss  4.08 | ppl    58.89
| epoch 628 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.39 | loss  4.06 | ppl    57.73
| epoch 628 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.39 | loss  4.05 | ppl    57.12
| epoch 628 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.40 | loss  3.90 | ppl    49.44
| epoch 628 |  2400/ 

| epoch 633 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.46 | loss  4.05 | ppl    57.62
| epoch 633 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.46 | loss  4.06 | ppl    57.76
| epoch 633 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.42 | loss  3.90 | ppl    49.57
| epoch 633 |  2400/ 2928 batches | lr 0.00 | ms/batch  8.44 | loss  4.00 | ppl    54.51
| epoch 633 |  2600/ 2928 batches | lr 0.00 | ms/batch  8.42 | loss  4.02 | ppl    55.95
| epoch 633 |  2800/ 2928 batches | lr 0.00 | ms/batch  8.43 | loss  3.99 | ppl    54.02
-----------------------------------------------------------------------------------------
| end of epoch 633 | time: 26.02s | valid loss  5.54 | valid ppl   253.63
-----------------------------------------------------------------------------------------
| epoch 634 |   200/ 2928 batches | lr 0.00 | ms/batch  8.48 | loss  4.07 | ppl    58.51
| epoch 634 |   400/ 2928 batches | lr 0.00 | ms/batch  8.47 | loss  4.07 | ppl    58.60
| epoch 634 |   600/ 2928 batches 

| epoch 639 |   200/ 2928 batches | lr 0.00 | ms/batch  8.56 | loss  4.07 | ppl    58.50
| epoch 639 |   400/ 2928 batches | lr 0.00 | ms/batch  8.41 | loss  4.08 | ppl    59.08
| epoch 639 |   600/ 2928 batches | lr 0.00 | ms/batch  8.40 | loss  3.94 | ppl    51.45
| epoch 639 |   800/ 2928 batches | lr 0.00 | ms/batch  8.41 | loss  4.00 | ppl    54.73
| epoch 639 |  1000/ 2928 batches | lr 0.00 | ms/batch  8.42 | loss  4.01 | ppl    55.32
| epoch 639 |  1200/ 2928 batches | lr 0.00 | ms/batch  8.40 | loss  4.04 | ppl    56.88
| epoch 639 |  1400/ 2928 batches | lr 0.00 | ms/batch  8.45 | loss  4.01 | ppl    55.26
| epoch 639 |  1600/ 2928 batches | lr 0.00 | ms/batch  8.41 | loss  4.07 | ppl    58.71
| epoch 639 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.47 | loss  4.06 | ppl    57.79
| epoch 639 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.48 | loss  4.05 | ppl    57.49
| epoch 639 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.44 | loss  3.90 | ppl    49.54
| epoch 639 |  2400/ 

| epoch 644 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.48 | loss  4.05 | ppl    57.66
| epoch 644 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.47 | loss  4.06 | ppl    57.79
| epoch 644 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.46 | loss  3.90 | ppl    49.42
| epoch 644 |  2400/ 2928 batches | lr 0.00 | ms/batch  8.46 | loss  4.00 | ppl    54.78
| epoch 644 |  2600/ 2928 batches | lr 0.00 | ms/batch  8.47 | loss  4.02 | ppl    55.67
| epoch 644 |  2800/ 2928 batches | lr 0.00 | ms/batch  8.44 | loss  3.99 | ppl    53.85
-----------------------------------------------------------------------------------------
| end of epoch 644 | time: 26.19s | valid loss  5.54 | valid ppl   253.63
-----------------------------------------------------------------------------------------
| epoch 645 |   200/ 2928 batches | lr 0.00 | ms/batch  8.58 | loss  4.06 | ppl    58.22
| epoch 645 |   400/ 2928 batches | lr 0.00 | ms/batch  8.44 | loss  4.07 | ppl    58.71
| epoch 645 |   600/ 2928 batches 

| epoch 650 |   200/ 2928 batches | lr 0.00 | ms/batch  8.51 | loss  4.07 | ppl    58.44
| epoch 650 |   400/ 2928 batches | lr 0.00 | ms/batch  8.47 | loss  4.07 | ppl    58.69
| epoch 650 |   600/ 2928 batches | lr 0.00 | ms/batch  8.46 | loss  3.94 | ppl    51.62
| epoch 650 |   800/ 2928 batches | lr 0.00 | ms/batch  8.49 | loss  4.00 | ppl    54.85
| epoch 650 |  1000/ 2928 batches | lr 0.00 | ms/batch  8.45 | loss  4.01 | ppl    55.12
| epoch 650 |  1200/ 2928 batches | lr 0.00 | ms/batch  8.46 | loss  4.04 | ppl    56.74
| epoch 650 |  1400/ 2928 batches | lr 0.00 | ms/batch  8.44 | loss  4.02 | ppl    55.59
| epoch 650 |  1600/ 2928 batches | lr 0.00 | ms/batch  8.48 | loss  4.07 | ppl    58.45
| epoch 650 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.46 | loss  4.05 | ppl    57.59
| epoch 650 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.45 | loss  4.05 | ppl    57.65
| epoch 650 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.45 | loss  3.90 | ppl    49.59
| epoch 650 |  2400/ 

| epoch 655 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.36 | loss  4.06 | ppl    57.72
| epoch 655 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.40 | loss  4.05 | ppl    57.61
| epoch 655 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.38 | loss  3.90 | ppl    49.42
| epoch 655 |  2400/ 2928 batches | lr 0.00 | ms/batch  8.43 | loss  4.00 | ppl    54.58
| epoch 655 |  2600/ 2928 batches | lr 0.00 | ms/batch  8.37 | loss  4.01 | ppl    55.37
| epoch 655 |  2800/ 2928 batches | lr 0.00 | ms/batch  8.37 | loss  3.99 | ppl    54.01
-----------------------------------------------------------------------------------------
| end of epoch 655 | time: 25.87s | valid loss  5.54 | valid ppl   253.63
-----------------------------------------------------------------------------------------
| epoch 656 |   200/ 2928 batches | lr 0.00 | ms/batch  8.45 | loss  4.07 | ppl    58.50
| epoch 656 |   400/ 2928 batches | lr 0.00 | ms/batch  8.33 | loss  4.08 | ppl    58.91
| epoch 656 |   600/ 2928 batches 

| epoch 661 |   200/ 2928 batches | lr 0.00 | ms/batch  8.50 | loss  4.07 | ppl    58.47
| epoch 661 |   400/ 2928 batches | lr 0.00 | ms/batch  8.40 | loss  4.07 | ppl    58.74
| epoch 661 |   600/ 2928 batches | lr 0.00 | ms/batch  8.41 | loss  3.94 | ppl    51.39
| epoch 661 |   800/ 2928 batches | lr 0.00 | ms/batch  8.43 | loss  4.00 | ppl    54.72
| epoch 661 |  1000/ 2928 batches | lr 0.00 | ms/batch  8.43 | loss  4.02 | ppl    55.42
| epoch 661 |  1200/ 2928 batches | lr 0.00 | ms/batch  8.43 | loss  4.04 | ppl    57.04
| epoch 661 |  1400/ 2928 batches | lr 0.00 | ms/batch  8.46 | loss  4.01 | ppl    55.22
| epoch 661 |  1600/ 2928 batches | lr 0.00 | ms/batch  8.43 | loss  4.07 | ppl    58.76
| epoch 661 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.46 | loss  4.06 | ppl    57.82
| epoch 661 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.43 | loss  4.05 | ppl    57.58
| epoch 661 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.47 | loss  3.91 | ppl    49.71
| epoch 661 |  2400/ 

| epoch 666 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.42 | loss  4.05 | ppl    57.66
| epoch 666 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.40 | loss  4.05 | ppl    57.38
| epoch 666 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.42 | loss  3.90 | ppl    49.46
| epoch 666 |  2400/ 2928 batches | lr 0.00 | ms/batch  8.41 | loss  4.00 | ppl    54.55
| epoch 666 |  2600/ 2928 batches | lr 0.00 | ms/batch  8.42 | loss  4.02 | ppl    55.65
| epoch 666 |  2800/ 2928 batches | lr 0.00 | ms/batch  8.42 | loss  3.98 | ppl    53.72
-----------------------------------------------------------------------------------------
| end of epoch 666 | time: 26.00s | valid loss  5.54 | valid ppl   253.63
-----------------------------------------------------------------------------------------
| epoch 667 |   200/ 2928 batches | lr 0.00 | ms/batch  8.56 | loss  4.07 | ppl    58.45
| epoch 667 |   400/ 2928 batches | lr 0.00 | ms/batch  8.41 | loss  4.07 | ppl    58.72
| epoch 667 |   600/ 2928 batches 

| epoch 672 |   200/ 2928 batches | lr 0.00 | ms/batch  8.56 | loss  4.07 | ppl    58.58
| epoch 672 |   400/ 2928 batches | lr 0.00 | ms/batch  8.38 | loss  4.08 | ppl    58.85
| epoch 672 |   600/ 2928 batches | lr 0.00 | ms/batch  8.39 | loss  3.94 | ppl    51.45
| epoch 672 |   800/ 2928 batches | lr 0.00 | ms/batch  8.39 | loss  4.00 | ppl    54.82
| epoch 672 |  1000/ 2928 batches | lr 0.00 | ms/batch  8.39 | loss  4.01 | ppl    55.33
| epoch 672 |  1200/ 2928 batches | lr 0.00 | ms/batch  8.38 | loss  4.04 | ppl    56.90
| epoch 672 |  1400/ 2928 batches | lr 0.00 | ms/batch  8.43 | loss  4.01 | ppl    55.37
| epoch 672 |  1600/ 2928 batches | lr 0.00 | ms/batch  8.42 | loss  4.07 | ppl    58.68
| epoch 672 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.45 | loss  4.05 | ppl    57.61
| epoch 672 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.41 | loss  4.05 | ppl    57.37
| epoch 672 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.43 | loss  3.90 | ppl    49.57
| epoch 672 |  2400/ 

| epoch 677 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.40 | loss  4.05 | ppl    57.63
| epoch 677 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.39 | loss  4.06 | ppl    57.73
| epoch 677 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.40 | loss  3.90 | ppl    49.47
| epoch 677 |  2400/ 2928 batches | lr 0.00 | ms/batch  8.38 | loss  4.00 | ppl    54.80
| epoch 677 |  2600/ 2928 batches | lr 0.00 | ms/batch  8.41 | loss  4.02 | ppl    55.74
| epoch 677 |  2800/ 2928 batches | lr 0.00 | ms/batch  8.40 | loss  3.99 | ppl    54.02
-----------------------------------------------------------------------------------------
| end of epoch 677 | time: 25.95s | valid loss  5.54 | valid ppl   253.63
-----------------------------------------------------------------------------------------
| epoch 678 |   200/ 2928 batches | lr 0.00 | ms/batch  8.50 | loss  4.07 | ppl    58.38
| epoch 678 |   400/ 2928 batches | lr 0.00 | ms/batch  8.39 | loss  4.07 | ppl    58.37
| epoch 678 |   600/ 2928 batches 

| epoch 683 |   200/ 2928 batches | lr 0.00 | ms/batch  8.49 | loss  4.07 | ppl    58.85
| epoch 683 |   400/ 2928 batches | lr 0.00 | ms/batch  8.37 | loss  4.08 | ppl    58.99
| epoch 683 |   600/ 2928 batches | lr 0.00 | ms/batch  8.41 | loss  3.94 | ppl    51.44
| epoch 683 |   800/ 2928 batches | lr 0.00 | ms/batch  8.40 | loss  4.00 | ppl    54.78
| epoch 683 |  1000/ 2928 batches | lr 0.00 | ms/batch  8.39 | loss  4.01 | ppl    55.30
| epoch 683 |  1200/ 2928 batches | lr 0.00 | ms/batch  8.42 | loss  4.04 | ppl    56.92
| epoch 683 |  1400/ 2928 batches | lr 0.00 | ms/batch  8.39 | loss  4.01 | ppl    55.30
| epoch 683 |  1600/ 2928 batches | lr 0.00 | ms/batch  8.44 | loss  4.07 | ppl    58.69
| epoch 683 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.43 | loss  4.05 | ppl    57.68
| epoch 683 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.47 | loss  4.05 | ppl    57.52
| epoch 683 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.45 | loss  3.90 | ppl    49.57
| epoch 683 |  2400/ 

| epoch 688 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.34 | loss  4.06 | ppl    57.91
| epoch 688 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.34 | loss  4.05 | ppl    57.65
| epoch 688 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.34 | loss  3.91 | ppl    49.71
| epoch 688 |  2400/ 2928 batches | lr 0.00 | ms/batch  8.35 | loss  4.00 | ppl    54.47
| epoch 688 |  2600/ 2928 batches | lr 0.00 | ms/batch  8.33 | loss  4.02 | ppl    55.94
| epoch 688 |  2800/ 2928 batches | lr 0.00 | ms/batch  8.33 | loss  3.99 | ppl    54.26
-----------------------------------------------------------------------------------------
| end of epoch 688 | time: 25.75s | valid loss  5.54 | valid ppl   253.63
-----------------------------------------------------------------------------------------
| epoch 689 |   200/ 2928 batches | lr 0.00 | ms/batch  8.42 | loss  4.07 | ppl    58.66
| epoch 689 |   400/ 2928 batches | lr 0.00 | ms/batch  8.34 | loss  4.08 | ppl    58.88
| epoch 689 |   600/ 2928 batches 

| epoch 694 |   200/ 2928 batches | lr 0.00 | ms/batch  8.48 | loss  4.07 | ppl    58.62
| epoch 694 |   400/ 2928 batches | lr 0.00 | ms/batch  8.33 | loss  4.07 | ppl    58.60
| epoch 694 |   600/ 2928 batches | lr 0.00 | ms/batch  8.35 | loss  3.94 | ppl    51.58
| epoch 694 |   800/ 2928 batches | lr 0.00 | ms/batch  8.32 | loss  4.00 | ppl    54.67
| epoch 694 |  1000/ 2928 batches | lr 0.00 | ms/batch  8.33 | loss  4.01 | ppl    55.29
| epoch 694 |  1200/ 2928 batches | lr 0.00 | ms/batch  8.37 | loss  4.04 | ppl    56.81
| epoch 694 |  1400/ 2928 batches | lr 0.00 | ms/batch  8.34 | loss  4.01 | ppl    55.33
| epoch 694 |  1600/ 2928 batches | lr 0.00 | ms/batch  8.32 | loss  4.07 | ppl    58.67
| epoch 694 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.35 | loss  4.06 | ppl    57.82
| epoch 694 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.40 | loss  4.05 | ppl    57.53
| epoch 694 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.41 | loss  3.91 | ppl    49.66
| epoch 694 |  2400/ 

| epoch 699 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.46 | loss  4.05 | ppl    57.63
| epoch 699 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.54 | loss  4.05 | ppl    57.32
| epoch 699 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.73 | loss  3.90 | ppl    49.48
| epoch 699 |  2400/ 2928 batches | lr 0.00 | ms/batch  8.81 | loss  4.00 | ppl    54.44
| epoch 699 |  2600/ 2928 batches | lr 0.00 | ms/batch  8.74 | loss  4.02 | ppl    55.46
| epoch 699 |  2800/ 2928 batches | lr 0.00 | ms/batch  8.49 | loss  3.99 | ppl    54.06
-----------------------------------------------------------------------------------------
| end of epoch 699 | time: 26.38s | valid loss  5.54 | valid ppl   253.63
-----------------------------------------------------------------------------------------
| epoch 700 |   200/ 2928 batches | lr 0.00 | ms/batch  8.61 | loss  4.07 | ppl    58.51
| epoch 700 |   400/ 2928 batches | lr 0.00 | ms/batch  8.40 | loss  4.07 | ppl    58.63
| epoch 700 |   600/ 2928 batches 

| epoch 705 |   200/ 2928 batches | lr 0.00 | ms/batch  8.44 | loss  4.07 | ppl    58.74
| epoch 705 |   400/ 2928 batches | lr 0.00 | ms/batch  8.75 | loss  4.08 | ppl    58.98
| epoch 705 |   600/ 2928 batches | lr 0.00 | ms/batch  9.32 | loss  3.94 | ppl    51.55
| epoch 705 |   800/ 2928 batches | lr 0.00 | ms/batch 10.69 | loss  4.00 | ppl    54.72
| epoch 705 |  1000/ 2928 batches | lr 0.00 | ms/batch 10.37 | loss  4.01 | ppl    55.38
| epoch 705 |  1200/ 2928 batches | lr 0.00 | ms/batch 10.10 | loss  4.04 | ppl    56.87
| epoch 705 |  1400/ 2928 batches | lr 0.00 | ms/batch 10.15 | loss  4.01 | ppl    55.36
| epoch 705 |  1600/ 2928 batches | lr 0.00 | ms/batch 10.16 | loss  4.07 | ppl    58.78
| epoch 705 |  1800/ 2928 batches | lr 0.00 | ms/batch 10.19 | loss  4.06 | ppl    57.74
| epoch 705 |  2000/ 2928 batches | lr 0.00 | ms/batch 10.08 | loss  4.05 | ppl    57.63
| epoch 705 |  2200/ 2928 batches | lr 0.00 | ms/batch 10.18 | loss  3.90 | ppl    49.42
| epoch 705 |  2400/ 

| epoch 710 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.44 | loss  4.06 | ppl    57.69
| epoch 710 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.40 | loss  4.05 | ppl    57.68
| epoch 710 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.44 | loss  3.91 | ppl    49.69
| epoch 710 |  2400/ 2928 batches | lr 0.00 | ms/batch  8.42 | loss  4.00 | ppl    54.49
| epoch 710 |  2600/ 2928 batches | lr 0.00 | ms/batch  8.41 | loss  4.02 | ppl    55.79
| epoch 710 |  2800/ 2928 batches | lr 0.00 | ms/batch  8.47 | loss  3.99 | ppl    53.88
-----------------------------------------------------------------------------------------
| end of epoch 710 | time: 26.64s | valid loss  5.54 | valid ppl   253.63
-----------------------------------------------------------------------------------------
| epoch 711 |   200/ 2928 batches | lr 0.00 | ms/batch  8.58 | loss  4.07 | ppl    58.41
| epoch 711 |   400/ 2928 batches | lr 0.00 | ms/batch  8.44 | loss  4.08 | ppl    58.85
| epoch 711 |   600/ 2928 batches 

| epoch 716 |   200/ 2928 batches | lr 0.00 | ms/batch  8.59 | loss  4.07 | ppl    58.61
| epoch 716 |   400/ 2928 batches | lr 0.00 | ms/batch  8.45 | loss  4.07 | ppl    58.75
| epoch 716 |   600/ 2928 batches | lr 0.00 | ms/batch  8.42 | loss  3.94 | ppl    51.54
| epoch 716 |   800/ 2928 batches | lr 0.00 | ms/batch  8.43 | loss  4.01 | ppl    54.90
| epoch 716 |  1000/ 2928 batches | lr 0.00 | ms/batch  8.43 | loss  4.02 | ppl    55.53
| epoch 716 |  1200/ 2928 batches | lr 0.00 | ms/batch  8.42 | loss  4.04 | ppl    56.81
| epoch 716 |  1400/ 2928 batches | lr 0.00 | ms/batch  8.43 | loss  4.01 | ppl    55.33
| epoch 716 |  1600/ 2928 batches | lr 0.00 | ms/batch  8.45 | loss  4.08 | ppl    58.88
| epoch 716 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.45 | loss  4.06 | ppl    57.76
| epoch 716 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.44 | loss  4.05 | ppl    57.51
| epoch 716 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.44 | loss  3.90 | ppl    49.31
| epoch 716 |  2400/ 

| epoch 721 |  1800/ 2928 batches | lr 0.00 | ms/batch  9.96 | loss  4.06 | ppl    57.92
| epoch 721 |  2000/ 2928 batches | lr 0.00 | ms/batch  9.96 | loss  4.05 | ppl    57.45
| epoch 721 |  2200/ 2928 batches | lr 0.00 | ms/batch 10.03 | loss  3.90 | ppl    49.42
| epoch 721 |  2400/ 2928 batches | lr 0.00 | ms/batch 10.11 | loss  4.00 | ppl    54.78
| epoch 721 |  2600/ 2928 batches | lr 0.00 | ms/batch 10.11 | loss  4.02 | ppl    55.60
| epoch 721 |  2800/ 2928 batches | lr 0.00 | ms/batch 10.08 | loss  3.99 | ppl    54.09
-----------------------------------------------------------------------------------------
| end of epoch 721 | time: 28.68s | valid loss  5.54 | valid ppl   253.63
-----------------------------------------------------------------------------------------
| epoch 722 |   200/ 2928 batches | lr 0.00 | ms/batch  8.54 | loss  4.07 | ppl    58.75
| epoch 722 |   400/ 2928 batches | lr 0.00 | ms/batch  8.41 | loss  4.07 | ppl    58.75
| epoch 722 |   600/ 2928 batches 

| epoch 727 |   200/ 2928 batches | lr 0.00 | ms/batch  8.75 | loss  4.07 | ppl    58.52
| epoch 727 |   400/ 2928 batches | lr 0.00 | ms/batch  8.83 | loss  4.08 | ppl    58.96
| epoch 727 |   600/ 2928 batches | lr 0.00 | ms/batch  9.12 | loss  3.94 | ppl    51.64
| epoch 727 |   800/ 2928 batches | lr 0.00 | ms/batch  8.57 | loss  4.01 | ppl    54.91
| epoch 727 |  1000/ 2928 batches | lr 0.00 | ms/batch  8.35 | loss  4.01 | ppl    55.22
| epoch 727 |  1200/ 2928 batches | lr 0.00 | ms/batch  8.35 | loss  4.04 | ppl    56.96
| epoch 727 |  1400/ 2928 batches | lr 0.00 | ms/batch  8.37 | loss  4.01 | ppl    55.16
| epoch 727 |  1600/ 2928 batches | lr 0.00 | ms/batch  8.38 | loss  4.08 | ppl    58.86
| epoch 727 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.38 | loss  4.05 | ppl    57.64
| epoch 727 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.33 | loss  4.05 | ppl    57.36
| epoch 727 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.32 | loss  3.90 | ppl    49.49
| epoch 727 |  2400/ 

| epoch 732 |  1800/ 2928 batches | lr 0.00 | ms/batch 10.86 | loss  4.05 | ppl    57.52
| epoch 732 |  2000/ 2928 batches | lr 0.00 | ms/batch 10.93 | loss  4.05 | ppl    57.39
| epoch 732 |  2200/ 2928 batches | lr 0.00 | ms/batch 10.91 | loss  3.90 | ppl    49.64
| epoch 732 |  2400/ 2928 batches | lr 0.00 | ms/batch 10.88 | loss  4.00 | ppl    54.67
| epoch 732 |  2600/ 2928 batches | lr 0.00 | ms/batch 10.96 | loss  4.03 | ppl    56.02
| epoch 732 |  2800/ 2928 batches | lr 0.00 | ms/batch 10.96 | loss  3.99 | ppl    54.17
-----------------------------------------------------------------------------------------
| end of epoch 732 | time: 33.20s | valid loss  5.54 | valid ppl   253.63
-----------------------------------------------------------------------------------------
| epoch 733 |   200/ 2928 batches | lr 0.00 | ms/batch  8.45 | loss  4.07 | ppl    58.55
| epoch 733 |   400/ 2928 batches | lr 0.00 | ms/batch  8.37 | loss  4.07 | ppl    58.84
| epoch 733 |   600/ 2928 batches 

| epoch 738 |   200/ 2928 batches | lr 0.00 | ms/batch  8.47 | loss  4.07 | ppl    58.54
| epoch 738 |   400/ 2928 batches | lr 0.00 | ms/batch  8.34 | loss  4.08 | ppl    58.93
| epoch 738 |   600/ 2928 batches | lr 0.00 | ms/batch  8.35 | loss  3.94 | ppl    51.22
| epoch 738 |   800/ 2928 batches | lr 0.00 | ms/batch  8.33 | loss  4.00 | ppl    54.82
| epoch 738 |  1000/ 2928 batches | lr 0.00 | ms/batch  8.32 | loss  4.01 | ppl    55.31
| epoch 738 |  1200/ 2928 batches | lr 0.00 | ms/batch  8.34 | loss  4.04 | ppl    56.59
| epoch 738 |  1400/ 2928 batches | lr 0.00 | ms/batch  8.37 | loss  4.01 | ppl    55.03
| epoch 738 |  1600/ 2928 batches | lr 0.00 | ms/batch  8.37 | loss  4.07 | ppl    58.67
| epoch 738 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.34 | loss  4.05 | ppl    57.28
| epoch 738 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.32 | loss  4.05 | ppl    57.54
| epoch 738 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.36 | loss  3.90 | ppl    49.51
| epoch 738 |  2400/ 

| epoch 743 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.43 | loss  4.06 | ppl    57.85
| epoch 743 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.43 | loss  4.05 | ppl    57.66
| epoch 743 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.43 | loss  3.91 | ppl    49.65
| epoch 743 |  2400/ 2928 batches | lr 0.00 | ms/batch  8.48 | loss  4.00 | ppl    54.48
| epoch 743 |  2600/ 2928 batches | lr 0.00 | ms/batch  8.49 | loss  4.02 | ppl    55.71
| epoch 743 |  2800/ 2928 batches | lr 0.00 | ms/batch  8.41 | loss  3.99 | ppl    54.10
-----------------------------------------------------------------------------------------
| end of epoch 743 | time: 26.12s | valid loss  5.54 | valid ppl   253.63
-----------------------------------------------------------------------------------------
| epoch 744 |   200/ 2928 batches | lr 0.00 | ms/batch  8.55 | loss  4.06 | ppl    58.12
| epoch 744 |   400/ 2928 batches | lr 0.00 | ms/batch  8.38 | loss  4.07 | ppl    58.62
| epoch 744 |   600/ 2928 batches 

| epoch 749 |   200/ 2928 batches | lr 0.00 | ms/batch  8.56 | loss  4.07 | ppl    58.66
| epoch 749 |   400/ 2928 batches | lr 0.00 | ms/batch  8.41 | loss  4.07 | ppl    58.83
| epoch 749 |   600/ 2928 batches | lr 0.00 | ms/batch  8.40 | loss  3.94 | ppl    51.23
| epoch 749 |   800/ 2928 batches | lr 0.00 | ms/batch  8.42 | loss  4.00 | ppl    54.74
| epoch 749 |  1000/ 2928 batches | lr 0.00 | ms/batch  8.42 | loss  4.01 | ppl    55.24
| epoch 749 |  1200/ 2928 batches | lr 0.00 | ms/batch  8.44 | loss  4.04 | ppl    56.72
| epoch 749 |  1400/ 2928 batches | lr 0.00 | ms/batch  8.45 | loss  4.01 | ppl    55.39
| epoch 749 |  1600/ 2928 batches | lr 0.00 | ms/batch  8.42 | loss  4.07 | ppl    58.79
| epoch 749 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.45 | loss  4.06 | ppl    57.78
| epoch 749 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.44 | loss  4.05 | ppl    57.55
| epoch 749 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.46 | loss  3.90 | ppl    49.44
| epoch 749 |  2400/ 

| epoch 754 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.48 | loss  4.05 | ppl    57.63
| epoch 754 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.44 | loss  4.05 | ppl    57.41
| epoch 754 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.46 | loss  3.90 | ppl    49.46
| epoch 754 |  2400/ 2928 batches | lr 0.00 | ms/batch  8.85 | loss  4.01 | ppl    54.98
| epoch 754 |  2600/ 2928 batches | lr 0.00 | ms/batch  9.00 | loss  4.02 | ppl    55.69
| epoch 754 |  2800/ 2928 batches | lr 0.00 | ms/batch  8.90 | loss  3.99 | ppl    54.26
-----------------------------------------------------------------------------------------
| end of epoch 754 | time: 26.37s | valid loss  5.54 | valid ppl   253.63
-----------------------------------------------------------------------------------------
| epoch 755 |   200/ 2928 batches | lr 0.00 | ms/batch  8.52 | loss  4.08 | ppl    58.88
| epoch 755 |   400/ 2928 batches | lr 0.00 | ms/batch  8.38 | loss  4.07 | ppl    58.76
| epoch 755 |   600/ 2928 batches 

| epoch 760 |   200/ 2928 batches | lr 0.00 | ms/batch 10.24 | loss  4.07 | ppl    58.42
| epoch 760 |   400/ 2928 batches | lr 0.00 | ms/batch  9.05 | loss  4.08 | ppl    58.94
| epoch 760 |   600/ 2928 batches | lr 0.00 | ms/batch  8.42 | loss  3.94 | ppl    51.35
| epoch 760 |   800/ 2928 batches | lr 0.00 | ms/batch  8.43 | loss  4.01 | ppl    54.89
| epoch 760 |  1000/ 2928 batches | lr 0.00 | ms/batch  8.41 | loss  4.01 | ppl    55.42
| epoch 760 |  1200/ 2928 batches | lr 0.00 | ms/batch  8.46 | loss  4.04 | ppl    56.82
| epoch 760 |  1400/ 2928 batches | lr 0.00 | ms/batch  8.44 | loss  4.02 | ppl    55.49
| epoch 760 |  1600/ 2928 batches | lr 0.00 | ms/batch  8.45 | loss  4.07 | ppl    58.67
| epoch 760 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.43 | loss  4.06 | ppl    57.69
| epoch 760 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.45 | loss  4.05 | ppl    57.68
| epoch 760 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.43 | loss  3.90 | ppl    49.25
| epoch 760 |  2400/ 

| epoch 765 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.47 | loss  4.06 | ppl    57.85
| epoch 765 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.43 | loss  4.05 | ppl    57.61
| epoch 765 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.49 | loss  3.91 | ppl    49.80
| epoch 765 |  2400/ 2928 batches | lr 0.00 | ms/batch  8.43 | loss  4.00 | ppl    54.40
| epoch 765 |  2600/ 2928 batches | lr 0.00 | ms/batch  8.49 | loss  4.02 | ppl    55.78
| epoch 765 |  2800/ 2928 batches | lr 0.00 | ms/batch  8.44 | loss  3.99 | ppl    54.10
-----------------------------------------------------------------------------------------
| end of epoch 765 | time: 26.10s | valid loss  5.54 | valid ppl   253.63
-----------------------------------------------------------------------------------------
| epoch 766 |   200/ 2928 batches | lr 0.00 | ms/batch  8.56 | loss  4.07 | ppl    58.46
| epoch 766 |   400/ 2928 batches | lr 0.00 | ms/batch  8.39 | loss  4.07 | ppl    58.73
| epoch 766 |   600/ 2928 batches 

| epoch 771 |   200/ 2928 batches | lr 0.00 | ms/batch  8.65 | loss  4.07 | ppl    58.56
| epoch 771 |   400/ 2928 batches | lr 0.00 | ms/batch  8.53 | loss  4.08 | ppl    59.09
| epoch 771 |   600/ 2928 batches | lr 0.00 | ms/batch  8.44 | loss  3.94 | ppl    51.38
| epoch 771 |   800/ 2928 batches | lr 0.00 | ms/batch  8.45 | loss  4.00 | ppl    54.85
| epoch 771 |  1000/ 2928 batches | lr 0.00 | ms/batch  8.45 | loss  4.01 | ppl    55.38
| epoch 771 |  1200/ 2928 batches | lr 0.00 | ms/batch  8.46 | loss  4.04 | ppl    57.01
| epoch 771 |  1400/ 2928 batches | lr 0.00 | ms/batch  8.52 | loss  4.02 | ppl    55.60
| epoch 771 |  1600/ 2928 batches | lr 0.00 | ms/batch  8.44 | loss  4.07 | ppl    58.63
| epoch 771 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.47 | loss  4.06 | ppl    57.74
| epoch 771 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.45 | loss  4.06 | ppl    57.78
| epoch 771 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.45 | loss  3.90 | ppl    49.43
| epoch 771 |  2400/ 

| epoch 776 |  1800/ 2928 batches | lr 0.00 | ms/batch 10.15 | loss  4.06 | ppl    57.73
| epoch 776 |  2000/ 2928 batches | lr 0.00 | ms/batch 10.12 | loss  4.05 | ppl    57.57
| epoch 776 |  2200/ 2928 batches | lr 0.00 | ms/batch 10.09 | loss  3.90 | ppl    49.55
| epoch 776 |  2400/ 2928 batches | lr 0.00 | ms/batch 10.12 | loss  4.00 | ppl    54.63
| epoch 776 |  2600/ 2928 batches | lr 0.00 | ms/batch 10.17 | loss  4.02 | ppl    55.90
| epoch 776 |  2800/ 2928 batches | lr 0.00 | ms/batch 10.03 | loss  3.99 | ppl    54.04
-----------------------------------------------------------------------------------------
| end of epoch 776 | time: 30.22s | valid loss  5.54 | valid ppl   253.63
-----------------------------------------------------------------------------------------
| epoch 777 |   200/ 2928 batches | lr 0.00 | ms/batch  9.48 | loss  4.07 | ppl    58.44
| epoch 777 |   400/ 2928 batches | lr 0.00 | ms/batch  9.25 | loss  4.07 | ppl    58.80
| epoch 777 |   600/ 2928 batches 

| epoch 782 |   200/ 2928 batches | lr 0.00 | ms/batch  8.57 | loss  4.07 | ppl    58.48
| epoch 782 |   400/ 2928 batches | lr 0.00 | ms/batch  8.45 | loss  4.08 | ppl    58.87
| epoch 782 |   600/ 2928 batches | lr 0.00 | ms/batch  8.41 | loss  3.94 | ppl    51.43
| epoch 782 |   800/ 2928 batches | lr 0.00 | ms/batch  8.47 | loss  4.01 | ppl    54.92
| epoch 782 |  1000/ 2928 batches | lr 0.00 | ms/batch  8.45 | loss  4.01 | ppl    55.30
| epoch 782 |  1200/ 2928 batches | lr 0.00 | ms/batch  8.42 | loss  4.04 | ppl    56.69
| epoch 782 |  1400/ 2928 batches | lr 0.00 | ms/batch  8.43 | loss  4.01 | ppl    55.13
| epoch 782 |  1600/ 2928 batches | lr 0.00 | ms/batch  8.41 | loss  4.07 | ppl    58.65
| epoch 782 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.45 | loss  4.05 | ppl    57.61
| epoch 782 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.43 | loss  4.05 | ppl    57.33
| epoch 782 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.44 | loss  3.90 | ppl    49.46
| epoch 782 |  2400/ 

| epoch 787 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.46 | loss  4.06 | ppl    57.70
| epoch 787 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.50 | loss  4.05 | ppl    57.56
| epoch 787 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.43 | loss  3.90 | ppl    49.59
| epoch 787 |  2400/ 2928 batches | lr 0.00 | ms/batch  8.45 | loss  4.00 | ppl    54.59
| epoch 787 |  2600/ 2928 batches | lr 0.00 | ms/batch  8.44 | loss  4.02 | ppl    55.75
| epoch 787 |  2800/ 2928 batches | lr 0.00 | ms/batch  8.47 | loss  3.99 | ppl    54.25
-----------------------------------------------------------------------------------------
| end of epoch 787 | time: 26.11s | valid loss  5.54 | valid ppl   253.63
-----------------------------------------------------------------------------------------
| epoch 788 |   200/ 2928 batches | lr 0.00 | ms/batch  8.56 | loss  4.07 | ppl    58.57
| epoch 788 |   400/ 2928 batches | lr 0.00 | ms/batch  8.40 | loss  4.07 | ppl    58.71
| epoch 788 |   600/ 2928 batches 

| epoch 793 |   200/ 2928 batches | lr 0.00 | ms/batch  8.54 | loss  4.07 | ppl    58.53
| epoch 793 |   400/ 2928 batches | lr 0.00 | ms/batch  8.38 | loss  4.08 | ppl    58.92
| epoch 793 |   600/ 2928 batches | lr 0.00 | ms/batch  8.41 | loss  3.94 | ppl    51.28
| epoch 793 |   800/ 2928 batches | lr 0.00 | ms/batch  8.44 | loss  4.00 | ppl    54.81
| epoch 793 |  1000/ 2928 batches | lr 0.00 | ms/batch  8.40 | loss  4.01 | ppl    55.32
| epoch 793 |  1200/ 2928 batches | lr 0.00 | ms/batch  8.46 | loss  4.04 | ppl    56.99
| epoch 793 |  1400/ 2928 batches | lr 0.00 | ms/batch  8.52 | loss  4.02 | ppl    55.58
| epoch 793 |  1600/ 2928 batches | lr 0.00 | ms/batch  8.56 | loss  4.08 | ppl    58.86
| epoch 793 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.44 | loss  4.06 | ppl    57.88
| epoch 793 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.46 | loss  4.05 | ppl    57.66
| epoch 793 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.47 | loss  3.90 | ppl    49.56
| epoch 793 |  2400/ 

| epoch 798 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.37 | loss  4.06 | ppl    57.93
| epoch 798 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.41 | loss  4.05 | ppl    57.48
| epoch 798 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.35 | loss  3.90 | ppl    49.65
| epoch 798 |  2400/ 2928 batches | lr 0.00 | ms/batch  8.42 | loss  4.00 | ppl    54.41
| epoch 798 |  2600/ 2928 batches | lr 0.00 | ms/batch  8.38 | loss  4.02 | ppl    55.64
| epoch 798 |  2800/ 2928 batches | lr 0.00 | ms/batch  8.35 | loss  3.99 | ppl    53.93
-----------------------------------------------------------------------------------------
| end of epoch 798 | time: 25.89s | valid loss  5.54 | valid ppl   253.63
-----------------------------------------------------------------------------------------
| epoch 799 |   200/ 2928 batches | lr 0.00 | ms/batch  8.54 | loss  4.07 | ppl    58.72
| epoch 799 |   400/ 2928 batches | lr 0.00 | ms/batch  8.42 | loss  4.08 | ppl    59.01
| epoch 799 |   600/ 2928 batches 

| epoch 804 |   200/ 2928 batches | lr 0.00 | ms/batch  8.57 | loss  4.07 | ppl    58.49
| epoch 804 |   400/ 2928 batches | lr 0.00 | ms/batch  8.36 | loss  4.08 | ppl    58.90
| epoch 804 |   600/ 2928 batches | lr 0.00 | ms/batch  8.40 | loss  3.94 | ppl    51.38
| epoch 804 |   800/ 2928 batches | lr 0.00 | ms/batch  8.40 | loss  4.01 | ppl    54.97
| epoch 804 |  1000/ 2928 batches | lr 0.00 | ms/batch  8.42 | loss  4.01 | ppl    55.27
| epoch 804 |  1200/ 2928 batches | lr 0.00 | ms/batch  8.41 | loss  4.04 | ppl    57.07
| epoch 804 |  1400/ 2928 batches | lr 0.00 | ms/batch  8.43 | loss  4.01 | ppl    55.16
| epoch 804 |  1600/ 2928 batches | lr 0.00 | ms/batch  8.39 | loss  4.07 | ppl    58.69
| epoch 804 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.39 | loss  4.06 | ppl    57.78
| epoch 804 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.39 | loss  4.05 | ppl    57.45
| epoch 804 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.39 | loss  3.90 | ppl    49.45
| epoch 804 |  2400/ 

| epoch 809 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.44 | loss  4.05 | ppl    57.35
| epoch 809 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.44 | loss  4.05 | ppl    57.51
| epoch 809 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.46 | loss  3.90 | ppl    49.55
| epoch 809 |  2400/ 2928 batches | lr 0.00 | ms/batch  9.39 | loss  4.00 | ppl    54.67
| epoch 809 |  2600/ 2928 batches | lr 0.00 | ms/batch 10.06 | loss  4.02 | ppl    55.82
| epoch 809 |  2800/ 2928 batches | lr 0.00 | ms/batch 10.06 | loss  3.99 | ppl    53.87
-----------------------------------------------------------------------------------------
| end of epoch 809 | time: 27.17s | valid loss  5.54 | valid ppl   253.63
-----------------------------------------------------------------------------------------
| epoch 810 |   200/ 2928 batches | lr 0.00 | ms/batch  8.58 | loss  4.07 | ppl    58.69
| epoch 810 |   400/ 2928 batches | lr 0.00 | ms/batch  8.51 | loss  4.08 | ppl    59.06
| epoch 810 |   600/ 2928 batches 

| epoch 815 |   200/ 2928 batches | lr 0.00 | ms/batch  8.93 | loss  4.07 | ppl    58.74
| epoch 815 |   400/ 2928 batches | lr 0.00 | ms/batch  9.02 | loss  4.07 | ppl    58.70
| epoch 815 |   600/ 2928 batches | lr 0.00 | ms/batch  9.77 | loss  3.94 | ppl    51.35
| epoch 815 |   800/ 2928 batches | lr 0.00 | ms/batch  9.78 | loss  4.00 | ppl    54.67
| epoch 815 |  1000/ 2928 batches | lr 0.00 | ms/batch  9.76 | loss  4.01 | ppl    55.32
| epoch 815 |  1200/ 2928 batches | lr 0.00 | ms/batch  9.76 | loss  4.04 | ppl    56.79
| epoch 815 |  1400/ 2928 batches | lr 0.00 | ms/batch  9.77 | loss  4.01 | ppl    55.37
| epoch 815 |  1600/ 2928 batches | lr 0.00 | ms/batch  9.75 | loss  4.07 | ppl    58.65
| epoch 815 |  1800/ 2928 batches | lr 0.00 | ms/batch  9.74 | loss  4.05 | ppl    57.55
| epoch 815 |  2000/ 2928 batches | lr 0.00 | ms/batch  9.79 | loss  4.05 | ppl    57.58
| epoch 815 |  2200/ 2928 batches | lr 0.00 | ms/batch  9.81 | loss  3.90 | ppl    49.54
| epoch 815 |  2400/ 

| epoch 820 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.42 | loss  4.06 | ppl    57.78
| epoch 820 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.42 | loss  4.06 | ppl    57.72
| epoch 820 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.46 | loss  3.90 | ppl    49.65
| epoch 820 |  2400/ 2928 batches | lr 0.00 | ms/batch  8.43 | loss  4.00 | ppl    54.52
| epoch 820 |  2600/ 2928 batches | lr 0.00 | ms/batch  8.42 | loss  4.02 | ppl    55.52
| epoch 820 |  2800/ 2928 batches | lr 0.00 | ms/batch  8.43 | loss  3.99 | ppl    53.92
-----------------------------------------------------------------------------------------
| end of epoch 820 | time: 26.03s | valid loss  5.54 | valid ppl   253.63
-----------------------------------------------------------------------------------------
| epoch 821 |   200/ 2928 batches | lr 0.00 | ms/batch  8.45 | loss  4.07 | ppl    58.55
| epoch 821 |   400/ 2928 batches | lr 0.00 | ms/batch  8.35 | loss  4.08 | ppl    59.07
| epoch 821 |   600/ 2928 batches 

| epoch 826 |   200/ 2928 batches | lr 0.00 | ms/batch  8.49 | loss  4.07 | ppl    58.75
| epoch 826 |   400/ 2928 batches | lr 0.00 | ms/batch  8.33 | loss  4.08 | ppl    58.95
| epoch 826 |   600/ 2928 batches | lr 0.00 | ms/batch  8.34 | loss  3.94 | ppl    51.39
| epoch 826 |   800/ 2928 batches | lr 0.00 | ms/batch  8.36 | loss  4.01 | ppl    54.92
| epoch 826 |  1000/ 2928 batches | lr 0.00 | ms/batch  8.32 | loss  4.01 | ppl    55.41
| epoch 826 |  1200/ 2928 batches | lr 0.00 | ms/batch  8.32 | loss  4.04 | ppl    56.91
| epoch 826 |  1400/ 2928 batches | lr 0.00 | ms/batch  8.37 | loss  4.01 | ppl    55.12
| epoch 826 |  1600/ 2928 batches | lr 0.00 | ms/batch  8.39 | loss  4.07 | ppl    58.73
| epoch 826 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.32 | loss  4.05 | ppl    57.64
| epoch 826 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.37 | loss  4.05 | ppl    57.60
| epoch 826 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.41 | loss  3.90 | ppl    49.58
| epoch 826 |  2400/ 

| epoch 831 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.33 | loss  4.05 | ppl    57.66
| epoch 831 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.40 | loss  4.05 | ppl    57.49
| epoch 831 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.36 | loss  3.91 | ppl    49.73
| epoch 831 |  2400/ 2928 batches | lr 0.00 | ms/batch  8.36 | loss  4.00 | ppl    54.71
| epoch 831 |  2600/ 2928 batches | lr 0.00 | ms/batch  8.37 | loss  4.02 | ppl    55.65
| epoch 831 |  2800/ 2928 batches | lr 0.00 | ms/batch  8.34 | loss  3.99 | ppl    54.15
-----------------------------------------------------------------------------------------
| end of epoch 831 | time: 25.78s | valid loss  5.54 | valid ppl   253.63
-----------------------------------------------------------------------------------------
| epoch 832 |   200/ 2928 batches | lr 0.00 | ms/batch  9.39 | loss  4.07 | ppl    58.45
| epoch 832 |   400/ 2928 batches | lr 0.00 | ms/batch 10.36 | loss  4.08 | ppl    58.95
| epoch 832 |   600/ 2928 batches 

| epoch 837 |   200/ 2928 batches | lr 0.00 | ms/batch  8.57 | loss  4.07 | ppl    58.61
| epoch 837 |   400/ 2928 batches | lr 0.00 | ms/batch  8.46 | loss  4.07 | ppl    58.81
| epoch 837 |   600/ 2928 batches | lr 0.00 | ms/batch  8.45 | loss  3.94 | ppl    51.34
| epoch 837 |   800/ 2928 batches | lr 0.00 | ms/batch  8.47 | loss  4.00 | ppl    54.69
| epoch 837 |  1000/ 2928 batches | lr 0.00 | ms/batch  8.45 | loss  4.01 | ppl    55.24
| epoch 837 |  1200/ 2928 batches | lr 0.00 | ms/batch  8.49 | loss  4.04 | ppl    56.88
| epoch 837 |  1400/ 2928 batches | lr 0.00 | ms/batch  8.43 | loss  4.01 | ppl    55.02
| epoch 837 |  1600/ 2928 batches | lr 0.00 | ms/batch  8.48 | loss  4.07 | ppl    58.71
| epoch 837 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.48 | loss  4.06 | ppl    57.77
| epoch 837 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.45 | loss  4.06 | ppl    57.74
| epoch 837 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.47 | loss  3.90 | ppl    49.50
| epoch 837 |  2400/ 

| epoch 842 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.42 | loss  4.06 | ppl    57.88
| epoch 842 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.45 | loss  4.05 | ppl    57.38
| epoch 842 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.48 | loss  3.90 | ppl    49.40
| epoch 842 |  2400/ 2928 batches | lr 0.00 | ms/batch  8.46 | loss  4.01 | ppl    55.18
| epoch 842 |  2600/ 2928 batches | lr 0.00 | ms/batch  8.48 | loss  4.02 | ppl    55.68
| epoch 842 |  2800/ 2928 batches | lr 0.00 | ms/batch  8.46 | loss  3.99 | ppl    53.96
-----------------------------------------------------------------------------------------
| end of epoch 842 | time: 26.15s | valid loss  5.54 | valid ppl   253.63
-----------------------------------------------------------------------------------------
| epoch 843 |   200/ 2928 batches | lr 0.00 | ms/batch  8.58 | loss  4.07 | ppl    58.51
| epoch 843 |   400/ 2928 batches | lr 0.00 | ms/batch  8.43 | loss  4.07 | ppl    58.72
| epoch 843 |   600/ 2928 batches 

| epoch 848 |   200/ 2928 batches | lr 0.00 | ms/batch  8.55 | loss  4.07 | ppl    58.51
| epoch 848 |   400/ 2928 batches | lr 0.00 | ms/batch  8.44 | loss  4.08 | ppl    58.94
| epoch 848 |   600/ 2928 batches | lr 0.00 | ms/batch  8.42 | loss  3.94 | ppl    51.49
| epoch 848 |   800/ 2928 batches | lr 0.00 | ms/batch  8.43 | loss  4.00 | ppl    54.84
| epoch 848 |  1000/ 2928 batches | lr 0.00 | ms/batch  8.44 | loss  4.02 | ppl    55.63
| epoch 848 |  1200/ 2928 batches | lr 0.00 | ms/batch  8.45 | loss  4.04 | ppl    56.83
| epoch 848 |  1400/ 2928 batches | lr 0.00 | ms/batch  8.44 | loss  4.02 | ppl    55.44
| epoch 848 |  1600/ 2928 batches | lr 0.00 | ms/batch  8.46 | loss  4.07 | ppl    58.68
| epoch 848 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.44 | loss  4.06 | ppl    57.84
| epoch 848 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.43 | loss  4.05 | ppl    57.68
| epoch 848 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.44 | loss  3.90 | ppl    49.44
| epoch 848 |  2400/ 

| epoch 853 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.45 | loss  4.05 | ppl    57.64
| epoch 853 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.45 | loss  4.05 | ppl    57.67
| epoch 853 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.49 | loss  3.90 | ppl    49.65
| epoch 853 |  2400/ 2928 batches | lr 0.00 | ms/batch  8.52 | loss  4.00 | ppl    54.50
| epoch 853 |  2600/ 2928 batches | lr 0.00 | ms/batch  8.46 | loss  4.02 | ppl    55.73
| epoch 853 |  2800/ 2928 batches | lr 0.00 | ms/batch  8.44 | loss  3.99 | ppl    54.09
-----------------------------------------------------------------------------------------
| end of epoch 853 | time: 26.13s | valid loss  5.54 | valid ppl   253.63
-----------------------------------------------------------------------------------------
| epoch 854 |   200/ 2928 batches | lr 0.00 | ms/batch  8.59 | loss  4.07 | ppl    58.54
| epoch 854 |   400/ 2928 batches | lr 0.00 | ms/batch  8.46 | loss  4.07 | ppl    58.67
| epoch 854 |   600/ 2928 batches 

| epoch 859 |   200/ 2928 batches | lr 0.00 | ms/batch  9.45 | loss  4.07 | ppl    58.79
| epoch 859 |   400/ 2928 batches | lr 0.00 | ms/batch  9.29 | loss  4.07 | ppl    58.62
| epoch 859 |   600/ 2928 batches | lr 0.00 | ms/batch  9.26 | loss  3.94 | ppl    51.45
| epoch 859 |   800/ 2928 batches | lr 0.00 | ms/batch  9.26 | loss  4.00 | ppl    54.69
| epoch 859 |  1000/ 2928 batches | lr 0.00 | ms/batch  9.30 | loss  4.02 | ppl    55.45
| epoch 859 |  1200/ 2928 batches | lr 0.00 | ms/batch  9.26 | loss  4.04 | ppl    56.74
| epoch 859 |  1400/ 2928 batches | lr 0.00 | ms/batch  9.26 | loss  4.01 | ppl    55.18
| epoch 859 |  1600/ 2928 batches | lr 0.00 | ms/batch  9.28 | loss  4.07 | ppl    58.73
| epoch 859 |  1800/ 2928 batches | lr 0.00 | ms/batch  9.26 | loss  4.06 | ppl    57.82
| epoch 859 |  2000/ 2928 batches | lr 0.00 | ms/batch  9.22 | loss  4.05 | ppl    57.64
| epoch 859 |  2200/ 2928 batches | lr 0.00 | ms/batch  9.33 | loss  3.90 | ppl    49.57
| epoch 859 |  2400/ 

| epoch 864 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.35 | loss  4.05 | ppl    57.57
| epoch 864 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.34 | loss  4.05 | ppl    57.54
| epoch 864 |  2200/ 2928 batches | lr 0.00 | ms/batch  9.46 | loss  3.90 | ppl    49.33
| epoch 864 |  2400/ 2928 batches | lr 0.00 | ms/batch  9.74 | loss  4.00 | ppl    54.62
| epoch 864 |  2600/ 2928 batches | lr 0.00 | ms/batch  9.72 | loss  4.02 | ppl    55.71
| epoch 864 |  2800/ 2928 batches | lr 0.00 | ms/batch  9.71 | loss  3.99 | ppl    53.92
-----------------------------------------------------------------------------------------
| end of epoch 864 | time: 27.03s | valid loss  5.54 | valid ppl   253.63
-----------------------------------------------------------------------------------------
| epoch 865 |   200/ 2928 batches | lr 0.00 | ms/batch  8.47 | loss  4.07 | ppl    58.44
| epoch 865 |   400/ 2928 batches | lr 0.00 | ms/batch  8.32 | loss  4.07 | ppl    58.76
| epoch 865 |   600/ 2928 batches 

| epoch 870 |   200/ 2928 batches | lr 0.00 | ms/batch  8.42 | loss  4.06 | ppl    58.21
| epoch 870 |   400/ 2928 batches | lr 0.00 | ms/batch  8.35 | loss  4.07 | ppl    58.66
| epoch 870 |   600/ 2928 batches | lr 0.00 | ms/batch  8.35 | loss  3.94 | ppl    51.53
| epoch 870 |   800/ 2928 batches | lr 0.00 | ms/batch  8.34 | loss  4.01 | ppl    54.88
| epoch 870 |  1000/ 2928 batches | lr 0.00 | ms/batch  8.34 | loss  4.01 | ppl    55.38
| epoch 870 |  1200/ 2928 batches | lr 0.00 | ms/batch  8.34 | loss  4.04 | ppl    56.79
| epoch 870 |  1400/ 2928 batches | lr 0.00 | ms/batch  8.34 | loss  4.01 | ppl    55.35
| epoch 870 |  1600/ 2928 batches | lr 0.00 | ms/batch  8.35 | loss  4.07 | ppl    58.62
| epoch 870 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.35 | loss  4.05 | ppl    57.68
| epoch 870 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.35 | loss  4.05 | ppl    57.63
| epoch 870 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.35 | loss  3.90 | ppl    49.50
| epoch 870 |  2400/ 

| epoch 875 |  1800/ 2928 batches | lr 0.00 | ms/batch  9.96 | loss  4.06 | ppl    57.83
| epoch 875 |  2000/ 2928 batches | lr 0.00 | ms/batch  9.92 | loss  4.05 | ppl    57.60
| epoch 875 |  2200/ 2928 batches | lr 0.00 | ms/batch  9.91 | loss  3.90 | ppl    49.56
| epoch 875 |  2400/ 2928 batches | lr 0.00 | ms/batch  9.93 | loss  4.00 | ppl    54.45
| epoch 875 |  2600/ 2928 batches | lr 0.00 | ms/batch 10.05 | loss  4.02 | ppl    55.92
| epoch 875 |  2800/ 2928 batches | lr 0.00 | ms/batch 10.08 | loss  3.99 | ppl    54.06
-----------------------------------------------------------------------------------------
| end of epoch 875 | time: 29.33s | valid loss  5.54 | valid ppl   253.63
-----------------------------------------------------------------------------------------
| epoch 876 |   200/ 2928 batches | lr 0.00 | ms/batch  8.57 | loss  4.07 | ppl    58.57
| epoch 876 |   400/ 2928 batches | lr 0.00 | ms/batch  8.42 | loss  4.08 | ppl    58.93
| epoch 876 |   600/ 2928 batches 

| epoch 881 |   200/ 2928 batches | lr 0.00 | ms/batch  8.52 | loss  4.07 | ppl    58.43
| epoch 881 |   400/ 2928 batches | lr 0.00 | ms/batch  8.48 | loss  4.08 | ppl    58.99
| epoch 881 |   600/ 2928 batches | lr 0.00 | ms/batch  8.46 | loss  3.94 | ppl    51.36
| epoch 881 |   800/ 2928 batches | lr 0.00 | ms/batch  8.48 | loss  4.01 | ppl    54.93
| epoch 881 |  1000/ 2928 batches | lr 0.00 | ms/batch  8.45 | loss  4.02 | ppl    55.45
| epoch 881 |  1200/ 2928 batches | lr 0.00 | ms/batch  8.43 | loss  4.04 | ppl    56.90
| epoch 881 |  1400/ 2928 batches | lr 0.00 | ms/batch  8.44 | loss  4.01 | ppl    55.25
| epoch 881 |  1600/ 2928 batches | lr 0.00 | ms/batch  8.42 | loss  4.07 | ppl    58.62
| epoch 881 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.44 | loss  4.06 | ppl    57.70
| epoch 881 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.47 | loss  4.05 | ppl    57.66
| epoch 881 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.48 | loss  3.91 | ppl    49.71
| epoch 881 |  2400/ 

| epoch 886 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.37 | loss  4.05 | ppl    57.68
| epoch 886 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.34 | loss  4.06 | ppl    57.76
| epoch 886 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.32 | loss  3.90 | ppl    49.65
| epoch 886 |  2400/ 2928 batches | lr 0.00 | ms/batch  8.31 | loss  4.00 | ppl    54.75
| epoch 886 |  2600/ 2928 batches | lr 0.00 | ms/batch  8.37 | loss  4.02 | ppl    55.53
| epoch 886 |  2800/ 2928 batches | lr 0.00 | ms/batch  8.39 | loss  3.99 | ppl    53.84
-----------------------------------------------------------------------------------------
| end of epoch 886 | time: 25.79s | valid loss  5.54 | valid ppl   253.63
-----------------------------------------------------------------------------------------
| epoch 887 |   200/ 2928 batches | lr 0.00 | ms/batch  8.47 | loss  4.07 | ppl    58.63
| epoch 887 |   400/ 2928 batches | lr 0.00 | ms/batch  8.32 | loss  4.07 | ppl    58.82
| epoch 887 |   600/ 2928 batches 

| epoch 892 |   200/ 2928 batches | lr 0.00 | ms/batch  8.55 | loss  4.07 | ppl    58.39
| epoch 892 |   400/ 2928 batches | lr 0.00 | ms/batch  8.46 | loss  4.07 | ppl    58.68
| epoch 892 |   600/ 2928 batches | lr 0.00 | ms/batch  8.45 | loss  3.94 | ppl    51.52
| epoch 892 |   800/ 2928 batches | lr 0.00 | ms/batch  8.44 | loss  4.00 | ppl    54.74
| epoch 892 |  1000/ 2928 batches | lr 0.00 | ms/batch  8.40 | loss  4.01 | ppl    55.40
| epoch 892 |  1200/ 2928 batches | lr 0.00 | ms/batch  8.40 | loss  4.04 | ppl    56.67
| epoch 892 |  1400/ 2928 batches | lr 0.00 | ms/batch  8.46 | loss  4.01 | ppl    55.29
| epoch 892 |  1600/ 2928 batches | lr 0.00 | ms/batch  8.41 | loss  4.08 | ppl    58.93
| epoch 892 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.40 | loss  4.06 | ppl    57.74
| epoch 892 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.40 | loss  4.05 | ppl    57.59
| epoch 892 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.43 | loss  3.91 | ppl    49.79
| epoch 892 |  2400/ 

| epoch 897 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.39 | loss  4.05 | ppl    57.52
| epoch 897 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.33 | loss  4.05 | ppl    57.61
| epoch 897 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.42 | loss  3.91 | ppl    49.74
| epoch 897 |  2400/ 2928 batches | lr 0.00 | ms/batch  8.65 | loss  4.00 | ppl    54.69
| epoch 897 |  2600/ 2928 batches | lr 0.00 | ms/batch  9.76 | loss  4.02 | ppl    55.73
| epoch 897 |  2800/ 2928 batches | lr 0.00 | ms/batch  9.78 | loss  3.99 | ppl    54.11
-----------------------------------------------------------------------------------------
| end of epoch 897 | time: 26.71s | valid loss  5.54 | valid ppl   253.63
-----------------------------------------------------------------------------------------
| epoch 898 |   200/ 2928 batches | lr 0.00 | ms/batch  8.49 | loss  4.07 | ppl    58.70
| epoch 898 |   400/ 2928 batches | lr 0.00 | ms/batch  8.32 | loss  4.08 | ppl    58.98
| epoch 898 |   600/ 2928 batches 

| epoch 903 |   200/ 2928 batches | lr 0.00 | ms/batch  9.26 | loss  4.07 | ppl    58.50
| epoch 903 |   400/ 2928 batches | lr 0.00 | ms/batch  9.46 | loss  4.08 | ppl    58.91
| epoch 903 |   600/ 2928 batches | lr 0.00 | ms/batch  9.92 | loss  3.94 | ppl    51.63
| epoch 903 |   800/ 2928 batches | lr 0.00 | ms/batch 10.10 | loss  4.00 | ppl    54.80
| epoch 903 |  1000/ 2928 batches | lr 0.00 | ms/batch 10.12 | loss  4.01 | ppl    55.24
| epoch 903 |  1200/ 2928 batches | lr 0.00 | ms/batch 10.17 | loss  4.04 | ppl    57.00
| epoch 903 |  1400/ 2928 batches | lr 0.00 | ms/batch 10.11 | loss  4.01 | ppl    55.37
| epoch 903 |  1600/ 2928 batches | lr 0.00 | ms/batch  9.84 | loss  4.07 | ppl    58.63
| epoch 903 |  1800/ 2928 batches | lr 0.00 | ms/batch  9.72 | loss  4.06 | ppl    57.85
| epoch 903 |  2000/ 2928 batches | lr 0.00 | ms/batch  9.82 | loss  4.05 | ppl    57.50
| epoch 903 |  2200/ 2928 batches | lr 0.00 | ms/batch  9.76 | loss  3.90 | ppl    49.61
| epoch 903 |  2400/ 

| epoch 908 |  1800/ 2928 batches | lr 0.00 | ms/batch  9.46 | loss  4.05 | ppl    57.59
| epoch 908 |  2000/ 2928 batches | lr 0.00 | ms/batch  9.67 | loss  4.05 | ppl    57.48
| epoch 908 |  2200/ 2928 batches | lr 0.00 | ms/batch  9.69 | loss  3.90 | ppl    49.49
| epoch 908 |  2400/ 2928 batches | lr 0.00 | ms/batch  9.68 | loss  4.00 | ppl    54.59
| epoch 908 |  2600/ 2928 batches | lr 0.00 | ms/batch  9.68 | loss  4.02 | ppl    55.73
| epoch 908 |  2800/ 2928 batches | lr 0.00 | ms/batch  9.69 | loss  3.99 | ppl    54.20
-----------------------------------------------------------------------------------------
| end of epoch 908 | time: 28.58s | valid loss  5.54 | valid ppl   253.63
-----------------------------------------------------------------------------------------
| epoch 909 |   200/ 2928 batches | lr 0.00 | ms/batch  8.39 | loss  4.07 | ppl    58.63
| epoch 909 |   400/ 2928 batches | lr 0.00 | ms/batch  8.31 | loss  4.08 | ppl    59.15
| epoch 909 |   600/ 2928 batches 

| epoch 914 |   200/ 2928 batches | lr 0.00 | ms/batch  8.37 | loss  4.07 | ppl    58.73
| epoch 914 |   400/ 2928 batches | lr 0.00 | ms/batch  8.31 | loss  4.07 | ppl    58.77
| epoch 914 |   600/ 2928 batches | lr 0.00 | ms/batch  8.31 | loss  3.94 | ppl    51.39
| epoch 914 |   800/ 2928 batches | lr 0.00 | ms/batch  8.36 | loss  4.00 | ppl    54.83
| epoch 914 |  1000/ 2928 batches | lr 0.00 | ms/batch  8.34 | loss  4.01 | ppl    55.32
| epoch 914 |  1200/ 2928 batches | lr 0.00 | ms/batch  8.32 | loss  4.04 | ppl    56.60
| epoch 914 |  1400/ 2928 batches | lr 0.00 | ms/batch  8.30 | loss  4.02 | ppl    55.63
| epoch 914 |  1600/ 2928 batches | lr 0.00 | ms/batch  8.32 | loss  4.07 | ppl    58.63
| epoch 914 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.31 | loss  4.06 | ppl    57.80
| epoch 914 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.31 | loss  4.06 | ppl    58.01
| epoch 914 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.31 | loss  3.90 | ppl    49.58
| epoch 914 |  2400/ 

| epoch 919 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.35 | loss  4.05 | ppl    57.40
| epoch 919 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.32 | loss  4.05 | ppl    57.50
| epoch 919 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.32 | loss  3.90 | ppl    49.45
| epoch 919 |  2400/ 2928 batches | lr 0.00 | ms/batch  8.33 | loss  4.00 | ppl    54.62
| epoch 919 |  2600/ 2928 batches | lr 0.00 | ms/batch  8.34 | loss  4.02 | ppl    55.75
| epoch 919 |  2800/ 2928 batches | lr 0.00 | ms/batch  8.34 | loss  3.99 | ppl    54.28
-----------------------------------------------------------------------------------------
| end of epoch 919 | time: 25.77s | valid loss  5.54 | valid ppl   253.63
-----------------------------------------------------------------------------------------
| epoch 920 |   200/ 2928 batches | lr 0.00 | ms/batch  8.40 | loss  4.07 | ppl    58.50
| epoch 920 |   400/ 2928 batches | lr 0.00 | ms/batch  8.33 | loss  4.07 | ppl    58.77
| epoch 920 |   600/ 2928 batches 

| epoch 925 |   200/ 2928 batches | lr 0.00 | ms/batch  8.47 | loss  4.07 | ppl    58.65
| epoch 925 |   400/ 2928 batches | lr 0.00 | ms/batch  8.34 | loss  4.07 | ppl    58.82
| epoch 925 |   600/ 2928 batches | lr 0.00 | ms/batch  8.36 | loss  3.94 | ppl    51.51
| epoch 925 |   800/ 2928 batches | lr 0.00 | ms/batch  8.34 | loss  4.00 | ppl    54.50
| epoch 925 |  1000/ 2928 batches | lr 0.00 | ms/batch  8.34 | loss  4.01 | ppl    55.21
| epoch 925 |  1200/ 2928 batches | lr 0.00 | ms/batch  8.37 | loss  4.04 | ppl    56.69
| epoch 925 |  1400/ 2928 batches | lr 0.00 | ms/batch  8.35 | loss  4.01 | ppl    55.36
| epoch 925 |  1600/ 2928 batches | lr 0.00 | ms/batch  8.37 | loss  4.07 | ppl    58.43
| epoch 925 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.35 | loss  4.06 | ppl    57.78
| epoch 925 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.35 | loss  4.05 | ppl    57.29
| epoch 925 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.35 | loss  3.90 | ppl    49.52
| epoch 925 |  2400/ 

| epoch 930 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.37 | loss  4.05 | ppl    57.62
| epoch 930 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.37 | loss  4.05 | ppl    57.54
| epoch 930 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.38 | loss  3.90 | ppl    49.47
| epoch 930 |  2400/ 2928 batches | lr 0.00 | ms/batch  8.38 | loss  4.00 | ppl    54.56
| epoch 930 |  2600/ 2928 batches | lr 0.00 | ms/batch  8.37 | loss  4.02 | ppl    55.78
| epoch 930 |  2800/ 2928 batches | lr 0.00 | ms/batch  8.41 | loss  3.99 | ppl    53.81
-----------------------------------------------------------------------------------------
| end of epoch 930 | time: 25.84s | valid loss  5.54 | valid ppl   253.63
-----------------------------------------------------------------------------------------
| epoch 931 |   200/ 2928 batches | lr 0.00 | ms/batch  8.40 | loss  4.07 | ppl    58.46
| epoch 931 |   400/ 2928 batches | lr 0.00 | ms/batch  8.30 | loss  4.07 | ppl    58.74
| epoch 931 |   600/ 2928 batches 

| epoch 936 |   200/ 2928 batches | lr 0.00 | ms/batch  8.41 | loss  4.07 | ppl    58.51
| epoch 936 |   400/ 2928 batches | lr 0.00 | ms/batch  8.35 | loss  4.08 | ppl    58.98
| epoch 936 |   600/ 2928 batches | lr 0.00 | ms/batch  8.35 | loss  3.94 | ppl    51.22
| epoch 936 |   800/ 2928 batches | lr 0.00 | ms/batch  8.36 | loss  4.00 | ppl    54.87
| epoch 936 |  1000/ 2928 batches | lr 0.00 | ms/batch  8.36 | loss  4.01 | ppl    55.25
| epoch 936 |  1200/ 2928 batches | lr 0.00 | ms/batch  8.35 | loss  4.04 | ppl    56.92
| epoch 936 |  1400/ 2928 batches | lr 0.00 | ms/batch  8.37 | loss  4.02 | ppl    55.56
| epoch 936 |  1600/ 2928 batches | lr 0.00 | ms/batch  8.39 | loss  4.08 | ppl    59.02
| epoch 936 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.41 | loss  4.05 | ppl    57.68
| epoch 936 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.41 | loss  4.05 | ppl    57.62
| epoch 936 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.34 | loss  3.91 | ppl    49.73
| epoch 936 |  2400/ 

| epoch 941 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.36 | loss  4.05 | ppl    57.55
| epoch 941 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.33 | loss  4.05 | ppl    57.24
| epoch 941 |  2200/ 2928 batches | lr 0.00 | ms/batch  9.48 | loss  3.90 | ppl    49.45
| epoch 941 |  2400/ 2928 batches | lr 0.00 | ms/batch  9.69 | loss  4.00 | ppl    54.51
| epoch 941 |  2600/ 2928 batches | lr 0.00 | ms/batch  9.69 | loss  4.02 | ppl    55.78
| epoch 941 |  2800/ 2928 batches | lr 0.00 | ms/batch  9.67 | loss  3.99 | ppl    54.00
-----------------------------------------------------------------------------------------
| end of epoch 941 | time: 27.06s | valid loss  5.54 | valid ppl   253.63
-----------------------------------------------------------------------------------------
| epoch 942 |   200/ 2928 batches | lr 0.00 | ms/batch  9.30 | loss  4.07 | ppl    58.68
| epoch 942 |   400/ 2928 batches | lr 0.00 | ms/batch 10.68 | loss  4.08 | ppl    58.88
| epoch 942 |   600/ 2928 batches 

| epoch 947 |   200/ 2928 batches | lr 0.00 | ms/batch  8.48 | loss  4.07 | ppl    58.67
| epoch 947 |   400/ 2928 batches | lr 0.00 | ms/batch  8.34 | loss  4.07 | ppl    58.51
| epoch 947 |   600/ 2928 batches | lr 0.00 | ms/batch  8.43 | loss  3.94 | ppl    51.36
| epoch 947 |   800/ 2928 batches | lr 0.00 | ms/batch  8.35 | loss  4.01 | ppl    55.02
| epoch 947 |  1000/ 2928 batches | lr 0.00 | ms/batch  8.32 | loss  4.01 | ppl    55.41
| epoch 947 |  1200/ 2928 batches | lr 0.00 | ms/batch  8.36 | loss  4.04 | ppl    56.94
| epoch 947 |  1400/ 2928 batches | lr 0.00 | ms/batch  8.37 | loss  4.01 | ppl    55.09
| epoch 947 |  1600/ 2928 batches | lr 0.00 | ms/batch  8.31 | loss  4.08 | ppl    58.89
| epoch 947 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.34 | loss  4.05 | ppl    57.34
| epoch 947 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.33 | loss  4.05 | ppl    57.48
| epoch 947 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.33 | loss  3.91 | ppl    49.75
| epoch 947 |  2400/ 

| epoch 952 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.46 | loss  4.05 | ppl    57.53
| epoch 952 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.42 | loss  4.05 | ppl    57.47
| epoch 952 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.45 | loss  3.90 | ppl    49.52
| epoch 952 |  2400/ 2928 batches | lr 0.00 | ms/batch  8.45 | loss  4.00 | ppl    54.57
| epoch 952 |  2600/ 2928 batches | lr 0.00 | ms/batch  8.47 | loss  4.02 | ppl    55.84
| epoch 952 |  2800/ 2928 batches | lr 0.00 | ms/batch  8.44 | loss  3.99 | ppl    53.95
-----------------------------------------------------------------------------------------
| end of epoch 952 | time: 26.13s | valid loss  5.54 | valid ppl   253.63
-----------------------------------------------------------------------------------------
| epoch 953 |   200/ 2928 batches | lr 0.00 | ms/batch  9.22 | loss  4.07 | ppl    58.60
| epoch 953 |   400/ 2928 batches | lr 0.00 | ms/batch 11.00 | loss  4.08 | ppl    58.92
| epoch 953 |   600/ 2928 batches 

| epoch 958 |   200/ 2928 batches | lr 0.00 | ms/batch  8.80 | loss  4.07 | ppl    58.55
| epoch 958 |   400/ 2928 batches | lr 0.00 | ms/batch  8.37 | loss  4.07 | ppl    58.82
| epoch 958 |   600/ 2928 batches | lr 0.00 | ms/batch  8.31 | loss  3.94 | ppl    51.47
| epoch 958 |   800/ 2928 batches | lr 0.00 | ms/batch  8.35 | loss  4.01 | ppl    54.91
| epoch 958 |  1000/ 2928 batches | lr 0.00 | ms/batch  8.35 | loss  4.01 | ppl    55.34
| epoch 958 |  1200/ 2928 batches | lr 0.00 | ms/batch  8.35 | loss  4.04 | ppl    57.05
| epoch 958 |  1400/ 2928 batches | lr 0.00 | ms/batch  8.40 | loss  4.01 | ppl    55.19
| epoch 958 |  1600/ 2928 batches | lr 0.00 | ms/batch  8.36 | loss  4.08 | ppl    58.86
| epoch 958 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.39 | loss  4.06 | ppl    57.77
| epoch 958 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.39 | loss  4.05 | ppl    57.56
| epoch 958 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.38 | loss  3.90 | ppl    49.38
| epoch 958 |  2400/ 

| epoch 963 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.46 | loss  4.05 | ppl    57.66
| epoch 963 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.44 | loss  4.06 | ppl    57.69
| epoch 963 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.51 | loss  3.90 | ppl    49.46
| epoch 963 |  2400/ 2928 batches | lr 0.00 | ms/batch  8.45 | loss  4.00 | ppl    54.50
| epoch 963 |  2600/ 2928 batches | lr 0.00 | ms/batch  8.45 | loss  4.02 | ppl    55.75
| epoch 963 |  2800/ 2928 batches | lr 0.00 | ms/batch  8.52 | loss  3.98 | ppl    53.72
-----------------------------------------------------------------------------------------
| end of epoch 963 | time: 26.21s | valid loss  5.54 | valid ppl   253.63
-----------------------------------------------------------------------------------------
| epoch 964 |   200/ 2928 batches | lr 0.00 | ms/batch  8.46 | loss  4.07 | ppl    58.49
| epoch 964 |   400/ 2928 batches | lr 0.00 | ms/batch  8.42 | loss  4.07 | ppl    58.73
| epoch 964 |   600/ 2928 batches 

| epoch 969 |   200/ 2928 batches | lr 0.00 | ms/batch  8.86 | loss  4.07 | ppl    58.56
| epoch 969 |   400/ 2928 batches | lr 0.00 | ms/batch  8.42 | loss  4.07 | ppl    58.70
| epoch 969 |   600/ 2928 batches | lr 0.00 | ms/batch  8.69 | loss  3.94 | ppl    51.55
| epoch 969 |   800/ 2928 batches | lr 0.00 | ms/batch  9.93 | loss  4.00 | ppl    54.82
| epoch 969 |  1000/ 2928 batches | lr 0.00 | ms/batch 10.01 | loss  4.01 | ppl    55.26
| epoch 969 |  1200/ 2928 batches | lr 0.00 | ms/batch 10.02 | loss  4.05 | ppl    57.13
| epoch 969 |  1400/ 2928 batches | lr 0.00 | ms/batch 10.11 | loss  4.01 | ppl    55.22
| epoch 969 |  1600/ 2928 batches | lr 0.00 | ms/batch 10.10 | loss  4.08 | ppl    59.03
| epoch 969 |  1800/ 2928 batches | lr 0.00 | ms/batch 10.14 | loss  4.06 | ppl    57.73
| epoch 969 |  2000/ 2928 batches | lr 0.00 | ms/batch 10.11 | loss  4.05 | ppl    57.58
| epoch 969 |  2200/ 2928 batches | lr 0.00 | ms/batch 10.11 | loss  3.91 | ppl    49.71
| epoch 969 |  2400/ 

| epoch 974 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.36 | loss  4.05 | ppl    57.61
| epoch 974 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.35 | loss  4.05 | ppl    57.49
| epoch 974 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.35 | loss  3.91 | ppl    49.70
| epoch 974 |  2400/ 2928 batches | lr 0.00 | ms/batch  8.36 | loss  4.00 | ppl    54.50
| epoch 974 |  2600/ 2928 batches | lr 0.00 | ms/batch  8.34 | loss  4.02 | ppl    55.60
| epoch 974 |  2800/ 2928 batches | lr 0.00 | ms/batch  8.33 | loss  3.99 | ppl    54.27
-----------------------------------------------------------------------------------------
| end of epoch 974 | time: 25.83s | valid loss  5.54 | valid ppl   253.63
-----------------------------------------------------------------------------------------
| epoch 975 |   200/ 2928 batches | lr 0.00 | ms/batch  8.41 | loss  4.07 | ppl    58.64
| epoch 975 |   400/ 2928 batches | lr 0.00 | ms/batch  8.37 | loss  4.08 | ppl    59.04
| epoch 975 |   600/ 2928 batches 

| epoch 980 |   200/ 2928 batches | lr 0.00 | ms/batch  8.45 | loss  4.07 | ppl    58.63
| epoch 980 |   400/ 2928 batches | lr 0.00 | ms/batch  8.30 | loss  4.07 | ppl    58.73
| epoch 980 |   600/ 2928 batches | lr 0.00 | ms/batch  8.30 | loss  3.94 | ppl    51.27
| epoch 980 |   800/ 2928 batches | lr 0.00 | ms/batch  8.29 | loss  4.00 | ppl    54.81
| epoch 980 |  1000/ 2928 batches | lr 0.00 | ms/batch  8.29 | loss  4.01 | ppl    55.21
| epoch 980 |  1200/ 2928 batches | lr 0.00 | ms/batch  8.31 | loss  4.04 | ppl    57.01
| epoch 980 |  1400/ 2928 batches | lr 0.00 | ms/batch  8.33 | loss  4.01 | ppl    55.41
| epoch 980 |  1600/ 2928 batches | lr 0.00 | ms/batch  8.34 | loss  4.07 | ppl    58.47
| epoch 980 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.31 | loss  4.06 | ppl    57.84
| epoch 980 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.33 | loss  4.05 | ppl    57.60
| epoch 980 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.33 | loss  3.90 | ppl    49.60
| epoch 980 |  2400/ 

| epoch 985 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.48 | loss  4.05 | ppl    57.62
| epoch 985 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.41 | loss  4.05 | ppl    57.54
| epoch 985 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.42 | loss  3.91 | ppl    49.77
| epoch 985 |  2400/ 2928 batches | lr 0.00 | ms/batch  8.86 | loss  4.00 | ppl    54.60
| epoch 985 |  2600/ 2928 batches | lr 0.00 | ms/batch 10.08 | loss  4.02 | ppl    55.75
| epoch 985 |  2800/ 2928 batches | lr 0.00 | ms/batch 10.11 | loss  3.99 | ppl    54.08
-----------------------------------------------------------------------------------------
| end of epoch 985 | time: 27.00s | valid loss  5.54 | valid ppl   253.63
-----------------------------------------------------------------------------------------
| epoch 986 |   200/ 2928 batches | lr 0.00 | ms/batch  8.56 | loss  4.07 | ppl    58.75
| epoch 986 |   400/ 2928 batches | lr 0.00 | ms/batch  8.42 | loss  4.07 | ppl    58.58
| epoch 986 |   600/ 2928 batches 

| epoch 991 |   200/ 2928 batches | lr 0.00 | ms/batch  9.32 | loss  4.07 | ppl    58.31
| epoch 991 |   400/ 2928 batches | lr 0.00 | ms/batch  9.14 | loss  4.08 | ppl    58.86
| epoch 991 |   600/ 2928 batches | lr 0.00 | ms/batch  9.22 | loss  3.94 | ppl    51.58
| epoch 991 |   800/ 2928 batches | lr 0.00 | ms/batch  9.23 | loss  4.00 | ppl    54.82
| epoch 991 |  1000/ 2928 batches | lr 0.00 | ms/batch  9.43 | loss  4.01 | ppl    55.29
| epoch 991 |  1200/ 2928 batches | lr 0.00 | ms/batch 10.01 | loss  4.04 | ppl    56.82
| epoch 991 |  1400/ 2928 batches | lr 0.00 | ms/batch 10.00 | loss  4.01 | ppl    55.28
| epoch 991 |  1600/ 2928 batches | lr 0.00 | ms/batch 10.06 | loss  4.07 | ppl    58.75
| epoch 991 |  1800/ 2928 batches | lr 0.00 | ms/batch 10.01 | loss  4.05 | ppl    57.61
| epoch 991 |  2000/ 2928 batches | lr 0.00 | ms/batch 10.02 | loss  4.06 | ppl    57.75
| epoch 991 |  2200/ 2928 batches | lr 0.00 | ms/batch 10.06 | loss  3.91 | ppl    49.74
| epoch 991 |  2400/ 

| epoch 996 |  1800/ 2928 batches | lr 0.00 | ms/batch  8.48 | loss  4.05 | ppl    57.66
| epoch 996 |  2000/ 2928 batches | lr 0.00 | ms/batch  8.45 | loss  4.05 | ppl    57.61
| epoch 996 |  2200/ 2928 batches | lr 0.00 | ms/batch  8.44 | loss  3.90 | ppl    49.64
| epoch 996 |  2400/ 2928 batches | lr 0.00 | ms/batch  8.48 | loss  4.00 | ppl    54.50
| epoch 996 |  2600/ 2928 batches | lr 0.00 | ms/batch  8.44 | loss  4.03 | ppl    55.99
| epoch 996 |  2800/ 2928 batches | lr 0.00 | ms/batch  8.43 | loss  3.99 | ppl    54.04
-----------------------------------------------------------------------------------------
| end of epoch 996 | time: 26.17s | valid loss  5.54 | valid ppl   253.63
-----------------------------------------------------------------------------------------
| epoch 997 |   200/ 2928 batches | lr 0.00 | ms/batch  8.54 | loss  4.07 | ppl    58.67
| epoch 997 |   400/ 2928 batches | lr 0.00 | ms/batch  8.37 | loss  4.08 | ppl    58.85
| epoch 997 |   600/ 2928 batches 