In [64]:
### Necessary Library

import os
import torch
import math

In [65]:
#######################################################################################
################################# LSTM MODEL ##########################################
#######################################################################################

import torch.nn as nn


## Creating a class of nn.Module to include RNN with LSTM Units
class LSTM_Model(nn.Module):
    
    def __init__(self, vocab_size, embedding_size, hidden_size, batch_size = 1, n_layers = 1):
        
        super(LSTM_Model, self).__init__()
        
        ## Creating an embedding object to create 'embedding_size' dimensional encoding
        self.encoder = nn.Embedding(vocab_size,embedding_size)
        
        self.lstm = nn.LSTM(embedding_size,hidden_size,n_layers)
        
        ## The output size is assigned as hidden_size * vocab_size 
        ## (Since the output is one word from the entire vocab)
        self.linear_output = nn.Linear(hidden_size,vocab_size)
        
        self.init_weights()
        
        ## More variables
        self.hidden_size = hidden_size
        self.n_layers = n_layers
    
        
    def init_weights(self):
        random_range = 0.1
        
        ## Setting random values for different layers
        self.encoder.weight.data.uniform_(-random_range, random_range)
        
        self.linear_output.bias.data.zero_()
        self.linear_output.weight.data.uniform_(-random_range, random_range)

    # Defining the forward layer    
    def forward(self, input, hidden):
        
        #Passing through the embedding layer
        embedding_input = self.encoder(input)
        
        lstm_out, hidden = self.lstm(embedding_input, hidden)
        
        decoded = self.linear_output(lstm_out.view(lstm_out.size(0)*lstm_out.size(1), lstm_out.size(2)))
        
        return decoded.view(lstm_out.size(0), lstm_out.size(1), decoded.size(1)), hidden

    
    def init_hidden(self, batch_size):
        
        weight = next(self.parameters())
        
        return (weight.new_zeros(self.n_layers, batch_size, self.hidden_size), \
                    weight.new_zeros(self.n_layers, batch_size, self.hidden_size))
    
######################################################################################
######################################################################################

In [66]:
################################## FUNCTIONS TO READ THE DATA ########################

# Dictionary class to create word2idx and idx2word objects 
#(Mapping for a word to an id and vice versa)

class Dictionary(object):
    def __init__(self):
        self.word2idx = {}
        self.idx2word = []

    def add_word(self, word):
        if word not in self.word2idx:
            self.idx2word.append(word)
            self.word2idx[word] = len(self.idx2word) - 1
        return self.word2idx[word]

    def __len__(self):
        return len(self.idx2word)
    

    
#### Tokenize function to convert text file to torch idx file #####################

def tokenize(path):
    """Tokenizes a text file."""
    assert os.path.exists(path)
    # Add words to the dictionary
    
    with open(path, 'r', encoding="utf8") as f:
        
        
        tokens = 0
        for line in f:
            words = line.split()
            tokens += len(words)
            for word in words:
                dictionary.add_word(word)

    # Tokenize file content
    with open(path, 'r', encoding="utf8") as f:
        ids = torch.LongTensor(tokens)
        token = 0
        for line in f:
            words = line.split()
            for word in words:
                ids[token] = dictionary.word2idx[word]
                token += 1

    return ids

######################################################################################
######################################################################################

In [126]:
########################## Additional Functions for training the model ##############

def batchify(data, bsz):
    # Work out how cleanly we can divide the dataset into bsz parts.
    nbatch = data.size(0) // bsz
    # Trim off any extra elements that wouldn't cleanly fit (remainders).
    data = data.narrow(0, 0, nbatch * bsz)
    # Evenly divide the data across the bsz batches.
    data = data.view(bsz, -1).t().contiguous()
    return data.to(device)


#### Separate data into batch sizes

def batch_split(data, pos, bs = mini_batch_size):
    
    nbatch = data.size(0) // mini_batch_size
    
    data = data.narrow(0, 0, nbatch * mini_batch_size)

    # Evenly divide the data across the mini_batch_size batches.
    data = data.view(mini_batch_size, -1).t().contiguous().to(device)
    
    
    in_data = data[pos:pos+bs]
    out_data = data[(pos+1):(pos+bs+1)].view(-1)
    
    return in_data,out_data
    
    
def repackage_hidden(h):
    """Wraps hidden states in new Tensors, to detach them from their history."""
    if isinstance(h, torch.Tensor):
        return h.detach()
    else:
        return tuple(repackage_hidden(v) for v in h)

In [127]:
# def batchify(data, bsz):
#     # Work out how cleanly we can divide the dataset into bsz parts.
#     nbatch = data.size(0) // bsz
#     # Trim off any extra elements that wouldn't cleanly fit (remainders).
#     data = data.narrow(0, 0, nbatch * bsz)
#     # Evenly divide the data across the bsz batches.
#     data = data.view(bsz, -1).t().contiguous()
#     return data.to(device)

In [185]:
def train():
    
    # Turn on training mode which enables dropout.
    model.train()
    total_loss = 0.
    #start_time = time.time()
    
    hidden = model.init_hidden(mini_batch_size)
    
    for batch, i in enumerate(range(0, train_data.size(0) - 1, mini_batch_size)):
        
        
        data, targets = batch_split(train_data, i,mini_batch_size)
       
        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        hidden = repackage_hidden(hidden)
        
        model.zero_grad()
        
        ## Calling the forward pass of the model 
        output, hidden = model(data, hidden)
        
        ## Calculating the loss
        loss = criterion(output.view(-1, vocab_size), targets)
        
        ## Computing gradient and optimizing
        loss.backward()
        optimizer.step()
        

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_norm_(model.parameters(), gradient_clip_val)
        
        
        for p in model.parameters():
            p.data.add_(-lr, p.grad.data)

        total_loss += loss.item()

        if batch % log_interval == 0 and batch > 0:
            cur_loss = total_loss / log_interval
            #elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | '
                    'loss {:5.2f} | ppl {:8.2f}'.format(
                epoch, batch, len(train_data) // mini_batch_size, lr,\
                    #elapsed * 1000 / args.log_interval, 
                        cur_loss, math.exp(cur_loss)))
            total_loss = 0
            #start_time = time.time()

In [202]:
def evaluate(data_source):
    # Turn on evaluation mode which disables dropout.
    model.eval()
    total_loss = 0.
    
    hidden = model.init_hidden(eval_batch_size)
    
    with torch.no_grad():
        for i in range(0, data_source.size(0) - 1, eval_batch_size):
            
            data, targets = batch_split(data_source, i,eval_batch_size)
            
            output, hidden = model(data, hidden)
            
            output_flat = output.view(-1, vocab_size)
                      
            total_loss +=  criterion(output_flat, targets).item()
            
            hidden = repackage_hidden(hidden)
            
    return total_loss / (len(data_source) - 1)

#### Reading the data

In [146]:
#Initializing dictionary object
dictionary = Dictionary()

train_data = tokenize('trn-wiki.txt')
val_data = tokenize('dev-wiki.txt')

In [192]:
train_data = train_data[0:320]
#val_data = val_data[0:320]
val_data = train_data[0:320]

### Input Arguments

In [204]:
#vocab_size = len(dictionary)

vocab_size = len(set(train_data))

embedding_size = 32 
hidden_size = 32
mini_batch_size = 1
eval_batch_size = mini_batch_size
n_layers = 1 # For single layer LSTM

##### Creating the model object from the LSTM_Model in the models.py

model = LSTM_Model(vocab_size, embedding_size, hidden_size) ## LSTM Model

optimizer = torch.optim.SGD(model.parameters(), lr=0.1) ## SGD Optimizer
criterion = nn.CrossEntropyLoss() ## Crossentropy Loss function


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


#### Model parameters 

lr = 10 # Learning rate
epochs = 20 #
gradient_clip_val = 1
log_interval = 1

ntokens = vocab_size

In [205]:
### Running the data over different epochs
for epoch in range(1, epochs+1):
    #epoch_start_time = time.time()
    train()
    val_loss = evaluate(val_data)
    print(val_loss)

| epoch   1 |     1/  320 batches | lr 10.00 | loss 16.04 | ppl 9289838.13
| epoch   1 |     2/  320 batches | lr 10.00 | loss  6.81 | ppl   904.19
| epoch   1 |     3/  320 batches | lr 10.00 | loss 11.49 | ppl 97697.35
| epoch   1 |     4/  320 batches | lr 10.00 | loss  6.56 | ppl   707.29
| epoch   1 |     5/  320 batches | lr 10.00 | loss  9.67 | ppl 15766.12
| epoch   1 |     6/  320 batches | lr 10.00 | loss  6.52 | ppl   676.76
| epoch   1 |     7/  320 batches | lr 10.00 | loss  9.87 | ppl 19259.15
| epoch   1 |     8/  320 batches | lr 10.00 | loss  6.50 | ppl   664.65
| epoch   1 |     9/  320 batches | lr 10.00 | loss  7.40 | ppl  1628.46
| epoch   1 |    10/  320 batches | lr 10.00 | loss  9.19 | ppl  9813.31
| epoch   1 |    11/  320 batches | lr 10.00 | loss  7.19 | ppl  1329.14
| epoch   1 |    12/  320 batches | lr 10.00 | loss  8.40 | ppl  4433.70
| epoch   1 |    13/  320 batches | lr 10.00 | loss  6.78 | ppl   876.06
| epoch   1 |    14/  320 batches | lr 10.00 | lo

| epoch   1 |   189/  320 batches | lr 10.00 | loss  9.12 | ppl  9167.87
| epoch   1 |   190/  320 batches | lr 10.00 | loss  6.85 | ppl   948.60
| epoch   1 |   191/  320 batches | lr 10.00 | loss  8.91 | ppl  7426.31
| epoch   1 |   192/  320 batches | lr 10.00 | loss  7.87 | ppl  2605.73
| epoch   1 |   193/  320 batches | lr 10.00 | loss  7.46 | ppl  1737.99
| epoch   1 |   194/  320 batches | lr 10.00 | loss  7.80 | ppl  2441.50
| epoch   1 |   195/  320 batches | lr 10.00 | loss  6.64 | ppl   767.89
| epoch   1 |   196/  320 batches | lr 10.00 | loss  8.15 | ppl  3475.69
| epoch   1 |   197/  320 batches | lr 10.00 | loss  5.97 | ppl   391.30
| epoch   1 |   198/  320 batches | lr 10.00 | loss  7.86 | ppl  2598.81
| epoch   1 |   199/  320 batches | lr 10.00 | loss  7.12 | ppl  1241.33
| epoch   1 |   200/  320 batches | lr 10.00 | loss  8.10 | ppl  3284.95
| epoch   1 |   201/  320 batches | lr 10.00 | loss 12.37 | ppl 235568.20
| epoch   1 |   202/  320 batches | lr 10.00 | los

6.842583348384845
| epoch   2 |     1/  320 batches | lr 10.00 | loss 15.89 | ppl 7976168.59
| epoch   2 |     2/  320 batches | lr 10.00 | loss  7.97 | ppl  2893.18
| epoch   2 |     3/  320 batches | lr 10.00 | loss  8.75 | ppl  6305.84
| epoch   2 |     4/  320 batches | lr 10.00 | loss  6.17 | ppl   475.85
| epoch   2 |     5/  320 batches | lr 10.00 | loss  4.16 | ppl    64.10
| epoch   2 |     6/  320 batches | lr 10.00 | loss 11.16 | ppl 69990.89
| epoch   2 |     7/  320 batches | lr 10.00 | loss  8.69 | ppl  5926.05
| epoch   2 |     8/  320 batches | lr 10.00 | loss  7.58 | ppl  1952.11
| epoch   2 |     9/  320 batches | lr 10.00 | loss  6.60 | ppl   736.67
| epoch   2 |    10/  320 batches | lr 10.00 | loss  8.29 | ppl  3984.39
| epoch   2 |    11/  320 batches | lr 10.00 | loss 17.70 | ppl 48625070.55
| epoch   2 |    12/  320 batches | lr 10.00 | loss  7.35 | ppl  1551.08
| epoch   2 |    13/  320 batches | lr 10.00 | loss  6.71 | ppl   817.84
| epoch   2 |    14/  320 ba

| epoch   2 |   186/  320 batches | lr 10.00 | loss  7.31 | ppl  1492.47
| epoch   2 |   187/  320 batches | lr 10.00 | loss  8.63 | ppl  5577.62
| epoch   2 |   188/  320 batches | lr 10.00 | loss 11.54 | ppl 102996.76
| epoch   2 |   189/  320 batches | lr 10.00 | loss 10.22 | ppl 27476.95
| epoch   2 |   190/  320 batches | lr 10.00 | loss 19.64 | ppl 338958398.48
| epoch   2 |   191/  320 batches | lr 10.00 | loss 12.51 | ppl 272034.27
| epoch   2 |   192/  320 batches | lr 10.00 | loss  7.57 | ppl  1936.57
| epoch   2 |   193/  320 batches | lr 10.00 | loss  7.44 | ppl  1700.90
| epoch   2 |   194/  320 batches | lr 10.00 | loss 12.25 | ppl 209114.86
| epoch   2 |   195/  320 batches | lr 10.00 | loss  6.59 | ppl   727.85
| epoch   2 |   196/  320 batches | lr 10.00 | loss  8.36 | ppl  4278.46
| epoch   2 |   197/  320 batches | lr 10.00 | loss 10.46 | ppl 34731.73
| epoch   2 |   198/  320 batches | lr 10.00 | loss  7.97 | ppl  2902.60
| epoch   2 |   199/  320 batches | lr 10.00

7.409538397594679
| epoch   3 |     1/  320 batches | lr 10.00 | loss 14.39 | ppl 1768878.26
| epoch   3 |     2/  320 batches | lr 10.00 | loss  9.77 | ppl 17470.76
| epoch   3 |     3/  320 batches | lr 10.00 | loss  4.98 | ppl   145.33
| epoch   3 |     4/  320 batches | lr 10.00 | loss 13.25 | ppl 568587.11
| epoch   3 |     5/  320 batches | lr 10.00 | loss 14.93 | ppl 3045536.93
| epoch   3 |     6/  320 batches | lr 10.00 | loss  5.38 | ppl   217.20
| epoch   3 |     7/  320 batches | lr 10.00 | loss  8.00 | ppl  2981.27
| epoch   3 |     8/  320 batches | lr 10.00 | loss  6.92 | ppl  1011.84
| epoch   3 |     9/  320 batches | lr 10.00 | loss  6.88 | ppl   969.09
| epoch   3 |    10/  320 batches | lr 10.00 | loss 10.56 | ppl 38576.00
| epoch   3 |    11/  320 batches | lr 10.00 | loss  9.91 | ppl 20117.58
| epoch   3 |    12/  320 batches | lr 10.00 | loss  9.69 | ppl 16091.30
| epoch   3 |    13/  320 batches | lr 10.00 | loss 12.08 | ppl 175605.36
| epoch   3 |    14/  320 b

| epoch   3 |   177/  320 batches | lr 10.00 | loss  8.62 | ppl  5522.47
| epoch   3 |   178/  320 batches | lr 10.00 | loss 15.55 | ppl 5671830.88
| epoch   3 |   179/  320 batches | lr 10.00 | loss 10.10 | ppl 24236.60
| epoch   3 |   180/  320 batches | lr 10.00 | loss  7.48 | ppl  1766.17
| epoch   3 |   181/  320 batches | lr 10.00 | loss  7.79 | ppl  2418.02
| epoch   3 |   182/  320 batches | lr 10.00 | loss  8.29 | ppl  3966.82
| epoch   3 |   183/  320 batches | lr 10.00 | loss  3.52 | ppl    33.90
| epoch   3 |   184/  320 batches | lr 10.00 | loss  4.92 | ppl   137.37
| epoch   3 |   185/  320 batches | lr 10.00 | loss  9.75 | ppl 17198.64
| epoch   3 |   186/  320 batches | lr 10.00 | loss 16.23 | ppl 11156654.48
| epoch   3 |   187/  320 batches | lr 10.00 | loss 13.55 | ppl 765267.91
| epoch   3 |   188/  320 batches | lr 10.00 | loss  9.09 | ppl  8903.33
| epoch   3 |   189/  320 batches | lr 10.00 | loss 15.29 | ppl 4358109.93
| epoch   3 |   190/  320 batches | lr 10.0

8.481808314129102
| epoch   4 |     1/  320 batches | lr 10.00 | loss 15.17 | ppl 3872625.00
| epoch   4 |     2/  320 batches | lr 10.00 | loss 17.37 | ppl 35060419.03
| epoch   4 |     3/  320 batches | lr 10.00 | loss 10.92 | ppl 55493.42
| epoch   4 |     4/  320 batches | lr 10.00 | loss  6.52 | ppl   678.58
| epoch   4 |     5/  320 batches | lr 10.00 | loss  9.78 | ppl 17735.97
| epoch   4 |     6/  320 batches | lr 10.00 | loss 11.61 | ppl 110176.45
| epoch   4 |     7/  320 batches | lr 10.00 | loss 13.65 | ppl 847399.98
| epoch   4 |     8/  320 batches | lr 10.00 | loss  7.19 | ppl  1323.24
| epoch   4 |     9/  320 batches | lr 10.00 | loss  4.34 | ppl    76.64
| epoch   4 |    10/  320 batches | lr 10.00 | loss  8.92 | ppl  7497.12
| epoch   4 |    11/  320 batches | lr 10.00 | loss 22.66 | ppl 6908518954.90
| epoch   4 |    12/  320 batches | lr 10.00 | loss  5.62 | ppl   275.84
| epoch   4 |    13/  320 batches | lr 10.00 | loss  9.80 | ppl 17973.29
| epoch   4 |    14/ 

| epoch   4 |   159/  320 batches | lr 10.00 | loss  8.03 | ppl  3083.30
| epoch   4 |   160/  320 batches | lr 10.00 | loss  6.21 | ppl   495.52
| epoch   4 |   161/  320 batches | lr 10.00 | loss 10.14 | ppl 25220.40
| epoch   4 |   162/  320 batches | lr 10.00 | loss  7.58 | ppl  1963.55
| epoch   4 |   163/  320 batches | lr 10.00 | loss  8.40 | ppl  4441.38
| epoch   4 |   164/  320 batches | lr 10.00 | loss  7.19 | ppl  1331.17
| epoch   4 |   165/  320 batches | lr 10.00 | loss  7.62 | ppl  2041.89
| epoch   4 |   166/  320 batches | lr 10.00 | loss 16.02 | ppl 9047626.36
| epoch   4 |   167/  320 batches | lr 10.00 | loss  8.16 | ppl  3493.81
| epoch   4 |   168/  320 batches | lr 10.00 | loss  7.72 | ppl  2249.11
| epoch   4 |   169/  320 batches | lr 10.00 | loss 15.47 | ppl 5240021.14
| epoch   4 |   170/  320 batches | lr 10.00 | loss 11.47 | ppl 95612.57
| epoch   4 |   171/  320 batches | lr 10.00 | loss 11.52 | ppl 100787.55
| epoch   4 |   172/  320 batches | lr 10.00 |

7.659954254903764
| epoch   5 |     1/  320 batches | lr 10.00 | loss 18.02 | ppl 66902912.00
| epoch   5 |     2/  320 batches | lr 10.00 | loss 15.83 | ppl 7523477.40
| epoch   5 |     3/  320 batches | lr 10.00 | loss  7.27 | ppl  1436.97
| epoch   5 |     4/  320 batches | lr 10.00 | loss  8.03 | ppl  3065.38
| epoch   5 |     5/  320 batches | lr 10.00 | loss 10.98 | ppl 58715.40
| epoch   5 |     6/  320 batches | lr 10.00 | loss  7.35 | ppl  1563.28
| epoch   5 |     7/  320 batches | lr 10.00 | loss  8.48 | ppl  4809.75
| epoch   5 |     8/  320 batches | lr 10.00 | loss  7.76 | ppl  2344.19
| epoch   5 |     9/  320 batches | lr 10.00 | loss  1.40 | ppl     4.04
| epoch   5 |    10/  320 batches | lr 10.00 | loss  8.07 | ppl  3187.81
| epoch   5 |    11/  320 batches | lr 10.00 | loss 21.19 | ppl 1587007198.20
| epoch   5 |    12/  320 batches | lr 10.00 | loss  7.81 | ppl  2476.59
| epoch   5 |    13/  320 batches | lr 10.00 | loss 12.68 | ppl 320593.40
| epoch   5 |    14/  

| epoch   5 |   159/  320 batches | lr 10.00 | loss 12.29 | ppl 217862.27
| epoch   5 |   160/  320 batches | lr 10.00 | loss  7.28 | ppl  1446.24
| epoch   5 |   161/  320 batches | lr 10.00 | loss  6.26 | ppl   525.45
| epoch   5 |   162/  320 batches | lr 10.00 | loss 10.17 | ppl 26077.87
| epoch   5 |   163/  320 batches | lr 10.00 | loss 10.35 | ppl 31365.81
| epoch   5 |   164/  320 batches | lr 10.00 | loss  6.52 | ppl   675.98
| epoch   5 |   165/  320 batches | lr 10.00 | loss  9.32 | ppl 11132.07
| epoch   5 |   166/  320 batches | lr 10.00 | loss  7.77 | ppl  2370.13
| epoch   5 |   167/  320 batches | lr 10.00 | loss  7.51 | ppl  1831.06
| epoch   5 |   168/  320 batches | lr 10.00 | loss 12.90 | ppl 400768.13
| epoch   5 |   169/  320 batches | lr 10.00 | loss  7.44 | ppl  1697.54
| epoch   5 |   170/  320 batches | lr 10.00 | loss  5.41 | ppl   223.19
| epoch   5 |   171/  320 batches | lr 10.00 | loss  6.72 | ppl   826.89
| epoch   5 |   172/  320 batches | lr 10.00 | lo

| epoch   5 |   318/  320 batches | lr 10.00 | loss  7.09 | ppl  1203.35
8.194782484287753
| epoch   6 |     1/  320 batches | lr 10.00 | loss 21.48 | ppl 2123810298.44
| epoch   6 |     2/  320 batches | lr 10.00 | loss 15.09 | ppl 3570188.82
| epoch   6 |     3/  320 batches | lr 10.00 | loss 15.17 | ppl 3868025.98
| epoch   6 |     4/  320 batches | lr 10.00 | loss  4.93 | ppl   138.43
| epoch   6 |     5/  320 batches | lr 10.00 | loss 10.24 | ppl 27884.21
| epoch   6 |     6/  320 batches | lr 10.00 | loss 14.89 | ppl 2933549.36
| epoch   6 |     7/  320 batches | lr 10.00 | loss  8.20 | ppl  3644.24
| epoch   6 |     8/  320 batches | lr 10.00 | loss  8.60 | ppl  5412.35
| epoch   6 |     9/  320 batches | lr 10.00 | loss 18.64 | ppl 124968687.14
| epoch   6 |    10/  320 batches | lr 10.00 | loss  4.85 | ppl   127.48
| epoch   6 |    11/  320 batches | lr 10.00 | loss 28.00 | ppl 1441066597182.71
| epoch   6 |    12/  320 batches | lr 10.00 | loss  9.06 | ppl  8563.03
| epoch   

| epoch   6 |   166/  320 batches | lr 10.00 | loss  6.74 | ppl   846.02
| epoch   6 |   167/  320 batches | lr 10.00 | loss  8.39 | ppl  4422.34
| epoch   6 |   168/  320 batches | lr 10.00 | loss  5.59 | ppl   268.35
| epoch   6 |   169/  320 batches | lr 10.00 | loss 10.78 | ppl 48236.66
| epoch   6 |   170/  320 batches | lr 10.00 | loss  4.85 | ppl   128.11
| epoch   6 |   171/  320 batches | lr 10.00 | loss  4.03 | ppl    56.15
| epoch   6 |   172/  320 batches | lr 10.00 | loss 13.09 | ppl 484818.49
| epoch   6 |   173/  320 batches | lr 10.00 | loss 10.44 | ppl 34242.71
| epoch   6 |   174/  320 batches | lr 10.00 | loss  6.63 | ppl   757.88
| epoch   6 |   175/  320 batches | lr 10.00 | loss  7.82 | ppl  2488.30
| epoch   6 |   176/  320 batches | lr 10.00 | loss 10.68 | ppl 43503.86
| epoch   6 |   177/  320 batches | lr 10.00 | loss 11.28 | ppl 79608.17
| epoch   6 |   178/  320 batches | lr 10.00 | loss  3.73 | ppl    41.62
| epoch   6 |   179/  320 batches | lr 10.00 | los

7.8473198047625985
| epoch   7 |     1/  320 batches | lr 10.00 | loss 21.10 | ppl 1453882879.28
| epoch   7 |     2/  320 batches | lr 10.00 | loss 15.62 | ppl 6088010.44
| epoch   7 |     3/  320 batches | lr 10.00 | loss  7.60 | ppl  1994.40
| epoch   7 |     4/  320 batches | lr 10.00 | loss  3.90 | ppl    49.51
| epoch   7 |     5/  320 batches | lr 10.00 | loss 17.57 | ppl 42604260.95
| epoch   7 |     6/  320 batches | lr 10.00 | loss 10.09 | ppl 23981.50
| epoch   7 |     7/  320 batches | lr 10.00 | loss 14.27 | ppl 1567566.03
| epoch   7 |     8/  320 batches | lr 10.00 | loss  7.40 | ppl  1641.24
| epoch   7 |     9/  320 batches | lr 10.00 | loss  8.25 | ppl  3842.33
| epoch   7 |    10/  320 batches | lr 10.00 | loss  9.55 | ppl 14003.30
| epoch   7 |    11/  320 batches | lr 10.00 | loss 32.23 | ppl 98889221253635.66
| epoch   7 |    12/  320 batches | lr 10.00 | loss  6.51 | ppl   673.59
| epoch   7 |    13/  320 batches | lr 10.00 | loss 12.90 | ppl 399544.66
| epoch   

| epoch   7 |   170/  320 batches | lr 10.00 | loss  0.01 | ppl     1.01
| epoch   7 |   171/  320 batches | lr 10.00 | loss  9.27 | ppl 10603.34
| epoch   7 |   172/  320 batches | lr 10.00 | loss 14.99 | ppl 3227289.09
| epoch   7 |   173/  320 batches | lr 10.00 | loss  4.94 | ppl   140.37
| epoch   7 |   174/  320 batches | lr 10.00 | loss  6.84 | ppl   936.66
| epoch   7 |   175/  320 batches | lr 10.00 | loss 17.32 | ppl 33257657.81
| epoch   7 |   176/  320 batches | lr 10.00 | loss  7.54 | ppl  1886.52
| epoch   7 |   177/  320 batches | lr 10.00 | loss  7.77 | ppl  2368.81
| epoch   7 |   178/  320 batches | lr 10.00 | loss 10.65 | ppl 42151.96
| epoch   7 |   179/  320 batches | lr 10.00 | loss 36.08 | ppl 4691451569700941.00
| epoch   7 |   180/  320 batches | lr 10.00 | loss 12.65 | ppl 313228.39
| epoch   7 |   181/  320 batches | lr 10.00 | loss 22.53 | ppl 6096002392.81
| epoch   7 |   182/  320 batches | lr 10.00 | loss  7.56 | ppl  1925.12
| epoch   7 |   183/  320 bat

8.170266764291028
| epoch   8 |     1/  320 batches | lr 10.00 | loss 17.55 | ppl 41763608.00
| epoch   8 |     2/  320 batches | lr 10.00 | loss 10.11 | ppl 24620.50
| epoch   8 |     3/  320 batches | lr 10.00 | loss  6.91 | ppl  1002.07
| epoch   8 |     4/  320 batches | lr 10.00 | loss  0.90 | ppl     2.45
| epoch   8 |     5/  320 batches | lr 10.00 | loss 10.82 | ppl 50128.35
| epoch   8 |     6/  320 batches | lr 10.00 | loss 16.80 | ppl 19863373.10
| epoch   8 |     7/  320 batches | lr 10.00 | loss  8.19 | ppl  3609.51
| epoch   8 |     8/  320 batches | lr 10.00 | loss 10.28 | ppl 29218.53
| epoch   8 |     9/  320 batches | lr 10.00 | loss 10.55 | ppl 38272.45
| epoch   8 |    10/  320 batches | lr 10.00 | loss  9.26 | ppl 10526.71
| epoch   8 |    11/  320 batches | lr 10.00 | loss 12.68 | ppl 321273.78
| epoch   8 |    12/  320 batches | lr 10.00 | loss 10.82 | ppl 50241.77
| epoch   8 |    13/  320 batches | lr 10.00 | loss  6.63 | ppl   760.33
| epoch   8 |    14/  320 

| epoch   8 |   168/  320 batches | lr 10.00 | loss  7.41 | ppl  1651.36
| epoch   8 |   169/  320 batches | lr 10.00 | loss 22.43 | ppl 5519014501.31
| epoch   8 |   170/  320 batches | lr 10.00 | loss 15.67 | ppl 6415709.43
| epoch   8 |   171/  320 batches | lr 10.00 | loss  2.65 | ppl    14.13
| epoch   8 |   172/  320 batches | lr 10.00 | loss 10.23 | ppl 27610.15
| epoch   8 |   173/  320 batches | lr 10.00 | loss 10.10 | ppl 24293.11
| epoch   8 |   174/  320 batches | lr 10.00 | loss 11.81 | ppl 133972.06
| epoch   8 |   175/  320 batches | lr 10.00 | loss  6.27 | ppl   530.34
| epoch   8 |   176/  320 batches | lr 10.00 | loss  8.49 | ppl  4855.45
| epoch   8 |   177/  320 batches | lr 10.00 | loss  9.36 | ppl 11670.47
| epoch   8 |   178/  320 batches | lr 10.00 | loss  5.04 | ppl   153.97
| epoch   8 |   179/  320 batches | lr 10.00 | loss  9.66 | ppl 15714.76
| epoch   8 |   180/  320 batches | lr 10.00 | loss  8.18 | ppl  3567.04
| epoch   8 |   181/  320 batches | lr 10.0

7.630773684821532
| epoch   9 |     1/  320 batches | lr 10.00 | loss 20.24 | ppl 617073991.74
| epoch   9 |     2/  320 batches | lr 10.00 | loss  6.01 | ppl   408.18
| epoch   9 |     3/  320 batches | lr 10.00 | loss  6.50 | ppl   667.91
| epoch   9 |     4/  320 batches | lr 10.00 | loss  7.74 | ppl  2287.88
| epoch   9 |     5/  320 batches | lr 10.00 | loss  9.49 | ppl 13265.78
| epoch   9 |     6/  320 batches | lr 10.00 | loss  8.24 | ppl  3787.81
| epoch   9 |     7/  320 batches | lr 10.00 | loss 20.28 | ppl 641881879.21
| epoch   9 |     8/  320 batches | lr 10.00 | loss  9.65 | ppl 15579.46
| epoch   9 |     9/  320 batches | lr 10.00 | loss 13.34 | ppl 619797.86
| epoch   9 |    10/  320 batches | lr 10.00 | loss  6.83 | ppl   921.79
| epoch   9 |    11/  320 batches | lr 10.00 | loss 31.65 | ppl 55506434411739.80
| epoch   9 |    12/  320 batches | lr 10.00 | loss 10.13 | ppl 25149.88
| epoch   9 |    13/  320 batches | lr 10.00 | loss  8.08 | ppl  3241.71
| epoch   9 |  

| epoch   9 |   176/  320 batches | lr 10.00 | loss  4.89 | ppl   132.43
| epoch   9 |   177/  320 batches | lr 10.00 | loss 10.40 | ppl 32734.25
| epoch   9 |   178/  320 batches | lr 10.00 | loss 12.95 | ppl 422057.60
| epoch   9 |   179/  320 batches | lr 10.00 | loss  8.61 | ppl  5507.17
| epoch   9 |   180/  320 batches | lr 10.00 | loss 10.50 | ppl 36210.13
| epoch   9 |   181/  320 batches | lr 10.00 | loss 14.22 | ppl 1500946.18
| epoch   9 |   182/  320 batches | lr 10.00 | loss  9.69 | ppl 16085.07
| epoch   9 |   183/  320 batches | lr 10.00 | loss 11.11 | ppl 67138.15
| epoch   9 |   184/  320 batches | lr 10.00 | loss  7.19 | ppl  1322.61
| epoch   9 |   185/  320 batches | lr 10.00 | loss 11.53 | ppl 102147.00
| epoch   9 |   186/  320 batches | lr 10.00 | loss  6.04 | ppl   420.81
| epoch   9 |   187/  320 batches | lr 10.00 | loss  9.16 | ppl  9482.48
| epoch   9 |   188/  320 batches | lr 10.00 | loss 16.67 | ppl 17364775.22
| epoch   9 |   189/  320 batches | lr 10.00

8.39101995121349
| epoch  10 |     1/  320 batches | lr 10.00 | loss 13.20 | ppl 541225.61
| epoch  10 |     2/  320 batches | lr 10.00 | loss  0.27 | ppl     1.31
| epoch  10 |     3/  320 batches | lr 10.00 | loss  9.00 | ppl  8126.43
| epoch  10 |     4/  320 batches | lr 10.00 | loss  3.31 | ppl    27.26
| epoch  10 |     5/  320 batches | lr 10.00 | loss 16.84 | ppl 20486830.36
| epoch  10 |     6/  320 batches | lr 10.00 | loss  2.68 | ppl    14.54
| epoch  10 |     7/  320 batches | lr 10.00 | loss 10.78 | ppl 47963.49
| epoch  10 |     8/  320 batches | lr 10.00 | loss 11.80 | ppl 132875.11
| epoch  10 |     9/  320 batches | lr 10.00 | loss  8.25 | ppl  3815.67
| epoch  10 |    10/  320 batches | lr 10.00 | loss 10.42 | ppl 33375.23
| epoch  10 |    11/  320 batches | lr 10.00 | loss 16.67 | ppl 17363980.35
| epoch  10 |    12/  320 batches | lr 10.00 | loss  7.64 | ppl  2081.43
| epoch  10 |    13/  320 batches | lr 10.00 | loss  7.75 | ppl  2320.59
| epoch  10 |    14/  320 

| epoch  10 |   177/  320 batches | lr 10.00 | loss 11.27 | ppl 78810.55
| epoch  10 |   178/  320 batches | lr 10.00 | loss 13.00 | ppl 442310.46
| epoch  10 |   179/  320 batches | lr 10.00 | loss  4.20 | ppl    66.77
| epoch  10 |   180/  320 batches | lr 10.00 | loss  7.86 | ppl  2594.15
| epoch  10 |   181/  320 batches | lr 10.00 | loss 11.80 | ppl 132667.45
| epoch  10 |   182/  320 batches | lr 10.00 | loss  9.81 | ppl 18244.46
| epoch  10 |   183/  320 batches | lr 10.00 | loss 13.08 | ppl 478422.24
| epoch  10 |   184/  320 batches | lr 10.00 | loss  9.74 | ppl 16971.78
| epoch  10 |   185/  320 batches | lr 10.00 | loss  6.79 | ppl   892.86
| epoch  10 |   186/  320 batches | lr 10.00 | loss  8.41 | ppl  4488.36
| epoch  10 |   187/  320 batches | lr 10.00 | loss  9.78 | ppl 17618.27
| epoch  10 |   188/  320 batches | lr 10.00 | loss  4.25 | ppl    70.05
| epoch  10 |   189/  320 batches | lr 10.00 | loss 11.83 | ppl 137454.08
| epoch  10 |   190/  320 batches | lr 10.00 | 

8.27419321514596
| epoch  11 |     1/  320 batches | lr 10.00 | loss 26.29 | ppl 260682647721.00
| epoch  11 |     2/  320 batches | lr 10.00 | loss 29.70 | ppl 7919399168379.52
| epoch  11 |     3/  320 batches | lr 10.00 | loss 13.23 | ppl 556176.95
| epoch  11 |     4/  320 batches | lr 10.00 | loss 22.26 | ppl 4657157247.83
| epoch  11 |     5/  320 batches | lr 10.00 | loss 23.45 | ppl 15357125856.92
| epoch  11 |     6/  320 batches | lr 10.00 | loss 13.63 | ppl 832895.72
| epoch  11 |     7/  320 batches | lr 10.00 | loss 10.64 | ppl 41578.12
| epoch  11 |     8/  320 batches | lr 10.00 | loss 12.44 | ppl 251579.53
| epoch  11 |     9/  320 batches | lr 10.00 | loss 23.30 | ppl 13188717464.13
| epoch  11 |    10/  320 batches | lr 10.00 | loss 10.06 | ppl 23390.59
| epoch  11 |    11/  320 batches | lr 10.00 | loss  0.01 | ppl     1.01
| epoch  11 |    12/  320 batches | lr 10.00 | loss 11.19 | ppl 72323.04
| epoch  11 |    13/  320 batches | lr 10.00 | loss  7.34 | ppl  1539.52

| epoch  11 |   175/  320 batches | lr 10.00 | loss 11.38 | ppl 87903.18
| epoch  11 |   176/  320 batches | lr 10.00 | loss  4.93 | ppl   138.86
| epoch  11 |   177/  320 batches | lr 10.00 | loss  8.61 | ppl  5472.33
| epoch  11 |   178/  320 batches | lr 10.00 | loss  8.84 | ppl  6928.21
| epoch  11 |   179/  320 batches | lr 10.00 | loss 16.66 | ppl 17201238.47
| epoch  11 |   180/  320 batches | lr 10.00 | loss  8.51 | ppl  4942.79
| epoch  11 |   181/  320 batches | lr 10.00 | loss 17.41 | ppl 36257420.89
| epoch  11 |   182/  320 batches | lr 10.00 | loss  7.60 | ppl  1990.07
| epoch  11 |   183/  320 batches | lr 10.00 | loss  4.60 | ppl    99.32
| epoch  11 |   184/  320 batches | lr 10.00 | loss  9.48 | ppl 13110.89
| epoch  11 |   185/  320 batches | lr 10.00 | loss 17.09 | ppl 26342112.67
| epoch  11 |   186/  320 batches | lr 10.00 | loss  8.09 | ppl  3267.78
| epoch  11 |   187/  320 batches | lr 10.00 | loss  7.98 | ppl  2923.18
| epoch  11 |   188/  320 batches | lr 10.

9.085251915791192
| epoch  12 |     1/  320 batches | lr 10.00 | loss 24.73 | ppl 55071323332.55
| epoch  12 |     2/  320 batches | lr 10.00 | loss 10.84 | ppl 50928.05
| epoch  12 |     3/  320 batches | lr 10.00 | loss 29.13 | ppl 4454972311452.91
| epoch  12 |     4/  320 batches | lr 10.00 | loss 20.91 | ppl 1208375029.47
| epoch  12 |     5/  320 batches | lr 10.00 | loss 15.77 | ppl 7049867.65
| epoch  12 |     6/  320 batches | lr 10.00 | loss  5.94 | ppl   379.28
| epoch  12 |     7/  320 batches | lr 10.00 | loss 28.37 | ppl 2097943635601.66
| epoch  12 |     8/  320 batches | lr 10.00 | loss 11.86 | ppl 141584.09
| epoch  12 |     9/  320 batches | lr 10.00 | loss 18.09 | ppl 72101402.97
| epoch  12 |    10/  320 batches | lr 10.00 | loss  7.85 | ppl  2554.38
| epoch  12 |    11/  320 batches | lr 10.00 | loss 23.36 | ppl 13936812115.81
| epoch  12 |    12/  320 batches | lr 10.00 | loss 11.16 | ppl 70521.27
| epoch  12 |    13/  320 batches | lr 10.00 | loss 10.96 | ppl 575

| epoch  12 |   178/  320 batches | lr 10.00 | loss 15.55 | ppl 5690061.67
| epoch  12 |   179/  320 batches | lr 10.00 | loss 15.22 | ppl 4055988.21
| epoch  12 |   180/  320 batches | lr 10.00 | loss  8.46 | ppl  4732.82
| epoch  12 |   181/  320 batches | lr 10.00 | loss 17.26 | ppl 31430714.05
| epoch  12 |   182/  320 batches | lr 10.00 | loss  8.46 | ppl  4730.20
| epoch  12 |   183/  320 batches | lr 10.00 | loss  0.11 | ppl     1.12
| epoch  12 |   184/  320 batches | lr 10.00 | loss  5.84 | ppl   344.66
| epoch  12 |   185/  320 batches | lr 10.00 | loss 11.50 | ppl 98479.85
| epoch  12 |   186/  320 batches | lr 10.00 | loss 12.06 | ppl 172243.34
| epoch  12 |   187/  320 batches | lr 10.00 | loss 13.68 | ppl 870763.71
| epoch  12 |   188/  320 batches | lr 10.00 | loss 21.65 | ppl 2533695041.25
| epoch  12 |   189/  320 batches | lr 10.00 | loss 14.05 | ppl 1260859.45
| epoch  12 |   190/  320 batches | lr 10.00 | loss  5.29 | ppl   198.31
| epoch  12 |   191/  320 batches |

8.819176599149793
| epoch  13 |     1/  320 batches | lr 10.00 | loss 20.21 | ppl 597247329.70
| epoch  13 |     2/  320 batches | lr 10.00 | loss  8.49 | ppl  4845.13
| epoch  13 |     3/  320 batches | lr 10.00 | loss  9.02 | ppl  8267.45
| epoch  13 |     4/  320 batches | lr 10.00 | loss 37.46 | ppl 18604451530862272.00
| epoch  13 |     5/  320 batches | lr 10.00 | loss 18.95 | ppl 169066822.80
| epoch  13 |     6/  320 batches | lr 10.00 | loss 14.69 | ppl 2387664.62
| epoch  13 |     7/  320 batches | lr 10.00 | loss 15.96 | ppl 8507851.95
| epoch  13 |     8/  320 batches | lr 10.00 | loss 11.58 | ppl 106625.49
| epoch  13 |     9/  320 batches | lr 10.00 | loss 12.26 | ppl 210239.05
| epoch  13 |    10/  320 batches | lr 10.00 | loss 25.57 | ppl 127230146206.56
| epoch  13 |    11/  320 batches | lr 10.00 | loss 37.50 | ppl 19399737883075804.00
| epoch  13 |    12/  320 batches | lr 10.00 | loss  8.83 | ppl  6806.80
| epoch  13 |    13/  320 batches | lr 10.00 | loss  6.45 | p

| epoch  13 |   180/  320 batches | lr 10.00 | loss  5.57 | ppl   263.13
| epoch  13 |   181/  320 batches | lr 10.00 | loss  9.89 | ppl 19685.51
| epoch  13 |   182/  320 batches | lr 10.00 | loss  9.60 | ppl 14715.28
| epoch  13 |   183/  320 batches | lr 10.00 | loss 12.48 | ppl 263337.47
| epoch  13 |   184/  320 batches | lr 10.00 | loss  6.71 | ppl   822.02
| epoch  13 |   185/  320 batches | lr 10.00 | loss  7.95 | ppl  2830.68
| epoch  13 |   186/  320 batches | lr 10.00 | loss 10.04 | ppl 22979.29
| epoch  13 |   187/  320 batches | lr 10.00 | loss  7.37 | ppl  1590.33
| epoch  13 |   188/  320 batches | lr 10.00 | loss  5.53 | ppl   250.95
| epoch  13 |   189/  320 batches | lr 10.00 | loss  4.88 | ppl   132.04
| epoch  13 |   190/  320 batches | lr 10.00 | loss  7.77 | ppl  2374.31
| epoch  13 |   191/  320 batches | lr 10.00 | loss  4.44 | ppl    84.96
| epoch  13 |   192/  320 batches | lr 10.00 | loss 12.85 | ppl 382397.39
| epoch  13 |   193/  320 batches | lr 10.00 | lo

8.951908352233026
| epoch  14 |     1/  320 batches | lr 10.00 | loss 14.73 | ppl 2483647.49
| epoch  14 |     2/  320 batches | lr 10.00 | loss 11.90 | ppl 147231.74
| epoch  14 |     3/  320 batches | lr 10.00 | loss 24.21 | ppl 32556148146.79
| epoch  14 |     4/  320 batches | lr 10.00 | loss 18.16 | ppl 77300564.98
| epoch  14 |     5/  320 batches | lr 10.00 | loss  1.73 | ppl     5.63
| epoch  14 |     6/  320 batches | lr 10.00 | loss 36.25 | ppl 5511520389886412.00
| epoch  14 |     7/  320 batches | lr 10.00 | loss  7.76 | ppl  2333.54
| epoch  14 |     8/  320 batches | lr 10.00 | loss  7.93 | ppl  2787.54
| epoch  14 |     9/  320 batches | lr 10.00 | loss 17.74 | ppl 50714774.13
| epoch  14 |    10/  320 batches | lr 10.00 | loss 14.01 | ppl 1212649.20
| epoch  14 |    11/  320 batches | lr 10.00 | loss 13.37 | ppl 643615.61
| epoch  14 |    12/  320 batches | lr 10.00 | loss 15.91 | ppl 8125142.49
| epoch  14 |    13/  320 batches | lr 10.00 | loss  0.32 | ppl     1.38
| 

| epoch  14 |   181/  320 batches | lr 10.00 | loss 27.24 | ppl 677278410346.15
| epoch  14 |   182/  320 batches | lr 10.00 | loss  8.88 | ppl  7192.81
| epoch  14 |   183/  320 batches | lr 10.00 | loss  5.08 | ppl   160.29
| epoch  14 |   184/  320 batches | lr 10.00 | loss 14.16 | ppl 1415011.48
| epoch  14 |   185/  320 batches | lr 10.00 | loss  7.61 | ppl  2025.94
| epoch  14 |   186/  320 batches | lr 10.00 | loss 10.22 | ppl 27410.50
| epoch  14 |   187/  320 batches | lr 10.00 | loss  9.09 | ppl  8842.21
| epoch  14 |   188/  320 batches | lr 10.00 | loss 12.08 | ppl 177058.28
| epoch  14 |   189/  320 batches | lr 10.00 | loss  9.30 | ppl 10924.63
| epoch  14 |   190/  320 batches | lr 10.00 | loss  6.45 | ppl   629.58
| epoch  14 |   191/  320 batches | lr 10.00 | loss  3.01 | ppl    20.19
| epoch  14 |   192/  320 batches | lr 10.00 | loss  8.59 | ppl  5375.94
| epoch  14 |   193/  320 batches | lr 10.00 | loss  7.32 | ppl  1508.56
| epoch  14 |   194/  320 batches | lr 10

8.61320955103094
| epoch  15 |     1/  320 batches | lr 10.00 | loss 21.29 | ppl 1756809292.25
| epoch  15 |     2/  320 batches | lr 10.00 | loss  3.75 | ppl    42.44
| epoch  15 |     3/  320 batches | lr 10.00 | loss  6.27 | ppl   527.31
| epoch  15 |     4/  320 batches | lr 10.00 | loss  9.29 | ppl 10855.23
| epoch  15 |     5/  320 batches | lr 10.00 | loss 24.43 | ppl 40738482366.15
| epoch  15 |     6/  320 batches | lr 10.00 | loss  3.79 | ppl    44.10
| epoch  15 |     7/  320 batches | lr 10.00 | loss 17.57 | ppl 42919741.51
| epoch  15 |     8/  320 batches | lr 10.00 | loss  8.22 | ppl  3706.38
| epoch  15 |     9/  320 batches | lr 10.00 | loss  0.16 | ppl     1.17
| epoch  15 |    10/  320 batches | lr 10.00 | loss 12.21 | ppl 200399.64
| epoch  15 |    11/  320 batches | lr 10.00 | loss 12.60 | ppl 296735.21
| epoch  15 |    12/  320 batches | lr 10.00 | loss  9.52 | ppl 13596.25
| epoch  15 |    13/  320 batches | lr 10.00 | loss  9.21 | ppl  9997.45
| epoch  15 |    1

| epoch  15 |   179/  320 batches | lr 10.00 | loss 13.50 | ppl 732605.11
| epoch  15 |   180/  320 batches | lr 10.00 | loss  7.07 | ppl  1173.29
| epoch  15 |   181/  320 batches | lr 10.00 | loss 34.90 | ppl 1433740431154944.75
| epoch  15 |   182/  320 batches | lr 10.00 | loss  8.51 | ppl  4969.31
| epoch  15 |   183/  320 batches | lr 10.00 | loss  0.02 | ppl     1.02
| epoch  15 |   184/  320 batches | lr 10.00 | loss  5.98 | ppl   397.17
| epoch  15 |   185/  320 batches | lr 10.00 | loss 13.26 | ppl 573562.72
| epoch  15 |   186/  320 batches | lr 10.00 | loss  4.97 | ppl   144.12
| epoch  15 |   187/  320 batches | lr 10.00 | loss 15.72 | ppl 6710556.93
| epoch  15 |   188/  320 batches | lr 10.00 | loss  3.07 | ppl    21.46
| epoch  15 |   189/  320 batches | lr 10.00 | loss  0.21 | ppl     1.24
| epoch  15 |   190/  320 batches | lr 10.00 | loss  8.54 | ppl  5108.16
| epoch  15 |   191/  320 batches | lr 10.00 | loss 15.81 | ppl 7369556.74
| epoch  15 |   192/  320 batches 

8.295309579484515
| epoch  16 |     1/  320 batches | lr 10.00 | loss 23.81 | ppl 21926080373.03
| epoch  16 |     2/  320 batches | lr 10.00 | loss 11.91 | ppl 148700.84
| epoch  16 |     3/  320 batches | lr 10.00 | loss  0.87 | ppl     2.40
| epoch  16 |     4/  320 batches | lr 10.00 | loss 13.96 | ppl 1160468.65
| epoch  16 |     5/  320 batches | lr 10.00 | loss  4.52 | ppl    92.14
| epoch  16 |     6/  320 batches | lr 10.00 | loss 12.17 | ppl 192590.35
| epoch  16 |     7/  320 batches | lr 10.00 | loss 12.58 | ppl 291610.90
| epoch  16 |     8/  320 batches | lr 10.00 | loss  9.03 | ppl  8382.59
| epoch  16 |     9/  320 batches | lr 10.00 | loss  5.77 | ppl   321.06
| epoch  16 |    10/  320 batches | lr 10.00 | loss 11.71 | ppl 121702.10
| epoch  16 |    11/  320 batches | lr 10.00 | loss 11.92 | ppl 150388.70
| epoch  16 |    12/  320 batches | lr 10.00 | loss  7.96 | ppl  2875.15
| epoch  16 |    13/  320 batches | lr 10.00 | loss 10.38 | ppl 32295.30
| epoch  16 |    14/

| epoch  16 |   180/  320 batches | lr 10.00 | loss  6.44 | ppl   626.23
| epoch  16 |   181/  320 batches | lr 10.00 | loss 17.66 | ppl 46741050.48
| epoch  16 |   182/  320 batches | lr 10.00 | loss  9.23 | ppl 10200.25
| epoch  16 |   183/  320 batches | lr 10.00 | loss  0.17 | ppl     1.18
| epoch  16 |   184/  320 batches | lr 10.00 | loss  8.49 | ppl  4859.65
| epoch  16 |   185/  320 batches | lr 10.00 | loss  8.80 | ppl  6665.23
| epoch  16 |   186/  320 batches | lr 10.00 | loss  8.50 | ppl  4924.60
| epoch  16 |   187/  320 batches | lr 10.00 | loss  7.67 | ppl  2148.35
| epoch  16 |   188/  320 batches | lr 10.00 | loss  0.00 | ppl     1.00
| epoch  16 |   189/  320 batches | lr 10.00 | loss  9.63 | ppl 15232.14
| epoch  16 |   190/  320 batches | lr 10.00 | loss 10.03 | ppl 22695.82
| epoch  16 |   191/  320 batches | lr 10.00 | loss 16.97 | ppl 23556911.32
| epoch  16 |   192/  320 batches | lr 10.00 | loss  7.22 | ppl  1362.40
| epoch  16 |   193/  320 batches | lr 10.00 

7.6641739857234175
| epoch  17 |     1/  320 batches | lr 10.00 | loss 43.58 | ppl 8470306865216401408.00
| epoch  17 |     2/  320 batches | lr 10.00 | loss 11.28 | ppl 79609.09
| epoch  17 |     3/  320 batches | lr 10.00 | loss  6.31 | ppl   550.51
| epoch  17 |     4/  320 batches | lr 10.00 | loss  8.94 | ppl  7662.18
| epoch  17 |     5/  320 batches | lr 10.00 | loss 16.81 | ppl 19886952.41
| epoch  17 |     6/  320 batches | lr 10.00 | loss 27.27 | ppl 697925055413.62
| epoch  17 |     7/  320 batches | lr 10.00 | loss 10.87 | ppl 52627.98
| epoch  17 |     8/  320 batches | lr 10.00 | loss 10.93 | ppl 55794.89
| epoch  17 |     9/  320 batches | lr 10.00 | loss  0.30 | ppl     1.35
| epoch  17 |    10/  320 batches | lr 10.00 | loss 22.68 | ppl 7084933096.67
| epoch  17 |    11/  320 batches | lr 10.00 | loss 19.24 | ppl 226726272.05
| epoch  17 |    12/  320 batches | lr 10.00 | loss 12.48 | ppl 264231.78
| epoch  17 |    13/  320 batches | lr 10.00 | loss  7.28 | ppl  1451.0

| epoch  17 |   183/  320 batches | lr 10.00 | loss  0.00 | ppl     1.00
| epoch  17 |   184/  320 batches | lr 10.00 | loss  7.96 | ppl  2862.96
| epoch  17 |   185/  320 batches | lr 10.00 | loss  9.47 | ppl 13026.69
| epoch  17 |   186/  320 batches | lr 10.00 | loss  7.66 | ppl  2118.53
| epoch  17 |   187/  320 batches | lr 10.00 | loss  6.48 | ppl   652.01
| epoch  17 |   188/  320 batches | lr 10.00 | loss  0.00 | ppl     1.00
| epoch  17 |   189/  320 batches | lr 10.00 | loss  3.03 | ppl    20.79
| epoch  17 |   190/  320 batches | lr 10.00 | loss  7.05 | ppl  1154.14
| epoch  17 |   191/  320 batches | lr 10.00 | loss 23.65 | ppl 18585738948.62
| epoch  17 |   192/  320 batches | lr 10.00 | loss  3.81 | ppl    45.27
| epoch  17 |   193/  320 batches | lr 10.00 | loss  7.83 | ppl  2518.27
| epoch  17 |   194/  320 batches | lr 10.00 | loss  4.74 | ppl   114.32
| epoch  17 |   195/  320 batches | lr 10.00 | loss  7.73 | ppl  2273.06
| epoch  17 |   196/  320 batches | lr 10.00 

7.381828491964311
| epoch  18 |     1/  320 batches | lr 10.00 | loss 19.20 | ppl 217450982.75
| epoch  18 |     2/  320 batches | lr 10.00 | loss 10.42 | ppl 33377.68
| epoch  18 |     3/  320 batches | lr 10.00 | loss 12.54 | ppl 279428.73
| epoch  18 |     4/  320 batches | lr 10.00 | loss 11.94 | ppl 153332.91
| epoch  18 |     5/  320 batches | lr 10.00 | loss  0.16 | ppl     1.18
| epoch  18 |     6/  320 batches | lr 10.00 | loss 35.78 | ppl 3468059189440644.50
| epoch  18 |     7/  320 batches | lr 10.00 | loss  7.96 | ppl  2858.72
| epoch  18 |     8/  320 batches | lr 10.00 | loss  4.09 | ppl    59.73
| epoch  18 |     9/  320 batches | lr 10.00 | loss  0.70 | ppl     2.02
| epoch  18 |    10/  320 batches | lr 10.00 | loss 26.78 | ppl 426577698270.22
| epoch  18 |    11/  320 batches | lr 10.00 | loss 13.61 | ppl 811130.32
| epoch  18 |    12/  320 batches | lr 10.00 | loss  6.46 | ppl   637.85
| epoch  18 |    13/  320 batches | lr 10.00 | loss  5.37 | ppl   214.59
| epoch 

| epoch  18 |   181/  320 batches | lr 10.00 | loss 15.25 | ppl 4182855.83
| epoch  18 |   182/  320 batches | lr 10.00 | loss  8.54 | ppl  5091.35
| epoch  18 |   183/  320 batches | lr 10.00 | loss  0.00 | ppl     1.00
| epoch  18 |   184/  320 batches | lr 10.00 | loss  8.48 | ppl  4821.31
| epoch  18 |   185/  320 batches | lr 10.00 | loss  7.22 | ppl  1363.54
| epoch  18 |   186/  320 batches | lr 10.00 | loss 10.18 | ppl 26489.17
| epoch  18 |   187/  320 batches | lr 10.00 | loss  6.39 | ppl   597.36
| epoch  18 |   188/  320 batches | lr 10.00 | loss  0.20 | ppl     1.22
| epoch  18 |   189/  320 batches | lr 10.00 | loss  4.88 | ppl   131.02
| epoch  18 |   190/  320 batches | lr 10.00 | loss  5.36 | ppl   211.81
| epoch  18 |   191/  320 batches | lr 10.00 | loss 15.55 | ppl 5674341.25
| epoch  18 |   192/  320 batches | lr 10.00 | loss 12.57 | ppl 288153.07
| epoch  18 |   193/  320 batches | lr 10.00 | loss  6.40 | ppl   602.62
| epoch  18 |   194/  320 batches | lr 10.00 |

7.549211061112933
| epoch  19 |     1/  320 batches | lr 10.00 | loss 28.52 | ppl 2434996730730.02
| epoch  19 |     2/  320 batches | lr 10.00 | loss 11.04 | ppl 62156.68
| epoch  19 |     3/  320 batches | lr 10.00 | loss  0.04 | ppl     1.04
| epoch  19 |     4/  320 batches | lr 10.00 | loss 17.03 | ppl 24853262.84
| epoch  19 |     5/  320 batches | lr 10.00 | loss  0.00 | ppl     1.00
| epoch  19 |     6/  320 batches | lr 10.00 | loss 18.90 | ppl 160826980.72
| epoch  19 |     7/  320 batches | lr 10.00 | loss  8.89 | ppl  7275.62
| epoch  19 |     8/  320 batches | lr 10.00 | loss  5.83 | ppl   341.87
| epoch  19 |     9/  320 batches | lr 10.00 | loss  0.10 | ppl     1.11
| epoch  19 |    10/  320 batches | lr 10.00 | loss 36.66 | ppl 8369704113731189.00
| epoch  19 |    11/  320 batches | lr 10.00 | loss 20.69 | ppl 965057617.00
| epoch  19 |    12/  320 batches | lr 10.00 | loss  4.47 | ppl    87.76
| epoch  19 |    13/  320 batches | lr 10.00 | loss  8.92 | ppl  7481.38
| e

| epoch  19 |   182/  320 batches | lr 10.00 | loss 11.43 | ppl 91608.87
| epoch  19 |   183/  320 batches | lr 10.00 | loss  0.00 | ppl     1.00
| epoch  19 |   184/  320 batches | lr 10.00 | loss  6.71 | ppl   821.73
| epoch  19 |   185/  320 batches | lr 10.00 | loss 10.23 | ppl 27759.83
| epoch  19 |   186/  320 batches | lr 10.00 | loss  8.19 | ppl  3596.00
| epoch  19 |   187/  320 batches | lr 10.00 | loss  0.64 | ppl     1.90
| epoch  19 |   188/  320 batches | lr 10.00 | loss  0.00 | ppl     1.00
| epoch  19 |   189/  320 batches | lr 10.00 | loss  0.00 | ppl     1.00
| epoch  19 |   190/  320 batches | lr 10.00 | loss  5.77 | ppl   320.90
| epoch  19 |   191/  320 batches | lr 10.00 | loss 15.63 | ppl 6111395.28
| epoch  19 |   192/  320 batches | lr 10.00 | loss  7.21 | ppl  1351.93
| epoch  19 |   193/  320 batches | lr 10.00 | loss  7.17 | ppl  1296.19
| epoch  19 |   194/  320 batches | lr 10.00 | loss  9.73 | ppl 16862.21
| epoch  19 |   195/  320 batches | lr 10.00 | lo

7.121390764242429
| epoch  20 |     1/  320 batches | lr 10.00 | loss  7.86 | ppl  2592.60
| epoch  20 |     2/  320 batches | lr 10.00 | loss  7.30 | ppl  1481.57
| epoch  20 |     3/  320 batches | lr 10.00 | loss  8.06 | ppl  3176.28
| epoch  20 |     4/  320 batches | lr 10.00 | loss  7.36 | ppl  1569.41
| epoch  20 |     5/  320 batches | lr 10.00 | loss  6.71 | ppl   822.30
| epoch  20 |     6/  320 batches | lr 10.00 | loss 14.55 | ppl 2079799.27
| epoch  20 |     7/  320 batches | lr 10.00 | loss 12.08 | ppl 176879.72
| epoch  20 |     8/  320 batches | lr 10.00 | loss  5.62 | ppl   276.49
| epoch  20 |     9/  320 batches | lr 10.00 | loss  2.76 | ppl    15.84
| epoch  20 |    10/  320 batches | lr 10.00 | loss 10.79 | ppl 48742.83
| epoch  20 |    11/  320 batches | lr 10.00 | loss 12.32 | ppl 224511.02
| epoch  20 |    12/  320 batches | lr 10.00 | loss  8.92 | ppl  7487.94
| epoch  20 |    13/  320 batches | lr 10.00 | loss  6.09 | ppl   442.83
| epoch  20 |    14/  320 bat

| epoch  20 |   182/  320 batches | lr 10.00 | loss  7.87 | ppl  2605.55
| epoch  20 |   183/  320 batches | lr 10.00 | loss  0.00 | ppl     1.00
| epoch  20 |   184/  320 batches | lr 10.00 | loss  9.88 | ppl 19462.77
| epoch  20 |   185/  320 batches | lr 10.00 | loss 10.94 | ppl 56249.45
| epoch  20 |   186/  320 batches | lr 10.00 | loss  4.84 | ppl   126.61
| epoch  20 |   187/  320 batches | lr 10.00 | loss  2.92 | ppl    18.59
| epoch  20 |   188/  320 batches | lr 10.00 | loss  0.01 | ppl     1.01
| epoch  20 |   189/  320 batches | lr 10.00 | loss  0.02 | ppl     1.02
| epoch  20 |   190/  320 batches | lr 10.00 | loss  5.33 | ppl   206.12
| epoch  20 |   191/  320 batches | lr 10.00 | loss 14.11 | ppl 1340525.16
| epoch  20 |   192/  320 batches | lr 10.00 | loss 11.39 | ppl 88224.17
| epoch  20 |   193/  320 batches | lr 10.00 | loss 11.43 | ppl 92169.19
| epoch  20 |   194/  320 batches | lr 10.00 | loss  0.23 | ppl     1.26
| epoch  20 |   195/  320 batches | lr 10.00 | lo

7.78661042395804


In [167]:
len(val_data)

320

In [206]:
#### Calculate perplexity
#### Batch inputs such a way that only same sentences are together