In [1]:
### Necessary Library
import os
import numpy as np
import torch
import math
import io
import torch.nn.functional as F

In [2]:
#######################################################################################
################################# LSTM MODEL ##########################################
#######################################################################################

import torch.nn as nn


## Creating a class of nn.Module to include RNN with LSTM Units
class LSTM_Model(nn.Module):
    
    def __init__(self, vocab_size, embedding_size, hidden_size, batch_size = 1, n_layers = 1):
        
        super(LSTM_Model, self).__init__()
        
        ## Creating an embedding object to create 'embedding_size' dimensional encoding
        self.encoder = nn.Embedding(vocab_size,embedding_size)
        
        self.lstm = nn.LSTM(embedding_size,hidden_size,n_layers)
        
        ## The output size is assigned as hidden_size * vocab_size 
        ## (Since the output is one word from the entire vocab)
        self.linear_output = nn.Linear(hidden_size,vocab_size)
        
        self.init_weights()
        
        ## More variables
        self.hidden_size = hidden_size
        self.n_layers = n_layers
    
        
    def init_weights(self):
        random_range = 1
        
        ## Setting random values for different layers
        self.encoder.weight.data.uniform_(-random_range, random_range)
        
        self.linear_output.bias.data.zero_()
        self.linear_output.weight.data.uniform_(-random_range, random_range)

    # Defining the forward layer    
    def forward(self, input, hidden):
        
        #Passing through the embedding layer
        embedding_input = self.encoder(input).view(len(input), 1, -1)
        
        lstm_out, hidden = self.lstm(embedding_input, hidden)
        
        decoded = self.linear_output(lstm_out.view(lstm_out.size(0)*lstm_out.size(1), lstm_out.size(2)))
        
        soft_max_out = F.log_softmax(decoded, dim = 1)
        
        #decoded.view(lstm_out.size(0), lstm_out.size(1), decoded.size(1))
        return soft_max_out, hidden

    
    def init_hidden(self, batch_size):
        
        weight = next(self.parameters())
        
        return (weight.new_zeros(self.n_layers, batch_size, self.hidden_size), \
                    weight.new_zeros(self.n_layers, batch_size, self.hidden_size))
    
######################################################################################
######################################################################################

#### Reading the train and validation dataset

In [3]:
## Training Records
train_records = []
with io.open('trn-wiki.txt','r',encoding = 'UTF-8') as f:
    for line in f:
        train_records.append(line)
        
        
### Validation Records
valid_records = []
with io.open('dev-wiki.txt','r',encoding = 'UTF-8') as f:
    for line in f:
        valid_records.append(line)

In [4]:
### Word to index
word_to_ix = {}
for sent in train_records:
    for word in sent.split(" "):
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)

In [5]:
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq.split(" ")]
    
    data = idxs[:-1]
    target = idxs[1:]
    
    return (torch.tensor(data, dtype=torch.long).to(device),
            torch.tensor(target, dtype=torch.long).to(device))

In [6]:
def train():
    
    # Turn on training mode which enables dropout.
    model.train()
    total_loss = 0.
    #start_time = time.time()
    
    hidden = model.init_hidden(mini_batch_size)
    
    for batch, sent in enumerate(train_records):
        data, targets = prepare_sequence(sent, word_to_ix)
       
        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        hidden = repackage_hidden(hidden)
        
        model.zero_grad()
         
        ## Calling the forward pass of the model 
        output, hidden = model(data, hidden)
        
        ## Calculating the loss
        loss = criterion(output.view(-1, vocab_size), targets)
        
        ## Computing gradient and optimizing
        loss.backward()
        optimizer.step()
        

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_norm_(model.parameters(), gradient_clip_val)
        
        
        for p in model.parameters():
            p.data.add_(-lr, p.grad.data)

        total_loss += loss.item()

        if batch % log_interval == 0 and batch > 0:
            
            cur_loss = total_loss / log_interval
            #elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | '
                    'loss {:5.2f} | ppl {:8.2f}'.format(
                epoch, batch, len(train_records) // mini_batch_size, lr,\
                    #elapsed * 1000 / args.log_interval, 
                        cur_loss, math.exp(cur_loss)))
            total_loss = 0
            #start_time = time.time()

In [7]:
def evaluate(data_source,perplexity_calc = False):
    # Turn on evaluation mode which disables dropout.
    model.eval()
    total_loss = 0.
    perplexity = []
    
    hidden = model.init_hidden(eval_batch_size)
    
    with torch.no_grad():
        for batch, sent in enumerate(data_source):
            

            data, targets = prepare_sequence(sent, word_to_ix)
            output, hidden = model(data, hidden)
     
            output_flat = output.view(-1, vocab_size)
            total_loss +=  criterion(output_flat, targets).item()
            
            hidden = repackage_hidden(hidden)
            
            if(perplexity_calc):
                perplexity.append([output_flat[position,word_id].tolist() for position,word_id in enumerate(targets)])
                
            
            
    return perplexity,total_loss / (len(data_source) - 1)

In [8]:
#a

In [9]:
#np.exp(-a/(17556+1800340))

### Input Arguments

In [10]:
#vocab_size = len(dictionary)

vocab_size = len(word_to_ix)

embedding_size = 32 
hidden_size = 32

mini_batch_size = 1
eval_batch_size = mini_batch_size
n_layers = 1 # For single layer LSTM

#### Model parameters 

lr = 0.1 # Learning rate
epochs = 20 #
gradient_clip_val = 2
log_interval = 1000

ntokens = vocab_size


##### Creating the model object from the LSTM_Model in the models.py

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
model = LSTM_Model(vocab_size, embedding_size, hidden_size,n_layers = 1).to(device)

optimizer = torch.optim.SGD(model.parameters(), lr= lr) ## SGD Optimizer
criterion = nn.CrossEntropyLoss() ## Crossentropy Loss function

cuda


In [11]:
########################## Additional Functions for training the model ##############
def repackage_hidden(h):
    """Wraps hidden states in new Tensors, to detach them from their history."""
    if isinstance(h, torch.Tensor):
        return h.detach()
    else:
        return tuple(repackage_hidden(v) for v in h)

In [None]:
### Running the data over different epochs
#lr = 0.1
prev_val_loss = 10
for epoch in range(1, epochs+1):
    #epoch_start_time = time.time()
    train()
    _, val_loss = evaluate(valid_records)
    print(val_loss)
    
    if(val_loss > prev_val_loss):
        lr /= 1.5
        
    prev_val_loss = val_loss

| epoch   1 |  1000/17556 batches | lr 0.10 | loss  8.42 | ppl  4547.04
| epoch   1 |  2000/17556 batches | lr 0.10 | loss  7.67 | ppl  2144.43
| epoch   1 |  3000/17556 batches | lr 0.10 | loss  7.49 | ppl  1782.16
| epoch   1 |  4000/17556 batches | lr 0.10 | loss  7.58 | ppl  1953.35
| epoch   1 |  5000/17556 batches | lr 0.10 | loss  7.42 | ppl  1661.31
| epoch   1 |  6000/17556 batches | lr 0.10 | loss  7.31 | ppl  1501.19
| epoch   1 |  7000/17556 batches | lr 0.10 | loss  7.37 | ppl  1587.27
| epoch   1 |  8000/17556 batches | lr 0.10 | loss  7.32 | ppl  1514.79
| epoch   1 |  9000/17556 batches | lr 0.10 | loss  7.27 | ppl  1435.29
| epoch   1 | 10000/17556 batches | lr 0.10 | loss  7.10 | ppl  1217.96
| epoch   1 | 11000/17556 batches | lr 0.10 | loss  7.17 | ppl  1303.74
| epoch   1 | 12000/17556 batches | lr 0.10 | loss  7.07 | ppl  1180.85
| epoch   1 | 13000/17556 batches | lr 0.10 | loss  7.21 | ppl  1358.47
| epoch   1 | 14000/17556 batches | lr 0.10 | loss  7.13 | ppl  

In [None]:
train_perp,train_score = evaluate(valid_records,True)

In [None]:
tot_sent = len(train_perp)

In [None]:
word_prob = []
sum_words = 0
for sent in train_perp:
    word_prob.extend(sent)
    sum_words += len(sent)   

In [None]:
np.exp(-np.sum(word_prob)/(sum_words+tot_sent))