This notebook implements language model to model text. We use wiki text 2 dataset.

In [1]:
import numpy as np
import torch
import torch.nn as nn
from torch.autograd import Variable

In [2]:
import data
from importlib import reload
#reload(data)

In [15]:
reload(data)

<module 'data' from '/home/manoj/Documents/github/language-modelling/data.py'>

## Load and batchify the data

In [3]:
import os

In [4]:
work_dir = os.getcwd()
data_path = os.path.join(work_dir, 'data/wikitext-2-raw/')

In [5]:
corpus = data.Corpus(data_path)

In [6]:
batch_size = 20
eval_batch_size = 10

In [7]:
def batchify(data, batch_size):
    num_batches = data.size(0) // batch_size
    data = data.narrow(0, 0, num_batches * batch_size)
    data = data.view(batch_size, -1).t().contiguous()
    return data.cuda()

In [8]:
train_data = batchify(corpus.train, batch_size)
valid_data = batchify(corpus.valid, eval_batch_size)
test_data = batchify(corpus.test, eval_batch_size)

In [9]:
type(train_data)

torch.cuda.LongTensor

In [16]:
def data2words(corpus, data):
    return " ".join([corpus.dictionary.idx2word[i] for i in data])

In [10]:
print(train_data.shape)
print(valid_data.shape)
print(test_data.shape)

torch.Size([104431, 20])
torch.Size([21764, 10])
torch.Size([24556, 10])


## Modelling

We create a basic RNN model first.

In [29]:
class RNNModel(nn.Module):
    def __init__(self, num_tokens, emb_inp, num_hidden, num_layers):
        super(RNNModel, self).__init__()
        self.drop = nn.Dropout(0.2)
        self.encoder = nn.Embedding(num_tokens, emb_inp)
        self.rnn = nn.RNN(emb_inp, num_hidden, num_layers, nonlinearity='tanh', dropout = 0.2)
        self.decoder = nn.Linear(num_hidden, num_tokens)
        
        self.init_weights()
        self.num_hidden = num_hidden
        self.num_layers = num_layers
    
    def init_weights(self):
        self.encoder.weight.data.uniform_(-0.1, 0.1)
        self.decoder.bias.data.fill_(0)
        self.decoder.weight.data.uniform_(-0.1, 0.1)
    
    def init_hidden(self,bsz):
        weight = next(self.parameters()).data
        return Variable(weight.new(self.num_layers, bsz, self.num_hidden).zero_())
        
    def forward(self, input, hidden):
        emb = self.drop(self.encoder(input))
        output, hidden = self.rnn(emb, hidden)
        output = self.drop(output)
        decoded = self.decoder(output.view(output.size(0)*output.size(1), output.size(2)))
        return decoded.view(output.size(0), output.size(1), decoded.size(1)), hidden

In [30]:
ntokens = len(corpus.dictionary)
emb_size = 200
num_hidden = 600
num_layers = 2

In [31]:
model = RNNModel(ntokens, emb_size, num_hidden, num_layers)

In [32]:
model.cuda()

RNNModel(
  (drop): Dropout(p=0.2)
  (encoder): Embedding(84608, 200)
  (rnn): RNN(200, 600, num_layers=2, dropout=0.2)
  (decoder): Linear(in_features=600, out_features=84608)
)

In [33]:
# Combination of LogSoftmax and NLLLoss (Negative log likelihood loss) 
# Why this is used?
criterion = nn.CrossEntropyLoss()

In [34]:
bptt = 35 #seq_lentgh
clip = 0.25 # Gradient clipping

In [35]:
def get_batch(source, i, evaluation=False):
    seq_len = min(bptt, len(source) - 1 - i)
    data = Variable(source[i:i+seq_len], volatile=evaluation)
    target = Variable(source[i+1:i+1+seq_len].view(-1))
    return data, target

def repackage_hidden(h):
    """Wraps hidden states in new Variables, to detach them from their history."""
    if type(h) == Variable:
        return Variable(h.data)
    else:
        return tuple(repackage_hidden(v) for v in h)

def evaluate(valid_data):
    # Turn on evaluation mode which disables dropout.
    model.eval()
    total_loss = 0
    hidden = model.init_hidden(eval_batch_size)
    for _,i in enumerate(range(0, valid_data.size(0) - 1, bptt)):
        data, targets = get_batch(valid_data, i, evaluation=True)
        output, hidden = model(data, hidden)
        output_flat = output.view(-1, ntokens)
        total_loss += len(data) * criterion(output_flat, targets).data
        hidden = repackage_hidden(hidden)
    return total_loss[0] / len(valid_data)

In [48]:
epochs = 10
lr = 1
log_interval = 200 #interval of batches to report

In [42]:
import time
import math

In [43]:
def train():
    start_time = time.time()
    model.train()
    total_loss = 0
    hidden = model.init_hidden(batch_size)
    for batch, i in enumerate(range(0, train_data.size(0) - 1, bptt)):
        data, targets = get_batch(train_data, i)
        # We reset the hidden layer values each batch
        hidden = repackage_hidden(hidden)
        model.zero_grad()
        output, hidden = model(data, hidden)
        loss = criterion(output.view(-1, ntokens), targets)
        loss.backward()
    
        #Clip gradients to handle exploding graadients
        nn.utils.clip_grad_norm(model.parameters(), clip)
        #Updating the weights because of gradient clipping?
        for p in model.parameters():
            p.data.add_(-lr, p.grad.data)
            
        total_loss += loss.data
        
        #Logs
        if batch % log_interval == 0 and batch > 0:
            cur_loss = total_loss[0] / log_interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | '
                    'loss {:5.2f} | ppl {:8.2f}'.format(
                epoch, batch, len(train_data) // bptt, lr,
                elapsed * 1000 / log_interval, cur_loss, math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()
        

In [44]:
best_val_loss=None
model_save_path = 'RNN_model.pt'

In [49]:
# At any point you can stop kernel to break out of training early.
try:
    for epoch in range(1, epochs+1):
        epoch_start_time = time.time()
        train()
        val_loss = evaluate(valid_data)
        print('-' * 89)
        print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
                'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                           val_loss, math.exp(val_loss)))
        print('-' * 89)
        # Save the model if the validation loss is the best we've seen so far.
        if not best_val_loss or val_loss < best_val_loss:
            with open(model_save_path, 'wb') as f:
                torch.save(model, f)
            best_val_loss = val_loss
        else:
            # Anneal the learning rate if no improvement has been seen in the validation dataset.
            lr /= 4.0
except KeyboardInterrupt:
    print('-' * 89)
    print('Exiting from training early')

| epoch   1 |   200/ 2983 batches | lr 1.00 | ms/batch 62.19 | loss 11.83 | ppl 137908.16
| epoch   1 |   400/ 2983 batches | lr 1.00 | ms/batch 61.38 | loss 11.36 | ppl 85503.47
| epoch   1 |   600/ 2983 batches | lr 1.00 | ms/batch 60.72 | loss 10.58 | ppl 39225.94
| epoch   1 |   800/ 2983 batches | lr 1.00 | ms/batch 59.37 | loss 10.05 | ppl 23150.22
| epoch   1 |  1000/ 2983 batches | lr 1.00 | ms/batch 58.14 | loss  9.80 | ppl 18038.83
| epoch   1 |  1200/ 2983 batches | lr 1.00 | ms/batch 64.50 | loss  9.49 | ppl 13218.26
| epoch   1 |  1400/ 2983 batches | lr 1.00 | ms/batch 63.73 | loss  9.44 | ppl 12599.34
| epoch   1 |  1600/ 2983 batches | lr 1.00 | ms/batch 61.84 | loss  9.24 | ppl 10306.13
| epoch   1 |  1800/ 2983 batches | lr 1.00 | ms/batch 59.88 | loss  9.12 | ppl  9143.81
| epoch   1 |  2000/ 2983 batches | lr 1.00 | ms/batch 58.79 | loss  9.08 | ppl  8819.48
| epoch   1 |  2200/ 2983 batches | lr 1.00 | ms/batch 56.71 | loss  8.98 | ppl  7929.33
| epoch   1 |  2400/

  "type " + obj.__name__ + ". It won't be checked "


| epoch   2 |   200/ 2983 batches | lr 1.00 | ms/batch 61.41 | loss  8.20 | ppl  3647.40
| epoch   2 |   400/ 2983 batches | lr 1.00 | ms/batch 59.00 | loss  8.04 | ppl  3115.58
| epoch   2 |   600/ 2983 batches | lr 1.00 | ms/batch 58.87 | loss  7.97 | ppl  2884.45
| epoch   2 |   800/ 2983 batches | lr 1.00 | ms/batch 58.56 | loss  7.93 | ppl  2784.90
| epoch   2 |  1000/ 2983 batches | lr 1.00 | ms/batch 59.49 | loss  7.86 | ppl  2601.32
| epoch   2 |  1200/ 2983 batches | lr 1.00 | ms/batch 59.65 | loss  7.89 | ppl  2665.56
| epoch   2 |  1400/ 2983 batches | lr 1.00 | ms/batch 61.28 | loss  7.88 | ppl  2646.91
| epoch   2 |  1600/ 2983 batches | lr 1.00 | ms/batch 58.85 | loss  7.87 | ppl  2629.33
| epoch   2 |  1800/ 2983 batches | lr 1.00 | ms/batch 58.73 | loss  7.79 | ppl  2405.15
| epoch   2 |  2000/ 2983 batches | lr 1.00 | ms/batch 58.55 | loss  7.79 | ppl  2422.83
| epoch   2 |  2200/ 2983 batches | lr 1.00 | ms/batch 58.79 | loss  7.78 | ppl  2398.28
| epoch   2 |  2400/ 

| epoch   7 |  1800/ 2983 batches | lr 0.06 | ms/batch 56.07 | loss  7.45 | ppl  1718.52
| epoch   7 |  2000/ 2983 batches | lr 0.06 | ms/batch 56.11 | loss  7.48 | ppl  1774.77
| epoch   7 |  2200/ 2983 batches | lr 0.06 | ms/batch 56.07 | loss  7.47 | ppl  1754.17
| epoch   7 |  2400/ 2983 batches | lr 0.06 | ms/batch 57.31 | loss  7.47 | ppl  1749.32
| epoch   7 |  2600/ 2983 batches | lr 0.06 | ms/batch 58.85 | loss  7.49 | ppl  1781.27
| epoch   7 |  2800/ 2983 batches | lr 0.06 | ms/batch 58.98 | loss  7.45 | ppl  1727.01
-----------------------------------------------------------------------------------------
| end of epoch   7 | time: 178.89s | valid loss  7.47 | valid ppl  1762.01
-----------------------------------------------------------------------------------------
| epoch   8 |   200/ 2983 batches | lr 0.06 | ms/batch 60.97 | loss  7.55 | ppl  1903.01
| epoch   8 |   400/ 2983 batches | lr 0.06 | ms/batch 58.73 | loss  7.50 | ppl  1816.45
| epoch   8 |   600/ 2983 batches

In [68]:
outf = "generated.txt"
no_of_words = 50

In [73]:
def generate():
    input = Variable(torch.rand(1,1).mul(ntokens).long(), volatile=True)
    input.data = input.data.cuda()
    hidden = model.init_hidden(1)
    with open(outf, 'w') as of:
        for i in range(no_of_words):
            output, hidden = model(input, hidden)
            word_weights = output.squeeze().data.div(1.0).exp().cpu()
            word_idx = torch.multinomial(word_weights, 1)[0]
            input.data.fill_(word_idx)
            word = corpus.dictionary.idx2word[word_idx]

            of.write(word + ('\n' if i % 20 == 19 else ' '))

    
    

In [74]:
generate()