Implementing this paper: [Unsupervised Natural Language Generation with Denoising Autoencoders](https://arxiv.org/pdf/1804.07899.pdf)

Data from here: http://www.macs.hw.ac.uk/InteractionLab/E2E/#


In [1]:
from collections import Counter
import random

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import utils

In [2]:
# For E2E Dataset
trainset = pd.read_csv('e2e-dataset/trainset.csv')
trainset = trainset.assign(clean=utils.replace_punctuation(trainset['ref']))
vocab_to_int, int_to_vocab = utils.get_tokens(trainset['clean'])
as_tokens = trainset['clean'].apply(lambda x: [vocab_to_int[each] for each in x.split()])
trainset = trainset.assign(tokenized=as_tokens)

In [3]:
def dataloader(dataset, p_drop=0.6, max_length=50):
    
    # Corrupt dataset by randomly dropping words
    corrupted = utils.corrupt(dataset)
    # Shuffle words in each sequence
    shuffled = [utils.shuffle(seq, cor_seq) for seq, cor_seq in zip(dataset, corrupted)]

    for shuffled_seq, original_seq in zip(shuffled, dataset):
        # need to make sure our input_tensors have at least one element
        if len(shuffled_seq) == 0:
            shuffled_seq = [original_seq[np.random.randint(0, len(original_seq))]]
        
        input_tensor = torch.Tensor(shuffled_seq).view(-1, 1).type(torch.LongTensor)
        
        # Append <EOS> token to the end of original sequence
        target = original_seq.copy()
        target.append(1)
        target_tensor = torch.Tensor(target).view(-1, 1).type(torch.LongTensor)
            
        yield input_tensor, target_tensor

In [4]:
class Encoder(nn.Module):
    
    def __init__(self, vocab_size, embedding_size=300, hidden_size=256, num_layers=2, drop_p=0.5):
        super().__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        self.embedding = nn.Embedding(vocab_size, embedding_size)
        self.lstm = nn.LSTM(embedding_size, hidden_size, num_layers=num_layers, 
                            dropout=drop_p, bidirectional=True)
        
    def forward(self, input, hidden):
        embedded = self.embedding(input)
        output, hidden = self.lstm(embedded, hidden)
        return output, hidden
    
    def init_hidden(self, device='cpu'):
        """ Create two tensors with shape (num_layers * num_directions, batch, hidden_size)
            for the hidden state and cell state
        """
        h_0, c_0 = torch.zeros(2, 2*self.num_layers, 1, self.hidden_size, device=device)
        
        return h_0, c_0

In [5]:
# Attention network from http://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html
class Decoder(nn.Module):
    
    def __init__(self, vocab_size, embedding_size=300, hidden_size=256, 
                       num_layers=2, drop_p=0.1, max_length=50):
        
        super().__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.max_length = max_length

        self.embedding = nn.Embedding(vocab_size, embedding_size)
        self.attn = nn.Linear(self.hidden_size + embedding_size, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2 + embedding_size, self.hidden_size)
        self.dropout = nn.Dropout(drop_p)
        self.lstm = nn.LSTM(hidden_size, hidden_size, num_layers=num_layers, 
                            dropout=drop_p, bidirectional=True)
        
        self.out = nn.Linear(2 * hidden_size, vocab_size)
        self.log_softmax = nn.LogSoftmax(dim=1)
        
    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input)
        embedded = self.dropout(embedded)
        
        # Learns the attention vector (a probability distribution) here for weighting
        # encoder outputs based on the decoder input and encoder hidden vector
        attn_weights = F.softmax(self.attn(torch.cat((embedded[0], hidden[0][0]), 1)), dim=1)
        
        # Applies the attention vector (again, a probability distribution) to the encoder
        # outputs which weight the encoder_outputs
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))
        
        # Now the decoder input is combined with the weighted encoder_outputs and
        # passed through a linear transformation as input to the LSTM layer
        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)
        output = F.relu(output)
        
        output, hidden = self.lstm(output, hidden)
        output = self.out(output).view(1, -1)
        output = self.log_softmax(output)
    
        return output, hidden, attn_weights
        
    def init_hidden(self, device='cpu'):
        """ Create two tensors with shape (num_layers * num_directions, batch, hidden_size)
            for the hidden state and cell state
        """
        h_0, c_0 = torch.zeros(2, 2*self.num_layers, 1, self.hidden_size, device=device)
        return h_0, c_0

In [22]:
def train(dataset, encoder, decoder, enc_opt, dec_opt, criterion, 
          max_length=50, print_every=1000, plot_every=100, 
          teacher_forcing=0.5, device=None):
    
    if device is None:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    steps = 0
    plot_losses = []
    for input_tensor, target_tensor in dataloader(dataset):
        loss = 0
        print_loss_total = 0  # Reset every print_every
        plot_loss_total = 0  # Reset every plot_every
        
        steps += 1
        
        input_tensor = input_tensor.to(device)
        target_tensor = target_tensor.to(device)

        enc_opt.zero_grad()
        dec_opt.zero_grad()

        h, c = encoder.init_hidden(device=device)
        encoder_outputs = torch.zeros(max_length, 2*encoder.hidden_size).to(device)

        # Run input through encoder
        enc_outputs, enc_hidden = encoder.forward(input_tensor, (h, c))
        
        # Prepare encoder_outputs for attention
        encoder_outputs[:min(enc_outputs.shape[0], max_length)] = enc_outputs[:max_length,0,:]

        # First decoder input is the <SOS> token
        dec_input = torch.Tensor([[0]]).type(torch.LongTensor).to(device)
        dec_hidden = enc_hidden

        dec_outputs = []
        for ii in range(target_tensor.shape[0]):
            # Pass in previous output and hidden state
            dec_out, dec_hidden, dec_attn = decoder.forward(dec_input, dec_hidden, encoder_outputs)
            _, out_token = dec_out.topk(1)
            
            # Curriculum learning, sometimes use the decoder output as the next input,
            # sometimes use the correct token from the target sequence
            if np.random.rand() < teacher_forcing:
                dec_input = target_tensor[ii].view(*out_token.shape)
            else:
                dec_input = out_token.detach().to(device)  # detach from history as input
            
            dec_outputs.append(out_token)

            loss += criterion(dec_out, target_tensor[ii])
            
            # If the input is the <EOS> token (end of sentence)...
            if dec_input.item() == 1:
                break

        loss.backward()
        
        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        nn.utils.clip_grad_norm_(encoder.parameters(), 5)
        nn.utils.clip_grad_norm_(decoder.parameters(), 5)

        enc_opt.step()
        dec_opt.step()
        
        print_loss_total += loss
        plot_loss_total += loss

        if steps % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print(f"Loss avg. = {print_loss_avg}")
            print([int_to_vocab[each.item()] for each in input_tensor])
            print([int_to_vocab[each.item()] for each in dec_outputs])

In [23]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# max length for attention
max_length = 50

encoder = Encoder(len(vocab_to_int), hidden_size=512, drop_p=0.1).to(device)
decoder = Decoder(len(vocab_to_int), hidden_size=512, drop_p=0.1, max_length=max_length).to(device)

enc_opt = optim.Adam(encoder.parameters(), lr=0.001, amsgrad=True)
dec_opt = optim.Adam(decoder.parameters(), lr=0.001, amsgrad=True)
criterion = nn.NLLLoss()

In [24]:
epochs = 10
for e in range(1, epochs+1):
    print(f"Starting epoch {e}")
    train(trainset['tokenized'], encoder, decoder, enc_opt, dec_opt, criterion, 
          teacher_forcing=0.9/e, device=device, print_every=4200,
          max_length=max_length)

Starting epoch 1
Loss avg. = 0.007070066407322884
['coffee', 'city', 'centre', 'is', 'a', 'an']
['The', 'Rice', 'Curry', 'is', 'a', 'family', 'restaurant', 'restaurant', 'shop', 'located', 'near', 'the', 'riverside', 'centre', 'near', 'near', 'has', 'a', 'average', 'customer', 'rating', '<PERIOD>', '<EOS>']
Loss avg. = 0.03228044509887695
['<COMMA>', 'The', 'customer', 'Rating', 'that', 'out', 'a', 'located', 'near', 'Plaza', 'Browns', 'Cambridge', 'is', 'is', 'of', '<PERIOD>', 'The']
['The', 'Cambridge', 'is', 'a', 'coffee', 'shop', '<COMMA>', 'in', 'the', '<COMMA>', 'near', '<COMMA>', 'Hotel', '<COMMA>', 'It', 'price', 'is', 'is', 'not', 'for', 'children', 'food', 'It', 'price', 'is', 'not', 'friendly', '<PERIOD>', 'has', 'Rice', 'rating', 'is', 'low', 'low', 'out', 'of', 'of', '5', '<PERIOD>', '<EOS>']
Loss avg. = 0.007794644217938185
['in', 'one', 'star', 'Curry', 'is', 'a', 'family', 'near', 'the']
['The', 'Waterman', 'Curry', 'is', 'a', 'family', 'friendly', 'restaurant', 'star',

KeyboardInterrupt: 

> [0;32m/home/mat/miniconda3/envs/pytorch/lib/python3.6/site-packages/torch/nn/_functions/rnn.py[0m(287)[0;36mforward[0;34m()[0m
[0;32m    285 [0;31m            [0mbatch_first[0m[0;34m,[0m [0mdropout[0m[0;34m,[0m [0mtrain[0m[0;34m,[0m [0mbool[0m[0;34m([0m[0mbidirectional[0m[0;34m)[0m[0;34m,[0m[0;34m[0m[0m
[0m[0;32m    286 [0;31m            [0mlist[0m[0;34m([0m[0mbatch_sizes[0m[0;34m.[0m[0mdata[0m[0;34m)[0m [0;32mif[0m [0mvariable_length[0m [0;32melse[0m [0;34m([0m[0;34m)[0m[0;34m,[0m[0;34m[0m[0m
[0m[0;32m--> 287 [0;31m            dropout_ts)
[0m[0;32m    288 [0;31m[0;34m[0m[0m
[0m[0;32m    289 [0;31m        [0;32mif[0m [0mcx[0m [0;32mis[0m [0;32mnot[0m [0;32mNone[0m[0;34m:[0m[0;34m[0m[0m
[0m
ipdb> q


In [15]:
checkpoint = {"hidden_size": 256,
              "num_layers": 512,
              "encoder_sd": encoder.state_dict(),
              "decoder_sd": decoder.state_dict(),
              "epochs": 5}

torch.save(checkpoint, "nlg_07052018.pth"