<a href="https://colab.research.google.com/github/madhugopinathan/deep-nlu/blob/master/date_normalization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Date Normalization with RNN Encoder / Decoder

In [0]:
import pandas as pd
import numpy as np
import os

## Download Data

In [0]:
DATA_DIR = "date-normalization"
if not os.path.exists(DATA_DIR):
  os.mkdir(DATA_DIR)

In [0]:
DIR = "https://raw.githubusercontent.com/madhugopinathan/deep-nlu/master/data"
for fn in ["training.csv", "validation.csv", "test.csv"]:
  df = pd.read_csv(f"{DIR}/human-machine-date-{fn}")
  df.to_csv(f"date-normalization/{fn}", index=False)

In [0]:
test_df = pd.read_csv(f"{DATA_DIR}/test.csv", header=None)

In [5]:
test_df.head()

Unnamed: 0,0,1
0,"26, SEP 2007",2007-09-26
1,"27 September, 1986",1986-09-27
2,24.02.82,1982-02-24
3,04-may-1983,1983-05-04
4,"19 Jun, 1971",1971-06-19


## Use TorchText to create train / val / test sets

In [0]:
import torchtext
from torchtext import data

In [0]:
HUMAN = data.ReversibleField(init_token='<sot>',eos_token='<eot>',
                             include_lengths=True,
                             tokenize=list,
                             lower=True)

In [0]:
MACHINE = data.ReversibleField(init_token='<sot>',eos_token='<eot>',
                               include_lengths=True,
                               tokenize=list,
                               lower=True)

In [0]:
train, val, test = data.TabularDataset.splits(path="./date-normalization/", 
                                              format='csv', 
                                              train='training.csv', 
                                              validation='validation.csv',
                                              test='test.csv',
                                              fields=[('human',HUMAN),('machine',MACHINE)])

In [0]:
HUMAN.build_vocab(train)

In [0]:
MACHINE.build_vocab(train)

In [0]:
ex = train[0]

In [13]:
ex.human, ex.machine

(['3', '0', '/', '0', '7', '/', '7', '7'],
 ['1', '9', '7', '7', '-', '0', '7', '-', '3', '0'])

In [14]:
HUMAN.vocab.stoi['3']

17

In [15]:
HUMAN.vocab.itos[17]

'3'

In [16]:
HUMAN.process(['dec 7th 2019'])

(tensor([[ 2],
         [28],
         [ 9],
         [26],
         [ 4],
         [12],
         [27],
         [39],
         [ 4],
         [ 7],
         [ 6],
         [ 5],
         [ 8],
         [ 3]]), tensor([14]))

In [17]:
HUMAN.reverse(HUMAN.process(['dec 7th 2019'])[0])

['dec 7th 2019']

## RNN Model

In [0]:
import torch
from torch import nn
from torch import optim
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.nn import functional as F

In [0]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super().__init__()
        # one hot encoding of characters
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, input_size, _weight=torch.eye(input_size))
        self.rnn = nn.GRU(input_size=input_size, hidden_size=hidden_size, bidirectional=True)
    
    def forward(self, input_batch, input_length, hidden=None):
        embedded = self.embedding(input_batch) # T,B,F
        packed = pack_padded_sequence(embedded, input_length)
        outputs, hidden = self.rnn(packed, hidden)
        outputs, _ = pad_packed_sequence(outputs)
        # Sum bidirectional GRU outputs
        outputs = outputs[:, :, :self.hidden_size] + outputs[:, : ,self.hidden_size:]
        return outputs, hidden

In [0]:
class DecoderRNN(nn.Module):
    def __init__(self, input_size, 
                 hidden_size, output_size, 
                 n_layers=1, dropout=0.1):
        super().__init__()

        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.dropout = dropout
        
        # one hot encoding
        self.embedding = nn.Embedding(input_size, input_size, _weight=torch.eye(input_size))
        self.embedding_dropout = nn.Dropout(dropout)
        self.gru = nn.GRU(input_size, self.hidden_size, self.n_layers,
                          dropout=(0 if n_layers==1 else self.dropout))
        self.concat = nn.Linear(self.hidden_size*2, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)
        
    def forward(self, input_step, last_hidden, encoder_outputs):
        # this is run one word (one step) at a time
        embedded = self.embedding(input_step)
        embedded = self.embedding_dropout(embedded)

        rnn_output, hidden = self.gru(embedded, last_hidden)

        output = self.out(rnn_output.squeeze(0))
        output = F.softmax(output, dim=1)
        # Return output and final hidden state
        return output, hidden

In [0]:
# https://github.com/pytorch/text/issues/251
def sequence_mask(length):
    return (torch.arange(0, length.max())
                .type_as(length)
                .repeat(length.numel(), 1)
                .lt(length.unsqueeze(1))).t()

In [0]:
def mask_nll_loss(inp, target, mask):
    n_total = mask.sum()
    cross_entropy = -torch.log(torch.gather(inp, 1, target.view(-1, 1)))
    loss = cross_entropy.masked_select(mask).mean()
    loss = loss.to(device)
    return loss, n_total.item()

In [0]:
def do_train(input_variable, lengths, target_variable, mask, max_target_len, 
          encoder, decoder,
          encoder_optimizer, decoder_optimizer, 
          batch_size, clip, max_length=12):
    # Zero gradients
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    # Set device options
    input_variable = input_variable.to(device)
    lengths = lengths.to(device)
    target_variable = target_variable.to(device)
    mask = mask.to(device)
    
    # Initialize variables
    loss = 0
    print_losses = []
    n_totals = 0

    # Forward pass through encoder
    encoder_outputs, encoder_hidden = encoder(input_variable, lengths)

    # Create initial decoder input (start with SOS tokens for each sentence)
    decoder_input = torch.LongTensor([[MACHINE.vocab.stoi['<sos>'] for _ in range(batch_size)]])
    decoder_input = decoder_input.to(device)

    # Set initial decoder hidden state to the encoder's final hidden state
    decoder_hidden = encoder_hidden[:decoder.n_layers]

    # Determine if we are using teacher forcing this iteration
    use_teacher_forcing = False if np.random.rand() < teacher_forcing_ratio else False

    # Forward batch of sequences through decoder one time step at a time
    if use_teacher_forcing:
        for t in range(max_target_len):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden, encoder_outputs
            )
            # Teacher forcing: next input is current target
            decoder_input = target_variable[t].view(1, -1)
            # Calculate and accumulate loss
            mask_loss, n_total = mask_nll_loss(decoder_output, target_variable[t], mask[t])
            loss += mask_loss
            print_losses.append(mask_loss.item() * n_total)
            n_totals += n_total
    else:
        for t in range(max_target_len):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden, encoder_outputs
            )
            # No teacher forcing: next input is decoder's own current output
            _, topi = decoder_output.topk(1)
            decoder_input = torch.LongTensor([[topi[i][0] for i in range(batch_size)]])
            decoder_input = decoder_input.to(device)
            # Calculate and accumulate loss
            mask_loss, n_total = mask_nll_loss(decoder_output, target_variable[t], mask[t])
            loss += mask_loss
            print_losses.append(mask_loss.item() * n_total)
            n_totals += n_total

    # Perform backpropatation
    loss.backward()

    # Clip gradients: gradients are modified in place
    _ = torch.nn.utils.clip_grad_norm_(encoder.parameters(), clip)
    _ = torch.nn.utils.clip_grad_norm_(decoder.parameters(), clip)

    # Adjust model weights
    encoder_optimizer.step()
    decoder_optimizer.step()

    return sum(print_losses) / n_totals

In [0]:
class GreedySearchDecoder(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, input_seq, input_length, max_length):
        input_seq = input_seq.to(device)
        input_length = input_length.to(device)

        # Forward input through encoder model
        encoder_outputs, encoder_hidden = self.encoder(input_seq, input_length)
        # Prepare encoder's final hidden layer to be first hidden input to the decoder
        # TODO: changed from decoder.n_layers to encoder.n_layers. Is this correct??
        #import pdb; pdb.set_trace();
        decoder_hidden = encoder_hidden[:decoder.n_layers]
        # Initialize decoder input with SOS_token
        #decoder_input = torch.LongTensor([[MACHINE.vocab.stoi['<sos>'] for _ in range(batch_size)]])
        decoder_input = torch.ones(1, input_seq.shape[-1], device=device, dtype=torch.long) * MACHINE.vocab.stoi['<sos>']
        #print(f"decoder_input {decoder_input.shape}, decoder_hidden {decoder_hidden.shape}")
        # Initialize tensors to append decoded words to
        all_tokens = []
        all_scores = []

        # Iteratively decode one word token at a time
        for _ in range(max_length):
            # Forward pass through decoder
            decoder_output, decoder_hidden = self.decoder(decoder_input, 
                                                           decoder_hidden, 
                                                           encoder_outputs)
            # Obtain most likely word token and its softmax score
            decoder_scores, decoder_input = torch.max(decoder_output, dim=1)
            #print(f"di = {decoder_input.shape}, ds = {decoder_scores.shape}")
            # Record token and score
            all_tokens.append(decoder_input)
            all_scores.append(decoder_scores)

            # Prepare current token to be next decoder input (add a dimension)
            decoder_input = torch.unsqueeze(decoder_input, 0)
        # Return collections of word tokens and scores
        return torch.stack(all_tokens), torch.stack(all_scores)

In [0]:
def evaluate(encoder, decoder, batch):
    with torch.no_grad():
        encoder.train(mode=False)
        decoder.train(mode=False)
        gsd = GreedySearchDecoder(encoder, decoder)
        all_tokens, all_scores  = gsd(batch.human[0], batch.human[1], 12)
        correct = np.sum(np.all(batch.machine[0].data.numpy() == all_tokens.cpu().data.numpy(),
                                axis=0))
        batch_size = batch.machine[0].shape[-1]
        return correct / batch_size, all_tokens, all_scores

In [0]:
def train_iters(encoder, decoder, 
                encoder_optimizer, decoder_optimizer, 
                encoder_n_layers, decoder_n_layers, 
                n_epochs=50, 
                clip=50):

    max_target_len = 12
        
    for epoch in range(n_epochs):
        training_losses = []
        train_iter, val_iter = get_train_val_iter()
        
        print(f"Epoch = {epoch}")
        # Ensure dropout layers are in train mode
        encoder.train()
        decoder.train()
        
        for batch in train_iter:
            input_variable, input_lengths = batch.human
            target_variable, target_lengths = batch.machine
            mask = sequence_mask(target_lengths)
        

            # Run a training iteration with batch
            loss = do_train(input_variable, input_lengths, 
                         target_variable, mask, max_target_len, 
                         encoder, decoder, 
                         encoder_optimizer, decoder_optimizer, 
                         batch_size, 
                         clip)
            training_losses.append(loss)
            
        val_accs = []
        for batch in val_iter:
            val_acc, _, _ = evaluate(encoder, decoder, batch)
            val_accs.append(val_acc)
            

        print(f"Mean Loss: {np.mean(training_losses)}, Mean Val Acc: {np.mean(val_accs)}")

In [0]:
batch_size = 25
def get_train_val_iter():
    train_iter, val_iter = data.BucketIterator.splits((train, val), 
                                                  batch_sizes=(batch_size,batch_size),
                                                  sort_key=lambda x: len(x.human),
                                                  sort_within_batch=True,
                                                )
    return iter(train_iter), iter(val_iter)      

In [37]:
# Configure models
hidden_size = 256
encoder_n_layers = 1
decoder_n_layers = 1
dropout = 0.1

print('Building encoder and decoder ...')
input_size = len(HUMAN.vocab)
output_size = len(MACHINE.vocab)
# Initialize encoder & decoder models
encoder = EncoderRNN(input_size, hidden_size)
decoder = DecoderRNN(input_size, 
                     hidden_size, output_size, 
                     decoder_n_layers, dropout)

gpu_is_available = torch.cuda.is_available()
device = torch.device("cuda" if gpu_is_available else "cpu")
encoder = encoder.to(device)
decoder = decoder.to(device)
print('Model is ready to go!')

Building encoder and decoder ...
Model is ready to go!


In [39]:
# Configure training/optimization
clip = 50.0
teacher_forcing_ratio = 0.5 #0.5 #1.0
learning_rate = 0.001
decoder_learning_ratio = 5.0
n_epochs = 10

# Initialize optimizers
print('Building optimizers ...')
encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate * decoder_learning_ratio)

# Run training iterations
print("Starting Training!")
train_iters(encoder, decoder, encoder_optimizer, decoder_optimizer,
            encoder_n_layers, decoder_n_layers, n_epochs=n_epochs)

Building optimizers ...
Starting Training!
Epoch = 0
Mean Loss: 0.15787426652136458, Mean Val Acc: 0.561
Epoch = 1
Mean Loss: 0.07634578954401502, Mean Val Acc: 0.635
Epoch = 2
Mean Loss: 0.02502268770417586, Mean Val Acc: 0.9730000000000001
Epoch = 3
Mean Loss: 0.00999346716466682, Mean Val Acc: 0.593
Epoch = 4
Mean Loss: 0.015643942681193495, Mean Val Acc: 1.0
Epoch = 5
Mean Loss: 0.00035623386887313107, Mean Val Acc: 1.0
Epoch = 6
Mean Loss: 0.00016742369553896555, Mean Val Acc: 1.0
Epoch = 7
Mean Loss: 0.00010463752979969006, Mean Val Acc: 1.0
Epoch = 8
Mean Loss: 7.134340589576663e-05, Mean Val Acc: 1.0
Epoch = 9
Mean Loss: 5.0772896396564426e-05, Mean Val Acc: 1.0


In [0]:
def predict(encoder, decoder, human):
    with torch.no_grad():
        encoder.train(mode=False)
        decoder.train(mode=False)
        gsd = GreedySearchDecoder(encoder, decoder)
        all_tokens, all_scores = gsd(human[0], human[1], 12)
        return all_tokens.cpu(), all_scores.cpu()

In [0]:
def get_input_output_labels(input_text, output_text):
    input_text = list(input_text)
    input_text.insert(0,'<sot>')
    input_text.append('<eot>')
    
    output_text = list(MACHINE.reverse(output_text)[0])
    output_text.insert(0,'<sot>')
    output_text.append('<eot>')
        
    return input_text, output_text

In [0]:
input_text = '7 dec, 19'
output_text, scores = predict(encoder, decoder, HUMAN.process([input_text]))

In [69]:
input_label, output_label = get_input_output_labels(input_text, output_text)
input_label, output_label

(['<sot>', '7', ' ', 'd', 'e', 'c', ',', ' ', '1', '9', '<eot>'],
 ['<sot>', '2', '0', '1', '9', '-', '1', '2', '-', '0', '7', '<eot>'])

# References

1. [Recurrent Neural Network with Attention](https://medium.com/datalogue/attention-in-keras-1892773a4f22])