In [1]:
import pandas as pd
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
from torch.autograd import Variable
from sklearn.model_selection import train_test_split
import torch.utils.data

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [27]:
def load_data(path):
    df = pd.read_csv(path, header=None)
    X = df[0].values
    y = df[1].values
    x_tok = Tokenizer(char_level=True, filters='')
    x_tok.fit_on_texts(X)
    y_tok = Tokenizer(char_level=True, filters='')
    y_tok.fit_on_texts(y)
    
    X = x_tok.texts_to_sequences(X)
    y = y_tok.texts_to_sequences(y)
    
    X = pad_sequences(X)
    y = np.asarray(y)
    
    return X, y, x_tok.word_index, y_tok.word_index

X, y, x_wid, y_wid= load_data('data/data.csv')
X_train, X_test, y_train, y_test = train_test_split(X, y)
print('train size: {} - test size: {}'.format(len(X_train), len(X_test)))

train size: 18750 - test size: 6250


In [3]:
hidden_size = 128
learning_rate = 0.001
decoder_learning_ratio = 0.1

# plus 1 is padding token
input_size = len(x_wid) + 1
# plus 2 is sos and eos token
output_size = len(y_wid) + 2
sos_idx = len(y_wid) 
eos_idx = len(y_wid) + 1

max_length = y.shape[1]
print("input vocab: {} - output vocab: {} - length of target: {}".format(input_size, output_size, max_length))

input vocab: 55 - output vocab: 13 - length of target: 10


In [67]:
def decoder_sentence(idxs, vocab):
    for 

In [4]:
class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
    
    def forward(self, input):
        # input: SxB        
        embedded = self.embedding(input)
        output, hidden = self.gru(embedded)
        return output, hidden # SxBxH, 1xBxH              

class Attn(nn.Module):
    def __init__(self, hidden_size):
        super(Attn ,self).__init__()
        
    def forward(self, hidden, encoder_outputs):
        # encoder_outputs: TxBxH
        # hidden: SxBxH
        encoder_outputs = torch.transpose(encoder_outputs, 0, 1) #BxTxH
        hidden = torch.transpose(torch.transpose(hidden, 0, 1), 1, 2) # BxHxS
        energies = torch.bmm(encoder_outputs, hidden) # BxTxS
        energies = torch.transpose(energies, 1, 2) # BxSxT
        attn_weights = F.softmax(energies, dim=-1) #BxSxT
        
        output = torch.bmm(attn_weights, encoder_outputs) # BxSxH
        output = torch.transpose(output, 0, 1) # SxBxH
        attn_weights = torch.transpose(attn_weights, 0, 1) #SxBxT
        
        return output, attn_weights
    
class Decoder(nn.Module):
    def __init__(self, output_size, hidden_size, dropout):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.attn = Attn(hidden_size)
        self.dropout = nn.Dropout(dropout)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.concat = nn.Linear(self.hidden_size*2, hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)
        
    def forward(self, input, hidden, encoder_outputs):
        # input: SxB
        # encoder_outputs: BxSxH
        # hidden: 1xBxH
        embedded = self.embedding(input) # 1xBxH
        embedded = self.dropout(embedded)
        rnn_output, hidden = self.gru(embedded, hidden)  #SxBxH, 1xBxH
        context, attn_weights = self.attn(rnn_output, encoder_outputs) # SxBxH
        concat_input = torch.cat((rnn_output, context), -1)
        concat_output = torch.tanh(self.concat(concat_input)) #SxBxH
        
        output = self.out(concat_output) # SxBxoutput_size
        return output, hidden, attn_weights

  

In [5]:
encoder = Encoder(input_size, hidden_size)
decoder = Decoder(output_size, hidden_size, 0.1)

# Initialize optimizers and criterion
encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate * decoder_learning_ratio)
criterion = nn.CrossEntropyLoss()


input_encoder = torch.randint(1, input_size, (34, 6), dtype=torch.long)
encoder_outputs, hidden = encoder(input_encoder)
input_decoder = torch.randint(1, output_size, (10, 6), dtype=torch.long)
output, hidden, attn_weights = decoder(input_decoder, hidden, encoder_outputs)

In [80]:
def forward_and_compute_loss(inputs, targets, encoder, decoder, criterion):
    batch_size = inputs.size()[1]
    
    sos = Variable(torch.ones((1, batch_size), dtype=torch.long)*sos_idx)
    eos = Variable(torch.ones((1, batch_size), dtype=torch.long)*eos_idx)
    
    decoder_inputs = torch.cat((sos, targets[:-1]), dim=0)
    decoder_targets = torch.cat((targets[1:], eos), dim=0)
    
    encoder_outputs, encoder_hidden = encoder(inputs)
    output, hidden, attn_weights = decoder(decoder_inputs, encoder_hidden, encoder_outputs)
    
    output = torch.transpose(torch.transpose(output, 0, 1), 1, 2) # BxCxS
    decoder_targets = torch.transpose(decoder_targets, 0, 1)
    loss = criterion(output, decoder_targets)
    
    return loss

def train(inputs, targets,  encoder, decoder, encoder_optimizer, decoder_optimizer, criterion):
    
    encoder.train()
    decoder.train()
    
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    
    train_loss = forward_and_compute_loss(inputs, targets,encoder, decoder,criterion)    
    
    train_loss.backward()
    
    encoder_optimizer.step()
    decoder_optimizer.step()
            
    return train_loss.item()

def evaluate(inputs, targets, encoder, decoder, criterion):
    encoder.eval()
    decoder.eval()
    eval_loss = forward_and_compute_loss(*eval_data, encoder, decoder,criterion)
    
    return eval_loss.item()

def predict(inputs, encoder, decoder, target_length=max_length):
    
    batch_size = inputs.size()[1]
    decoder_inputs = Variable(torch.ones((1, batch_size), dtype=torch.long)*sos_idx)
    encoder_outputs, encoder_hidden = encoder(inputs)
    hidden = encoder_hidden
    preds = []
    for i in range(target_length):
        output, hidden, attn_weights = decoder(decoder_inputs, hidden, encoder_outputs)
        output = output.squeeze(dim=0)
        pred_idx = torch.argmax(output, dim=-1)
        
        decoder_inputs = Variable(torch.ones((1, batch_size), dtype=torch.long)*pred_idx)
        preds.append(decoder_inputs)
        
    preds = torch.cat(preds, dim=0)
    preds = torch.transpose(preds, 0, 1).data.cpu().numpy()
    return preds

In [38]:
train(input_encoder, input_decoder, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)

2.995781183242798

In [41]:
epochs = 20
batch_size = 64

encoder = Encoder(input_size, hidden_size)
decoder = Decoder(output_size, hidden_size, 0.1)

# Initialize optimizers and criterion
encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate * decoder_learning_ratio)
criterion = nn.CrossEntropyLoss()

X_val = torch.tensor(X_test, dtype=torch.long)
y_val = torch.tensor(y_test, dtype=torch.long)
X_val = torch.transpose(X_val, 0, 1)
y_val = torch.transpose(y_val, 0, 1)
eval_data = X_val, y_val

for epoch in range(epochs):
    for idx in range(len(X_train)//batch_size):
        X_train_batch = torch.tensor(X_train[batch_size*idx:batch_size*(idx+1)], dtype=torch.long)
        y_train_batch = torch.tensor(y_train[batch_size*idx:batch_size*(idx+1)], dtype=torch.long)
        
        X_train_batch = torch.transpose(X_train_batch, 0, 1)
        y_train_batch = torch.transpose(y_train_batch, 0, 1)
        train_loss= train(X_train_batch, y_train_batch, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)
    eval_loss = evaluate(X_val, y_val, encoder, decoder, criterion)
    print('train loss: {:.3f} - eval loss: {:.3f}'.format(train_loss, eval_loss))

train loss: 0.267 - eval loss: 0.272
train loss: 0.065 - eval loss: 0.085
train loss: 0.037 - eval loss: 0.056
train loss: 0.026 - eval loss: 0.046
train loss: 0.025 - eval loss: 0.041
train loss: 0.020 - eval loss: 0.044
train loss: 0.019 - eval loss: 0.037
train loss: 0.017 - eval loss: 0.030
train loss: 0.011 - eval loss: 0.026
train loss: 0.009 - eval loss: 0.021
train loss: 0.008 - eval loss: 0.019
train loss: 0.006 - eval loss: 0.014
train loss: 0.004 - eval loss: 0.013
train loss: 0.003 - eval loss: 0.009
train loss: 0.003 - eval loss: 0.009
train loss: 0.005 - eval loss: 0.008
train loss: 0.008 - eval loss: 0.006
train loss: 0.003 - eval loss: 0.005
train loss: 0.002 - eval loss: 0.004
train loss: 0.005 - eval loss: 0.004


In [81]:
predict(X_val[:, :2] ,encoder, decoder, target_length=10)

array([[ 2,  8,  2,  2,  2,  2,  2,  2,  2,  3],
       [ 5,  7,  1,  6,  1,  3, 12,  3, 12,  3]])