In [None]:
# create dataset of number sequences
# let's assume that we have a vocabulary size of 1000 words
# let's assume that 0 is the EOS token, and 1 is the SOS token, and 2 is PAD

In [1]:
# get LM model
import torch 
import torch.nn as nn
import numpy as np
import torch.optim as optim
from torch.nn.utils.rnn import pack_padded_sequence
from torch.autograd import Variable
import torch.nn.functional as F
from models.copynet import CopyEncoder, CopyDecoder
from models.seq2seq import Encoder01, Decoder01
from logger import Logger
torch.manual_seed(1000)
# Hyperparameters
embed_size = 150
hidden_size = 300
num_layers = 1
bin_size = 10
num_epochs = 30
batch_size = 20
lr = 0.0001
vocab_size = 100

step = 0 # number of steps taken

In [2]:
# get data
# with open('data/copynet_data_simple.txt') as f:
with open('data/copynet_data_v1.txt') as f:
    lines = f.readlines()
import random
random.shuffle(lines)
half = int(len(lines)/2)
# train = lines[:half]
train = lines[:100]
test = lines[half:]

In [3]:
logger = Logger('./logs')

In [4]:
def toData(batch):
    # [input] batch: list of strings
    # [output] input_out, output_out: np array([b x seq]), fixed size, eos & zero padding applied
    # [output] in_idx, out_idx: np.array([b]), length of each line in seq 
    batch = [line.replace('\n','') for line in batch]
    inputs_ = []
    outputs_ = []
    in_len = []
    out_len = []
    for line in batch:
        inputs, outputs, _ = line.split('\t')
        inputs_.append([int(num) for num in inputs.split(',')]+[1])
        outputs_.append([int(num) for num in outputs.split(',')]+[1])
        in_len.append(len(inputs_[-1]))
        out_len.append(len(outputs_[-1]))
    in_len = np.array(in_len)
    out_len = np.array(out_len)
    max_in = max(in_len)
    max_out = max(out_len)
    batch_size = len(batch)
    input_out = np.zeros([batch_size,max_in],dtype=int)
    output_out = np.zeros([batch_size,max_out],dtype=int)
    for b in range(batch_size):
        input_out[b][:in_len[b]] = np.array(inputs_[b])
        output_out[b][:out_len[b]] = np.array(outputs_[b])
    out_rev = out_len.argsort()[::-1]
#     return input_out, output_out, in_len, out_len    
    return input_out[out_rev], output_out[out_rev], in_len[out_rev], out_len[out_rev]

def to_np(x):
    return x.data.cpu().numpy()

def to_var(x):
    if torch.cuda.is_available():
        x = x.cuda()
    return Variable(x)

In [5]:
num_samples = len(train)
num_batches = int(num_samples/batch_size)

In [11]:
################ copynet model #####################
encoder = CopyEncoder(vocab_size, embed_size, hidden_size)
decoder = CopyDecoder(vocab_size, embed_size, hidden_size)
# encoder = torch.load(f='models/encoder01.pth')
# decoder = torch.load(f='models/decoder01.pth')
criterion = nn.CrossEntropyLoss()
if torch.cuda.is_available():
    encoder.cuda()
    decoder.cuda()


# for epoch in range(num_epochs):
for epoch in range(num_epochs):
    if epoch % 20 == 19:
        lr=lr/10
    print("Epoch ",epoch+1)
    opt_e = optim.Adam(params=encoder.parameters(), lr=lr)
    opt_d = optim.Adam(params=decoder.parameters(), lr=lr)
#     opt_e = optim.SGD(params=encoder.parameters(), lr=lr)
#     opt_d = optim.SGD(params=decoder.parameters(), lr=lr)
    
    # shuffle data
    random.shuffle(train)
    
    for i in range(num_batches):
        # initialize gradient buffers
        opt_e.zero_grad()
        opt_d.zero_grad()

        # obtain batch outputs
        batch = train[i*batch_size:(i+1)*batch_size]
        input_out, output_out, in_len, out_len = toData(batch)
        
        # mask input to remove padding
        input_mask = np.array(input_out>0, dtype=int)
        
        # input and output in Variable form
        x = torch.LongTensor(input_out)
        y = torch.LongTensor(output_out)
        if torch.cuda.is_available():
            x = x.cuda()
            y = y.cuda()
        x = Variable(x)
        y = Variable(y)
#         print("input",x)
        encoded, _ = encoder(x)
        
        # get initial input of decoder
        decoder_in = torch.LongTensor(np.ones(x.size(0),dtype=int))*2
        s = None
        w = None
        if torch.cuda.is_available():
            decoder_in = decoder_in.cuda()
        decoder_in = Variable(decoder_in)
        
        # out_list to store outputs
        out_list=[]
        for j in range(y.size(1)): # for all sequences
            """
            decoder_in (Variable): [b]
            encoded (Variable): [b x seq x hid]
            input_out (np.array): [b x seq]
            s (Variable): [b x hid]
            """
            # calculate for 1st state
            if j==0:
                out, s, w = decoder(input_idx=decoder_in, encoded=encoded,
                                encoded_idx=input_out, prev_state=s, 
                                weighted=w, order=j)
            else:
                tmp_out, s, w = decoder(input_idx=decoder_in, encoded=encoded,
                                encoded_idx=input_out, prev_state=s, 
                                weighted=w, order=j)
                out = torch.cat([out,tmp_out],dim=1)
            # select next input
            if epoch % 2 ==0:
                decoder_in = out[:,-1].max(1)[1].squeeze() # train with sequence outputs
            else:
                decoder_in = y[:,j] # train with ground truth
            out_list.append(out[:,-1].max(1)[1].squeeze().cpu().data.numpy())
        # get loss
        target = pack_padded_sequence(y,out_len.tolist(), batch_first=True)[0]
        out = pack_padded_sequence(out,out_len.tolist(), batch_first=True)[0]
        loss = criterion(out, target)
        loss.backward()
        opt_e.step()
        opt_d.step()
        step += 1
        info = {
            'loss': loss.data[0]
        }
        for tag, value in info.items():
            logger.scalar_summary(tag,value,step)
        
        for tag, value in encoder.named_parameters():
            tag = 'encoder/'+tag
            logger.histo_summary(tag, to_np(value), step)
            logger.histo_summary(tag+'/grad', to_np(value.grad), step)

        for tag, value in decoder.named_parameters():
            tag = 'decoder/'+tag
            logger.histo_summary(tag, to_np(value), step)
            logger.histo_summary(tag+'/grad', to_np(value.grad), step)
    
    # print loss
    print("Loss: ", loss.data[0])
    print("Ground truth: ")
    print(y.cpu().data.numpy().transpose())
    print("Predictions: ")
    print(np.array(out_list))

Epoch  1
Loss:  4.511541843414307
Ground truth: 
[[71 12 87 88  9 85 61 30 48 71 69 73 50 34 99 65 37 31 93 70]
 [58 23 75 15 57 83 22 21 56 52 79 61 87 21 31 66 39 73 19  5]
 [46 27 94 94 54 35 83 26 82 92 35 14 75 47 98 95 75 93 44 79]
 [ 4 90  3 31 77  4 53 25  5 72 31 93 62 62 16 68 29 72 27 20]
 [67 92 82 68 11 72 67 22 58 46 53 49 79 66 49 41 75 98 25 60]
 [ 1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1]]
Predictions: 
[[0 0 0 0 1 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0]
 [0 0 0 1 1 0 1 1 0 1 1 1 1 1 0 0 1 1 1 1]
 [1 1 1 0 1 1 0 0 1 1 1 0 1 0 1 1 1 0 1 1]
 [0 0 0 1 1 0 1 1 0 1 1 1 0 1 0 0 1 1 1 1]
 [0 0 0 1 1 1 1 0 1 1 1 1 1 1 1 0 0 1 0 0]
 [1 1 1 0 1 0 0 1 0 1 1 0 1 0 0 1 1 0 1 1]]
Epoch  2
Loss:  4.505768299102783
Ground truth: 
[[71 50 58 63 85 63 88  8 75 30 61  6 68 32 83 89 27 24 15  3]
 [52 87 52 47 38 96 15 53 32 53 22 50 46 16 59 27 83 54 11 96]
 [92 75 84 34  9 36 94 59 48 99 83 98 47 54 10 68 19 58 78 23]
 [72 62 52 45 62 61 31 45 42 88 53 96  8 12 23 65 25 34 69 74