In [None]:
# create dataset of number sequences
# let's assume that we have a vocabulary size of 1000 words
# let's assume that 0 is the EOS token, and 1 is the SOS token, and 2 is PAD

In [3]:
import torch 
import torch.nn as nn
import numpy as np
import torch.optim as optim
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.autograd import Variable
import torch.nn.functional as F
from models.copynet import CopyEncoder, CopyDecoder
from models.seq2seq import Encoder01, Decoder01
from logger import Logger
import matplotlib.pyplot as plt
%matplotlib inline
torch.manual_seed(1000)
# Hyperparameters
embed_size = 150
hidden_size = 300
num_layers = 1
bin_size = 10
num_epochs = 100
batch_size = 20
lr = 0.001
vocab_size = 100

step = 0 # number of steps taken

In [4]:
# get data
# with open('data/copynet_data_simple.txt') as f:
with open('data/copynet_data_v2.txt') as f:
    lines = f.readlines()
import random
# random.shuffle(lines)
half = int(len(lines)/2)
# train = lines[:half]
# test = lines[half:]
train = lines[200:300]
test = lines[100:200]

In [5]:
logger = Logger('./logs')

In [6]:
def toData(batch):
    # [input] batch: list of strings
    # [output] input_out, output_out: np array([b x seq]), fixed size, eos & zero padding applied
    # [output] in_idx, out_idx: np.array([b]), length of each line in seq 
    batch = [line.replace('\n','') for line in batch]
    inputs_ = []
    outputs_ = []
    in_len = []
    out_len = []
    for line in batch:
        inputs, outputs, _ = line.split('\t')
        inputs_.append([int(num) for num in inputs.split(',')]+[1])
        outputs_.append([int(num) for num in outputs.split(',')]+[1])
        in_len.append(len(inputs_[-1]))
        out_len.append(len(outputs_[-1]))
    in_len = np.array(in_len)
    out_len = np.array(out_len)
    max_in = max(in_len)
    max_out = max(out_len)
    batch_size = len(batch)
    input_out = np.zeros([batch_size,max_in],dtype=int)
    output_out = np.zeros([batch_size,max_out],dtype=int)
    for b in range(batch_size):
        input_out[b][:in_len[b]] = np.array(inputs_[b])
        output_out[b][:out_len[b]] = np.array(outputs_[b])
    out_rev = out_len.argsort()[::-1]
#     return input_out, output_out, in_len, out_len    
    return input_out[out_rev], output_out[out_rev], in_len[out_rev], out_len[out_rev]

def to_np(x):
    return x.data.cpu().numpy()

def to_var(x):
    if torch.cuda.is_available():
        x = x.cuda()
    return Variable(x)

def visualize(x):
    plt.pcolor(x.cpu().data.numpy())

In [7]:
num_samples = len(train)
num_batches = int(num_samples/batch_size)

In [None]:
################ copynet model #####################
encoder = CopyEncoder(vocab_size, embed_size, hidden_size)
decoder = CopyDecoder(vocab_size, embed_size, hidden_size)
opt_e = optim.Adam(params=encoder.parameters(), lr=lr)
opt_d = optim.Adam(params=decoder.parameters(), lr=lr)
# encoder = torch.load(f='models/encoder01.pth')
# decoder = torch.load(f='models/decoder01.pth')
criterion = nn.CrossEntropyLoss()
if torch.cuda.is_available():
    encoder.cuda()
    decoder.cuda()


# for epoch in range(num_epochs):
for epoch in range(num_epochs):
#     if epoch % 20 == 19:
#         lr=lr/10
    print("==================================================")
    print("Epoch ",epoch+1)
#     opt_e = optim.SGD(params=encoder.parameters(), lr=lr)
#     opt_d = optim.SGD(params=decoder.parameters(), lr=lr)
    
    # shuffle data
    random.shuffle(train)
    
    for i in range(num_batches):
        # initialize gradient buffers
        opt_e.zero_grad()
        opt_d.zero_grad()

        # obtain batch outputs
        batch = train[i*batch_size:(i+1)*batch_size]
        input_out, output_out, in_len, out_len = toData(batch)
        
        # mask input to remove padding
        input_mask = np.array(input_out>0, dtype=int)
        
        # input and output in Variable form
        x = torch.LongTensor(input_out)
        y = torch.LongTensor(output_out)
        if torch.cuda.is_available():
            x = x.cuda()
            y = y.cuda()
        x = Variable(x)
        y = Variable(y)
#         print("input",x)
        encoded, _ = encoder(x)
        
        # get initial input of decoder
        decoder_in = torch.LongTensor(np.ones(x.size(0),dtype=int))*2
        s = None
        w = None
        if torch.cuda.is_available():
            decoder_in = decoder_in.cuda()
        decoder_in = Variable(decoder_in)
        
        # out_list to store outputs
        out_list=[]
#         for j in range(3): # for all sequences
        for j in range(y.size(1)): # for all sequences
            """
            decoder_in (Variable): [b]
            encoded (Variable): [b x seq x hid]
            input_out (np.array): [b x seq]
            s (Variable): [b x hid]
            """
            # calculate for 1st state
            if j==0:
                out, s, w = decoder(input_idx=decoder_in, encoded=encoded,
                                encoded_idx=input_out, prev_state=s, 
                                weighted=w, order=j)
            else:
                tmp_out, s, w = decoder(input_idx=decoder_in, encoded=encoded,
                                encoded_idx=input_out, prev_state=s, 
                                weighted=w, order=j)
                out = torch.cat([out,tmp_out],dim=1)
            # select next input
            if epoch % 2 ==1:
                decoder_in = out[:,-1].max(1)[1].squeeze() # train with sequence outputs
            else:
                decoder_in = y[:,j] # train with ground truth
            out_list.append(out[:,-1].max(1)[1].squeeze().cpu().data.numpy())
        # get loss
        target = pack_padded_sequence(y,out_len.tolist(), batch_first=True)[0]
        pad_out = pack_padded_sequence(out,out_len.tolist(), batch_first=True)[0]
        loss = criterion(pad_out, target)
        loss.backward()
        opt_e.step()
        opt_d.step()
        step += 1
        info = {
            'loss': loss.data[0]
        }
        for tag, value in info.items():
            logger.scalar_summary(tag,value,step)
        
        for tag, value in encoder.named_parameters():
            tag = 'encoder/'+tag
            logger.histo_summary(tag, to_np(value), step)
            logger.histo_summary(tag+'/grad', to_np(value.grad), step)

        for tag, value in decoder.named_parameters():
            tag = 'decoder/'+tag
            logger.histo_summary(tag, to_np(value), step)
            logger.histo_summary(tag+'/grad', to_np(value.grad), step)
    
    # print loss
    if epoch % 20==0:
        print("-----------------------------")
        print("Inputs:")
        print(x.cpu().data.numpy().transpose())
        print("Ground truth: ")
        print(y.cpu().data.numpy().transpose())
        print("Predictions: ")
        print(np.array(out_list))
    print("Loss: ", loss.data[0])
#     print("-----------------------------")
#     print("Inputs:")
#     print(x.cpu().data.numpy().transpose())
#     print("Ground truth: ")
#     print(y.cpu().data.numpy().transpose())
#     print("Predictions: ")
#     print(np.array(out_list))

Epoch  1
-----------------------------
Inputs:
[[64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64]
 [58 58 58 58 58 58 58 58 58 58 58 58 58 58 58 58 58 58 58 58]
 [ 3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3]
 [ 7 70 56 17 20 85 89 74 50 46 65 89 54 67 99 66 98 14 94 54]
 [32 63 12 82 85 20  9 22 59 76 72 29 84 10 14 45  5 33 13 47]
 [35 80 19 10 28 85 17 51 45 83 25 95 13 24 83 29 57 10 64 32]
 [76 26 87 14  9 65 20 78 53 77 69 78 66 30 90 96 85  8 16 76]
 [68 10 99 52 54 32  5 97 15 41 70 23 60  5 69 16 76 63 49 19]
 [ 3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3]
 [65 65 65 65 65 65 65 65 65 65 65 65 65 65 65 65 65 65 65 65]
 [ 1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1]]
Ground truth: 
[[37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37]
 [85 85 85 85 85 85 85 85 85 85 85 85 85 85 85 85 85 85 85 85]
 [63 63 63 63 63 63 63 63 63 63 63 63 63 63 63 63 63 63 63 63]
 [ 7 70 56 17 20 85 89 74 50 46 65 89 54 67 99 66 98 14

In [None]:
################ test copynet model #####################
# encoder = torch.load(f='models/encoder01.pth')
# decoder = torch.load(f='models/decoder01.pth')

# for epoch in range(num_epochs):
for epoch in 1:
    if epoch % 20 == 19:
        lr=lr/10
    print("==================================================")
    print("Epoch ",epoch+1)
    opt_e = optim.Adam(params=encoder.parameters(), lr=lr)
    opt_d = optim.Adam(params=decoder.parameters(), lr=lr)
#     opt_e = optim.SGD(params=encoder.parameters(), lr=lr)
#     opt_d = optim.SGD(params=decoder.parameters(), lr=lr)
    
    # shuffle data
    random.shuffle(train)
    
    for i in range(num_batches):
        # initialize gradient buffers
        opt_e.zero_grad()
        opt_d.zero_grad()

        # obtain batch outputs
        batch = train[i*batch_size:(i+1)*batch_size]
        input_out, output_out, in_len, out_len = toData(batch)
        
        # mask input to remove padding
        input_mask = np.array(input_out>0, dtype=int)
        
        # input and output in Variable form
        x = torch.LongTensor(input_out)
        y = torch.LongTensor(output_out)
        if torch.cuda.is_available():
            x = x.cuda()
            y = y.cuda()
        x = Variable(x)
        y = Variable(y)
#         print("input",x)
        encoded, _ = encoder(x)
        
        # get initial input of decoder
        decoder_in = torch.LongTensor(np.ones(x.size(0),dtype=int))*2
        s = None
        w = None
        if torch.cuda.is_available():
            decoder_in = decoder_in.cuda()
        decoder_in = Variable(decoder_in)
        
        # out_list to store outputs
        out_list=[]
#         for j in range(3): # for all sequences
        for j in range(y.size(1)): # for all sequences
            """
            decoder_in (Variable): [b]
            encoded (Variable): [b x seq x hid]
            input_out (np.array): [b x seq]
            s (Variable): [b x hid]
            """
            # calculate for 1st state
            if j==0:
                out, s, w = decoder(input_idx=decoder_in, encoded=encoded,
                                encoded_idx=input_out, prev_state=s, 
                                weighted=w, order=j)
            else:
                tmp_out, s, w = decoder(input_idx=decoder_in, encoded=encoded,
                                encoded_idx=input_out, prev_state=s, 
                                weighted=w, order=j)
                out = torch.cat([out,tmp_out],dim=1)
            # select next input
            if epoch % 2 ==1:
                decoder_in = out[:,-1].max(1)[1].squeeze() # train with sequence outputs
            else:
                decoder_in = y[:,j] # train with ground truth
            out_list.append(out[:,-1].max(1)[1].squeeze().cpu().data.numpy())
        # get loss
        target = pack_padded_sequence(y,out_len.tolist(), batch_first=True)[0]
        pad_out = pack_padded_sequence(out,out_len.tolist(), batch_first=True)[0]
        loss = criterion(pad_out, target)
        loss.backward()
        opt_e.step()
        opt_d.step()
        step += 1
        info = {
            'loss': loss.data[0]
        }
        for tag, value in info.items():
            logger.scalar_summary(tag,value,step)
        
        for tag, value in encoder.named_parameters():
            tag = 'encoder/'+tag
            logger.histo_summary(tag, to_np(value), step)
            logger.histo_summary(tag+'/grad', to_np(value.grad), step)

        for tag, value in decoder.named_parameters():
            tag = 'decoder/'+tag
            logger.histo_summary(tag, to_np(value), step)
            logger.histo_summary(tag+'/grad', to_np(value.grad), step)
    
    # print loss
    if epoch % 20==0:
        print("-----------------------------")
        print("Inputs:")
        print(x.cpu().data.numpy().transpose())
        print("Ground truth: ")
        print(y.cpu().data.numpy().transpose())
        print("Predictions: ")
        print(np.array(out_list))
    print("Loss: ", loss.data[0])
#     print("-----------------------------")
#     print("Inputs:")
#     print(x.cpu().data.numpy().transpose())
#     print("Ground truth: ")
#     print(y.cpu().data.numpy().transpose())
#     print("Predictions: ")
#     print(np.array(out_list))

In [None]:
visualize(torch.stack(decoder.prob_c_to_g,dim=1)[:,0])

In [None]:
torch.stack(decoder.prob_c_to_g,dim=1)[:,0].max(1)[1]

In [None]:
visualize(torch.stack(decoder.probs,dim=1)[:,0])

In [None]:
torch.stack(decoder.probs,dim=1)[:,0,100:].sum(1)

In [None]:
visualize(torch.stack(decoder.prob_c_to_g,dim=1)[:,0])

In [None]:
visualize(out[:,0])

In [None]:
visualize(torch.stack(decoder.attn,dim=1)[:,2])

In [None]:
x

In [None]:
y

In [None]:
out.max(2)[1].squeeze()

In [None]:
torch.stack(decoder.W,1).squeeze()[:,1].sum(1)