In [None]:
# for real data
# create dataset of number sequences
# let's assume that we have a vocabulary size of 1000 words
# let's assume that 0 is the EOS token, and 1 is the SOS token, and 2 is PAD

In [1]:
def toData(batch):
    # [input] batch: list of strings
    # [output] input_out, output_out: np array([b x seq]), fixed size, eos & zero padding applied
    # [output] in_idx, out_idx: np.array([b]), length of each line in seq
    batch = [line.replace('\n','') for line in batch]
    inputs_ = []
    outputs_ = []
    in_len = []
    out_len = []
    for line in batch:
        inputs, outputs = line.split('::')
#         outputs, inputs = line.split('::')
        inputs_.append([int(num) for num in inputs.split(' ')])
        outputs_.append([int(num) for num in outputs.split(' ')])
        in_len.append(len(inputs_[-1]))
        out_len.append(len(outputs_[-1]))
    in_len = np.array(in_len)
    out_len = np.array(out_len)
    max_in = max(in_len)
    max_out = max(out_len)
    batch_size = len(batch)
    input_out = np.zeros([batch_size,max_in],dtype=int)
    output_out = np.zeros([batch_size,max_out],dtype=int)
    for b in range(batch_size):
        input_out[b][:in_len[b]] = np.array(inputs_[b])
        output_out[b][:out_len[b]] = np.array(outputs_[b])
    out_rev = out_len.argsort()[::-1]
    return input_out[out_rev], output_out[out_rev], in_len[out_rev], out_len[out_rev]

In [2]:
import numpy as np
w2i = np.load('data/en-django/en-django/w2i.npy').item()
i2w = np.load('data/en-django/en-django/i2w.npy').item()
vocab_size = len(w2i)
print(vocab_size)

7667


In [24]:
import torch
import torch.nn as nn
import numpy as np
import torch.optim as optim
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.autograd import Variable
import torch.nn.functional as F
# from models.copynet import CopyEncoder, CopyDecoder
from models.copynet_dbg import CopyEncoder, CopyDecoder
from models.functions import numpy_to_var, to_np, to_var, visualize, decoder_initial, update_logger
import time
import sys
import math
torch.manual_seed(1000)

# Hyperparameters
embed_size = 150
hidden_size = 300
num_layers = 1
bin_size = 10
num_epochs = 40
prev_end=0
batch_size = 16
lr = 0.001
vocab_size = 100
weight_decay = 0.99
use_saved = True # whether to train from a previous model
continue_from = 7
# version = 'django'
version = 'django_fixed'
step = 0 # number of steps taken

# input and output directories
w2i = np.load('data/en-django/en-django/w2i.npy').item()
i2w = np.load('data/en-django/en-django/i2w.npy').item()
vocab_size = len(w2i)
# file_dir = 'data/en-django/en-django/idx_lists.txt'
file_dir = 'data/en-django/en-django/idx_lists_fixed.txt'
# get training and test data
with open(file_dir) as f:
    lines = f.readlines()

import random
random.shuffle(lines)
test = lines[:200]
train = lines[200:]

# get number of batches
num_samples = len(train)
num_batches = int(num_samples/batch_size)

################ load copynet model #####################
if use_saved:
    # if using from previous data
    encoder_dir = 'models/encoder_%s_%s.pckl' % (version,continue_from)
    decoder_dir = 'models/decoder_%s_%s.pckl' % (version,continue_from)
    encoder = torch.load(f=encoder_dir)
    decoder = torch.load(f=decoder_dir)
else:
    encoder = CopyEncoder(vocab_size, embed_size, hidden_size)
    decoder = CopyDecoder(vocab_size, embed_size, hidden_size)
    continue_from = 0
if torch.cuda.is_available():
    encoder.cuda()
    decoder.cuda()
    

In [None]:
help(encoder.load_state_dict)

In [None]:
################################# training ##################################

# set loss
criterion = nn.NLLLoss()

start = time.time()
for epoch in range(num_epochs):
    print("==================================================")
    print("Epoch ",epoch+1)
    opt_e = optim.Adam(params=encoder.parameters(), lr=lr)
    opt_d = optim.Adam(params=decoder.parameters(), lr=lr)
    lr= lr * weight_decay # weight decay
    # shuffle data
    random.shuffle(train)
    samples_read = 0
    while(samples_read<len(train)):
        # initialize gradient buffers
        opt_e.zero_grad()
        opt_d.zero_grad()

        # obtain batch outputs
        batch = train[samples_read:min(samples_read+batch_size,len(train))]
        annotations, codes, in_len, out_len = toData(batch)
        output_out = output_out[:,:50]
        out_len = np.array([min(50,x) for x in out_len])
#         print(in_len.shape)
#         print(out_len.shape)
        samples_read+=len(batch)

        # mask input to remove padding
        input_mask = np.array(input_out>0, dtype=int)

        # input and output in Variable form
#         x = numpy_to_var(input_out)
#         y = numpy_to_var(output_out)
        x = numpy_to_var(codes)
        y = numpy_to_var(annotations)

        # apply to encoder
        encoded, _ = encoder(x)

        # get initial input of decoder
        decoder_in, s, w = decoder_initial(x.size(0))

        # out_list to store outputs
        out_list=[]
        for j in range(y.size(1)): # for all sequences
            """
            decoder_in (Variable): [b]
            encoded (Variable): [b x seq x hid]
            input_out (np.array): [b x seq]
            s (Variable): [b x hid]
            """
            # 1st state
#             print(j)
#             print(out.size())
            if j==0:
                out, s, w = decoder(input_idx=decoder_in, encoded=encoded,
                                encoded_idx=input_out, prev_state=s,
                                weighted=w, order=j)
            # remaining states
            else:
                tmp_out, s, w = decoder(input_idx=decoder_in, encoded=encoded,
                                encoded_idx=input_out, prev_state=s,
                                weighted=w, order=j)
                out = torch.cat([out,tmp_out],dim=1)
            # for debugging: stop if nan
            if math.isnan(w[-1][0][0].data[0]):
                sys.exit()
            # select next input

 
            decoder_in = y[:,j] # train with ground truth
#             out_list.append(out[:,-1].max(1)[1].squeeze().cpu().data.numpy())

        # print(torch.stack(decoder.prob_c_to_g,1))
        target = pack_padded_sequence(y,out_len.tolist(), batch_first=True)[0]
        pad_out = pack_padded_sequence(out,out_len.tolist(), batch_first=True)[0]
        # include log computation as we are using log-softmax and NLL
        pad_out = torch.log(pad_out)
        loss = criterion(pad_out, target)
        loss.backward()
        if samples_read%1==0:
            print("[%d/%d] Loss: %1.4f"%(samples_read,len(train),loss.data[0]))
        opt_e.step()
        opt_d.step()
        step += 1
        info = {
            'loss': loss.data[0]
        }
    # print("Loss: ",loss.data[0])
    elapsed = time.time()
    print("Elapsed time for epoch: ",elapsed-start)
    start = time.time()

    torch.save(f='models/encoder_%s_%s.pckl' % (version,str(epoch+continue_from)),obj=encoder)
    torch.save(f='models/decoder_%s_%s.pckl' % (version,str(epoch+continue_from)),obj=decoder)

In [25]:
# encoder_debug = encoder
# decoder_debug = decoder
encoder_debug = CopyEncoder(vocab_size, embed_size, hidden_size)
decoder_debug = CopyDecoder(vocab_size, embed_size, hidden_size)
encoder_debug.cuda()
decoder_debug.cuda()

CopyDecoder (
  (embed): Embedding(7667, 150)
  (gru): GRU(750, 300, batch_first=True)
  (Ws): Linear (600 -> 300)
  (Wo): Linear (300 -> 7667)
  (Wc): Linear (600 -> 300)
  (nonlinear): Tanh ()
)

In [26]:
encoder_debug.load_state_dict(encoder.state_dict())
decoder_debug.load_state_dict(decoder.state_dict())

In [27]:
def IdxToWords(idx_list,dic):
    return [dic[x] for x in idx_list]

In [28]:
################################# validation ##################################
input_out, output_out, in_len, out_len = toData(test[50:70])
input_mask = np.array(input_out>0, dtype=int)
x = numpy_to_var(input_out)
y = numpy_to_var(output_out)
encoded, _ = encoder(x)
decoder_in, s, w = decoder_initial(x.size(0))
out_list=[]
for j in range(y.size(1)): # for all sequences
    if j==0:
#         out, s, w = decoder(input_idx=decoder_in, encoded=encoded,
        out, s, w = decoder_debug(input_idx=decoder_in, encoded=encoded,
                        encoded_idx=input_out, prev_state=s, 
                        weighted=w, order=j)
    else:
#         tmp_out, s, w = decoder(input_idx=decoder_in, encoded=encoded,
        tmp_out, s, w = decoder_debug(input_idx=decoder_in, encoded=encoded,
                        encoded_idx=input_out, prev_state=s, 
                        weighted=w, order=j)
        tmp_data = tmp_out.data
        tmp_data[:,:,0].zero_()
        tmp_out = Variable(tmp_data)
        out = torch.cat([out,tmp_out],dim=1)
#     decoder_in = y[:,j] # train with ground truth
    decoder_in = out[:,-1].max(1)[1].squeeze() # train with sequence outputs
    out_list.append(out[:,-1].max(1)[1].squeeze().cpu().data.numpy())
# with open(save_dir,'a') as f:
#     out = np.hstack(tup=(out,iden))
#     f.write('\n')
#     for line in out:
#         f.write(','.join([str(y_) for y_ in line])+'\n')
out_list = np.array(out_list).transpose()

In [29]:
for i in range(len(input_out)): # for each sample in batch
    print("==============================================================")
    i_line = input_out[i]
    i_out = [idx for idx in IdxToWords(i_line,i2w) if (idx!='<PAD>') & (idx!='<SOS>') & (idx!='<EOS>')]
    print("[INPUT]")
    print(' '.join(i_out))
    a_line = output_out[i]
    a_out = [idx for idx in IdxToWords(a_line,i2w) if (idx!='<PAD>') & (idx!='<SOS>') & (idx!='<EOS>')]
    print("[GROUND OUTPUT]")
    print(' '.join(a_out))
    o_line = out_list[i]
    o_out = []
    for idx in o_line:
        if idx==w2i['<EOS>']:
            break
        if (idx!=w2i['<PAD>']) & (idx!=w2i['<SOS>']):
            o_out.append(idx)
    o_out = IdxToWords(o_out,i2w)
    print("[PREDICTED OUTPUT]")
    print(' '.join(o_out))

[INPUT]
ADDRESS_HEADERS = set ( [ ' from ' , ' sender ' , ' reply - to ' , ' to ' , ' cc ' , ' bcc ' , ' resent - from ' , ' resent - sender ' , ' resent - to ' , ' resent - cc ' , ' resent - bcc ' , ] )
[GROUND OUTPUT]
ADDRESS_HEADERS is a set containing strings : ' from ' , ' sender ' , ' reply - to ' , ' to ' , ' cc ' , ' bcc ' , ' resent - from ' , ' resent - sender ' ,
[PREDICTED OUTPUT]
' ' ' , ' resent ' for ' , ' , ' for ' , ' for ' , ' for ' , ' for ' , ' for ' , ' for ' , ' for ' , ' for ' ' ' '
[INPUT]
raise InvalidTemplateLibrary ( " Unsupported arguments to " " Library . filter : ( % r , % r ) " , ( name , filter_func ) )
[GROUND OUTPUT]
raise an InvalidTemplateLibrary exception with an argument string ( " Unsupported arguments to Library . filter : ( % r , % r ) " ,
[PREDICTED OUTPUT]
raise an InvalidTemplateLibrary exception with an argument string " Unsupported arguments % s . " , where ' % s ' is replaced by :
[INPUT]
@ property
[GROUND OUTPUT]
where ' % s ' is replace

In [None]:
for line in output_out:
    out = [idx for idx in IdxToWords(line,i2w) if idx!='<PAD>']
    print(' '.join(out))

In [None]:
for line in out_list:
    out = [idx for idx in IdxToWords(line,i2w) if (idx!='<EOS>')]
    print(' '.join(out))

In [None]:
%matplotlib inline
# debug
# get a sample input, ground truth, output
idx = 3
print("input: ",x[idx].cpu().data.numpy())
print("input: ",' '.join(IdxToWords(x[idx].cpu().data.numpy(),i2w)))
print("truth: ",y[idx].cpu().data.numpy())
print("truth: ",' '.join(IdxToWords(y[idx].cpu().data.numpy(),i2w)))
O = torch.cat(decoder_debug.O,1)
out_sample = []
for o in O[idx].max(1)[1].cpu().numpy().squeeze():
    if o==w2i['<EOS>']:
        break
    else:
        out_sample.append(o)

print("output: ",out_sample)
print("output: ",' '.join(IdxToWords(out_sample,i2w)))
A = torch.stack(decoder_debug.A,1)
A2 = torch.stack(decoder_debug.A2,1)
P = torch.stack(decoder_debug.P,1)
I = torch.stack(decoder_debug.I,1)
# E = torch.stack(decoder_debug.E,1)
S = torch.stack(decoder_debug.S,1)
W = torch.cat(decoder_debug.W,1)
Y = torch.stack(decoder_debug.Y,1)
# scores = torch.stack(decoder_debug.scores,1)
sc = torch.stack(decoder_debug.sc,1)
pcg = torch.stack(decoder_debug.prob_c_to_g,1)
pc = P[:,:,vocab_size:]
pg = P[:,:,:vocab_size]

In [None]:
visualize(Variable(pc[idx][6:12]))

In [None]:
out_list

In [None]:
b = a.data

In [22]:
batch_count = 0
print_list = []
while batch_count<len(test):
    ################################# validation ##################################
    input_out, output_out, in_len, out_len = toData(test[batch_count:min(batch_count+20,len(test))])
    batch_count+=20
    print(batch_count)
    input_mask = np.array(input_out>0, dtype=int)
    x = numpy_to_var(input_out)
    y = numpy_to_var(output_out)
    encoded, _ = encoder(x)
    decoder_in, s, w = decoder_initial(x.size(0))
    out_list=[]
    for j in range(y.size(1)): # for all sequences
        if j==0:
    #         out, s, w = decoder(input_idx=decoder_in, encoded=encoded,
            out, s, w = decoder_debug(input_idx=decoder_in, encoded=encoded,
                            encoded_idx=input_out, prev_state=s, 
                            weighted=w, order=j)
        else:
    #         tmp_out, s, w = decoder(input_idx=decoder_in, encoded=encoded,
            tmp_out, s, w = decoder_debug(input_idx=decoder_in, encoded=encoded,
                            encoded_idx=input_out, prev_state=s, 
                            weighted=w, order=j)
            tmp_data = tmp_out.data
            tmp_data[:,:,0].zero_()
            tmp_out = Variable(tmp_data)
            out = torch.cat([out,tmp_out],dim=1)
    #     decoder_in = y[:,j] # train with ground truth
        decoder_in = out[:,-1].max(1)[1].squeeze() # train with sequence outputs
        out_list.append(out[:,-1].max(1)[1].squeeze().cpu().data.numpy())
    # with open(save_dir,'a') as f:
    #     out = np.hstack(tup=(out,iden))
    #     f.write('\n')
    #     for line in out:
    #         f.write(','.join([str(y_) for y_ in line])+'\n')
    out_list = np.array(out_list).transpose()
    for i in range(len(input_out)): # for each sample in batch
#         print("==============================================================")
        i_line = input_out[i]
        i_out = [idx for idx in IdxToWords(i_line,i2w) if (idx!='<PAD>') & (idx!='<SOS>') & (idx!='<EOS>')]
#         print("[INPUT]")
#         print(' '.join(i_out))
        i_line = '[INPUT]\n'+' '.join(i_out)

        a_line = output_out[i]
        a_out = [idx for idx in IdxToWords(a_line,i2w) if (idx!='<PAD>') & (idx!='<SOS>') & (idx!='<EOS>')]
#         print("[GROUND OUTPUT]")
#         print(' '.join(a_out))
        a_line = '[GROUND TRUTH]\n# '+' '.join(a_out)

        o_line = out_list[i]
        o_out = []
        for idx in o_line:
            if idx==w2i['<EOS>']:
                break
            if (idx!=w2i['<PAD>']) & (idx!=w2i['<SOS>']):
                o_out.append(idx)
        o_out = IdxToWords(o_out,i2w)
#         print("[PREDICTED OUTPUT]")
#         print(' '.join(o_out))
        o_line = '[PREDICTED]\n# '+' '.join(o_out)
        print_line = '\n'.join([i_line,a_line,o_line])
        print_list.append(print_line)


20
40
60
80
100
120
140
160
180
200


In [23]:
with open('django-results.py','w') as f:
    f.write('\n\n'.join(print_list))