In [1]:
# for real data
# create dataset of number sequences
# let's assume that we have a vocabulary size of 1000 words
# let's assume that 0 is the EOS token, and 1 is the SOS token, and 2 is PAD

In [1]:
import torch 
import torch.nn as nn
import numpy as np
import torch.optim as optim
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.autograd import Variable
import torch.nn.functional as F
from models.copynet_debug import CopyEncoder, CopyDecoder
from models.functions import numpy_to_var, toData, to_np, to_var, visualize, decoder_initial, update_logger
from models.seq2seq import Encoder01, Decoder01
from logger import Logger
import matplotlib.pyplot as plt
import time
%matplotlib inline
torch.manual_seed(1000)
# Hyperparameters
embed_size = 150
hidden_size = 300
num_layers = 1
bin_size = 10
num_epochs = 1000
prev_end=0
batch_size = 100
lr = 0.001
vocab_size = 108

step = 0 # number of steps taken

In [8]:
import sys
import math
data_dir = 'js_dataset/var_dataset_3_shorter.txt'
# data_dir = 'data/copynet_data_v2.2.txt'
save_dir = 'eval_3.2.4.csv'

In [9]:

# get data
# with open('data/copynet_data_simple.txt') as f:
with open(data_dir) as f:
    lines = f.readlines()
import random
# random.shuffle(lines)
half = int(len(lines)/2)
# train = lines[:half]
# test = lines[half:]
train = lines
random.shuffle(train)
test = train[:50]
train = train[50:]

In [10]:
logger = Logger('./logs')

In [11]:
# encoder = torch.load(f='models/encoder_41.pckl')
# decoder = torch.load(f='models/decoder_41.pckl')

In [12]:
num_samples = len(train)
num_batches = int(num_samples/batch_size)

In [None]:
################ copynet model #####################
encoder = CopyEncoder(vocab_size, embed_size, hidden_size)
decoder = CopyDecoder(vocab_size, embed_size, hidden_size)
# opt_e = optim.Adam(params=encoder.parameters(), lr=lr)
# opt_d = optim.Adam(params=decoder.parameters(), lr=lr)
# encoder = torch.load(f='models/encoder_1000_95.pckl')
# decoder = torch.load(f='models/decoder_1000_95.pckl')
criterion = nn.CrossEntropyLoss()
if torch.cuda.is_available():
    encoder.cuda()
    decoder.cuda()

    


################################# training ##################################

start = time.time()
# for epoch in range(num_epochs):
epoch_arr = np.arange(num_epochs,dtype=int)
epoch_arr+=prev_end
for epoch in epoch_arr:
# for epoch in range(num_epochs):
#     if epoch % 20 == 19:
#         lr=lr/3
    print("==================================================")
    print("Epoch ",epoch+1)
    opt_e = optim.Adam(params=encoder.parameters(), lr=lr)
    opt_d = optim.Adam(params=decoder.parameters(), lr=lr)
    lr= lr * 0.99
    # shuffle data
    random.shuffle(train)
    
    for i in range(num_batches):
        # initialize gradient buffers
        opt_e.zero_grad()
        opt_d.zero_grad()

        # obtain batch outputs
        batch = train[i*batch_size:(i+1)*batch_size]
        input_out, output_out, in_len, out_len = toData(batch)
        
        # mask input to remove padding
        input_mask = np.array(input_out>0, dtype=int)
        
        # input and output in Variable form
        x = numpy_to_var(input_out)
        y = numpy_to_var(output_out)
        
        # apply to encoder
        encoded, _ = encoder(x)
        
        # get initial input of decoder
        decoder_in, s, w = decoder_initial(x.size(0))

        # out_list to store outputs
        out_list=[]
        for j in range(y.size(1)): # for all sequences
            """
            decoder_in (Variable): [b]
            encoded (Variable): [b x seq x hid]
            input_out (np.array): [b x seq]
            s (Variable): [b x hid]
            """
            # 1st state
            if j==0:
                out, s, w = decoder(input_idx=decoder_in, encoded=encoded,
                                encoded_idx=input_out, prev_state=s, 
                                weighted=w, order=j)
            # remaining states
            else:
                tmp_out, s, w = decoder(input_idx=decoder_in, encoded=encoded,
                                encoded_idx=input_out, prev_state=s, 
                                weighted=w, order=j)
                out = torch.cat([out,tmp_out],dim=1)

            # for debugging: stop if nan
            if math.isnan(w[-1][0][0].data[0]):
                sys.exit()
            # select next input
            
            if epoch % 2 ==13:
                decoder_in = out[:,-1].max(1)[1].squeeze() # train with sequence outputs
            else:
                decoder_in = y[:,j] # train with ground truth
            out_list.append(out[:,-1].max(1)[1].squeeze().cpu().data.numpy())

        # print(torch.stack(decoder.prob_c_to_g,1))
        target = pack_padded_sequence(y,out_len.tolist(), batch_first=True)[0]
        pad_out = pack_padded_sequence(out,out_len.tolist(), batch_first=True)[0]
        loss = criterion(pad_out, target)
        loss.backward()
        if i%int(num_batches/5)==0:
            print("[%d/%d] Loss: %1.4f"%(i,num_batches,loss.data[0]))
        opt_e.step()
        opt_d.step()
        step += 1
        info = {
            'loss': loss.data[0]
        }
    print("Loss: ",loss.data[0])
    elapsed = time.time()
    print("Elapsed time: ",elapsed-start)
    start = time.time()
        # update for tensorboard
#         logger = update_logger(logger, [encoder,decoder], loss, step)
    
    ################################# validation ##################################
    if epoch % 2==0:
        print("Printing results")
        input_out, output_out, in_len, out_len = toData(test)
        input_mask = np.array(input_out>0, dtype=int)
        x = numpy_to_var(input_out)
        y = numpy_to_var(output_out)
        if epoch==0:
            with open(save_dir,'a') as f:
                x_input = x.cpu().data.numpy().transpose()
                y_input = y.cpu().data.numpy().transpose()
                m = max(x_input.shape[0],y_input.shape[0])
                out_ = np.zeros([m,x_input.shape[1]*2])
                out_[:x_input.shape[0],x_input.shape[1]:]=x_input
                out_[:y_input.shape[0],:y_input.shape[1]]=y_input
                out = []
                for line in out_:
                    tmp = ','.join([str(x_) for x_ in line])
                    out.append(tmp)
                f.write('\n'.join(out)+'\n')
        encoded, _ = encoder(x)
        decoder_in, s, w = decoder_initial(x.size(0))
        out_list=[]
        for j in range(y.size(1)): # for all sequences
            if j==0:
                out, s, w = decoder(input_idx=decoder_in, encoded=encoded,
                                encoded_idx=input_out, prev_state=s, 
                                weighted=w, order=j)
            else:
                tmp_out, s, w = decoder(input_idx=decoder_in, encoded=encoded,
                                encoded_idx=input_out, prev_state=s, 
                                weighted=w, order=j)
                out = torch.cat([out,tmp_out],dim=1)
            decoder_in = out[:,-1].max(1)[1].squeeze() # train with sequence outputs
            out_list.append(out[:,-1].max(1)[1].squeeze().cpu().data.numpy())
        out = np.array(out_list)
        iden = np.array(out==y_input,dtype=int)
        with open(save_dir,'a') as f:
            out = np.hstack(tup=(out,iden))
            f.write('\n')
            for line in out:
                f.write(','.join([str(y_) for y_ in line])+'\n')
        torch.save(f='models/encoder_1000_'+str(epoch)+'_v4.pckl',obj=encoder)
        torch.save(f='models/decoder_1000_'+str(epoch)+'_v4.pckl',obj=decoder)

Epoch  1
[0/112] Loss: 4.6690


In [9]:
epoch

100

In [None]:
################################# validation ##################################
print("Printing results")
input_out, output_out, in_len, out_len = toData(test)
input_mask = np.array(input_out>0, dtype=int)
x = numpy_to_var(input_out)
y = numpy_to_var(output_out)
encoded, _ = encoder(x)
decoder_in, s, w = decoder_initial(x.size(0))
out_list=[]
for j in range(y.size(1)): # for all sequences
    if j==0:
        out, s, w = decoder(input_idx=decoder_in, encoded=encoded,
                        encoded_idx=input_out, prev_state=s, 
                        weighted=w, order=j)
    else:
        tmp_out, s, w = decoder(input_idx=decoder_in, encoded=encoded,
                        encoded_idx=input_out, prev_state=s, 
                        weighted=w, order=j)
        out = torch.cat([out,tmp_out],dim=1)
    decoder_in = y[:,j] # train with ground truth
#     decoder_in = out[:,-1].max(1)[1].squeeze() # train with sequence outputs
    out_list.append(out[:,-1].max(1)[1].squeeze().cpu().data.numpy())
# with open(save_dir,'a') as f:
#     out = np.hstack(tup=(out,iden))
#     f.write('\n')
#     for line in out:
#         f.write(','.join([str(y_) for y_ in line])+'\n')

In [None]:
# get a sample input, ground truth, output
idx = 2
print("input: ",x[idx].cpu().data.numpy())
print("truth: ",y[idx].cpu().data.numpy())
O = torch.cat(decoder.O,1)
print("output: ",O[idx].max(1)[1].cpu().numpy().squeeze())
A = torch.stack(decoder.A,1)
A2 = torch.stack(decoder.A2,1)
P = torch.stack(decoder.P,1)
I = torch.stack(decoder.I,1)
E = torch.stack(decoder.E,1)
S = torch.stack(decoder.S,1)
W = torch.cat(decoder.W,1)
Y = torch.stack(decoder.Y,1)
scores = torch.stack(decoder.scores,1)
sc = torch.stack(decoder.sc,1)
pcg = torch.stack(decoder.prob_c_to_g,1)
pc = P[:,:,100:113]
pg = P[:,:,:100]

In [None]:
sc.max()

In [None]:
tmp_s = S[0][0] # state
tmp_c = sc[0][0] # from copy
tmp_wc = decoder.Wc.weight.data # from generate
tmp_wo = decoder.Wo.weight.data # from generate

In [None]:
print(tmp_c.max())
print(tmp_wc.max())
print(tmp_wo.max())

In [None]:
torch.mm(tmp_wo,tmp_s.unsqueeze(1))

In [None]:
F.tanh(torch.mm(tmp_c,tmp_s.unsqueeze(1)))

In [None]:
tmp_wo

In [None]:
torch.save(f='models/encoder_41.pckl',obj=encoder)
torch.save(f='models/decoder_41.pckl',obj=decoder)

In [None]:
a = np.arange(130,dtype=float).reshape([10,13])
b = (torch.Tensor(a))
F.softmax(Variable(b*I[idx]))

In [None]:
I

In [None]:
P[4:8,:,:].max(2)[0].squeeze()

In [None]:
A = torch.stack(decoder.A,1)

In [None]:
P = torch.stack(decoder.P,1)
p = torch.stack(decoder.prob_c_to_g,1)
pg = torch.stack(decoder.prob_g,1)

In [None]:
pg[-2]

In [None]:
O[4,:,3]

In [None]:
P.size()

In [None]:
p.size()

In [None]:
import numpy as np
import torch
vocab_size=100
# test one-hot
encoded_idx = np.array([[38,4,73,57,59,49],[54,57,59,49,89,26]])
en = torch.LongTensor(encoded_idx)
en.unsqueeze_(2)
one_hot = torch.FloatTensor(en.size(0),en.size(1),vocab_size).zero_()
one_hot.scatter_(2,en,1) # one hot tensor: [b x seq x vocab]

In [None]:
visualize(Variable(one_hot[1]))

In [None]:
torch.stack(decoder.prob_c_to_g,dim=1)[:,0].max(1)[1]

In [None]:
visualize(torch.stack(decoder.probs,dim=1)[:,0])

In [None]:
torch.stack(decoder.probs,dim=1)[:,0,100:].sum(1)

In [None]:
visualize(torch.stack(decoder.prob_c_to_g,dim=1)[:,0])

In [None]:
visualize(out[:,0])

In [None]:
visualize(torch.stack(decoder.attn,dim=1)[:,2])

In [None]:
x

In [None]:
y

In [None]:
out.max(2)[1].squeeze()

In [None]:
torch.stack(decoder.W,1).squeeze()[:,1].sum(1)