In [None]:
# create dataset of number sequences
# let's assume that we have a vocabulary size of 1000 words
# let's assume that 0 is the EOS token, and 1 is the SOS token, and 2 is PAD

In [None]:
# get LM model
import torch 
import torch.nn as nn
import numpy as np
import torch.optim as optim
from torch.nn.utils.rnn import pack_padded_sequence
from torch.autograd import Variable
import torch.nn.functional as F
from models.copynet import CopyEncoder, CopyDecoder

torch.manual_seed(1000)
# Hyperparameters
embed_size = 150
hidden_size = 300
num_layers = 1
bin_size = 10
num_epochs = 100
batch_size = 20
lr = 0.1
vocab_size = 1000

In [None]:
# get data
with open('data/copynet_data.txt') as f:
    lines = f.readlines()
import random
random.shuffle(lines)
half = int(len(lines)/2)
# train = lines[:half]
train = lines[:100]
test = lines[half:]

In [None]:
def toData(batch):
    # [input] batch: list of strings
    # [output] input_out, output_out: np array([b x seq]), fixed size, eos & zero padding applied
    # [output] in_idx, out_idx: np.array([b]), length of each line in seq 
    batch = [line.replace('\n','') for line in batch]
    inputs_ = []
    outputs_ = []
    in_len = []
    out_len = []
    for line in batch:
        inputs, outputs, _ = line.split('\t')
        inputs_.append([int(num) for num in inputs.split(',')]+[1])
        outputs_.append([int(num) for num in outputs.split(',')]+[1])
        in_len.append(len(inputs_[-1]))
        out_len.append(len(outputs_[-1]))
    in_len = np.array(in_len)
    out_len = np.array(out_len)
    max_in = max(in_len)
    max_out = max(out_len)
    batch_size = len(batch)
    input_out = np.zeros([batch_size,max_in],dtype=int)
    output_out = np.zeros([batch_size,max_out],dtype=int)
    for b in range(batch_size):
        input_out[b][:in_len[b]] = np.array(inputs_[b])
        output_out[b][:out_len[b]] = np.array(outputs_[b])
    out_rev = out_len.argsort()[::-1]
#     return input_out, output_out, in_len, out_len    
    return input_out[out_rev], output_out[out_rev], in_len[out_rev], out_len[out_rev]

In [None]:
num_samples = len(train)
num_batches = int(num_samples/batch_size)

In [None]:
################ copynet model #####################
encoder = CopyEncoder(vocab_size, embed_size, hidden_size)
decoder = CopyDecoder(vocab_size, embed_size, hidden_size)
# encoder = torch.load(f='models/encoder01.pth')
# decoder = torch.load(f='models/decoder01.pth')
criterion = nn.CrossEntropyLoss()
if torch.cuda.is_available():
    encoder.cuda()
    decoder.cuda()

# for epoch in range(num_epochs):
for epoch in range(num_epochs):
    print("Epoch ",epoch+1)
    opt_e = optim.Adam(params=encoder.parameters(), lr=lr)
    opt_d = optim.Adam(params=decoder.parameters(), lr=lr)
#     opt_e = optim.SGD(params=encoder.parameters(), lr=lr)
#     opt_d = optim.SGD(params=decoder.parameters(), lr=lr)
    
    # shuffle data
    random.shuffle(train)
    
    for i in range(num_batches):
        # initialize gradient buffers
        opt_e.zero_grad()
        opt_d.zero_grad()

        # obtain batch outputs
        batch = train[i*batch_size:(i+1)*batch_size]
        input_out, output_out, in_len, out_len = toData(batch)
        
        # mask input to remove padding
        input_mask = np.array(input_out>0, dtype=int)
        
        # input and output in Variable form
        x = torch.LongTensor(input_out)
        y = torch.LongTensor(output_out)
        if torch.cuda.is_available():
            x = x.cuda()
            y = y.cuda()
        x = Variable(x)
        y = Variable(y)
#         print("input",x)
        encoded, _ = encoder(x)
        
        # get initial input of decoder
        decoder_in = torch.LongTensor(np.ones(x.size(0),dtype=int))*2
        s = None
        w = None
        if torch.cuda.is_available():
            decoder_in = decoder_in.cuda()
        decoder_in = Variable(decoder_in)
        
        
        if epoch % 2 ==0:
            teacherForcing = True
        else:
            teacherForcing = False
                
        for j in range(y.size(1)): # for all sequences
            """
            decoder_in (Variable): [b]
            encoded (Variable): [b x seq x hid]
            input_out (np.array): [b x seq]
            s (Variable): [b x hid]
            """
            # calculate for 1st state
            if j==0:
                out, s, w = decoder(input_idx=decoder_in, encoded=encoded,
                                encoded_idx=input_out, prev_state=s, 
                                weighted=w, order=j)
            else:
                tmp_out, s, w = decoder(input_idx=decoder_in, encoded=encoded,
                                encoded_idx=input_out, prev_state=s, 
                                weighted=w, order=j)
                out = torch.cat([out,tmp_out],dim=1)
            # select next input
            if teacherForcing:
                decoder_in = y[:,j]
            else:
                decoder_in = tmp_out.max(2)[1].squeeze()
        
        target = pack_padded_sequence(y,out_len.tolist(), batch_first=True)[0]
        out = pack_padded_sequence(out,out_len.tolist(), batch_first=True)[0]
        loss = criterion(out, target)
        loss.backward()
        opt_e.step()
        opt_d.step()
    print("Loss: ", loss.data[0])
                    

In [None]:
x[1]

In [None]:
y[1]

In [None]:
one_hot = torch.FloatTensor(a.size(0),a.size(1),1000).zero_()
one_hot.scatter_(2,a, 1)

In [None]:
type(np.zeros([5,4])[2][0])

In [None]:
one_hot[0][0][700:705]

In [None]:
# torch.save(encoder,'models/seq2seq_encoder.pth')
# torch.save(decoder,'models/seq2seq_decoder.pth')

In [None]:
encoder = torch.load(f='models/encoder01.pth')
decoder = torch.load(f='models/decoder01.pth')

In [None]:
t_str = np.load('data/t_str.npy')
t_out = np.load('data/t_out.npy')

In [None]:
################ testing #####################
# encoder = Encoder01(vocab_size, hidden_size)
# decoder = Decoder01(vocab_size, hidden_size)
encoder = torch.load(f='models/encoder01.pth')
decoder = torch.load(f='models/decoder01.pth')
criterion = nn.CrossEntropyLoss()
encoder.cuda()
decoder.cuda()

# for epoch in range(num_epochs):
for epoch in range(num_epochs):

    for i in range(num_batches):
        # obtain batches
        batch_str = tr_str[i*batch_size:(i+1)*batch_size]
        batch_out = tr_out[i*batch_size:(i+1)*batch_size]
                
        # batches in Variables
        x = torch.LongTensor(batch_str)
        x = x.cuda()
        x = Variable(x)
        y = torch.LongTensor(batch_out)
        y = y.cuda()
        y = Variable(y)
        
        # get outputs of encoder
        _, h = encoder(x)
        
        decoder_in = Variable(torch.LongTensor(np.ones(x.size(0),dtype=int)))
        decoder_in = decoder_in.cuda()
        loss = 0
        print("input: ",batch_str[0])
        print("answer: ",batch_out[0])
        for j in range(y.size(1)):
            out, h = decoder(decoder_in, h)
            target = y[:,j]
            loss+=criterion(out, target)
            decoder_in = out.max(1)[1].squeeze()
            print(decoder_in[0].data[0])
    print("Loss for epoch %d: %.3f" %(epoch+1, loss.data[0]))
            # in_decoder : [b x 1]

Our model fails to generalize