In [None]:
# arguments related to the dataset
parser.add_argument("--data_dir",type=str, default='/home/mjc/datasets/CNN_DailyMail/cnn/stories_merged_100/',
                    help='directory where data files are located')
parser.add_argument("--word2idx",type=str, default='word2idx.npy', help='file name for word2idx file')
parser.add_argument("--idx2word",type=str, default='idx2word.npy', help='file name for idx2word file')
parser.add_argument("--max_enc",type=int, default=400, help='max length of encoder sequence')
parser.add_argument("--max_dec",type=int, default=100, help='max length of decoder sequence')
parser.add_argument("--min_dec",type=int, default=35, help='min length of decoder sequence')
parser.add_argument("--vocab_size",type=int, default=50000, help='vocabulary size')
parser.add_argument("--max_oovs",type=int, default=20, help='max number of OOVs to accept in a sample')


# arguments related to model training and inference
parser.add_argument("--train",type=bool, default=True, help='train/test model. Set by default to True(=train)')
parser.add_argument("--epochs",type=int, default=20, help='Number of epochs. Set by default to 20')
parser.add_argument("--load_model",type=str, default='', help='input model name to start from a pretrained model')
parser.add_argument("--hidden",type=int, default=256, help='size of hidden dimension')
parser.add_argument("--embed",type=int, default=128, help='size of embedded word dimension')
parser.add_argument("--lr",type=float, default=0.15, help='learning rate')
parser.add_argument("--cov_lambda",type=float, default=1.0, help='lambda for coverage loss')
parser.add_argument("--beam",type=int, default=4, help='beam size')
parser.add_argument("--cuda",type=bool, default=True, help='whether to use GPU')

args = parser.parse_args()

In [1]:
import argparse
import numpy as np
from packages.vocab import Vocab
from packages.batch import Batch
from model import Model
from packages.functions import to_cuda, num_to_var
import torch
from torch import nn,optim
import os
import random
from torch.nn.utils.rnn import pack_padded_sequence as pack
from torch.nn.utils.rnn import pad_packed_sequence as pad

parser = argparse.ArgumentParser()

In [2]:
class Args(object):
    data_dir='/home/mjc/datasets/CNN_DailyMail/cnn/stories_merged_100/'
    word2idx='word2idx.npy'
    idx2word='idx2word.npy'
    max_enc=400
    max_dec=100
    min_dec=35
    vocab_size=50000
    max_oovs = 20
    
    train = True
    epochs = 20
    load_model = ''
    hidden_size = 256
    embed_size = 128
    lr = 0.15
    cov_lambda = 1.0
    beam = 4
    cuda = True
args = Args()

In [3]:
# def main(args):
# obtain vocabulary
vocab = Vocab(args.vocab_size)
vocab.w2i = np.load(args.word2idx).item()
vocab.i2w = np.load(args.idx2word).item()
vocab.count = len(vocab.w2i)

# obtain dataset in batches
file_list = os.listdir(args.data_dir)
batch = Batch(file_list, args.max_enc, args.max_dec, args.max_oovs)

# load model
if args.load_model != '':
    model = torch.load(args.load_model)
else:
    model = Model(args)
model = to_cuda(model)

# get loss and optimizers
opt = optim.Adam(params=model.parameters(),lr=args.lr)
criterion = nn.NLLLoss()

# computation for each epoch
epoch = 0
while (epoch<args.epochs):
    epoch+=1
    random.shuffle(file_list)
    for file in file_list:
        opt.zero_grad()
        with open(os.path.join(args.data_dir,file)) as f:
            minibatch = f.read()
        stories,summaries = batch.process_minibatch(minibatch,vocab)
        out_list, cov_loss = model(stories, summaries, batch, vocab, True)
        
        # get packed versions
        target = num_to_var(summaries[:,1:])
        target = pack(target, batch.output_lens.to_list(),batch_first=True)[0]
        pad_out = pack(out_list, batch.output_lens.to_list(),batch_first=True)[0]
        pad_out = torch.log(pad_out)
        loss = criterion(pad_out,target)+cov_loss
        loss.backward()
        opt.step()
        print("got thru batch!")
# if __name__ == "__main__":
#     main(args)

0


RuntimeError: inconsistent tensor sizes at /home/mjc/github/pytorch/torch/lib/TH/generic/THTensorMath.c:489

In [6]:
A = torch.randn(4,10)

In [8]:
idx_x = torch.LongTensor([0,0,1,2])
idx_y = torch.LongTensor([3,5,8,8])

In [13]:
A[idx_x,idx_y]

TypeError: indexing a tensor with an object of type torch.LongTensor. The only supported types are integers, slices, numpy scalars and torch.LongTensor or torch.ByteTensor as the only argument.

In [None]:
a = np.arange(20,dtype=float).reshape([2,5,-1])
b = np.ones(100,dtype=float).reshape([2,5,-1])*40

In [None]:
from torch.autograd import Variable
A = Variable(torch.Tensor(a))
B = Variable(torch.Tensor(b))

In [None]:
A.expand_as(B)

In [None]:
import argparse

In [None]:
# hyperparameters
hidden_dim = 256
embed_dim = 128
batch_size = 16
num_samples = 92579
max_encoder_steps = 400
max_decoder_steps = 100
beam_size = 4
min_decoder_steps = 35 # min size of generated sequence
vocab_size = 50000
lr = 0.15
adagrad_init_acc = 0.1 # deprecated for pytorch
rand_unif_init_mag = 0.02 # magnitude for lstm cells during random init
trunc_norm_init_std = 1e-4 # std of truncated norm initialization
max_grad_norm = 2.0 # so they do apply gradient clipping
max_oovs = 20 # maximum number of oovs allowed?
coverage_loss = 1.0 # lambda

In [None]:
# get vocabulary
vocab = Vocab(50000)
vocab.w2i = np.load('word2idx.npy').item()
vocab.i2w = np.load('idx2word.npy').item()
vocab.count = len(vocab.w2i)

In [None]:
# get dataset in batches
file_dir = '/home/mjc/datasets/CNN_DailyMail/cnn/stories_merged_100/'
file_list = os.listdir(file_dir)
batch = Batch(file_list,400,100,50)

In [None]:
batch.init_minibatch()
with open(os.path.join(file_dir,file_list[70])) as f:
    minibatch = f.read()
    minibatch = minibatch.split('\n\n')
    minibatch = [line for line in minibatch if not line.startswith(":==:")]
stories, summaries = batch.process_minibatch(minibatch,vocab)

In [None]:
stories[idx]

In [None]:
idx=22
' '.join(vocab.idx_list_to_word_list(stories[idx]))

In [None]:
batch.idx2oov_list

In [None]:
' '.join(vocab.idx_list_to_word_list(stories[idx],batch.idx2oov_list[idx]))

In [None]:
unked = batch.unk_minibatch(stories[idx],vocab)
' '.join(vocab.idx_list_to_word_list(unked,batch.idx2oov_list[idx]))

In [None]:
' '.join(vocab.idx_list_to_word_list(summaries[idx],batch.idx2oov_list[idx]))

In [None]:
import torch.nn as nn
help(nn.LSTMCell)

In [None]:
batch.oov2idx_list[idx]

In [None]:
np.ones([10])