In [1]:
import torch
import torch.optim as optim
from torch.autograd import Variable
import torch.nn.functional as F

import csv
import argparse
import os 
import numpy as np
import operator
import random
import sys
import time
from tqdm import tqdm
from collections import defaultdict

from constants import *
import datasets
import evaluation
import persistence
import learn.interpret
import learn.models as models
import learn.tools as tools

from dataproc import extract_wvs
import torch.nn as nn

In [2]:
command = '''/home/stack/Documents/w266_project/vaersdata/toy2.csv /home/stack/Documents/w266_project/vocab/vocab.csv full conv_attn 200 --filter-size 10 --num-filter-maps 50 --dropout 0.2 --patience 10 --criterion prec_at_8 --lr 0.0001 --lmbda 0.01 --embed-file /home/stack/Documents/w266_project/vaersdata/train_only.embed --embed-size 100 --gpu'''

In [3]:
parser = argparse.ArgumentParser(description="train a neural network on some clinical documents")
parser.add_argument("data_path", type=str,
                    help="path to a file containing sorted train data. dev/test splits assumed to have same name format with 'train' replaced by 'dev' and 'test'")
parser.add_argument("vocab", type=str, help="path to a file holding vocab word list for discretizing words")
parser.add_argument("Y", type=str, help="size of label space")
parser.add_argument("model", type=str, choices=["cnn_vanilla", "rnn", "conv_attn", "multi_conv_attn", "logreg", "saved"], help="model")
parser.add_argument("n_epochs", type=int, help="number of epochs to train")
parser.add_argument("--embed-file", type=str, required=False, dest="embed_file",
                    help="path to a file holding pre-trained embeddings")
parser.add_argument("--cell-type", type=str, choices=["lstm", "gru"], help="what kind of RNN to use (default: GRU)", dest='cell_type',
                    default='gru')
parser.add_argument("--rnn-dim", type=int, required=False, dest="rnn_dim", default=128,
                    help="size of rnn hidden layer (default: 128)")
parser.add_argument("--bidirectional", dest="bidirectional", action="store_const", required=False, const=True,
                    help="optional flag for rnn to use a bidirectional model")
parser.add_argument("--rnn-layers", type=int, required=False, dest="rnn_layers", default=1,
                    help="number of layers for RNN models (default: 1)")
parser.add_argument("--embed-size", type=int, required=False, dest="embed_size", default=100,
                    help="size of embedding dimension. (default: 100)")
parser.add_argument("--filter-size", type=str, required=False, dest="filter_size", default=4,
                    help="size of convolution filter to use. (default: 3) For multi_conv_attn, give comma separated integers, e.g. 3,4,5")
parser.add_argument("--num-filter-maps", type=int, required=False, dest="num_filter_maps", default=50,
                    help="size of conv output (default: 50)")
parser.add_argument("--pool", choices=['max', 'avg'], required=False, dest="pool", help="which type of pooling to do (logreg model only)")
parser.add_argument("--code-emb", type=str, required=False, dest="code_emb", 
                    help="point to code embeddings to use for parameter initialization, if applicable")
parser.add_argument("--weight-decay", type=float, required=False, dest="weight_decay", default=0,
                    help="coefficient for penalizing l2 norm of model weights (default: 0)")
parser.add_argument("--lr", type=float, required=False, dest="lr", default=1e-3,
                    help="learning rate for Adam optimizer (default=1e-3)")
parser.add_argument("--batch-size", type=int, required=False, dest="batch_size", default=16,
                    help="size of training batches")
parser.add_argument("--dropout", dest="dropout", type=float, required=False, default=0.5,
                    help="optional specification of dropout (default: 0.5)")
parser.add_argument("--lmbda", type=float, required=False, dest="lmbda", default=0,
                    help="hyperparameter to tradeoff BCE loss and similarity embedding loss. defaults to 0, which won't create/use the description embedding module at all. ")
parser.add_argument("--dataset", type=str, choices=['mimic2', 'mimic3'], dest="version", default='mimic3', required=False,
                    help="version of MIMIC in use (default: mimic3)")
parser.add_argument("--test-model", type=str, dest="test_model", required=False, help="path to a saved model to load and evaluate")
parser.add_argument("--criterion", type=str, default='f1_micro', required=False, dest="criterion",
                    help="which metric to use for early stopping (default: f1_micro)")
parser.add_argument("--patience", type=int, default=3, required=False,
                    help="how many epochs to wait for improved criterion metric before early stopping (default: 3)")
parser.add_argument("--gpu", dest="gpu", action="store_const", required=False, const=True,
                    help="optional flag to use GPU if available")
parser.add_argument("--public-model", dest="public_model", action="store_const", required=False, const=True,
                    help="optional flag for testing pre-trained models from the public github")
parser.add_argument("--stack-filters", dest="stack_filters", action="store_const", required=False, const=True,
                    help="optional flag for multi_conv_attn to instead use concatenated filter outputs, rather than pooling over them")
parser.add_argument("--samples", dest="samples", action="store_const", required=False, const=True,
                    help="optional flag to save samples of good / bad predictions")
parser.add_argument("--quiet", dest="quiet", action="store_const", required=False, const=True,
                    help="optional flag not to print so much during training")
args = parser.parse_args(command.split())
args.command = command
args

Namespace(Y='full', batch_size=16, bidirectional=None, cell_type='gru', code_emb=None, command='/home/stack/Documents/w266_project/vaersdata/toy2.csv /home/stack/Documents/w266_project/vocab/vocab.csv full conv_attn 200 --filter-size 10 --num-filter-maps 50 --dropout 0.2 --patience 10 --criterion prec_at_8 --lr 0.0001 --lmbda 0.01 --embed-file /home/stack/Documents/w266_project/vaersdata/train_only.embed --embed-size 100 --gpu', criterion='prec_at_8', data_path='/home/stack/Documents/w266_project/vaersdata/toy2.csv', dropout=0.2, embed_file='/home/stack/Documents/w266_project/vaersdata/train_only.embed', embed_size=100, filter_size='10', gpu=True, lmbda=0.01, lr=0.0001, model='conv_attn', n_epochs=200, num_filter_maps=50, patience=10, pool=None, public_model=None, quiet=None, rnn_dim=128, rnn_layers=1, samples=None, stack_filters=None, test_model=None, version='mimic3', vocab='/home/stack/Documents/w266_project/vocab/vocab.csv', weight_decay=0)

In [4]:
def init(args):
    """
        Load data, build model, create optimizer, create vars to hold metrics, etc.
    """
    #need to handle really large text fields
    csv.field_size_limit(sys.maxsize)

    #load vocab and other lookups
    desc_embed = args.lmbda > 0
    print("loading lookups...")
    dicts = datasets.load_lookups(args, desc_embed=desc_embed)

    model = tools.pick_model(args, dicts)
    print(model)

    if not args.test_model:
        optimizer = optim.Adam(model.parameters(), weight_decay=args.weight_decay, lr=args.lr)
    else:
        optimizer = None

    params = tools.make_param_dict(args)
    
    return args, model, optimizer, params, dicts

args, model, optimizer, params, dicts = init(args)
ind2w, w2ind, ind2c, c2ind = dicts['ind2w'], dicts['w2ind'], dicts['ind2c'], dicts['c2ind']
unseen_code_inds = set(ind2c.keys())
desc_embed = model.lmbda > 0

def train(model, optimizer, Y, epoch, batch_size, data_path, gpu, version, dicts, quiet):
    """
        Training loop.
        output: losses for each example for this iteration
    """
    print("EPOCH %d" % epoch)
    num_labels = len(dicts['ind2c'])

    losses = []
    #how often to print some info to stdout
    print_every = 25

    ind2w, w2ind, ind2c, c2ind = dicts['ind2w'], dicts['w2ind'], dicts['ind2c'], dicts['c2ind']
    unseen_code_inds = set(ind2c.keys())
    desc_embed = model.lmbda > 0

    model.train()
    gen = datasets.data_generator(data_path, dicts, batch_size, num_labels, version=version, desc_embed=desc_embed)
    
    return gen

def one_epoch(model, optimizer, Y, epoch, n_epochs, batch_size, data_path, version, testing, dicts, model_dir, 
              samples, gpu, quiet):
    return train(model, optimizer, Y, epoch, batch_size, data_path, gpu, version, dicts, quiet)

loading lookups...
loading pretrained embeddings...
adding unk embedding
ConvAttnPool(
  (embed_drop): Dropout(p=0.2)
  (embed): Embedding(59609, 100, padding_idx=0)
  (conv): Conv1d (100, 50, kernel_size=(10,), stride=(1,), padding=(5,))
  (U): Linear(in_features=50, out_features=330)
  (final): Linear(in_features=50, out_features=330)
  (desc_embedding): Embedding(59609, 100, padding_idx=0)
  (label_conv): Conv1d (100, 50, kernel_size=(10,), stride=(1,), padding=(5,))
  (label_fc1): Linear(in_features=50, out_features=50)
)


In [5]:
gen = one_epoch(model, optimizer, args.Y, 1, args.n_epochs, args.batch_size, args.data_path,
          args.version, False, dicts, MODEL_DIR, 
          args.samples, args.gpu, args.quiet)

EPOCH 1


In [6]:
tup = next(gen)

In [7]:
data, target, _, code_set, descs = tup

In [29]:
for inst in descs[:1]:
    inst = np.array([inst[0]])
inst

array([[56627, 25065]])

In [9]:
descs[0]

array([list([56627, 25065])], dtype=object)

In [19]:
W = torch.Tensor(extract_wvs.load_embeddings(args.embed_file))

adding unk embedding


In [20]:
embed = nn.Embedding(W.size()[0], W.size()[1], padding_idx=0)
embed.weight.data = W.clone()
W = embed.weight.data
desc_embedding = nn.Embedding(W.size()[0], W.size()[1], padding_idx=0)

In [24]:
lt = Variable(torch.LongTensor(np.array(inst[0])))
d = desc_embedding(lt)
d = d.transpose(1,2)

RuntimeError: invalid argument 3: Index is supposed to be a vector at /opt/conda/conda-bld/pytorch_1512387374934/work/torch/lib/TH/generic/THTensorMath.c:248

In [67]:
d

tensor([[[-0.2094,  0.3963],
         [ 0.9777,  0.9178],
         [ 0.9537, -0.3299],
         [-0.6207,  0.0328],
         [ 0.2803, -1.1058],
         [-0.8163, -1.5004],
         [ 1.4209,  0.8168],
         [ 0.2664, -0.3262],
         [ 2.2038, -0.8914],
         [-0.7598, -0.0435],
         [ 0.5017, -0.2119],
         [-0.0657, -0.7658],
         [-0.3663, -0.3825],
         [ 0.0736,  0.1690],
         [ 0.4003, -0.7683],
         [ 1.9004, -2.0641],
         [-0.0515,  0.8780],
         [-1.0260, -2.0651],
         [-0.2277, -1.8174],
         [-0.5912, -0.6174],
         [-1.0741, -0.4870],
         [-1.7110,  1.4376],
         [-0.3606, -2.3478],
         [ 1.1292,  1.2159],
         [ 0.3338,  1.2606],
         [ 0.4167,  1.3694],
         [-2.0438, -0.4475],
         [-2.0636, -1.6892],
         [-0.3210, -0.8554],
         [ 0.2676,  0.1202],
         [-0.0785, -1.8840],
         [ 0.6836, -0.0339],
         [-0.1857, -0.0136],
         [-0.7712,  0.0796],
         [ 1.0