<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [1]:
# Imports
import gzip
import os
import pandas as pd
import numpy as np
import torch
import basic_model
import random
from collections import Counter
from tqdm import tqdm
import ujson as json
from torch.autograd import Variable
import util
import args
import pprint
import Answer_Selection_Model

args = args.get_setup_args()


In [2]:
# Specific param for this model
args.question_len = 20
args.answer_len = 200
args.embedding_dim = 512
args.hidden_dim = 512
args.margin = 0.05
args.mode = 'test'
args.resume = 1
args.batch_size = 1

In [3]:
vars(args)

{'vocab_path': '/home/mouadh/Desktop/insuranceQA/V2/vocabulary',
 'label_path': '/home/mouadh/Desktop/insuranceQA/V2/InsuranceQA.label2answer.token.encoded',
 'train_path': '/home/mouadh/Desktop/insuranceQA/V2/InsuranceQA.question.anslabel.token.500.pool.solr.train.encoded',
 'test_path': '/home/mouadh/Desktop/insuranceQA/V2/InsuranceQA.question.anslabel.token.500.pool.solr.test.encoded',
 'glove_path': '/home/mouadh/Desktop/insuranceQA/glove.840B.300d/glove.840B.300d.txt',
 'glove_dim': 300,
 'glove_num_vecs': 2196017,
 'hidden_size': 100,
 'word_emb_file': '/home/mouadh/Desktop/insuranceQA/glove.840B.300d/word_embedding',
 'word2idx_file': '/home/mouadh/Desktop/insuranceQA/glove.840B.300d/word2idx',
 'seed': 0,
 'create_matrix_embedding': True,
 'learning_rate': 0.01,
 'num_epochs': 30,
 'drop_prob': 0.2,
 'margin': 0.05,
 'batch_size': 1,
 'use_glove': False,
 'embd_size': 200,
 'max_sent_len': 200,
 'question_len': 20,
 'answer_len': 200,
 'embedding_dim': 512,
 'hidden_dim': 512,


In [4]:
# Load answers labels and answer label with text
id2w, l2a, l2at = util.load_vocabulary(args.vocab_path, args.label_path)

# Create word to index vocabulary
w2i = {w: i for i, w in enumerate(id2w.values(), 1)}
# Add pad to the vocabulary
PAD = '<PAD>'
w2i[PAD] = 0

vocab_size = len(w2i)
args.vocab_size = vocab_size
print('vocab_size:', vocab_size)

vocab_size: 68581


In [5]:
train = util.load_data_train(args.train_path, id2w, l2at)

In [6]:
test = util.load_data_train(args.test_path, id2w, l2at)

In [7]:
if args.use_glove:
    if not args.create_matrix_embedding:
        # Create word embedding and word2idx for glove
        print("Creating word embedding")
        word_emb_mat, word2idx_dict = util.get_embedding('word', emb_file=args.glove_path,
                                        vec_size=args.glove_dim, num_vectors=args.glove_num_vecs)
        util.save(args.word_emb_file, word_emb_mat, message="word embedding")
        util.save(args.word2idx_file, word2idx_dict, message="word dictionary")
    else:
        # Get embeddings
        print('Loading word vectors embeddings...')
        word_vectors = util.torch_from_json(args.word_emb_file)
        print("Loading word to index dictionary")
        word2idx_dict = json.load(open(args.word2idx_file))
else:
    word2idx_dict = w2i

In [8]:
train[0][0]

['Is', 'Disability', 'Insurance', 'Required', 'By', 'Law', '?']

In [9]:
word2idx_dict['?']

50370

In [10]:
util.make_vector([train[0][0]], w2i, 20)

tensor([[66164, 54421, 29876, 56902, 59631, 57715, 50370,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]])

In [11]:
q, pos, negs = train[0][0], train[0][1], train[0][2]

In [30]:
vec_q = util.make_vector([q], word2idx_dict, args.question_len)
vec_pos = util.make_vector([pos], word2idx_dict,args.answer_len)

In [152]:
vec_neg = util.make_vector([negs[1]], word2idx_dict, args.answer_len)

In [153]:
q_ = torch.LongTensor(vec_q)
ga_ = torch.LongTensor(vec_pos)

In [154]:
ba_ = torch.LongTensor(vec_neg)


In [194]:
import pickle
import random
import numpy as np
from scipy.stats import rankdata
import torch
#import torch.autograd as autograd
from torch.autograd import Variable
import torch.utils.data as data
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

class AnswerSelection(nn.Module):
    def __init__(self, args):
        super(AnswerSelection, self).__init__()
        self.vocab_size = args.vocab_size
        self.hidden_dim = args.hidden_dim
        self.embedding_dim = args.embedding_dim
        self.question_len = args.question_len
        self.answer_len = args.answer_len
        self.batch_size = args.batch_size

        self.word_embeddings = nn.Embedding(self.vocab_size, self.embedding_dim)
        self.lstm = nn.LSTM(self.embedding_dim, self.hidden_dim // 2, num_layers=1, bidirectional=True, batch_first=True)
        self.cnns = nn.ModuleList([nn.Conv1d(self.hidden_dim, 500, filter_size, stride=1, padding=filter_size-(i+1)) for i, filter_size in enumerate([1,3])])
        self.question_maxpool = nn.MaxPool1d(self.question_len, stride=1)
        self.answer_maxpool = nn.MaxPool1d(self.answer_len, stride=1)
        self.dropout = nn.Dropout(p=0.1)
        self.init_weights()
        self.hiddenq = self.init_hidden(self.batch_size)
        self.hiddena = self.init_hidden(self.batch_size)

    def init_hidden(self, batch_len):
        return (Variable(torch.randn(2, batch_len, self.hidden_dim // 2)),
                Variable(torch.randn(2, batch_len, self.hidden_dim // 2)))

    def init_weights(self):
        initrange = 0.1
        self.word_embeddings.weight.data.uniform_(-initrange, initrange)

    def forward(self, question, answer):
        question_embedding = self.word_embeddings(question)
        answer_embedding = self.word_embeddings(answer)
        q_lstm, self.hiddenq = self.lstm(question_embedding, self.hiddenq)
        a_lstm, self.hiddena = self.lstm(answer_embedding, self.hiddena)
        q_lstm = q_lstm.contiguous()
        a_lstm = a_lstm.contiguous()
        q_lstm = question_embedding
        a_lstm = answer_embedding
        print(q_lstm.size())
        q_lstm = q_lstm.view(-1,self.hidden_dim, self.question_len)
        a_lstm = a_lstm.view(-1,self.hidden_dim, self.answer_len)

        question_pool = []
        answer_pool = []
        for cnn in self.cnns:
            question_conv = cnn(q_lstm)
            answer_conv = cnn(a_lstm)
            question_max_pool = self.question_maxpool(question_conv)
            answer_max_pool = self.answer_maxpool(answer_conv)
            question_activation = F.tanh(torch.squeeze(question_max_pool))
            answer_activation = F.tanh(torch.squeeze(answer_max_pool))
            question_pool.append(question_activation)
            answer_pool.append(answer_activation)
            
        question_output = torch.cat(question_pool, dim = 0)
        answer_output = torch.cat(answer_pool, dim = 0)
        #print((question_output != torch.cat(question_pool, dim = -1)).sum())
        question_output = self.dropout(question_output)
        answer_output = self.dropout(answer_output)
        
        similarity = F.cosine_similarity(question_output, answer_output, dim = -1)

        return similarity

In [195]:
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
torch.cuda.manual_seed_all(args.seed)

In [196]:
model = AnswerSelection(args)

In [205]:
model(vec_q, vec_pos)

torch.Size([1, 20, 512])


tensor(0.8401, grad_fn=<DivBackward0>)

In [206]:
model(vec_q, vec_neg)

torch.Size([1, 20, 512])


tensor(0.8548, grad_fn=<DivBackward0>)

In [None]:
def train(self):
        batch_size = args.batch_size
        epochs = args.epochs
        training_set = load('train')

        questions = list()
        good_answers = list()
        for i, q in enumerate(training_set):
            questions += [q['question']] * len(q['answers'])
            good_answers += [self.all_answers[j] for j in q['answers']]

        questions = torch.LongTensor(self.pad_question(questions))
        good_answers = torch.LongTensor(self.pad_answer(good_answers))
        optimizer = torch.optim.Adam(self.model.parameters(), lr=self.conf['learning_rate'])

        for i in xrange(epochs):
            bad_answers = torch.LongTensor(self.pad_answer(random.sample(self.all_answers.values(), len(good_answers))))
            train_loader = data.DataLoader(dataset=torch.cat([questions,good_answers,bad_answers],dim=1), batch_size=batch_size)
	    avg_loss = []
	    avg_acc = []
	    self.model.train()
            for step, train in enumerate(train_loader):
                batch_question = autograd.Variable(train[:,:self.conf['question_len']]).cuda()
                batch_good_answer = autograd.Variable(train[:,self.conf['question_len']:self.conf['question_len']+self.conf['answer_len']]).cuda()
                batch_bad_answer = autograd.Variable(train[:,self.conf['question_len']+self.conf['answer_len']:]).cuda()
                optimizer.zero_grad()
		self.model.hiddenq = self.model.init_hidden(len(train))
		self.model.hiddena = self.model.init_hidden(len(train))
		loss, acc = self.model.fit(batch_question, batch_good_answer, batch_bad_answer)
		avg_loss.append(loss.data[0])
		avg_acc.append(acc)
                loss.backward()
	        torch.nn.utils.clip_grad_norm(self.model.parameters(), 0.25)
                optimizer.step()

	    print "Epoch: {0} Epoch Average loss: {1} Accuracy {2}".format(str(i), str(np.mean(avg_loss)), str(np.mean(avg_acc)))
            torch.save(self.model.state_dict(), "saved_model/answer_selection_model_cnnlstm")
	    if i % 50 == 0 and i > 0:
		self.validate(validation=True)