In [1]:
import torch
import torch
import torch.optim as optim
import torch.nn as nn
from torch.autograd import Variable
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader
import json
import numpy as np
import pickle
from tqdm import tqdm
from loadcorpus import load_corpus, load_vocab

In [2]:
from layers.Highway import Highway
from models.lstm_pool_overlap import LSTMClassifier

In [3]:
x_train_ans, x_train_ans_pos, x_train_que, \
x_train_que_pos, x_train_ans_overlap, x_train_que_overlap, y_train, \
x_valid_ans, x_valid_ans_pos, x_valid_que, \
x_valid_que_pos, x_valid_ans_overlap, x_valid_que_overlap, y_valid = load_corpus()

In training set: ans,que,label 264416 264416 264416 264416 264416
In validation set: ans,que,label: 39997 39997 39997 39997 39997


In [4]:
vocab = load_vocab()
PADDINGIDX = 1
UNKNOWNIDX = 0
vocab['<unk>'] = UNKNOWNIDX
vocab['<pad>'] = PADDINGIDX
inv_vocab = {}
inv_vocab[UNKNOWNIDX] = '<unk>'
inv_vocab[PADDINGIDX] = '<pad>'

for x in (vocab.items()):
    inv_vocab[x[1]] = x[0]

In [5]:
posLookup={}
posLookup['<pad>'] = 0
for x in x_valid_ans_pos:
    for pos_tag in x:
        posLookup[pos_tag] = posLookup.get(pos_tag, len(posLookup))

for x in x_train_ans_pos:
    for pos_tag in x:
        posLookup[pos_tag] = posLookup.get(pos_tag, len(posLookup))

for x in x_train_que_pos:
    for pos_tag in x:
        posLookup[pos_tag] = posLookup.get(pos_tag, len(posLookup))

for x in x_valid_que_pos:
    for pos_tag in x:
        posLookup[pos_tag] = posLookup.get(pos_tag, len(posLookup))

In [6]:
len(posLookup)

36

In [7]:
def get_vocab_freq():
    vocab_freq = {}
    with open('../DATA/tf_idf.data') as fin:
        for _idx, line in enumerate(fin.readlines()):
            line = line.split('\t')
            vocab_freq[_idx] = float(line[1])
    return vocab_freq
vocab_freq = get_vocab_freq()

In [8]:
embedding_pretrained = []
with open('../DATA/embedding_matrix.data') as fin:
    for line in fin.readlines():
        embedding_pretrained.append(eval(line))
id_mapper = eval(open('../DATA/vocab_id_matrix.data').read())
embedding_pretrained = np.array(embedding_pretrained)
inv_id_mapper = {}
for x in (id_mapper.items()):
    inv_id_mapper[x[1]] = x[0]
inv_id_mapper[1] = 1

In [9]:
def prepare_train_corpus():
    TRAINCORPUS = [{'question': que, 'answer':ans, 'question_pos':q_pos, \
                    'answer_pos':a_pos, 'question_overlap': q_ovl, 'answer_overlap': a_ovl,'label':lab} \
                   for que, ans, q_pos, a_pos, q_ovl, a_ovl, lab in zip(x_train_que, \
                                                               x_train_ans, x_train_que_pos, \
                                                               x_train_ans_pos, \
                                                               x_train_que_overlap, x_train_ans_overlap, y_train)]
    return TRAINCORPUS

In [10]:
def prepare_valid_corpus():
    que_to_anss = {}
    for que, ans, q_pos, a_pos, q_ovl, a_ovl, lab in zip(x_valid_que, x_valid_ans, x_valid_que_pos,\
                                                         x_valid_ans_pos, x_valid_que_overlap,
                                                         x_valid_ans_overlap, y_valid):
        que_to_anss[tuple(que)] = que_to_anss.get(tuple(que), [])
        que_to_anss[tuple(que)].append((ans, q_pos, a_pos, q_ovl, a_ovl, lab))
    
    VALIDCORPUS = []
    for x in que_to_anss.items():
        tmp = {'question':list(x[0]), 'answers':[y[0] for y in x[1]], 'question_pos':x[1][0][1], 
               'answers_pos':[y[2] for y in x[1]], 'question_overlap': [y[3] for y in x[1]], \
               'answers_overlap': [y[4] for y in x[1]],'labels':[y[5] for y in x[1]]}
        VALIDCORPUS.append(tmp)
    return VALIDCORPUS

In [11]:
TRAINCORPUS, VALIDCORPUS = prepare_train_corpus(), prepare_valid_corpus()

In [12]:
class MyDataset_Train(Dataset):
    def __init__(self, que_len, ans_len, corpus):        
        self.corpus = corpus
        self.que_len = que_len
        self.ans_len = ans_len
        self.mapper = id_mapper
    
    def posToInd(self, pos_tag):
        posLookup[pos_tag] = posLookup.get(pos_tag, len(posLookup))
        return posLookup[pos_tag]
        
    def word_mapper(self, wrd):
        if wrd in self.mapper:
            return self.mapper[wrd]
        else:
            return wrd
        
    def __getitem__(self, index):        
        que = torch.LongTensor(np.ones((self.que_len), dtype=np.int64))
        ans = torch.LongTensor(np.ones((self.ans_len), dtype=np.int64))
        que_pos = torch.LongTensor(np.zeros((self.que_len), dtype=np.int64))
        ans_pos = torch.LongTensor(np.zeros((self.ans_len), dtype=np.int64))
        
        que_overlap = torch.LongTensor(np.zeros((self.que_len), dtype=np.int64))
        ans_overlap = torch.LongTensor(np.zeros((self.ans_len), dtype=np.int64))
        
        txt_q = list(self.corpus[index]['question'])
        pos_q = list(self.corpus[index]['question_pos'])
        ovl_q = list(self.corpus[index]['question_overlap'])
        
        txt_a = list(self.corpus[index]['answer'])
        pos_a = list(self.corpus[index]['answer_pos'])
        ovl_a = list(self.corpus[index]['answer_overlap'])
        
        assert len(txt_q) == len(pos_q) == len(ovl_q) and len(txt_a) == len(pos_a) == len(ovl_a), "Error!"
        
        for count, (wrd, ps, indicat) in enumerate(zip(txt_q, pos_q, ovl_q)):
            wrd = self.word_mapper(wrd)
            que[count] = wrd
            que_pos[count] = self.posToInd(ps)
            que_overlap[count] = indicat
            if count+1 >= self.que_len:
                break
        for count, (wrd, ps, indicat) in enumerate(zip(txt_a, pos_a, ovl_a)):
            wrd = self.word_mapper(wrd)
            ans[count] = wrd
            ans_pos[count] = self.posToInd(ps)
            ans_overlap[count] = indicat
            if count+1 >= self.ans_len:
                break            
        label = torch.LongTensor([self.corpus[index]['label']])
        return min(self.que_len, len(txt_q)), que, que_pos, que_overlap, min(self.ans_len, len(txt_a)),\
                    ans, ans_pos, ans_overlap, label    
    def __len__(self):
        return len(self.corpus)
    
def make_weights_for_balanced_classes(samples, nclasses, PosOverNeg=1):                        
    count = [0] * nclasses                                                      
    for item in samples:                                                         
        count[item['label']] += 1                                                     
    weight_per_class = [0.] * nclasses                                      
    N = float(sum(count))                                                   
    for i in range(nclasses):                                                   
        weight_per_class[i] = N/float(count[i])
    
    weight_per_class[1] *= PosOverNeg 
    weight = [0] * len(samples)                                              
    for idx, val in enumerate(samples):                                          
        weight[idx] = weight_per_class[val['label']]
    return weight

In [13]:
class MyDataset_Valid(Dataset):
    def __init__(self, que_len, ans_len, corpus):        
        self.corpus = corpus
        self.que_len = que_len
        self.ans_len = ans_len
        self.mapper = id_mapper    
    
    def posToInd(self, pos_tag):
        posLookup[pos_tag] = posLookup.get(pos_tag, len(posLookup))
        return posLookup[pos_tag]
        
    def word_mapper(self, wrd):
        if wrd in self.mapper:
            return self.mapper[wrd]
        else:
            return wrd
    
    def __getitem__(self, index):
        ques_ovl, anss, anss_pos, anss_ovl, labels = [], [], [], [], []
        
        que = torch.LongTensor(np.ones((self.que_len), dtype=np.int64))
        que_pos = torch.LongTensor(np.zeros((self.que_len), dtype=np.int64))
        
        txt_q = list(self.corpus[index]['question'])
        pos_q = list(self.corpus[index]['question_pos'])
        ovl_q = list(self.corpus[index]['question_overlap'])
        
        txt_a = list(self.corpus[index]['answers'])
        pos_a = list(self.corpus[index]['answers_pos'])
        ovl_a = list(self.corpus[index]['answers_overlap'])
        
        
        len_anss = torch.LongTensor(np.array([min(self.ans_len, len(a)) for a in txt_a], dtype=np.int64))
        
        all_labs = list(self.corpus[index]['labels'])
        N = len(all_labs)
        
        assert len(txt_q) == len(pos_q), "Error!"
        for _txt_a, _pos_a in zip(txt_a, pos_a):
            assert len(_txt_a) == len(_pos_a)
        
        for count, (wrd, ps) in enumerate(zip(txt_q, pos_q)):
            wrd = self.word_mapper(wrd)
            que[count] = wrd
            que_pos[count] = self.posToInd(ps)
            
            if count+1 >= self.que_len:
                break
                
        
        for _ovl_q, _txt_a, _pos_a, _ovl_a, lab in zip(ovl_q, txt_a, pos_a, ovl_a, all_labs):
            qovl = torch.LongTensor(np.zeros((self.que_len), dtype=np.int64))
            for count, oq in enumerate(_ovl_q):
                qovl[count] = oq
                if count+1 >= self.que_len:
                    break                
            ques_ovl.append(qovl)

            aovl = torch.LongTensor(np.zeros((self.ans_len), dtype=np.int64))
            ans = torch.LongTensor(np.ones((self.ans_len), dtype=np.int64))
            ans_pos = torch.LongTensor(np.zeros((self.ans_len), dtype=np.int64))
            for count, (wrd, ps, oa) in enumerate(zip(_txt_a, _pos_a, _ovl_a)):
                wrd = self.word_mapper(wrd)
                aovl[count] = oa
                ans[count] = wrd
                ans_pos[count] = self.posToInd(ps)
                if count+1 >= self.ans_len:
                    break
            anss_ovl.append(aovl)
            anss.append(ans)
            anss_pos.append(ans_pos)
        
        for lab in all_labs:            
            label = torch.LongTensor([lab])
            labels.append(label)
        
        que = [que] * N
        que_pos = [que_pos] * N
        
        len_ques = torch.LongTensor(np.array([min(self.que_len, len(txt_q))]*len(txt_a), dtype=np.int64))
        len_anss, len_ques = torch.squeeze(len_anss, dim=0), torch.squeeze(len_ques, dim=0)
        
        def get_stacked(x):
            return torch.squeeze(torch.stack(x))
        que, que_pos, anss, anss_pos, labels, ques_ovl, anss_ovl =  get_stacked(que), get_stacked(que_pos), \
                                                get_stacked(anss), get_stacked(anss_pos), get_stacked(labels),\
                                                get_stacked(ques_ovl), get_stacked(anss_ovl)
        return len_ques, que, que_pos,ques_ovl, len_anss, anss, anss_pos,anss_ovl, labels
    def __len__(self):
        return len(self.corpus)

In [14]:
QUESTION_LENGTH = 15
ANSWER_LENGTH = 45
TRAINSET = MyDataset_Train(QUESTION_LENGTH, ANSWER_LENGTH, TRAINCORPUS)
VALIDSET = MyDataset_Valid(QUESTION_LENGTH, ANSWER_LENGTH, VALIDCORPUS)

weights_train = make_weights_for_balanced_classes(TRAINSET.corpus, 2, PosOverNeg=1)
weights_train = torch.DoubleTensor(weights_train)                                       
sampler_train = torch.utils.data.sampler.WeightedRandomSampler(weights_train, len(weights_train))

TRAINSET_LOADER = DataLoader(TRAINSET, batch_size=32, shuffle=False, drop_last=True, sampler=sampler_train, num_workers=5)
VALIDSET_LOADER = DataLoader(VALIDSET, batch_size=1, shuffle=False, drop_last=True, num_workers=5)

In [15]:
VALIDCORPUSASTRAIN = [{'question': que, 'answer':ans, 'question_pos':q_pos, 'answer_pos':a_pos, 'label':lab} for que, ans, q_pos, a_pos, lab in zip(x_valid_que, x_valid_ans, x_valid_que_pos, x_valid_ans_pos, y_valid)]
VALIDSETASTRAIN = MyDataset_Train(QUESTION_LENGTH, ANSWER_LENGTH, VALIDCORPUSASTRAIN)

weights_valid = make_weights_for_balanced_classes(VALIDSETASTRAIN.corpus, 2)
weights_valid = torch.DoubleTensor(weights_valid)                                       
sampler_valid = torch.utils.data.sampler.WeightedRandomSampler(weights_valid, len(weights_valid))

VALIDSETASTRAIN_LOADER = DataLoader(VALIDSETASTRAIN, batch_size=64, shuffle=False, drop_last=True, sampler=sampler_valid, num_workers=5)

In [16]:
lstm_classifier = LSTMClassifier(embedding_dim=300, hidden_dim=128,  que_len=QUESTION_LENGTH,
                                 ans_len=ANSWER_LENGTH, pos_embedding_dim=30, posLookup=posLookup,
                                 vocab_size=len(inv_id_mapper), PADDINGIDX=PADDINGIDX,
                                 label_size=2, batch_size=64, use_gpu=True)

lstm_classifier.word_embeddings.weight.data.copy_(torch.from_numpy(embedding_pretrained))
lstm_classifier = lstm_classifier.cuda()
learning_rate = 0.001
optimizer = optim.Adam(lstm_classifier.parameters(), lr=learning_rate)
loss_function = nn.CrossEntropyLoss()

In [17]:
class SimLoss(nn.Module):
    # L = max(0, M + cossim(q, a_neg) - cossim(q, a_pos))
    def __init__(self, M=0.2):
        super(SimLoss, self).__init__()
        self.M = M
    def forward(self, inputs, targets):
        batch_size = Variable(torch.ones(1).cuda() * targets.shape[0])
        inputs = torch.squeeze(inputs)
        loss = (-1)*targets.float()*inputs + (targets.float() - 1)*inputs + self.M
        loss /= batch_size
        loss = loss.sum()
        return loss   # a single number (averaged loss over batch samples)

In [18]:
# class LSTMClassifier(nn.Module):
#     def __init__(self, embedding_dim, hidden_dim, pos_embedding_dim, que_len, ans_len,
#                  vocab_size, label_size, batch_size, use_gpu):
#         super(LSTMClassifier, self).__init__()
#         self.hidden_dim = hidden_dim
#         self.batch_size = batch_size
#         self.use_gpu = use_gpu
#         self.pos_embedding_dim = pos_embedding_dim
#         self.que_len = que_len
#         self.ans_len = ans_len

#         self.pos_embeddings = nn.Embedding(len(posLookup), pos_embedding_dim, padding_idx=0)
#         self.word_embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=PADDINGIDX)
        
#         self.lstm = nn.LSTM(embedding_dim+pos_embedding_dim, hidden_dim, bidirectional=True)
        
#         self.que_max_pooling = nn.MaxPool1d(que_len)
#         self.ans_max_pooling = nn.MaxPool1d(ans_len)
        
#         self.attention_word = nn.Linear(hidden_dim*4, hidden_dim)
#         self.tanh = nn.Tanh()
#         self.attention_score = nn.Linear(hidden_dim, 1)
        
#         self.dropout = nn.Dropout(0.3)
#         self.sigmoid = nn.Sigmoid()
#         self.softmax = nn.Softmax(dim=-1)
        
#         self.highway = Highway(4*hidden_dim, num_layers=1, f=nn.ReLU())
        
#         self.fullyconnected = nn.Linear(10*hidden_dim+2, 100)
#         self.hidden2label_1 = nn.Linear(100, 20)
#         self.hidden2label_2 = nn.Linear(20, label_size)
        
#         self.hidden2label = nn.Sequential(self.hidden2label_1, self.hidden2label_2)
#         self.hidden = self.init_hidden()
#         self.word_embeddings.weight.data.copy_(torch.from_numpy(embedding_pretrained))

#     def init_hidden(self):
#         h0 = Variable(torch.zeros(2, self.batch_size, self.hidden_dim).cuda())
#         c0 = Variable(torch.zeros(2, self.batch_size, self.hidden_dim).cuda())
#         return (h0, c0)

    
#     def forward(self, lq, que, q_pos, q_ovl, la, ans, a_pos, a_ovl):   
#         lq, la = torch.squeeze(lq), torch.squeeze(la)  
#         q_ovl, a_ovl = torch.squeeze(q_ovl), torch.squeeze(a_ovl)
#         emb_que = self.word_embeddings(que)
#         emb_ans = self.word_embeddings(ans)
#         pos_que = self.pos_embeddings(q_pos)
#         pos_ans = self.pos_embeddings(a_pos)
        
#         pos_que = pos_que.view(self.que_len, self.batch_size, -1)
#         emb_que = emb_que.view(self.que_len, self.batch_size, -1)
#         pos_ans = pos_ans.view(self.ans_len, self.batch_size, -1)
#         emb_ans = emb_ans.view(self.ans_len, self.batch_size, -1)
        
#         emb_que = torch.cat([emb_que, pos_que], dim=-1)
#         emb_ans = torch.cat([emb_ans, pos_ans], dim=-1)
        
#         vec_que, _ = self.lstm(emb_que, self.init_hidden())
#         vec_ans, _ = self.lstm(emb_ans, self.init_hidden())
        
#         mask_que = torch.arange(self.que_len).expand(self.batch_size, self.que_len).cuda() < lq.float().unsqueeze(1)
#         mask_ans = torch.arange(self.ans_len).expand(self.batch_size, self.ans_len).cuda() < la.float().unsqueeze(1)
        
#         vec_que = vec_que.view(-1, self.batch_size, self.que_len)
        
#         final_que = self.que_max_pooling(vec_que)
#         final_que = torch.squeeze(final_que)
#         final_que = final_que.view(self.batch_size, -1)
        
        
#         vec_ans = vec_ans.view(self.ans_len, self.batch_size, -1)
        
#         att_scores = []
#         ans_att_vec = []
#         for ih, h in enumerate(vec_ans):
#             wr_expr = torch.cat([final_que, h], dim=1)
#             wr_expr = self.highway(wr_expr)
#             att_vec = self.tanh(self.attention_word(wr_expr))
#             att_score = self.attention_score(att_vec)
#             att_scores.append(att_score)
            
#         att_scores = torch.stack(att_scores) # batch_size, ans_len
#         att_scores = self.softmax(att_scores) # batch_size, ans_len
        
#         ans_att_vec = (vec_ans*att_scores)
            
#         ans_att_vec = ans_att_vec.view(-1, self.batch_size, self.ans_len)
#         final_ans = self.ans_max_pooling(ans_att_vec)
#         final_ans = torch.squeeze(final_ans)
#         final_ans = final_ans.view(self.batch_size, -1)
        
#         diff_vec = torch.abs(final_ans - final_que)
#         ans_coverance = torch.sum(a_ovl, dim=1).float() / (1 + la.float())
#         que_coverance = torch.sum(q_ovl, dim=1).float() / (1 + lq.float()) # normalized coverance
        
#         ans_coverance = torch.unsqueeze(ans_coverance, dim=1)
#         que_coverance = torch.unsqueeze(que_coverance, dim=1)
        
        
#         vec_ans_covered = a_ovl.float().unsqueeze(2) * vec_ans.view(self.batch_size, self.ans_len, -1)
#         vec_que_covered = q_ovl.float().unsqueeze(2) * vec_que.view(self.batch_size, self.que_len, -1)
        
#         vec_que_covered = vec_que_covered.view(-1, self.batch_size, self.que_len)
#         vec_ans_covered = vec_ans_covered.view(-1, self.batch_size, self.ans_len)
#         vec_que_covered = self.que_max_pooling(vec_que_covered)
#         vec_ans_covered = self.ans_max_pooling(vec_ans_covered)
#         vec_que_covered = torch.squeeze(vec_que_covered).view(self.batch_size, -1)
#         vec_ans_covered = torch.squeeze(vec_ans_covered).view(self.batch_size, -1)
        
        
#         cos_sim = nn.CosineSimilarity(dim=-1)(final_que, final_ans)
#         cos_sim = cos_sim.view(-1, 1)
        
#         features = torch.cat([diff_vec, final_que, final_ans,\
#                               vec_que_covered, vec_ans_covered,\
#                               ans_coverance, que_coverance], dim=-1) # no sim
#         denser_features = self.fullyconnected(features)
#         # y  = cos_sim
#         y = self.hidden2label(denser_features)
#         return y, cos_sim
# print(len(inv_id_mapper)+1)
# lstm_classifier = LSTMClassifier(embedding_dim=300, hidden_dim=128,  que_len=QUESTION_LENGTH,
#                                  ans_len=ANSWER_LENGTH, pos_embedding_dim=30,
#                                  vocab_size=len(inv_id_mapper), 
#                                  label_size=2, batch_size=64, use_gpu=True)
# lstm_classifier = lstm_classifier.cuda()
# learning_rate = 0.001
# optimizer = optim.Adam(lstm_classifier.parameters(), lr=learning_rate)
# # loss_function = SimLoss(0.2)
# loss_function = nn.CrossEntropyLoss()

In [19]:
def get_recall_precision_f(TP, TN, FP, FN):
    return {'recall':TP.cpu().numpy()/(TP.cpu().numpy()+FN.cpu().numpy()), 
          'precision':TP.cpu().numpy()/(TP.cpu().numpy()+FP.cpu().numpy())}

In [20]:
def validate_vocabulary():
    for iter, valdata in enumerate(VALIDSET_LOADER):
        if iter > 5:
            break
        q, q_pos, a, a_pos, val_labels = valdata
        val_labels = torch.squeeze(val_labels)

        q, a, val_labels = Variable(q.cuda()), Variable(a.cuda()), val_labels.cuda()

        required_answers = val_labels.sum()        
        ind_true = (val_labels==1).nonzero()

        if required_answers <= 0:
            continue

        if not ind_true.shape:
            continue

        if len(val_labels.shape) == 0:
            continue

        N = val_labels.shape[0]
        if N == 0:
            continue
        lstm_classifier.batch_size = len(val_labels)
        lstm_classifier.hidden = lstm_classifier.init_hidden()
        output = lstm_classifier(q, a)
        output = output[-1]
        for qs in q:
            for wrds in qs:
                for wrd in wrds:
                    print(inv_vocab[inv_id_mapper[int(wrd)]])

In [21]:
def translate_seq(seq):
    ans = []
    for wrd in seq:
        ans.append(inv_vocab[inv_id_mapper[int(wrd)]])
    return ans

In [22]:
def evaluate_model(model, VALIDSET_LOADER, input_type='probs'):
    model.eval()
    
    inv_rank_sum = 0.0
    ave_prec_sum = 0.0
    total_questions = 0.0
    
    TP = 0
    TN = 0
    FP = 0
    FN = 0
    
    for iter, valdata in enumerate(VALIDSET_LOADER):
        lq, q, q_pos, q_ovl, las, a, a_pos, a_ovl, val_labels = valdata
        val_labels = torch.squeeze(val_labels)

        q, a, val_labels = Variable(q.cuda()), Variable(a.cuda()), val_labels.cuda()
        q_pos, a_pos = Variable(q_pos.cuda()), Variable(a_pos.cuda())
        q_ovl, a_ovl = Variable(q_ovl.cuda()), Variable(a_ovl.cuda())
        lq, las = Variable(lq.cuda()), Variable(las.cuda())
        
        required_answers = val_labels.sum()
        ind_true = (val_labels==1).nonzero()
        
        if required_answers <= 0:
            continue
        
        if not ind_true.shape:
            continue
        
        if len(val_labels.shape) == 0:
            continue
        
        N = val_labels.shape[0]
        if N == 0:
            continue
        model.batch_size = len(val_labels)
        model.hidden = model.init_hidden()
        output = model(lq, q, q_pos, q_ovl, las, a, a_pos, a_ovl)
        
        _, predicted = torch.max(output[0].data, 1)
        FP += ((1-val_labels)*(predicted)).sum().double()
        FN += ((val_labels)*(1-predicted)).sum().double()
        TP += (val_labels*predicted).sum().double()            
        TN += ((1-val_labels)*(1-predicted)).sum().double()
        
        output = output[0][:,1]
        _, ranking = torch.sort(output, dim=0, descending=True)
        ranking = torch.squeeze(ranking)
        
        rank_of_true = []
        
        ave_p = 0.0
        for x in ind_true:
            rank_of_true.append(torch.squeeze((ranking == x).nonzero()))

        rank_of_true = torch.stack(rank_of_true)
        rank_of_true, _ = torch.sort(rank_of_true)

        for _idx, x in enumerate(rank_of_true):
            ave_p += _idx + 1 / (x.double() + 1)
        
        ave_p =  ave_p / required_answers.double()
        
        inv_rank_sum += 1.0 / (1 + rank_of_true[0].double())
        ave_prec_sum += ave_p
        total_questions += 1.0
        
    result = {'samples': total_questions, 'MRR': inv_rank_sum / total_questions, 'MAP': ave_prec_sum/total_questions}
    print(get_recall_precision_f(TP, TN, FP, FN))
    return result


In [23]:
def train_model(model, TRAINSET_LOADER, VALIDSET_LOADER, runs=5, steps_per_run=100):
    for epoch_id in range(runs):
        num_batches = 0
        model.train()

        total_acc = 0.0
        total_loss = 0.0
        total = 0.0
        TP = 0
        TN = 0
        FP = 0
        FN = 0
        for iter, traindata in enumerate(TRAINSET_LOADER): #VALIDSETASTRAIN_LOADER): # 
            if num_batches > steps_per_run:
                break
            num_batches += 1
            lq, q, q_pos, q_ovl, la, a, a_pos, a_ovl, train_labels = traindata
            train_labels = torch.squeeze(train_labels)

            q, a, train_labels = Variable(q.cuda()), Variable(a.cuda()), train_labels.cuda()
            q_pos, a_pos = Variable(q_pos.cuda()), Variable(a_pos.cuda())
            q_ovl, a_ovl = Variable(q_ovl.cuda()), Variable(a_ovl.cuda())

            lq, la = Variable(lq.cuda()), Variable(la.cuda())

            model.zero_grad()
            model.batch_size = len(train_labels)
            model.hidden = model.init_hidden()        
            output = model(lq, q, q_pos, q_ovl, la, a, a_pos, a_ovl)
            # output: [prob_neg, prob_pos], score
            loss = loss_function(output[0], Variable(train_labels))
            loss.backward()
            optimizer.step()
        
            _, predicted = torch.max(output[0].data, 1)
            FP += ((1-train_labels)*(predicted)).sum().double()
            FN += ((train_labels)*(1-predicted)).sum().double()
            TP += (train_labels*predicted).sum().double()            
            TN += ((1-train_labels)*(1-predicted)).sum().double()
            
            total_acc += (predicted == train_labels).sum().double() / len(train_labels)
            total += 1.0
            total_loss += loss.data
            pass
        
        with open('./logs/train_log.log', 'a') as fout:
            fout.write('train:'+' '+ str(total_acc.cpu().numpy() / total) +' '+ str(total_loss.cpu().numpy() / total))
            fout.write(str(get_recall_precision_f(TP, TN, FP, FN)))
            fout.write('\n')
            fout.write(str(evaluate_model(model, VALIDSET_LOADER, input_type="probs")))
            fout.write('\n\n')

In [24]:
evaluate_model(lstm_classifier, VALIDSET_LOADER)

{'recall': 0.056173200702165006, 'precision': 0.033566433566433566}


{'MAP': tensor(0.1954, dtype=torch.float64, device='cuda:0'),
 'MRR': tensor(0.1773, dtype=torch.float64, device='cuda:0'),
 'samples': 1637.0}

In [27]:
def load_model(PATH):
    loaded =  torch.load(PATH)
    loaded = loaded.cuda()
    loaded.eval()
    return loaded

In [26]:
train_model(lstm_classifier, TRAINSET_LOADER, VALIDSET_LOADER, runs=3, steps_per_run=50)

{'recall': 0.85020479812756, 'precision': 0.1348491879350348}


Exception ignored in: <bound method _DataLoaderIter.__del__ of <torch.utils.data.dataloader._DataLoaderIter object at 0x7f694190f128>>
Traceback (most recent call last):
  File "/home/jeff/anaconda3/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 349, in __del__
    self._shutdown_workers()
  File "/home/jeff/anaconda3/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 328, in _shutdown_workers
    self.worker_result_queue.get()
  File "/home/jeff/anaconda3/lib/python3.6/multiprocessing/queues.py", line 337, in get
    return _ForkingPickler.loads(res)
  File "/home/jeff/anaconda3/lib/python3.6/site-packages/torch/multiprocessing/reductions.py", line 70, in rebuild_storage_fd
    fd = df.detach()
  File "/home/jeff/anaconda3/lib/python3.6/multiprocessing/resource_sharer.py", line 57, in detach
    with _resource_sharer.get_connection(self._id) as conn:
  File "/home/jeff/anaconda3/lib/python3.6/multiprocessing/resource_sharer.py", line 87, in get_connec

{'recall': 0.7682855471035693, 'precision': 0.23589651455264105}


Exception ignored in: <bound method _DataLoaderIter.__del__ of <torch.utils.data.dataloader._DataLoaderIter object at 0x7f6941c476d8>>
Traceback (most recent call last):
  File "/home/jeff/anaconda3/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 349, in __del__
    self._shutdown_workers()
  File "/home/jeff/anaconda3/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 328, in _shutdown_workers
    self.worker_result_queue.get()
  File "/home/jeff/anaconda3/lib/python3.6/multiprocessing/queues.py", line 337, in get
    return _ForkingPickler.loads(res)
  File "/home/jeff/anaconda3/lib/python3.6/site-packages/torch/multiprocessing/reductions.py", line 70, in rebuild_storage_fd
    fd = df.detach()
  File "/home/jeff/anaconda3/lib/python3.6/multiprocessing/resource_sharer.py", line 57, in detach
    with _resource_sharer.get_connection(self._id) as conn:
  File "/home/jeff/anaconda3/lib/python3.6/multiprocessing/resource_sharer.py", line 87, in get_connec

{'recall': 0.6612053832650673, 'precision': 0.34855027760641577}


In [33]:
torch.save(lstm_classifier, '../PRETRAINED/LSTMCLASSIFIER.model')

In [26]:
# torch.save(lstm_classifier.state_dict(), './logs/lstm_classifier.model')

In [25]:
# me_vec, us_vec, you_vec = embedding_pretrained[id_mapper[vocab['我']]],embedding_pretrained[id_mapper[vocab['美国']]],embedding_pretrained[id_mapper[vocab['你']]]
# me_vec = lstm_classifier.word_embeddings.weight[id_mapper[vocab['我']]].cpu().detach().numpy()
# us_vec= lstm_classifier.word_embeddings.weight[id_mapper[vocab['美国']]].cpu().detach().numpy()
# you_vec = lstm_classifier.word_embeddings.weight[id_mapper[vocab['你']]].cpu().detach().numpy()

In [126]:
def check_valid_output(lstm_classifier, VALIDSET_LOADER, indx):
    lstm_classifier.eval()

    for _idx, valdata in enumerate(VALIDSET_LOADER):
        if not _idx == indx:
            continue

        lq, q, q_pos, q_ovl, las, a, a_pos, a_ovl, val_labels = valdata
        val_labels = torch.squeeze(val_labels)

        q, a, val_labels = Variable(q.cuda()), Variable(a.cuda()), val_labels.cuda()
        q_pos, a_pos = Variable(q_pos.cuda()), Variable(a_pos.cuda())
        q_ovl, a_ovl = Variable(q_ovl.cuda()), Variable(a_ovl.cuda())

        lq, las = Variable(lq.cuda()), Variable(las.cuda())
        print(q_ovl)
        print(a_ovl)

        lstm_classifier.batch_size = len(val_labels)
        lstm_classifier.hidden = lstm_classifier.init_hidden()
        output = lstm_classifier(lq, q, q_pos, q_ovl, las, a, a_pos, a_ovl)

        output = output[0][:,1]
        
        _, ranking = torch.sort(output, dim=0, descending=True)
        ranking = torch.squeeze(ranking)
        
        rank_of_true = []
        ind_true = (val_labels==1).nonzero()
        
        ave_p = 0.0
        for x in ind_true:
            rank_of_true.append(torch.squeeze((ranking == x).nonzero()))

        rank_of_true = torch.stack(rank_of_true)
        rank_of_true, _ = torch.sort(rank_of_true)

        for _idx, x in enumerate(rank_of_true):
            ave_p += _idx + 1 / (x.double() + 1)
        
        inv_rank = 1.0 / (1 + rank_of_true[0].double())
        
        print('ave_p', ave_p)
        print('inv_rank', inv_rank, rank_of_true[0].double())
        for que, ans, t in zip(q, a, [val_labels]):
            for qq, aa, pp, tt in zip(que, ans, output, t):
                print(''.join(translate_seq(qq)))
                print(''.join(translate_seq(aa)))
                print(tt, pp)
        break

In [127]:
def check_train_output(indx):
    for _idx, traindata in enumerate(TRAINSET_LOADER):
        if not _idx == indx:
            continue
        lstm_classifier.eval()
        lq, q, q_pos, q_ovl, la, a, a_pos, a_ovl, train_labels = traindata

        q, a, train_labels = Variable(q.cuda()), Variable(a.cuda()), train_labels.cuda()
        q_pos, a_pos = Variable(q_pos.cuda()), Variable(a_pos.cuda())
        q_ovl, a_ovl = Variable(q_ovl.cuda()), Variable(a_ovl.cuda())

        lq, la = Variable(lq.cuda()), Variable(la.cuda())
        
        lstm_classifier.zero_grad()
        lstm_classifier.batch_size = len(train_labels)
        lstm_classifier.hidden = lstm_classifier.init_hidden()        
        output = lstm_classifier(lq, q, q_pos, q_ovl, la, a, a_pos, a_ovl)

        print(q.shape)
        print(a.shape)
        print(output[0].shape)
        for que, ans, t, p, sim in zip(q, a, train_labels, output[0], output[-1]):
                print(''.join(translate_seq(que)))
                print(''.join(translate_seq(ans)))
                print(t, p, sim)
        break

In [128]:
check_train_output(30)

torch.Size([64, 15])
torch.Size([64, 45])
torch.Size([64, 2])
易方达India.代销哪些银行？<pad><pad><pad><pad><pad><pad><pad><pad>
托管费率0<pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>
tensor([ 0], device='cuda:0') tensor([ 0.6394, -1.6225], device='cuda:0') tensor([ 0.1949], device='cuda:0')
前0<pad>齐国国夏进攻哪个国家？<pad><pad><pad><pad><pad><pad>
前India.，齐国国夏进攻鲁国。<pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>
tensor([ 1], device='cuda:0') tensor([-0.6050,  0.7585], device='cuda:0') tensor([ 0.2230], device='cuda:0')
大岩山（岩）组是浙江地矿局India.地质大队？<pad>
浙江地矿局India.地质大队，0<pad>命名。<pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>

In [129]:
check_valid_output(lstm_classifier, VALIDSET_LOADER, 56)

tensor([[[ 0,  1,  1,  0,  1,  0,  1,  1,  0,  0,  0,  0,  0,  0,
           0],
         [ 0,  0,  0,  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0,
           0],
         [ 0,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
           0],
         [ 1,  0,  0,  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0,
           0],
         [ 0,  1,  0,  0,  1,  0,  1,  0,  0,  0,  0,  0,  0,  0,
           0],
         [ 0,  0,  0,  0,  1,  0,  1,  0,  0,  0,  0,  0,  0,  0,
           0],
         [ 0,  0,  0,  0,  1,  0,  1,  0,  0,  0,  0,  0,  0,  0,
           0],
         [ 0,  0,  1,  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0,
           0],
         [ 0,  0,  0,  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0,
           0],
         [ 0,  0,  0,  0,  1,  0,  1,  0,  0,  0,  0,  0,  0,  0,
           0],
         [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
           0],
         [ 0,  0,  0,  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0,
           0],
         [ 0,  0,  0,  0,  0

Exception ignored in: <bound method _DataLoaderIter.__del__ of <torch.utils.data.dataloader._DataLoaderIter object at 0x7f551bbda438>>
Traceback (most recent call last):
  File "/home/jeff/anaconda3/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 349, in __del__
    self._shutdown_workers()
  File "/home/jeff/anaconda3/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 328, in _shutdown_workers
    self.worker_result_queue.get()
  File "/home/jeff/anaconda3/lib/python3.6/multiprocessing/queues.py", line 337, in get
    return _ForkingPickler.loads(res)
  File "/home/jeff/anaconda3/lib/python3.6/site-packages/torch/multiprocessing/reductions.py", line 70, in rebuild_storage_fd
    fd = df.detach()
  File "/home/jeff/anaconda3/lib/python3.6/multiprocessing/resource_sharer.py", line 57, in detach
    with _resource_sharer.get_connection(self._id) as conn:
  File "/home/jeff/anaconda3/lib/python3.6/multiprocessing/resource_sharer.py", line 87, in get_connec