In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from __future__ import print_function
import argparse
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from importlib import reload 

In [3]:
from deeprank.dataset import DataLoader, PairGenerator, ListGenerator
from deeprank import utils

In [4]:
seed = 1234
torch.manual_seed(seed)

<torch._C.Generator at 0x7fc43014da90>

In [5]:
loader = DataLoader('./config/letor07_mp_fold1.model')

[./data/letor/r5w/word_dict.txt]
	Word dict size: 193367
[./data/letor/r5w/qid_query.txt]
	Data size: 1692
[./data/letor/r5w/docid_doc.txt]
	Data size: 65323
[./data/letor/r5w/embed_wiki-pdc_d50_norm]
	Embedding size: 109282
Generate numpy embed: (193368, 50)


In [6]:
import json
letor_config = json.loads(open('./config/letor07_mp_fold1.model').read())
#device = torch.device("cuda")
device = torch.device("cpu")

In [7]:
Letor07Path = letor_config['data_dir']

letor_config['fill_word'] = loader._PAD_
letor_config['embedding'] = loader.embedding
letor_config['feat_size'] = loader.feat_size
letor_config['vocab_size'] = loader.embedding.shape[0]
letor_config['embed_dim'] = loader.embedding.shape[1]
letor_config['pad_value'] = loader._PAD_

pair_gen = PairGenerator(rel_file=Letor07Path + '/relation.train.fold%d.txt'%(letor_config['fold']), 
                         config=letor_config)

[./data/letor/r5w/relation.train.fold1.txt]
	Instance size: 47828
Pair Instance Count: 325439


In [8]:
from deeprank import select_module
from deeprank import rank_module

In [9]:
select_net = select_module.IdentityNet(config=letor_config)
select_net.train()
select_net = select_net.to(device)

In [10]:
letor_config['q_limit'] = 20
letor_config['d_limit'] = 500
letor_config['max_match'] = 5
letor_config['win_size'] = 5
select_net = select_module.QueryCentricNet(config=letor_config)
select_net.train()
select_net = select_net.to(device)

In [11]:
letor_config['q_limit'] = 20
letor_config['d_limit'] = 500
letor_config['max_match'] = 5
letor_config['win_size'] = 5
letor_config['finetune_embed'] = False
select_net = select_module.PointerNet(config=letor_config)
select_net.train()
select_net = select_net.to(device)

In [12]:
# letor_config['simmat_channel'] = 1
# letor_config['conv_params'] = [(8, 2, 10)]
# letor_config['fc_params'] = [50]
# letor_config['dpool_size'] = [3, 10]
# letor_config['lr'] = 0.001
# letor_config['finetune_embed'] = False
# rank_net = rank_module.MatchPyramidNet(config=letor_config)
# rank_net.embedding.weight.data.copy_(torch.from_numpy(loader.embedding))
# rank_net.train()
# optimizer = optim.Adam(rank_net.parameters(), lr=letor_config['lr'])

In [13]:
letor_config['simmat_channel'] = 1
letor_config['conv_params'] = [(8, 3, 3)]
letor_config['fc_params'] = [200]
letor_config['dpool_size'] = [3, 10]
letor_config['lr'] = 0.001
letor_config['finetune_embed'] = False
rank_net = rank_module.MatchPyramidNet(config=letor_config)
rank_net = rank_net.to(device)
rank_net.embedding.weight.data.copy_(torch.from_numpy(loader.embedding))
rank_net.train()
optimizer = optim.Adam(rank_net.parameters(), lr=letor_config['lr'])

In [14]:
def to_device(*variables):
    return (torch.from_numpy(variable).to(device) for variable in variables)

In [15]:
def show_text(x):
    print(' '.join([loader.word_dict[w.item()] for w in x]))

In [16]:
X1, X1_len, X1_id, X2, X2_len, X2_id, Y, F = \
        pair_gen.get_batch(data1=loader.query_data, data2=loader.doc_data)
X1, X1_len, X2, X2_len, Y, F = \
        to_device(X1, X1_len, X2, X2_len, Y, F)
show_text(X2[0])
X1, X2_new, X1_len, X2_len_new = select_net(X1, X2, X1_len, X2_len, X1_id, X2_id)
show_text(X1[0])
for i in range(5):
    print(i, end=' ')
    show_text(X2_new[0][i])

department regulation license report decision state wisconsin board nursing matter application frank sacco rn order adopt stipulation respondent july frank sacco rn file application license practice register nurse examination sacco application indicate criminal conviction record invite make appearance board connection application sacco appear december board sacco enter stipulation board agree grant sacco agree accept limited license practice register nurse wisconsin base stipulation information record ordered follow order ordered frank sacco rn grant limited license practice professional nurse wisconsin imposing following terms conditions effective date limited license shall date sacco notify pass national council license examination register nurse term limited license sacco shall responsible submission board quarterly written report prepared sacco employment supervise setting sacco activity progress employment days effective date limited license sacco shall submit board documentation 

In [17]:
X1 = X1[:1]
X1_len = X1_len[:1]
X2 = X2[:1]
X2_len = X2_len[:1]
X1_id = X1_id[:1]
X2_id = X2_id[:1]

In [18]:
show_text(X2[0])
X1, X2_new, X1_len, X2_len_new = select_net(X1, X2, X1_len, X2_len, X1_id, X2_id)
show_text(X1[0])
for i in range(5):
    print(i, end=' ')
    show_text(X2_new[0][i])

department regulation license report decision state wisconsin board nursing matter application frank sacco rn order adopt stipulation respondent july frank sacco rn file application license practice register nurse examination sacco application indicate criminal conviction record invite make appearance board connection application sacco appear december board sacco enter stipulation board agree grant sacco agree accept limited license practice register nurse wisconsin base stipulation information record ordered follow order ordered frank sacco rn grant limited license practice professional nurse wisconsin imposing following terms conditions effective date limited license shall date sacco notify pass national council license examination register nurse term limited license sacco shall responsible submission board quarterly written report prepared sacco employment supervise setting sacco activity progress employment days effective date limited license sacco shall submit board documentation 

In [19]:
loader.iword_dict['[PAD]']
loader._PAD_
select_net.embedding(torch.tensor([loader._PAD_]))

tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0.]])

In [25]:
import time
start_t = time.time()
for i in range(150):
    X1, X1_len, X1_id, X2, X2_len, X2_id, Y, F = \
        pair_gen.get_batch(data1=loader.query_data, data2=loader.doc_data)
    X1, X1_len, X2, X2_len, Y, F = \
        to_device(X1, X1_len, X2, X2_len, Y, F)
    X1, X2, X1_len, X2_len = select_net(X1, X2, X1_len, X2_len, X1_id, X2_id)
    X2, X2_len = utils.data_adaptor(X2, X2_len, select_net, rank_net, letor_config)
    output = rank_net(X1, X2, X1_len, X2_len, 0)
    loss = rank_net.pair_loss(output, Y)
    print(loss.item())
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
end_t = time.time()
print('Time Cost: %s s' % (end_t-start_t))

0.8452639579772949
0.7408360242843628
0.8817998170852661
0.8717394471168518
0.7674793601036072
0.9300913214683533
0.7658073902130127
0.9253232479095459
0.8339310884475708
1.0122829675674438
0.9256452322006226
1.0089644193649292
0.7635301351547241
0.9707406163215637
0.7831112146377563
0.9157482981681824
0.9178041219711304
0.7672064304351807
0.7871608138084412
0.8634348511695862
0.7072489261627197
0.9312631487846375
0.6571201682090759
0.8149318099021912
0.8332382440567017
0.8976726531982422
0.7403347492218018
0.8171766996383667
0.8814313411712646
0.7945161461830139
0.8885066509246826
0.6474040746688843
0.8585633039474487
0.8258967399597168
0.801921010017395
0.8014179468154907
0.7138195633888245
0.9144049286842346
0.9989900588989258
0.9851740002632141
0.7311701774597168
0.9250863790512085
0.9955953359603882
0.8455612659454346
0.85248202085495
0.8517870306968689
0.7686854004859924
0.7951820492744446
0.6983522772789001
0.844882071018219
0.7505353093147278
0.7465106248855591
0.86195194721221

In [26]:
torch.save(select_net.state_dict(), "identity.ckpt")
torch.save(rank_net.state_dict(), "matchpyramid.ckpt")

In [27]:
torch.save(select_net, "identity.model")
torch.save(rank_net, "matchpyramid.model")

In [28]:
rank_net

MatchPyramidNet(
  (embedding): Embedding(193368, 50, padding_idx=193367)
  (conv_sequential): Sequential(
    (0): Conv2d(1, 8, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1))
  )
  (dpool_layer): AdaptiveMaxPool2d(output_size=[3, 10])
  (fc_sequential): Sequential(
    (0): Linear(in_features=240, out_features=200, bias=True)
  )
  (out_layer): Linear(in_features=200, out_features=1, bias=True)
)

In [29]:
select_net_e = torch.load(f='identity.model')
rank_net_e = torch.load(f='matchpyramid.model')

list_gen = ListGenerator(rel_file=Letor07Path+'/relation.test.fold%d.txt'%(letor_config['fold']),
                         config=letor_config)
map_v = 0.0
map_c = 0.0

with torch.no_grad():
    for X1, X1_len, X1_id, X2, X2_len, X2_id, Y, F in \
        list_gen.get_batch(data1=loader.query_data, data2=loader.doc_data):
        #print(X1.shape, X2.shape, Y.shape)
        X1, X1_len, X2, X2_len, Y, F = to_device(X1, X1_len, X2, X2_len, Y, F)
        X1, X2, X1_len, X2_len = select_net_e(X1, X2, X1_len, X2_len, X1_id, X2_id)
        X2, X2_len = utils.data_adaptor(X2, X2_len, select_net, rank_net, letor_config)
        #print(X1.shape, X2.shape, Y.shape)
        pred = rank_net_e(X1, X2, X1_len, X2_len, 0)
        map_o = utils.eval_MAP(pred.tolist(), Y.tolist())
        #print(pred.shape, Y.shape)
        map_v += map_o
        map_c += 1.0
    map_v /= map_c

print('[Test]', map_v)

[./data/letor/r5w/relation.test.fold1.txt]
	Instance size: 13652
List Instance Count: 336
[Test] 0.3786513414875789
