In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from __future__ import print_function
import argparse
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from importlib import reload 

In [3]:
from deeprank.dataset import DataLoader, PairGenerator, ListGenerator

In [4]:
loader = DataLoader('./config/letor07_mp_fold1.model')

[./data/letor/r5w/word_dict.txt]
	Word dict size: 193367
[./data/letor/r5w/qid_query.txt]
	Data size: 1692
[./data/letor/r5w/docid_doc.txt]
	Data size: 65323
[./data/letor/r5w/embed_wiki-pdc_d50_norm]
	Embedding size: 109282
Generate numpy embed: (193368, 50)


In [5]:
import json
letor_config = json.loads(open('./config/letor07_mp_fold1.model').read())
# device = torch.device("cuda")
device = torch.device("cpu")

In [91]:
Letor07Path = letor_config['data_dir']

letor_config['fill_word'] = loader._PAD_
letor_config['embedding'] = loader.embedding
letor_config['feat_size'] = loader.feat_size
letor_config['vocab_size'] = loader.embedding.shape[0]
letor_config['embed_dim'] = loader.embedding.shape[1]

pair_gen = PairGenerator(rel_file=Letor07Path + '/relation.train.fold%d.txt'%(letor_config['fold']), 
                         config=letor_config)

[./data/letor/r5w/relation.train.fold1.txt]
	Instance size: 47828
Pair Instance Count: 325439


In [7]:
from deeprank import select_module
from deeprank import rank_module

select_module = reload(select_module)
rank_module = reload(rank_module)

In [8]:
# select_net = select_module.IdentityNet(config=letor_config)
# select_net.train()
# select_net = select_net.to(device)

In [98]:
select_net = select_module.QueryCentricNet(config=letor_config)
select_net.train()
select_net = select_net.to(device)

In [57]:
# letor_config['simmat_channel'] = 1
# letor_config['conv_params'] = [(8, 2, 10)]
# letor_config['fc_params'] = [50]
# letor_config['dpool_size'] = [3, 10]
# letor_config['lr'] = 0.001
# letor_config['finetune_embed'] = False
# rank_net = rank_module.MatchPyramidNet(config=letor_config)
# rank_net.embedding.weight.data.copy_(torch.from_numpy(loader.embedding))
# rank_net.train()
# optimizer = optim.Adam(rank_net.parameters(), lr=letor_config['lr'])

In [58]:
letor_config['simmat_channel'] = 1
letor_config['conv_params'] = [(8, 3, 3)]
letor_config['fc_params'] = [200]
letor_config['dpool_size'] = [3, 10]
letor_config['lr'] = 0.001
letor_config['finetune_embed'] = False
rank_net = rank_module.MatchPyramidNet(config=letor_config)
rank_net = rank_net.to(device)
rank_net.embedding.weight.data.copy_(torch.from_numpy(loader.embedding))
rank_net.train()
optimizer = optim.Adam(rank_net.parameters(), lr=letor_config['lr'])

In [59]:
def to_device(*variables):
    return (torch.from_numpy(variable).to(device) for variable in variables)

In [12]:
import time
start_t = time.time()
for i in range(50):
    X1, X1_len, X2, X2_len, Y, F = pair_gen.get_batch(data1=loader.query_data, data2=loader.doc_data)
    X1, X1_len, X2, X2_len, Y, F = to_device(X1, X1_len, X2, X2_len, Y, F)
    X1, X2, X1_len, X2_len = select_net(X1, X2, X1_len, X2_len)
    output = rank_net(X1, X2, X1_len, X2_len)
    loss = rank_net.pair_loss(output, Y)
    print(loss.item())
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
end_t = time.time()
print('Time Cost: %s s' % (end_t-start_t))

0.9942600131034851
0.9926823377609253
0.9657775163650513
1.0432099103927612
0.8972198963165283
0.9261519908905029
1.0405492782592773
1.0220781564712524
1.013228178024292
1.0005723237991333
0.8980894684791565
1.0080732107162476
0.9084506034851074
0.9161213636398315
0.8033925294876099
0.9157216548919678
0.9668008089065552
0.7468870282173157
0.8263317346572876
0.8480381369590759
0.8733361959457397
0.8737118244171143
0.8586342930793762
0.7700794339179993
0.581363320350647
0.6984531283378601
0.9883303046226501
0.7731813192367554
0.5680857300758362
1.338994026184082
0.7105446457862854
0.7083640098571777
0.7716987133026123
0.6268550157546997
0.7208281755447388
0.7603832483291626
1.070916771888733
0.5627222061157227
0.6443978548049927
0.8027175664901733
0.5620582103729248
0.456104576587677
0.7922399640083313
1.0084519386291504
1.1783521175384521
0.9524479508399963
1.2779537439346313
1.0437955856323242
0.4700107276439667
0.5086148381233215
Time Cost: 102.60761904716492 s


In [13]:
rank_net

MatchPyramidNet(
  (embedding): Embedding(193368, 50, padding_idx=0)
  (conv_sequential): Sequential(
    (0): Conv2d(1, 8, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1))
  )
  (dpool_layer): AdaptiveMaxPool2d(output_size=[3, 10])
  (fc_sequential): Sequential(
    (0): Linear(in_features=240, out_features=200, bias=True)
  )
  (out_layer): Linear(in_features=200, out_features=1, bias=True)
)

In [110]:
X1, X1_len, X1_id, X2, X2_len, X2_id, Y, F = pair_gen.get_batch(data1=loader.query_data, data2=loader.doc_data)
X1, X1_len, X2, X2_len, Y, F = to_device(X1, X1_len, X2, X2_len, Y, F)
print(X1.shape, X2.shape, X1_len.shape, X2_len.shape)
X1, X2, X1_len, X2_len = select_net(X1, X2, X1_len, X2_len, X1_id, X2_id)
print(X2[0].shape, X2_len[0])
# X2     - list of tensor, batch_size * tensor([total_snip * win_size])
# X2_len - list of list,   dim0: batch_size, dim1: n_query, value: n_match

torch.Size([128, 20]) torch.Size([128, 500]) torch.Size([128]) torch.Size([128])
torch.Size([54, 31]) [20, 20, 14, 0]


In [111]:
len(X2)

117