In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from __future__ import print_function
import argparse
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from importlib import reload 

In [3]:
from deeprank.dataset import DataLoader, PairGenerator, ListGenerator

In [4]:
loader = DataLoader('./config/letor07_mp_fold1.model')

[./data/letor/r5w/word_dict.txt]
	Word dict size: 193367
[./data/letor/r5w/qid_query.txt]
	Data size: 1692
[./data/letor/r5w/docid_doc.txt]
	Data size: 65323
[./data/letor/r5w/embed_wiki-pdc_d50_norm]
	Embedding size: 109282
Generate numpy embed: (193368, 50)


In [5]:
import json
letor_config = json.loads(open('./config/letor07_mp_fold1.model').read())
select_device = torch.device("cpu")
rank_device = torch.device("cuda")

In [6]:
Letor07Path = letor_config['data_dir']

letor_config['fill_word'] = loader._PAD_
letor_config['embedding'] = loader.embedding
letor_config['feat_size'] = loader.feat_size
letor_config['vocab_size'] = loader.embedding.shape[0]
letor_config['embed_dim'] = loader.embedding.shape[1]
letor_config['pad_value'] = loader._PAD_

pair_gen = PairGenerator(rel_file=Letor07Path + '/relation.train.fold%d.txt'%(letor_config['fold']), 
                         config=letor_config)

[./data/letor/r5w/relation.train.fold1.txt]
	Instance size: 47828
Pair Instance Count: 325439


In [7]:
from deeprank import select_module
from deeprank import rank_module

select_module = reload(select_module)
rank_module = reload(rank_module)

In [8]:
# select_net = select_module.IdentityNet(config=letor_config)
# select_net.train()
# select_net = select_net.to(device)

In [9]:
select_net = select_module.QueryCentricNet(config=letor_config, out_device=rank_device)
select_net.train()
select_net = select_net.to(select_device)

In [10]:
# letor_config['simmat_channel'] = 1
# letor_config['conv_params'] = [(8, 2, 10)]
# letor_config['fc_params'] = [50]
# letor_config['dpool_size'] = [3, 10]
# letor_config['lr'] = 0.001
# letor_config['finetune_embed'] = False
# rank_net = rank_module.MatchPyramidNet(config=letor_config)
# rank_net.embedding.weight.data.copy_(torch.from_numpy(loader.embedding))
# rank_net.train()
# optimizer = optim.Adam(rank_net.parameters(), lr=letor_config['lr'])

In [11]:
letor_config['simmat_channel'] = 1
letor_config['conv_params'] = [(8, 3, 3)]
letor_config['fc_params'] = [200]
letor_config['dpool_size'] = [3, 10]
letor_config['lr'] = 0.001
letor_config['finetune_embed'] = False
rank_net = rank_module.DeepRankNet(config=letor_config)
rank_net = rank_net.to(rank_device)
rank_net.embedding.weight.data.copy_(torch.from_numpy(loader.embedding))
rank_net.train()
optimizer = optim.Adam(rank_net.parameters(), lr=letor_config['lr'])

In [12]:
def prepare(*variables):
    return (torch.from_numpy(variable).to(select_device) for variable in variables)

In [21]:
X1, X1_len, X1_id, X2, X2_len, X2_id, Y, F = pair_gen.get_batch(data1=loader.query_data, data2=loader.doc_data)
X1, X1_len, X2, X2_len, Y, F = prepare(X1, X1_len, X2, X2_len, Y, F)
print(X1.shape, X2.shape, X1_len.shape, X2_len.shape)
X1, X2, X1_len, X2_len, X2_pos = select_net(X1, X2, X1_len, X2_len, X1_id, X2_id)
print(X2[0].shape, X2_len[0])
print(X2_pos[0].shape, X2_pos[0])
# X2     - list of tensor, batch_size * tensor([total_snip * win_size])
# X2_len - list of list,   dim0: batch_size, dim1: n_query, value: n_match

torch.Size([200, 20]) torch.Size([200, 2000]) torch.Size([200]) torch.Size([200])
torch.Size([20, 31]) [4, 6, 0, 7, 3, 0]
torch.Size([20]) tensor([204., 259., 281., 295.,  19.,  34.,  52.,  91., 284., 414.,  18.,  33.,
         51.,  90., 282., 413., 430.,  43.,  61.,  84.], device='cuda:0')


In [24]:
input_tensor = rank_net(X1, X2, X1_len, X2_len, X2_pos)
print(input_tensor.shape)

torch.Size([59, 5, 1])


In [None]:
import time
start_t = time.time()
for i in range(50):
    X1, X1_len, X2, X2_len, Y, F = pair_gen.get_batch(data1=loader.query_data, data2=loader.doc_data)
    X1, X1_len, X2, X2_len, Y, F = to_device(X1, X1_len, X2, X2_len, Y, F)
    X1, X2, X1_len, X2_len = select_net(X1, X2, X1_len, X2_len)
    output = rank_net(X1, X2, X1_len, X2_len)
    loss = rank_net.pair_loss(output, Y)
    print(loss.item())
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
end_t = time.time()
print('Time Cost: %s s' % (end_t-start_t))