In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from __future__ import print_function
import argparse
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from importlib import reload 

In [3]:
from deeprank.dataset import DataLoader, PairGenerator, ListGenerator

In [4]:
seed = 1234
torch.manual_seed(seed)

<torch._C.Generator at 0x7fecfc187af0>

In [5]:
loader = DataLoader('./config/letor07_mp_fold1.model')

[./data/letor/r5w/word_dict.txt]
	Word dict size: 193367
[./data/letor/r5w/qid_query.txt]
	Data size: 1692
[./data/letor/r5w/docid_doc.txt]
	Data size: 65323
[./data/letor/r5w/embed_wiki-pdc_d50_norm]
	Embedding size: 109282
Generate numpy embed: (193368, 50)


In [6]:
import json
letor_config = json.loads(open('./config/letor07_mp_fold1.model').read())
#device = torch.device("cuda")
device = torch.device("cpu")

In [7]:
Letor07Path = letor_config['data_dir']

letor_config['fill_word'] = loader._PAD_
letor_config['embedding'] = loader.embedding
letor_config['feat_size'] = loader.feat_size
letor_config['vocab_size'] = loader.embedding.shape[0]
letor_config['embed_dim'] = loader.embedding.shape[1]

pair_gen = PairGenerator(rel_file=Letor07Path + '/relation.train.fold%d.txt'%(letor_config['fold']), 
                         config=letor_config)

[./data/letor/r5w/relation.train.fold1.txt]
	Instance size: 47828
Pair Instance Count: 325439


In [8]:
from deeprank import select_module
from deeprank import rank_module

select_module = reload(select_module)
rank_module = reload(rank_module)

In [9]:
select_net = select_module.IdentityNet(config=letor_config)
select_net.train()
select_net = select_net.to(device)

In [10]:
# letor_config['simmat_channel'] = 1
# letor_config['conv_params'] = [(8, 2, 10)]
# letor_config['fc_params'] = [50]
# letor_config['dpool_size'] = [3, 10]
# letor_config['lr'] = 0.001
# letor_config['finetune_embed'] = False
# rank_net = rank_module.MatchPyramidNet(config=letor_config)
# rank_net.embedding.weight.data.copy_(torch.from_numpy(loader.embedding))
# rank_net.train()
# optimizer = optim.Adam(rank_net.parameters(), lr=letor_config['lr'])

In [11]:
letor_config['simmat_channel'] = 1
letor_config['conv_params'] = [(8, 3, 3)]
letor_config['fc_params'] = [200]
letor_config['dpool_size'] = [3, 10]
letor_config['lr'] = 0.001
letor_config['finetune_embed'] = False
rank_net = rank_module.MatchPyramidNet(config=letor_config)
rank_net = rank_net.to(device)
rank_net.embedding.weight.data.copy_(torch.from_numpy(loader.embedding))
rank_net.train()
optimizer = optim.Adam(rank_net.parameters(), lr=letor_config['lr'])

In [12]:
def to_device(*variables):
    return (torch.from_numpy(variable).to(device) for variable in variables)

In [13]:
import time
start_t = time.time()
for i in range(50):
    X1, X1_len, X2, X2_len, Y, F = pair_gen.get_batch(data1=loader.query_data, data2=loader.doc_data)
    X1, X1_len, X2, X2_len, Y, F = to_device(X1, X1_len, X2, X2_len, Y, F)
    X1, X2, X1_len, X2_len = select_net(X1, X2, X1_len, X2_len)
    output = rank_net(X1, X2, X1_len, X2_len, 0)
    loss = rank_net.pair_loss(output, Y)
    print(loss.item())
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
end_t = time.time()
print('Time Cost: %s s' % (end_t-start_t))

1.0036543607711792
0.9859936237335205
0.978439450263977
0.9510518908500671
0.9726119041442871
0.9723504185676575
0.9515997767448425
0.9267768263816833
0.9044245481491089
0.8598275184631348
0.887164831161499
0.9301736950874329
0.8329542279243469
0.8854843378067017
0.8413276672363281
0.7854862213134766
0.795760452747345
0.9298571348190308
0.808600127696991
0.6962187886238098
0.7879980802536011
0.8256113529205322
0.7483929395675659
0.6751953959465027
0.8718575239181519
0.8322612643241882
0.6763977408409119
0.7689079642295837
0.7520681619644165
0.726057231426239
0.7707747220993042
0.8906329274177551
0.8174879550933838
0.7817857265472412
1.0007716417312622
0.836122453212738
0.6929255127906799
0.5756625533103943
0.8190739750862122
0.8637646436691284
0.8407763838768005
0.8978973627090454
0.8313198685646057
0.586416482925415
0.8453450202941895
0.772385835647583
0.6902652978897095
0.7688271999359131
0.7985737323760986
0.8601698875427246
Time Cost: 12.905720233917236 s


In [14]:
torch.save(select_net.state_dict(), "identity.ckpt")
torch.save(rank_net.state_dict(), "matchpyramid.ckpt")

In [15]:
torch.save(select_net, "identity.model")
torch.save(rank_net, "matchpyramid.model")

In [16]:
rank_net

MatchPyramidNet(
  (embedding): Embedding(193368, 50, padding_idx=0)
  (conv_sequential): Sequential(
    (0): Conv2d(1, 8, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1))
  )
  (dpool_layer): AdaptiveMaxPool2d(output_size=[3, 10])
  (fc_sequential): Sequential(
    (0): Linear(in_features=240, out_features=200, bias=True)
  )
  (out_layer): Linear(in_features=200, out_features=1, bias=True)
)

In [17]:
from deeprank import utils
select_net_e = torch.load(f='identity.model')
rank_net_e = torch.load(f='matchpyramid.model')

list_gen = ListGenerator(rel_file=Letor07Path+'/relation.test.fold%d.txt'%(letor_config['fold']),
                         config=letor_config)
map_v = 0.0
map_c = 0.0

with torch.no_grad():
    for X1, X1_len, X2, X2_len, Y, F in list_gen.get_batch(data1=loader.query_data, data2=loader.doc_data):
        #print(X1.shape, X2.shape, Y.shape)
        X1, X1_len, X2, X2_len, Y, F = to_device(X1, X1_len, X2, X2_len, Y, F)
        X1, X2, X1_len, X2_len = select_net_e(X1, X2, X1_len, X2_len)
        #print(X1.shape, X2.shape, Y.shape)
        pred = rank_net_e(X1, X2, X1_len, X2_len, 0)
        map_o = utils.eval_MAP(pred.tolist(), Y.tolist())
        #print(pred.shape, Y.shape)
        map_v += map_o
        map_c += 1.0
    map_v /= map_c

print('[Test]', map_v)

[./data/letor/r5w/relation.test.fold1.txt]
	Instance size: 13652
List Instance Count: 336
[Test] 0.4162057003846964
