In [78]:
import pandas as pd
import numpy as np
import utils
import xgboost as xgb
from collections import defaultdict

In [79]:
from importlib import reload
utils = reload(utils)

In [5]:
!ls

bm25.py		lr_bigrams.ipynb	not_found.py   temp.ipynb
coverage.ipynb	lr_char_trigrams.ipynb	pkls	       tmp
data		lr.ipynb		__pycache__    translate.ipynb
evaluate.ipynb	metrics.py		queries.ipynb  utils.py
indexer.ipynb	norm.py			split.py


In [80]:
df_bm25_unigram_scores = utils.load_from_file("pkls/df_bm25_unigram_scores.pkl")
df_bm25_unigram_scores_smooth = utils.load_from_file("pkls/df_bm25_unigram_scores_smooth.pkl")
df_bm25_bigram_scores = utils.load_from_file("pkls/df_bm25_bigram_scores.pkl")
df_bm25_bigram_scores_smooth = utils.load_from_file("pkls/df_bm25_bigram_scores_smooth.pkl")
df_bm25_trigram_scores = utils.load_from_file("pkls/df_bm25_trigram_scores.pkl")
df_bm25_trigram_scores_smooth = utils.load_from_file("pkls/df_bm25_trigram_scores_smooth.pkl")
df_coverage_scores = utils.load_from_file("pkls/df_coverage_scores.pkl")

In [81]:
test_data = df_bm25_unigram_scores
for another_data in [ df_bm25_unigram_scores_smooth,
                      df_bm25_bigram_scores,
                      df_bm25_bigram_scores_smooth,
                      df_bm25_trigram_scores,
                      df_bm25_trigram_scores_smooth,
                      df_coverage_scores]:
    for column in another_data.columns:
        if column=='query_id' or column=='doc_id':
            continue
        test_data[column] = another_data[column]

In [9]:
df_train_bm25_unigram_scores = utils.load_from_file("pkls/df_train_bm25_unigram_scores.pkl")
df_train_bm25_unigram_scores_smooth = utils.load_from_file("pkls/df_train_bm25_unigram_scores_smooth.pkl")
df_train_bm25_bigram_scores = utils.load_from_file("pkls/df_train_bm25_bigram_scores.pkl")
df_train_bm25_bigram_scores_smooth = utils.load_from_file("pkls/df_train_bm25_bigram_scores_smooth.pkl")
df_train_bm25_trigram_scores = utils.load_from_file("pkls/df_train_bm25_trigram_scores.pkl")
df_train_bm25_trigram_scores_smooth = utils.load_from_file("pkls/df_train_bm25_trigram_scores_smooth.pkl")
df_train_coverage_scores = utils.load_from_file("pkls/df_train_coverage_scores.pkl")

In [10]:
train_data = df_train_bm25_unigram_scores
for another_data in [ df_train_bm25_unigram_scores_smooth,
                      df_train_bm25_bigram_scores,
                      df_train_bm25_bigram_scores_smooth,
                      df_train_bm25_trigram_scores,
                      df_train_bm25_trigram_scores_smooth,
                      df_train_coverage_scores]:
    for column in another_data.columns:
        if column=='query_id' or column=='doc_id':
            continue
        train_data[column] = another_data[column]

In [11]:
test_data.columns

Index(['query_id', 'doc_id', 'score_title_bm25', 'score_body_bm25',
       'score_title_bm25_smooth', 'score_body_bm25_smooth',
       'score_title_bm25_bigram', 'score_body_bm25_bigram',
       'score_title_bm25_bigram_smooth', 'score_body_bm25_bigram_smooth',
       'score_title_bm25_trigram', 'score_body_bm25_trigram',
       'score_title_bm25_trigram_smooth', 'score_body_bm25_trigram_smooth',
       'title_count', 'body_count', 'whole_count', 'title_ratio', 'body_ratio',
       'whole_ratio'],
      dtype='object')

In [36]:
queries = utils.load_queries('data/queries_tr.tsv')
test_queries = utils.load_test('data/test.tsv')
train_queries = utils.load_train('data/clicks.train.tsv')

In [13]:
train_data['target'] = 0
test_data['target'] = 0

In [14]:
clicked_pairs = set()
for query_id, docs, clicks in train_queries:
    clicks_set = set(clicks)
    for doc_id in docs:
        if doc_id in clicks:
            #train_data.target[(train_data.query_id==query_id) & (train_data.doc_id==doc_id)] = 1
            clicked_pairs.add((query_id, doc_id))

In [15]:
train_data.target = train_data.apply(lambda x: (x['query_id'], x['doc_id']) in clicked_pairs, axis=1)

In [75]:
feature_columns = ['score_title_bm25', 'score_body_bm25',
       'score_title_bm25_smooth', 'score_body_bm25_smooth',
       'score_title_bm25_bigram', 'score_body_bm25_bigram',
       'score_title_bm25_bigram_smooth', 'score_body_bm25_bigram_smooth',
       'score_title_bm25_trigram', 'score_body_bm25_trigram',
       'score_title_bm25_trigram_smooth', 'score_body_bm25_trigram_smooth',
       'title_count', 'body_count', 'whole_count', 'title_ratio', 'body_ratio',
       'whole_ratio']

In [84]:
feature_columns = ['score_title_bm25', 'score_body_bm25', 'score_whole_bm25', 
       #'score_title_bm25_smooth', 'score_body_bm25_smooth',
       #'score_title_bm25_bigram', 'score_body_bm25_bigram',
       #'score_title_bm25_bigram_smooth', 'score_body_bm25_bigram_smooth',
       #'score_title_bm25_trigram', 'score_body_bm25_trigram',
       #'score_title_bm25_trigram_smooth', 'score_body_bm25_trigram_smooth',
       'title_count', 'body_count', 'whole_count', 'title_ratio', 'body_ratio',
       'whole_ratio']

In [104]:
dtrain = xgb.DMatrix(train_data[feature_columns], label=train_data.target)
train_groups = [len(docs) for query_id,docs,clicks in train_queries]
dtrain.set_group(train_groups)

In [105]:
dtest = xgb.DMatrix(test_data[feature_columns], label=test_data.target)
test_groups = [len(docs) for query_id,docs in test_queries]
dtest.set_group(test_groups)

In [106]:
param = {'max_depth':5, 'eta':1, 'silent':1, 'objective':'rank:pairwise',
         'eval_metric': ['ndcg@5']}

In [107]:
watchlist  = [(dtest,'eval'), (dtrain,'train')]

In [108]:
num_round = 500

In [109]:
model = xgb.train(param, dtrain, num_round, watchlist)

[0]	eval-ndcg@5:1	train-ndcg@5:0.116932
[1]	eval-ndcg@5:1	train-ndcg@5:0.11943
[2]	eval-ndcg@5:1	train-ndcg@5:0.120965
[3]	eval-ndcg@5:1	train-ndcg@5:0.12162
[4]	eval-ndcg@5:1	train-ndcg@5:0.122976
[5]	eval-ndcg@5:1	train-ndcg@5:0.124404
[6]	eval-ndcg@5:1	train-ndcg@5:0.125901
[7]	eval-ndcg@5:1	train-ndcg@5:0.126952
[8]	eval-ndcg@5:1	train-ndcg@5:0.128664
[9]	eval-ndcg@5:1	train-ndcg@5:0.129868
[10]	eval-ndcg@5:1	train-ndcg@5:0.130471
[11]	eval-ndcg@5:1	train-ndcg@5:0.131269
[12]	eval-ndcg@5:1	train-ndcg@5:0.131692
[13]	eval-ndcg@5:1	train-ndcg@5:0.132212
[14]	eval-ndcg@5:1	train-ndcg@5:0.133477
[15]	eval-ndcg@5:1	train-ndcg@5:0.133932
[16]	eval-ndcg@5:1	train-ndcg@5:0.13434
[17]	eval-ndcg@5:1	train-ndcg@5:0.13549
[18]	eval-ndcg@5:1	train-ndcg@5:0.136416
[19]	eval-ndcg@5:1	train-ndcg@5:0.13632
[20]	eval-ndcg@5:1	train-ndcg@5:0.137206
[21]	eval-ndcg@5:1	train-ndcg@5:0.137652
[22]	eval-ndcg@5:1	train-ndcg@5:0.138182
[23]	eval-ndcg@5:1	train-ndcg@5:0.138433
[24]	eval-ndcg@5:1	train-ndcg@5

In [36]:
predictions = model.predict(dtest)

In [40]:
predictions[0:50]

array([-4.61346388, -2.62902498, -6.0048995 , -7.14047623, -7.51059341,
       -7.24228573, -1.00295293, -7.15137625, -5.95473385, -4.44423485,
        0.26970857, -7.56391525, -2.04789257, -2.51592636, -1.63177133,
       -3.63443327,  1.17775869, -3.34799314, -6.58916569, -7.29824018,
       -6.47084379, -6.31928396, -3.60964632,  1.29346585, -2.3839283 ,
       -1.53302574, -2.86201382, -1.73214769,  3.84289837, -4.78115702,
        0.35583115, -0.7107234 , -3.27604222,  3.14303112, -3.15917635,
       -7.96436882, -4.57471895, -0.18780178, -7.58180141,  1.3587296 ,
       -3.76664829, -4.9190526 , -4.82646799, -4.9190526 , -2.66047931,
       -6.58549595, -3.75814486, -1.96842837, -1.41662848, -7.81976986], dtype=float32)

In [43]:
i = 0
query_predictions = {}
for query_id, docs in test_queries:
    query_predictions[query_id] = np.array(predictions[i:i+len(docs)])
    i+=len(docs)

In [44]:
result = []
for query_id, docs in test_queries:
    top_docs = np.array(docs)[np.flip(np.argsort(query_predictions[query_id]),0)[0:5]]
    for doc_id in top_docs:
        result.append((query_id, doc_id))

In [51]:
df = pd.DataFrame(result, columns=['QueryId','DocID'])

In [53]:
df.to_csv('submit.csv', index=False)

In [1]:
#### Use only bm25 unigrams

In [98]:
test_data.head(50)

Unnamed: 0,query_id,doc_id,score_title_bm25,score_body_bm25,score_whole_bm25,score_title_bm25_smooth,score_body_bm25_smooth,score_title_bm25_bigram,score_body_bm25_bigram,score_title_bm25_bigram_smooth,...,score_title_bm25_trigram,score_body_bm25_trigram,score_title_bm25_trigram_smooth,score_body_bm25_trigram_smooth,title_count,body_count,whole_count,title_ratio,body_ratio,whole_ratio
0,0,14903,0.0,1.853975,9.960806,0.0,-2.671199,0.0,3.600716,0.0,...,0.0,0.0,0.0,0.0,0,11,0,0.0,0.177419,0.0
1,0,16325,0.0,2.385418,9.263034,0.0,-1.957138,0.0,7.945453,0.0,...,0.0,0.0,0.0,0.0,0,11,0,0.0,0.177419,0.0
2,0,22778,0.0,1.354222,10.728679,0.0,-0.75876,0.0,14.478266,0.0,...,0.0,0.0,0.0,0.0,0,11,0,0.0,0.177419,0.0
3,0,23653,0.0,1.504163,10.98081,0.0,-0.854451,0.0,11.381692,0.0,...,0.0,0.0,0.0,0.0,0,11,0,0.0,0.177419,0.0
4,0,23736,0.0,1.139433,10.111624,0.0,-0.828363,0.0,1.965721,0.0,...,0.0,0.0,0.0,0.0,0,11,0,0.0,0.177419,0.0
5,0,24166,0.0,1.39579,10.586598,0.0,-0.818385,0.0,7.666852,0.0,...,0.0,0.0,0.0,0.0,0,11,0,0.0,0.177419,0.0
6,0,25282,0.0,1.868991,15.542785,0.0,-1.17806,0.0,16.899424,0.0,...,0.0,0.0,0.0,0.0,0,12,0,0.0,0.193548,0.0
7,0,25405,0.0,1.710171,14.11313,0.0,-0.888972,0.0,3.144768,0.0,...,0.0,0.0,0.0,0.0,0,12,0,0.0,0.193548,0.0
8,0,48866,0.0,2.130297,16.345831,0.0,-1.137953,0.0,7.105152,0.0,...,0.0,0.0,0.0,0.0,0,12,0,0.0,0.193548,0.0
9,0,56054,0.0,1.692331,10.582083,0.0,-0.588656,0.0,2.617538,0.0,...,0.0,0.0,0.0,0.0,0,11,0,0.0,0.177419,0.0


In [83]:
new_data = test_data[['query_id', 'doc_id']].copy()
new_data.doc_id = new_data.doc_id.astype(int)

In [91]:
new_data['score_final'] = test_data.score_title_bm25 + test_data.score_body_bm25 #+ test_data.score_whole_bm25

In [92]:
query_scores = defaultdict(list)
for record in new_data.iterrows():
    query_scores[record[1]['query_id']].append((int(record[1]['doc_id']), record[1]['score_final']))

In [93]:
result = []
for query_id, docs in test_queries:
    docs, query_predictions = zip(*query_scores[query_id])
    top_docs = np.array(docs)[np.argsort(query_predictions)][::-1][0:5]
    for doc_id in top_docs:
        result.append((query_id, doc_id))

In [94]:
df = pd.DataFrame(result, columns=['QueryId','DocID'])

In [95]:
df.head()

Unnamed: 0,QueryId,DocID
0,0,310558
1,0,316309
2,0,236085
3,0,123741
4,0,309447


In [90]:
df.to_csv('submit4.csv', index=False)