In [1]:
import pandas as pd
import math
import numpy as np

In [2]:
queries = pd.read_csv('queries.csv',
                      sep='\t',
                      names=['query_id', 'query', 'trash'],
                      skiprows=1)

In [3]:
queries = pd.DataFrame(data=queries, columns=['query_id', 'query'])

In [4]:
queries.head()

Unnamed: 0,query_id,query
0,1,what similarity laws must be obeyed when const...
1,2,what are the structural and aeroelastic proble...
2,4,what problems of heat conduction in composite ...
3,8,can a criterion be developed to show empirical...
4,9,what chemical kinetic system is applicable to ...


In [5]:
train = pd.read_csv('relevance_train.csv', sep='\t', names=['query_id', 'document_id', 'relevance'])
test = pd.read_csv('relevance_test.csv', sep=',', names=['query_id', 'document_id'])

In [6]:
train = train[1:]
test = test[1:]

In [7]:
print(set(train['relevance']))

{'2', '4', '3', '-1', '1'}


In [8]:
import re
document_id = re.compile(r'(\.Id )(\d+)')
document_title = re.compile(r'\.T.*')
document_words = re.compile(r'\.W.*')
document_other = re.compile(r'\..*')

In [9]:
documents = {}
is_title = False
is_doc = False

In [10]:
with open('Documents.csv', 'r') as f:
    document = []
    index = -1
    for line in f:
        match = re.match(document_id, line)
        if match is not None:
            if index != -1:
                documents[index] = document
            document = []
            index = int(match.group(2))
            is_title = False
            is_doc = False
        else:
            match = re.match(document_title, line)
            if match is not None:
                is_title = True
            else:
                match = re.match(document_words, line)
                if match is not None:
                    is_doc = True
                else:
                    match = re.match(document_other, line)
                    if match is not None:
                        is_doc = False
                        is_title = False
                    else:
                        if is_doc or is_title:
                            document = document + re.split('\W+',line[:-1])
    if index != -1:
        documents[index] = document

In [11]:
# print(documents)

In [12]:
# for doc_id, document in documents.items():
#     lemmatized = []
#     for word in document:
#         lemmatized.append(lancaster_stemmer.stem(word))
#     documents[doc_id] = Counter(lemmatized)

In [13]:
feature_words = set()
for query in queries['query']:
    words = re.split('\W+', query)
    for word in words:
        feature_words.add(word)
feature_words = list(feature_words)

In [14]:
# words_in_queries = {}
# for index, row in queries.iterrows():
#     query = row['query']
#     words = re.split('\W+', query)
#     new_words = set()
#     for word in words:
#         new_words.add(lancaster_stemmer.stem(word))
#     words_in_queries[row['query_id']] = new_words

In [15]:
with open('learning.txt', 'w') as learning_f:
    with open('val.txt', 'w') as val_f:
        count = 0
        for index, row in train.iterrows():
            line = str(row['relevance']) +\
            " " + "qid:" + row['query_id'] + " "
            for i, word in enumerate(feature_words):
                if word in documents[int(row['document_id'])]:
                    line = line + str(i) + ":1 "
                else:
                    line = line + str(i) + ":0 "
#             if count < 600:
            learning_f.write(line + '\n')
#             elif count < 800:
#                 val_f.write(line + '\n')
#             count += 1

In [44]:
%%time
! java -jar RankLib-2.1-patched.jar -train ./learning.txt \
-validate ./val.txt \
-ranker 8 -tree 15 -bag 200 -metric2t NDCG@5 \
-save ./LambdaMART_100_5.txt


[+] General Parameters:
Training data:	./learning.txt
Validation data:	./val.txt
Feature vector representation: Dense.
Ranking method:	Random Forests
Feature description file:	Unspecified. All features will be used.
Train metric:	NDCG@5
Test metric:	NDCG@5
Feature normalization: No
Model file: ./LambdaMART_100_5.txt

[+] Random Forests's Parameters:
No. of bags: 200
Sub-sampling: 1.0
Feature-sampling: 0.3
No. of trees: 15
No. of leaves: 100
No. of threshold candidates: 256
Learning rate: 0.1

Reading feature file [./learning.txt]... [Done.]            
(125 ranked lists, 990 entries read)
Reading feature file [./val.txt]... [Done.]            
(0 ranked lists, 0 entries read)
Initializing... [Done]
------------------------------------
Training starts...
------------------------------------
bag       | NDCG@5-B  | NDCG@5-OOB  | 
------------------------------------
b[1]      | 0.9771    | 
b[2]      | 0.9676    | 
b[3]      | 0.9857    | 
b[4]      | 0.9699    | 
b[5]      | 0.9803    

In [45]:
with open('test.txt', 'w') as test_f:
    for index, row in test.iterrows():
        line = str(-1) +\
            " " + "qid:" + row['query_id'] + " "
        for i, word in enumerate(feature_words):
            if word in documents[int(row['document_id'])]:
                line = line + str(i) + ":1 "
            else:
                line = line + str(i) + ":0 "
        test_f.write(line + '\n')

In [46]:
%%time
! java -jar RankLib-2.1-patched.jar \
-load ./LambdaMART_100_5.txt \
-rank ./test.txt \
-score myscorefile.txt


[+] General Parameters:
Model file:	./LambdaMART_100_5.txt
Feature normalization: No
Model:		Random Forests
Reading feature file [./test.txt]... [Done.]            
(100 ranked lists, 847 entries read)
CPU times: user 3.43 s, sys: 492 ms, total: 3.92 s
Wall time: 1min 50s


In [47]:
answer = pd.read_table('myscorefile.txt', header=None)

In [48]:
answer.head()

Unnamed: 0,0,1,2
0,126,0,2.513562
1,126,1,1.741886
2,126,2,2.537909
3,126,3,1.391144
4,126,4,1.804242


In [49]:
document_ids = {}
for index, row in test.iterrows():
    if row['query_id'] not in document_ids.keys():
        document_ids[row['query_id']] = []
    document_ids[row['query_id']].append(row['document_id'])

In [50]:
super_final_answer = []

In [51]:
prev_query_id = 126
current_query_id = 126
values = []
for index, row in answer.iterrows():
    current_query_id = row[0]
    if prev_query_id != current_query_id:
        values = np.array(values)
#         print(values)
        args = np.argsort(-values)
        for arg in args:
            super_final_answer.append((int(prev_query_id), document_ids[str(int(prev_query_id))][arg]))
        values = [row[2]]
    else:
        values.append(row[2])
    prev_query_id = current_query_id
values = np.array(values)
args = np.argsort(-values)
for arg in args:
    super_final_answer.append((int(prev_query_id), document_ids[str(int(prev_query_id))][arg]))
values = []

In [52]:
print(document_ids['126'])

['974', '1326', '187', '969', '970', '971', '972', '973', '942']


In [53]:
print(super_final_answer)

[(126, '187'), (126, '974'), (126, '973'), (126, '971'), (126, '970'), (126, '972'), (126, '1326'), (126, '969'), (126, '942'), (127, '101'), (127, '983'), (127, '164'), (127, '944'), (127, '981'), (127, '982'), (128, '985'), (128, '990'), (128, '945'), (129, '984'), (129, '986'), (129, '985'), (129, '990'), (129, '988'), (129, '945'), (129, '989'), (129, '987'), (130, '12'), (130, '859'), (130, '858'), (130, '1008'), (130, '948'), (130, '766'), (131, '1014'), (131, '1016'), (131, '950'), (131, '1017'), (131, '1012'), (131, '1018'), (131, '1020'), (131, '1019'), (131, '1013'), (132, '1014'), (132, '1016'), (132, '1023'), (132, '950'), (132, '1017'), (132, '1015'), (132, '1025'), (132, '1026'), (132, '1012'), (132, '1018'), (132, '951'), (132, '1024'), (132, '1020'), (132, '1019'), (132, '952'), (132, '1013'), (133, '1016'), (133, '950'), (133, '1022'), (133, '1012'), (133, '1018'), (133, '1024'), (133, '1020'), (133, '1019'), (134, '1027'), (134, '1028'), (134, '951'), (135, '1016'), (

In [54]:
df = pd.DataFrame(data=super_final_answer, columns=['QueryId', 'DocumentId'])

In [55]:
df.head()

Unnamed: 0,QueryId,DocumentId
0,126,187
1,126,974
2,126,973
3,126,971
4,126,970


In [56]:
df.to_csv('out.csv', index=False)