In [1]:
import copy
import json
from tqdm import tqdm
import os
import re

from sklearn import linear_model

In [2]:
qrel_dict_labeled = {}
with open('./info/qrel_dict_labeled.txt', 'r') as f:
    qrel_dict_labeled = json.load(f)
    
qrel_dict_all = {}
with open('./info/qrel_dict_all.txt', 'r') as f:
    qrel_dict_all = json.load(f)
    
qrel_train = {}
with open('./info/qrel_train.txt', 'r') as f:
    qrel_train = json.load(f)

qrel_test = {}
with open('./info/qrel_test.txt', 'r') as f:
    qrel_test = json.load(f)

In [3]:
matrix_train = {}
matrix_test = {}
for query, label_map in qrel_dict_labeled.items():
    if query in qrel_train:
        for doc, label in label_map.items():
            entry = query + '-' + doc
            matrix_train[entry] = {}
            matrix_train[entry]['label'] = label
    
for query, doc_list in qrel_dict_all.items():
    if query in qrel_test:
        for doc in doc_list:
            entry = query + '-' + doc
            matrix_test[entry] = {}



# Load train scores

In [4]:

okapi_tf_train_file = open('./result/okapi_tf_train.txt', 'r')
for line in okapi_tf_train_file:
    entry = line.split()[0] + '-' + line.split()[2]
    matrix_train[entry]['okapi_tf'] = round(float(line.split()[4]), 4)
    
tf_idf_train_file = open('./result/tf_idf_train.txt', 'r')
for line in tf_idf_train_file:
    entry = line.split()[0] + '-' + line.split()[2]
    matrix_train[entry]['tf_idf'] = round(float(line.split()[4]), 4)

bm25_train_file = open('./result/bm25_train.txt', 'r')
for line in bm25_train_file:
    entry = line.split()[0] + '-' + line.split()[2]
    matrix_train[entry]['bm25'] = round(float(line.split()[4]), 4)

unigram_laplace_train_file = open('./result/unigram_laplace_train.txt', 'r')
for line in unigram_laplace_train_file:
    entry = line.split()[0] + '-' + line.split()[2]
    matrix_train[entry]['unigram_laplace'] = round(float(line.split()[4]), 4)
    
unigram_jm_train_file = open('./result/unigram_jm_train.txt', 'r')
for line in unigram_jm_train_file:
    entry = line.split()[0] + '-' + line.split()[2]
    matrix_train[entry]['unigram_jm'] = round(float(line.split()[4]), 4)
    

# Load test scores

In [5]:

okapi_tf_test_file = open('./result/okapi_tf_test.txt', 'r')
for line in okapi_tf_test_file:
    entry = line.split()[0] + '-' + line.split()[2]
    matrix_test[entry]['okapi_tf'] = round(float(line.split()[4]), 4)
    
tf_idf_test_file = open('./result/tf_idf_test.txt', 'r')
for line in tf_idf_test_file:
    entry = line.split()[0] + '-' + line.split()[2]
    matrix_test[entry]['tf_idf'] = round(float(line.split()[4]), 4)

bm25_test_file = open('./result/bm25_test.txt', 'r')
for line in bm25_test_file:
    entry = line.split()[0] + '-' + line.split()[2]
    matrix_test[entry]['bm25'] = round(float(line.split()[4]), 4)

unigram_laplace_test_file = open('./result/unigram_laplace_test.txt', 'r')
for line in unigram_laplace_test_file:
    entry = line.split()[0] + '-' + line.split()[2]
    matrix_test[entry]['unigram_laplace'] = round(float(line.split()[4]), 4)
    
unigram_jm_test_file = open('./result/unigram_jm_test.txt', 'r')
for line in unigram_jm_test_file:
    entry = line.split()[0] + '-' + line.split()[2]
    matrix_test[entry]['unigram_jm'] = round(float(line.split()[4]), 4)
    

In [6]:
with open('./info/matrix_train.txt', 'w') as f:
    json.dump(matrix_train, f)

with open('./info/matrix_test.txt', 'w') as f:
    json.dump(matrix_test, f)


In [7]:
train_feature_array = []
train_label_array = []
train_entry_array = []

for sample, attributes in matrix_train.items():
    train_entry_array.append(sample)
    train_feature_array.append([ attributes['bm25'], attributes['tf_idf'], attributes['okapi_tf'], attributes['unigram_laplace'], attributes['unigram_jm'] ])
    train_label_array.append(attributes['label'])
    
    

In [8]:
test_feature_array = []
test_label_array = []
test_entry_array = []

for sample, attributes in matrix_test.items():
    test_entry_array.append(sample)
    test_feature_array.append([ attributes['bm25'], attributes['tf_idf'], attributes['okapi_tf'], attributes['unigram_laplace'], attributes['unigram_jm'] ])
    
    

In [9]:
lm = linear_model.LinearRegression()
model = lm.fit(train_feature_array,train_label_array)




In [10]:
predictions = lm.predict(test_feature_array)

In [11]:

test_queries = []
for qno in qrel_test:
    test_queries.append(qno)

train_queries = []
for qno in qrel_train:
    train_queries.append(qno)

In [12]:
test_pred_map = {}
for query in test_queries:
    test_pred_map[str(query)] = []
    
for i in range(len(test_entry_array)):
    query, docno = test_entry_array[i].split('-', maxsplit=1)
    test_pred_map[query].append([docno, predictions[i]])

In [13]:
def rank_scores(query, ml_scores, out):
    sorted_scores = sorted(ml_scores, key=lambda x: x[1], reverse=True)
    i = 0
    for j in range(len(sorted_scores)):
        if i == 1000:
            break
        str = ('{} Q0 {} {} {} Exp'
            .format(query, sorted_scores[j][0], j+1, sorted_scores[j][1]))
        out.write(str+"\n")
        i += 1



In [14]:
ml_out = open('./result/ml_test.txt', 'w')

for test_query, ml_scores in test_pred_map.items():
    rank_scores(test_query, ml_scores, ml_out)

In [15]:
predictions = lm.predict(train_feature_array)

In [16]:
train_pred_map = {}
for query in train_queries:
    train_pred_map[str(query)] = []
    
for i in range(len(train_entry_array)):
    query, docno = train_entry_array[i].split('-', maxsplit=1)
    train_pred_map[query].append([docno, predictions[i]])

In [17]:
ml_out = open('./result/ml_train.txt', 'w')

for train_query, ml_scores in train_pred_map.items():
    rank_scores(train_query, ml_scores, ml_out)