In [1]:
import pyserini.search as pys
from pyserini.index import IndexReader
import numpy as np
import pandas as pd
import time
import math as m

In [2]:
def findDoc(id, mode="chunk"):
    if(mode == "linear"):
        res = ""
        f = open("data/collection.tsv",  encoding="utf8")
        for i in range(id+1):
            l = f.readline()
        return l.strip().split('\t')
    elif(mode == "chunk"):
        res = id%10000
        nearest_n = id - res
        f = open(f"data/collection_chunks/{nearest_n}.txt",  encoding="utf8")
        l = None
        if res == 0:
            f = open(f"data/collection_chunks/{id-10000}.txt",  encoding="utf8")
            lines = f.readlines()
            l = lines[len(lines)-1]
        
        for i in range(res):
            l = f.readline()
        
        if l == None:
            print(f"Document not found/read, id: {id}, res id: {res}")
            
        return l.strip().split('\t')
    
def loadQueries(filename = "data/queries/queries.train.tsv"):
    f = open(filename,  encoding="utf8")
    lines = f.readlines()
    print(lines[0])
    f.close()
    return {int(l.strip().split('\t')[0]):l.strip().split('\t')[1] for l in lines}

def loadQRels(filename):
    f = open(filename,  encoding="utf8")
    lines = f.readlines()
    lines = [l.strip().split(' ') for l in lines]
    qrels = {}
    query_qrels = {}
    for l in lines:
        qrels[f"{l[0]}:{l[2]}"] = int(l[3])
        query_qrels[f"{l[0]}"] = True
    print(lines[0])
    f.close()
    return qrels, query_qrels

def findQuery(id, cached=True):
    if(cached):
        content = queries[id]
        return [id, content]
    t = time.time()
    f = open("data/queries/queries.train.tsv",  encoding="utf8")
    while True:
        l = f.readline().strip().split('\t')
        l[0] = int(l[0])
        if(l[0] == id):
            print(time.time() - t)
            return l
        if not l:
            break
    print(time.time() - t)
    return l        

def getTrip(f):
    l = f.readline().strip().split('\t')
    l = [int(x) for x in l]
    l[0] = findQuery(l[0])
    l[1] = findDoc(l[1])
    l[2] = findDoc(l[2])
    return l

In [8]:
queries = loadQueries("data/queries/queries.eval.tsv")


786436	what is prescribed to treat thyroid storm



In [116]:
qrels["19335:1017759"]

0

In [169]:
index_reader = IndexReader('indexes/sample_collection_jsonl')

def calc_features(doc, query, improv = False):
    res = []
    res.append(index_reader.compute_query_document_score(str(doc[0]), query[1]))
    res.append(calc_tf(doc, query))
    res.append(calc_tf_idf(doc, query))
    res.append(len(doc[1].split(" ")))
    if(improv):
        res.append(calcTermPlacement(doc[1], query[1]))
    return res

def calc_tf(doc, query):
    terms = index_reader.analyze(query[1])
    tfs = 0
    doc_vec = index_reader.get_document_vector(doc[0])
    doc_n = len(doc[1].split(" "))
    for term in terms:
        tf = 0
        if term in doc_vec:
            tf = doc_vec[term]/doc_n
        tfs = tfs + tf/len(terms)
    return tfs

def calc_tf_idf(doc, query):
    terms = index_reader.analyze(query[1])
    tfs = 0
    doc_vec = index_reader.get_document_vector(doc[0])
    doc_n = len(doc[1].split(" "))
    for term in terms:
        tf = 0
        if term in doc_vec:
            tf = doc_vec[term]/doc_n*calc_idf(term)
        tfs = tfs + tf/len(terms)
    return tfs

def calc_idf(term):
    N = index_reader.stats()['documents']
    nk = index_reader.get_term_counts(term, analyzer=None)[0]
    return m.log(N/nk)

def calcTermPlacement(doc, query):
    terms = index_reader.analyze(query)
    sentences = doc.split(".")
    n = len(sentences)
    term_scores = [0]*len(terms)
    for i in range(len(terms)):
        term = terms[i]
        sent_score = 0
        for j in range(len(sentences)):
            s = sentences[j].lower()
            words =  index_reader.analyze(s)
            if term in words:
                sent_score = max(0, 1 - 0.02 * words.index(term)**2)
                sent_pos = max(0, 1 - (j/n)**2)
                term_scores[i] = sent_score*sent_pos
#                 print(f"{term_scores[i]}: {term} -> {s}")
                break
    
    return np.sum(term_scores)
        

def f_string(f, offset=1):
    return " ".join([f"{i+offset}:{f[i]}" for i in range(len(fl))])

In [161]:
calcTermPlacement("Insulin (from the Latin, insula meaning island) is a peptide hormone produced by beta cells of the pancreatic islets, and it is considered to be the main anabolic hormone of the body.",
                 "where does real insulin come from")

print( calc_idf("insulin"))
# print( index_reader.get_term_counts("where", analyzer=None)[0])
print( calc_idf("from"))

# print( index_reader.get_term_counts("and", analyzer=None)[0])
# print(index_reader.get_term_counts("is", analyzer=None)[0])
# print(index_reader.get_term_counts("the", analyzer=None)[0])

# [index_reader.analyze(w) for w in "Insulin (from the Latin, insula meaning island) is a peptide hormone produced by beta cells of the pancreatic islets, and it is considered to be the main anabolic hormone of the body.".split(" ")]
index_reader.analyze("Insulin (from the Latin, insula meaning island) is a peptide hormone produced by beta cells of the pancreatic islets, and it is considered to be the main anabolic hormone of the body.")

1.0: insulin -> insulin (from the latin, insula meaning island) is a peptide hormone produced by beta cells of the pancreatic islets, and it is considered to be the main anabolic hormone of the body
0.7: from -> insulin (from the latin, insula meaning island) is a peptide hormone produced by beta cells of the pancreatic islets, and it is considered to be the main anabolic hormone of the body
6.4439855959926
1.580250614477024


['insulin',
 'from',
 'latin',
 'insula',
 'mean',
 'island',
 'peptid',
 'hormon',
 'produc',
 'beta',
 'cell',
 'pancreat',
 'islet',
 'consid',
 'main',
 'anabol',
 'hormon',
 'bodi']

In [176]:
queries = loadQueries("data/queries/queries.train.tsv")
triples = open("data/qidpidtriples.train.full.2.tsv")
fw = open("train_improv2_features_100k_200k.txt", "w")

train_size = 100000
start = 100000

for i in range(0, start):
    triples.readline()
print("Starting now")


for i in range(start, start + train_size):
    triple = getTrip(triples)
    fl = calc_features(triple[1], triple[0], True)
    fr = calc_features(triple[2], triple[0], True)
    fw.write(f"1 qid:{triple[0][0]} {f_string(fl)}\n")
    fw.write(f"0 qid:{triple[0][0]} {f_string(fr)}\n")
    if i % (train_size/100) == 0:
        print(f"{(i-start)/train_size*100}%")


fw.close()
triples.close()

121352	define extreme

Starting now
0.0%
1.0%
2.0%
3.0%
4.0%
5.0%
6.0%
7.000000000000001%
8.0%
9.0%
10.0%
11.0%
12.0%
13.0%
14.000000000000002%
15.0%
16.0%
17.0%
18.0%
19.0%
20.0%
21.0%
22.0%
23.0%
24.0%
25.0%
26.0%
27.0%
28.000000000000004%
28.999999999999996%
30.0%
31.0%
32.0%
33.0%
34.0%
35.0%
36.0%
37.0%
38.0%
39.0%
40.0%
41.0%
42.0%
43.0%
44.0%
45.0%
46.0%
47.0%
48.0%
49.0%
50.0%
51.0%
52.0%
53.0%
54.0%
55.00000000000001%
56.00000000000001%
56.99999999999999%
57.99999999999999%
59.0%
60.0%
61.0%
62.0%
63.0%
64.0%
65.0%
66.0%
67.0%
68.0%
69.0%
70.0%
71.0%
72.0%
73.0%
74.0%
75.0%
76.0%
77.0%
78.0%
79.0%
80.0%
81.0%
82.0%
83.0%
84.0%
85.0%
86.0%
87.0%
88.0%
89.0%
90.0%
91.0%
92.0%
93.0%
94.0%
95.0%
96.0%
97.0%
98.0%
99.0%


In [175]:
fw.close()
triples.close()

In [177]:
searcher = pys.SimpleSearcher('indexes/sample_collection_jsonl')
qrels, query_qrels = loadQRels("data/2019qrels-pass.txt")
queries = loadQueries("data/msmarco-test2019-queries.tsv")



['19335', 'Q0', '1017759', '0']
1108939	what slows down the flow of blood



In [178]:
fw = open("data/test_improv2_features.tsv", "w", encoding="utf-8")

its = 0
n = len(qrels)

for k, v in query_qrels.items():
    query = findQuery(int(k))
    hits = searcher.search(query[1], k = 1000)
    for hit in hits:
        rel = 0
        combi = f"{query[0]}:{hit.docid}"
        if combi in qrels:
            rel = qrels[combi]
        fl = calc_features(findDoc(int(hit.docid)), query, True)
        fw.write(f"{rel} qid:{query[0]} {f_string(fl)} #{hit.docid}\n")
    its = its+ 1
    print(its)
        

# for k,v in qrels.items():
#     query = int(k.split(":")[0])
#     docid = k.split(":")[1]
#     fl = calc_features(findDoc(int(docid)), findQuery(query))
#     fw.write(f"{v} qid:{query} {f_string(fl)}\n")
#     if(its%100 == 0):
#         print(f"{its/n*100}%)
        
#     hits = searcher.search(query[1], k=100)
#     for hit in hits:
#         rel = 0
#         combi = f"{query[0]}:{hit.docid}"
#         print(combi)    
#         if combi in qrels:
#             rel = qrels[combi]
#             print(rel)
#         fl = calc_features(findDoc(int(hit.docid)), query)
#         fw.write(f"{rel} qid:{triple[0][0]} {f_string(fl)}\n")
        
fw.close()
    
    

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
