In [None]:

!pip install pyserini
!pip install faiss-cpu

!git clone https://github.com/castorini/anserini.git --recurse-submodules

In [None]:
from google.colab import drive
drive.mount('/content/drive')


In [None]:
import os
os.chdir('/content/drive/MyDrive/Colab Notebooks')

In [None]:
!pip install json_lines
!pip install jsonlines

In [None]:
import json_lines
import jsonlines
import csv

with open('queries.tsv', 'w', newline='') as f_output:
  tsv_w = csv.writer(f_output, delimiter='\t')
  with open('trec-covid/queries.jsonl', 'rb') as f:
    for item in json_lines.reader(f):
      tsv_w.writerow([item['_id'], item['text']])
    f.close()
  f_output.close()

    


In [None]:
file = jsonlines.open('corpus/corpus_new.jsonl','w')
#i = 0
with open('trec-covid/corpus/corpus.jsonl', 'rb') as f:
    for item in json_lines.reader(f):
      #file = jsonlines.open(f'output/{i}.jsonl','w')
      #item["id"] = item.pop("_id")
      #item["contents"] = item.pop("text")
      item_new = {"id": item["_id"], "contents": item["text"]}
      jsonlines.Writer.write(file,item_new)
    f.close()
file.close()
      #i +=1
    

In [None]:
!python -m pyserini.index.lucene \
  --collection JsonCollection \
  --input corpus \
  --index indexes/covid_new \
  --generator DefaultLuceneDocumentGenerator \
  --threads 1 \
  --storePositions --storeDocvectors --storeRaw

In [None]:
#from pyserini.search.lucene import LuceneSearcher

#searcher = LuceneSearcher('indexes/covid_new')
#hits = searcher.search('document')

#for i in range(len(hits)):
    #print(f'{i+1:2} {hits[i].docid:4} {hits[i].score:.5f}')

In [None]:
!python -m pyserini.search.lucene \
  --index indexes/covid_new \
  --topics queries.tsv \
  --output result_without_bm.txt \
  --bm25

In [None]:
import json
import math
import numpy as np

In [None]:
def parse_qrel_line(line):

    lst = line.split()
    query = int(lst[0])
    document = lst[1]
    relevancy = int(lst[2])

    return query, document, relevancy

In [None]:
def parse_results_line(line):

    lst = line.split()
    query = int(lst[0])
    document = lst[2]
    rank = int(lst[3])

    return query, document, rank

In [None]:
class relevancy_lookup(object):
    def __init__(self):
        self.relevancies = {}
    
    def add(self, query, document, relevancy):

        if query not in self.relevancies.keys():
            self.relevancies.update({query:{document: relevancy}})
        else:
            self.relevancies[query].update({document: relevancy})

        
    def get(self, query, document):
        if document in self.relevancies[query].keys():
            relevancy = self.relevancies[query][document]
        else:
            relevancy = 0
        return relevancy

In [None]:
def get_ranked_labels(rel_lookup, query, doc_rank_list): 
    result = np.zeros(len(doc_rank_list), dtype=int)
    for x in doc_rank_list:
        result[x[1]-1] = rel_lookup.get(query, x[0])
    return result

In [None]:
from itertools import islice

def process_files(qrel_path, results_path):
    relevancies = relevancy_lookup()
    with open(qrel_path, 'r') as qrel_file:
        next(qrel_file)
        for line in qrel_file:
            query, document, relevancy = parse_qrel_line(line)
            relevancies.add(query, document, relevancy)

    with open(results_path, 'r') as results_file:
        current_query, document, rank = parse_results_line(next(results_file))    
        doc_rank_list = [(document, rank)]
        for line in results_file:
    
            query, document, rank = parse_results_line(line)
            # only top 100 be considered
            if rank > 100:
              continue
            
            if query != current_query:
                yield get_ranked_labels(relevancies, current_query, doc_rank_list)
                current_query = query
                doc_rank_list = [(document, rank)]
            else:
                doc_rank_list.append((document, rank))

        yield get_ranked_labels(relevancies, current_query, doc_rank_list)

In [None]:
def precision(query_relevancy_labels, k):
    if k > 0:
        prec = sum(query_relevancy_labels[:k])/k
        #print(prec)
        return prec
    else:
        return 0

In [None]:
def recall(query_relevancy_labels, k):
    denominator = sum(query_relevancy_labels)
    
    if denominator > 0 :
        rec = sum(query_relevancy_labels[:k])/denominator
        return rec
    else:
        return 0.0

In [None]:
def F_score(query_relevancy_labels, k):

    denominator = precision(query_relevancy_labels, k) + recall(query_relevancy_labels, k)
    
    if denominator > 0:
        F_s = 2 * precision(query_relevancy_labels, k) * recall(query_relevancy_labels, k)/denominator
        return F_s
    else:
        return 0.0

In [None]:
def DCG(query_relevancy_labels, k):
    # Use log with base 2
    lst = []
    range_value = min(len(query_relevancy_labels),k)
    for i in range(range_value):
        denominator = math.log(i+2,2)
        
        if denominator == 0:
            lst.append(0.0)
        else:
            lst.append(query_relevancy_labels[i]/denominator)
    return sum(lst)

In [None]:
def NDCG(query_relevancy_labels, k):
    max_query_relevancy_labels = abs(np.sort((-1)*query_relevancy_labels))
    denominator = DCG(max_query_relevancy_labels, k)
    if denominator == 0:
        return 0.0
    else:
        return DCG(query_relevancy_labels, k)/denominator

In [None]:
def AP(query_relevancy_labels):
    denominator = sum(query_relevancy_labels)
    if denominator == 0:
        return 0.0
    else:
        lst = []
        for k in range(len(query_relevancy_labels)):
            lst.append(query_relevancy_labels[k] * precision(query_relevancy_labels[:k+1], k+1))
        return sum(lst)/denominator

In [None]:
def RR(query_relevancy_labels):

    position = list(np.where(query_relevancy_labels == 1)[0])
    if position:
        return 1/(position[0]+1)
    else:
        return 0

In [None]:
def evaluate(qrel_path, results_path):
    results_per_query = {
        'precision@1': [],
        'precision@5': [],
        'precision@10': [],
        'precision@25': [],
        'recall@1': [],
        'recall@5': [],
        'recall@10': [],
        'recall@25': [],
        'F-score@1': [],
        'F-score@5': [],
        'F-score@10': [],
        'F-score@25': [],
        'DCG@1': [],
        'DCG@5': [],
        'DCG@10': [],
        'DCG@25': [],
        'NDCG@1': [],
        'NDCG@5': [],
        'NDCG@10': [],
        'NDCG@25': [],
        'MAP': [],
        'MRR': [],
    }
    for labels in process_files(qrel_path, results_path):
        results_per_query['precision@1'].append(precision(labels, 1))
        results_per_query['precision@5'].append(precision(labels, 5))
        results_per_query['precision@10'].append(precision(labels, 10))
        results_per_query['precision@25'].append(precision(labels, 25))
        results_per_query['recall@1'].append(recall(labels, 1))
        results_per_query['recall@5'].append(recall(labels, 5))
        results_per_query['recall@10'].append(recall(labels, 10))
        results_per_query['recall@25'].append(recall(labels, 25))
        results_per_query['F-score@1'].append(F_score(labels, 1))
        results_per_query['F-score@5'].append(F_score(labels, 5))
        results_per_query['F-score@10'].append(F_score(labels, 10))
        results_per_query['F-score@25'].append(F_score(labels, 25))
        results_per_query['DCG@1'].append(DCG(labels, 1))
        results_per_query['DCG@5'].append(DCG(labels, 5))
        results_per_query['DCG@10'].append(DCG(labels, 10))
        results_per_query['DCG@25'].append(DCG(labels, 25))
        results_per_query['NDCG@1'].append(NDCG(labels, 1))
        results_per_query['NDCG@5'].append(NDCG(labels, 5))
        results_per_query['NDCG@10'].append(NDCG(labels, 10))
        results_per_query['NDCG@25'].append(NDCG(labels, 25))
        results_per_query['MAP'].append(AP(labels))
        results_per_query['MRR'].append(RR(labels))
    
    results = {}
    for key, values in results_per_query.items():
        #print(key)
        #print(len(values))
        results[key] = np.mean(values)
    return results

In [None]:
results = evaluate('trec-covid/qrels/test.tsv', 'run.sample.txt')

In [None]:
with open('evaluation_50_query_100_doc.json', 'w') as f:
      json.dump(results, f)

In [None]:
import ast
with open('output_test_queries.txt', 'r') as qrel_file:
  for data in qrel_file:
    data = ast.literal_eval(data)
    print(data[0])

In [None]:
with open('output_test_queries_formatted.txt', 'w') as f:
    lines = ''
    count = 0
    for item in data:
      count += 1
      rank = 0
      for pair in item:
        rank +=1
        if rank%10 == 0:
          rank = 10
        
        lines += str(count) + ' ' + 'Q' + ' ' + str(pair[0]) + ' ' + str(rank) + ' ' + str(pair[1]) + '\n'
    f.writelines(lines)

In [None]:
#with open('trec-covid/corpus/corpus.jsonl', 'rb') as f:
    #for item in json_lines.reader(f):
      #file = jsonlines.open(f'output/{i}.jsonl','w')
      #item["id"] = item.pop("_id")
      #item["contents"] = item.pop("text")
      #item_new = {"id": item["_id"], "contents": item["text"]}
      #jsonlines.Writer.write(file,item_new)
        #if item['_id'] == 'kjjljbl5':
            #print(item)
    #f.close()

{'_id': 'kjjljbl5', 'title': 'Existence theory and numerical analysis of three species prey–predator model under Mittag-Leffler power law', 'text': 'In this manuscript, the fractional Atangana–Baleanu–Caputo model of prey and predator is studied theoretically and numerically. The existence and Ulam–Hyers stability results are obtained by applying fixed point theory and nonlinear analysis. The approximation solutions for the considered model are discussed via the fractional Adams Bashforth method. Moreover, the behavior of the solution to the given model is explained by graphical representations through the numerical simulations. The obtained results play an important role in developing the theory of fractional analytical dynamic of many biological systems.', 'metadata': {'url': 'https://doi.org/10.1186/s13662-020-02709-7; https://www.ncbi.nlm.nih.gov/pubmed/32501396/', 'pubmed_id': '32501396'}}


In [None]:
results = evaluate('trec-covid/qrels/test.tsv', 'output_test_queries_formatted.txt')
with open('evaluation_test_queries.json', 'w') as f:
      json.dump(results, f)

# 新段落