In [1]:
from beir import util, LoggingHandler
from beir.retrieval import models
from beir.datasets.data_loader import GenericDataLoader
from beir.retrieval.evaluation import EvaluateRetrieval
from beir.retrieval.search.dense import DenseRetrievalExactSearch as DRES

import logging
import pathlib, os
#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
#### /print debug information to stdout

  from tqdm.autonotebook import tqdm


In [3]:
#### Download scifact.zip dataset and unzip the dataset
#dataset = "fiqa"
dataset = 'scidocs'
url = "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{}.zip".format(dataset)
#out_dir = os.path.join(pathlib.Path(__file__).parent.absolute(), "datasets")
out_dir = os.path.join(os.getcwd(), "datasets")
data_path = util.download_and_unzip(url, out_dir)

2022-09-01 23:27:10 - Downloading scidocs.zip ...


/dstore/home/wang/xaiss/datasets/scidocs.zip: 100%|██████████| 136M/136M [00:02<00:00, 61.1MiB/s] 


2022-09-01 23:27:13 - Unzipping scidocs.zip ...


In [16]:
# examples of corpus
from re import L
import jsonlines

corpus =  os.path.join(os.getcwd(),'datasets/scidocs/corpus.jsonl')
with jsonlines.open(corpus) as f:
    for l in f.iter():
        print(l)
        break

{'_id': '632589828c8b9fca2c3a59e97451fde8fa7d188d', 'title': 'A hybrid of genetic algorithm and particle swarm optimization for recurrent network design', 'text': 'An evolutionary recurrent network which automates the design of recurrent neural/fuzzy networks using a new evolutionary learning algorithm is proposed in this paper. This new evolutionary learning algorithm is based on a hybrid of genetic algorithm (GA) and particle swarm optimization (PSO), and is thus called HGAPSO. In HGAPSO, individuals in a new generation are created, not only by crossover and mutation operation as in GA, but also by PSO. The concept of elite strategy is adopted in HGAPSO, where the upper-half of the best-performing individuals in a population are regarded as elites. However, instead of being reproduced directly to the next generation, these elites are first enhanced. The group constituted by the elites is regarded as a swarm, and each elite corresponds to a particle within it. In this regard, the elit

In [17]:
queries =  os.path.join(os.getcwd(),'datasets/scidocs/queries.jsonl')
with jsonlines.open(queries) as f:
    for l in f.iter():
        print(l)
        break

{'_id': '78495383450e02c5fe817e408726134b3084905d', 'text': 'A Direct Search Method to solve Economic Dispatch Problem with Valve-Point Effect', 'metadata': {'authors': ['50306438', '15303316', '1976596'], 'year': 2014, 'cited_by': ['38e78343cfd5c013decf49e8cf008ddf6458200f'], 'references': ['632589828c8b9fca2c3a59e97451fde8fa7d188d', '4cf296b9d4ef79b838dc565e6e84ab9b089613de', '86e87db2dab958f1bd5877dc7d5b8105d6e31e46', '4b031fa8bf63e17e2100cf31ba6e11d8f80ff2a8', 'a718c6ca7a1db49bb2328d43f775783e8ec6f985', 'cf51cfb5b221500b882efee60b794bc11635267e', '6329874126a4e753f98c40eaa74b666d0f14eaba', 'a27b6025d147febb54761345eafdd73954467aca']}}


In [18]:
# change the corpus format to match the pyserini allowable format (json)
# 1. title + contents
import json
import jsonlines
corpus =  os.path.join(os.getcwd(),'datasets/scidocs/corpus.jsonl')
with jsonlines.open(corpus) as f:
    text = []
    for l in f:
        l['id'] = l['_id']
        l['contents'] = l['title'] + ' ' + l['text']
        del l['_id']
        del l['text']
        del l['title']
        del l['metadata']
        text.append(l)
print(len(text))
print(text[0])

output_path = os.path.join(os.getcwd(),'datasets/scidocs/corpus_json/corpus.json')
print()
with open(output_path,'w')as f:
    json.dump(text,f)



25657
{'id': '632589828c8b9fca2c3a59e97451fde8fa7d188d', 'contents': 'A hybrid of genetic algorithm and particle swarm optimization for recurrent network design An evolutionary recurrent network which automates the design of recurrent neural/fuzzy networks using a new evolutionary learning algorithm is proposed in this paper. This new evolutionary learning algorithm is based on a hybrid of genetic algorithm (GA) and particle swarm optimization (PSO), and is thus called HGAPSO. In HGAPSO, individuals in a new generation are created, not only by crossover and mutation operation as in GA, but also by PSO. The concept of elite strategy is adopted in HGAPSO, where the upper-half of the best-performing individuals in a population are regarded as elites. However, instead of being reproduced directly to the next generation, these elites are first enhanced. The group constituted by the elites is regarded as a swarm, and each elite corresponds to a particle within it. In this regard, the elites 

In [15]:
# 2. only title
import json
import jsonlines
corpus =  os.path.join(os.getcwd(),'datasets/scidocs_title/corpus.jsonl')
with jsonlines.open(corpus) as f:
    text = []
    for l in f:
        l['id'] = l['_id']
        l['contents'] = l['title']
        del l['_id']
        del l['text']
        del l['title']
        del l['metadata']
        text.append(l)
print(len(text))
print(text[0])

output_path = os.path.join(os.getcwd(),'datasets/scidocs_title/corpus_json/corpus.json')
print()
with open(output_path,'w')as f:
    json.dump(text,f)

25657
{'id': '632589828c8b9fca2c3a59e97451fde8fa7d188d', 'contents': 'A hybrid of genetic algorithm and particle swarm optimization for recurrent network design'}

