In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


# bier stuff

In [None]:
! pip install beir
! pip install -U sentence-transformers

In [4]:
from time import time
from beir import util, LoggingHandler
from beir.retrieval import models
from beir.datasets.data_loader import GenericDataLoader
from beir.retrieval.evaluation import EvaluateRetrieval
from beir.retrieval.search.dense import DenseRetrievalExactSearch as DRES

import logging
import pathlib, os
import random
from sentence_transformers import SentenceTransformer
import json

#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
#### /print debug information to stdout

#### Download nfcorpus.zip dataset and unzip the dataset
dataset = "scifact"
url = "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{}.zip".format(dataset)
data_path = util.download_and_unzip(url, "/content/gdrive/MyDrive/Independent study - Max & Carlos/data/beer_datasets")

#### Provide the data path where nfcorpus has been downloaded and unzipped to the data loader
# data folder would contain these files:
# (1) nfcorpus/corpus.jsonl  (format: jsonlines)
# (2) nfcorpus/queries.jsonl (format: jsonlines)
# (3) nfcorpus/qrels/test.tsv (format: tsv ("\t"))

corpus, queries, qrels = GenericDataLoader(data_folder=data_path).load(split="test")

queries = {}
with open(f'/content/gdrive/MyDrive/Independent study - Max & Carlos/scifact_dataset/claims_test.jsonl') as f_pdf: ###### WILL NEED TO CHANGE NAMES
    for line in f_pdf:
        line_dict = json.loads(line)
        queries[str(line_dict['id'])] = line_dict['claim']

#### Dense Retrieval using SBERT (Sentence-BERT) ####
#### Provide any pretrained sentence-transformers model
#### The model was fine-tuned using cosine-similarity.
#### Complete list - https://www.sbert.net/docs/pretrained_models.html

model = DRES(models.SentenceBERT("pritamdeka/S-PubMedBert-MS-MARCO-SCIFACT"), batch_size=256, corpus_chunk_size=512*9999)
retriever = EvaluateRetrieval(model, score_function="dot")

#### Retrieve dense results (format of results is identical to qrels)
start_time = time()
results = retriever.retrieve(corpus, queries)
end_time = time()
print("Time taken to retrieve: {:.2f} seconds".format(end_time - start_time))
#### Evaluate your retrieval using NDCG@k, MAP@K ...

# logging.info("Retriever evaluation for k in: {}".format(retriever.k_values))
# ndcg, _map, recall, precision = retriever.evaluate(qrels, results, retriever.k_values)

# mrr = retriever.evaluate_custom(qrels, results, retriever.k_values, metric="mrr")
# recall_cap = retriever.evaluate_custom(qrels, results, retriever.k_values, metric="r_cap")
# hole = retriever.evaluate_custom(qrels, results, retriever.k_values, metric="hole")

#### Print top-k documents retrieved ####
top_k = 10

query_id, ranking_scores = random.choice(list(results.items()))
scores_sorted = sorted(ranking_scores.items(), key=lambda item: item[1], reverse=True)
logging.info("Query : %s\n" % queries[query_id])

for rank in range(top_k):
    doc_id = scores_sorted[rank][0]
    # Format: Rank x: ID [Title] Body
    logging.info("Rank %d: %s [%s] - %s\n" % (rank+1, doc_id, corpus[doc_id].get("title"), corpus[doc_id].get("text")))

  from tqdm.autonotebook import tqdm


  0%|          | 0/5183 [00:00<?, ?it/s]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/123 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.29k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/416 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/461k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/21 [00:00<?, ?it/s]

Time taken to retrieve: 6937.20 seconds


In [14]:
with open("/content/gdrive/MyDrive/Independent study - Max & Carlos/data/results/scifact_test/retriever_256b_chunk.json", 'w') as json_file: # i changed the name so i don't overwrite
    json.dump(results, json_file)

# post bier stuff

In [10]:
import json
with open(f'/content/gdrive/MyDrive/Independent study - Max & Carlos/data/results/scifact_test/retriever_256b_maxcorpchunk.json') as f:
    scores = json.load(f)

test = []
with open(f'/content/gdrive/MyDrive/Independent study - Max & Carlos/scifact_dataset/claims_test.jsonl') as f_pdf:
    for line in f_pdf:
        line_dict = json.loads(line)
        test.append(line_dict)


In [11]:
top_k = 10

for item in test: # iterate through test claims
    sorted_scores = sorted(scores[str(item['id'])].items(), key=lambda x: x[1], reverse=True) # get top scores and ids sorted (id, score)

    item['cited_doc_ids'] = []
    for i in range(top_k): # take top k and add to test cited doc ids
        item['cited_doc_ids'].append(int(sorted_scores[i][0]))



In [12]:
with open('/content/gdrive/MyDrive/Independent study - Max & Carlos/data/results/scifact_test/test_with_doc_ids_top_10.jsonl', "w") as f: # commented out as to not override file (i think theres a way to do it in the code lol)
    for item in test:
        json_item = json.dumps(item)
        f.write(json_item + "\n")