SciFact, a dataset of 1.4K expert-written scientific claims paired with evidence-containing abstracts, and annotated with labels and rationales.

In [44]:
import faiss
import pickle
import numpy as np
from elasticsearch import Elasticsearch
from datasets import load_dataset

### Import data

In [2]:
# Specify the file path where your embeddings were saved
claim = "scifact_claim_embeddings.pkl"
evidence = "scifact_evidence_embeddings.pkl"

# Step 1: Load the pickled embeddings for evidence and claim
with open(evidence, "rb") as f:
    evidence_embeddings = pickle.load(f)

with open(claim, "rb") as f:
    claim_embeddings = pickle.load(f)

In [3]:
# Reformat embeddings into numpy array
claim_doc = []
claim_mat = []
for doc, embeddings in claim_embeddings.items():
    claim_doc.append(doc)
    claim_mat.append(embeddings)
claim_mat = np.array(claim_mat)


evidence_doc = []
evidence_mat = []
for doc, embeddings in evidence_embeddings.items():
    evidence_doc.append(doc)
    evidence_mat.append(embeddings)
evidence_mat = np.array(evidence_mat)

In [4]:
# Check on embedding dimensions
n_evi, d = np.shape(evidence_mat)
print(f"Evidence embedding dimension: {(n_evi, d)}")
n_claim, d = np.shape(claim_mat)
print(f"Claim embedding dimension: {(n_claim, d)}")


Evidence embedding dimension: (5183, 1536)
Claim embedding dimension: (809, 1536)


In [78]:
# Get relevant documents index for claims
scifact_evidence = load_dataset("allenai/scifact", "corpus")
scifact_claims = load_dataset("allenai/scifact", "claims")

In [115]:
query_id =[doc[0] for doc in claim_doc]
evidence_doc_id = [str(i) for i in scifact_evidence["train"]['doc_id']]
golden_evidence_id = []

for claim in scifact_claims['train']:
    if claim["id"] in query_id:
        if claim["evidence_doc_id"] == "":
            golden_evidence_id.append([-1])
        else:
            golden_evidence_id.append([evidence_doc_id.index(claim["evidence_doc_id"])])
        query_id.remove(claim["id"])
golden_evidence_id = np.array(golden_evidence_id)

In [68]:
# Mean Reciprocal Rank @ K (MRR@K)
def mrr_at_k(actual, predicted, k):
    """
    Calculate Mean Reciprocal Rank at K (MRR@K).
    
    Args:
    actual: List of lists containing actual relevant items for each query.
    predicted: List of lists containing predicted items for each query.
    k: The number of top predictions to consider.
    
    Returns:
    float: The MRR@K score.
    """
    reciprocal_ranks = []
    
    for act, pred in zip(actual, predicted):
        # Find the rank of the first relevant item in top K predictions
        for rank, item in enumerate(pred[:k], 1):
            if item in act:
                reciprocal_ranks.append(1.0 / rank)
                break
        else:
            reciprocal_ranks.append(0.0)
    
    return np.mean(reciprocal_ranks)

def mean_average_precision_at_k(actual, predicted, k):
    ap_at_k = []

    map_at_k = np.mean(ap_at_k)
    
    return map_at_k


## 5.2 Nearest Neighbor with FAISS
reference: https://github.com/facebookresearch/faiss/wiki/Getting-started

### Build index with Evidence Embeddings

In [91]:
index = faiss.IndexFlatL2(d)   # build the index
print(index.is_trained)
index.add(evidence_mat)  # add vectors to the index
print(index.ntotal)

True
5183


### Search KNN

In [92]:
K = 50
D, I = index.search(claim_mat, K)     # actual search

### Evaluation of MAP and MRR

In [119]:
mrr_at_1 = mrr_at_k(actual=golden_evidence_id, predicted=I, k=1)
mrr_at_10 = mrr_at_k(actual=golden_evidence_id, predicted=I, k=10)
mrr_at_50 = mrr_at_k(actual=golden_evidence_id, predicted=I, k=50)
print(f"MRR@1:{mrr_at_1}, MRR@10: {mrr_at_10}, MRR@50: {mrr_at_50}")

MRR@1:0.453646477132262, MRR@10: 0.5063982576961563, MRR@50: 0.507288146659629


## 5.3 ElasticSearch

In [133]:
# Connect to Elasticsearch instance
es = Elasticsearch("http://localhost:9200")

# Check if Elasticsearch is running
if not es.ping():
    raise ValueError("Connection failed")

### Build Index with Evidence Text

In [138]:
import json
# Index documents in Elasticsearch
for i, doc in enumerate(evidence_doc):
    es.index(index = "evidence-index", id = i, body ={"text": doc[1]})

In [150]:
I = []
for claim in claim_doc:
    query = claim[1]
    response = es.search(index = "evidence-index", 
                     body ={"query": 
                            {"match": 
                             {"text": query}
                             }
                             ,
                             "size": 50})
    topk = [int(hit['_id']) for hit in response["hits"]["hits"]]
    I.append(topk)
I = np.array(I)


In [147]:
I.shape

(809, 50)

In [144]:
response = es.search(index = "evidence-index", 
                     body ={"query": 
                            {"match": 
                             {"text": query}
                             }
                             ,
                             "size": 10})

In [145]:
# Print search results
for hit in response["hits"]["hits"]:
    print(f"Document ID:{hit['_id']}, Score: {hit ['_score']}")
    print(f"Document: { hit ['_source']['text']}\n")

Document ID:1810, Score: 13.345161
Document: BACKGROUND Paralysis or amputation of an arm results in the loss of the ability to orient the hand and grasp, manipulate, and carry objects, functions that are essential for activities of daily living. Brain-machine interfaces could provide a solution to restoring many of these lost functions. We therefore tested whether an individual with tetraplegia could rapidly achieve neurological control of a high-performance prosthetic limb using this type of an interface.   
 METHODS We implanted two 96-channel intracortical microelectrodes in the motor cortex of a 52-year-old individual with tetraplegia. Brain-machine-interface training was done for 13 weeks with the goal of controlling an anthropomorphic prosthetic limb with seven degrees of freedom (three-dimensional translation, three-dimensional orientation, one-dimensional grasping). The participant's ability to control the prosthetic limb was assessed with clinical measures of upper limb funct

In [152]:
mrr_at_1 = mrr_at_k(actual=golden_evidence_id, predicted=I, k=1)
mrr_at_10 = mrr_at_k(actual=golden_evidence_id, predicted=I, k=10)
mrr_at_50 = mrr_at_k(actual=golden_evidence_id, predicted=I, k=50)
print(f"MRR@1:{mrr_at_1}, MRR@10: {mrr_at_10}, MRR@50: {mrr_at_50}")

MRR@1:0.4400494437577256, MRR@10: 0.4866423371985794, MRR@50: 0.4881666117724335


In [175]:
map_at_1 = mean_average_precision_at_k(golden_evidence_id, I, 1)
map_at_10 = mean_average_precision_at_k(golden_evidence_id, I, 10)
map_at_50 = mean_average_precision_at_k(golden_evidence_id, I, 50)
print(f"MAP@1: {map_at_1:.4f}")
print(f"MAP@10: {map_at_10:.4f}")
print(f"MAP@50: {map_at_50:.4f}")

MAP@1: 0.4400
MAP@10: 0.4866
MAP@50: 0.4882


In [158]:
1810 in I[0][:5]

True

In [157]:
I[0][:5]

array([1810, 4721,  151,  209, 4035])

In [169]:
import numpy as np

def average_precision_at_k(relevant_docs, retrieved_docs, k):
    """
    Calculate Average Precision at K for a single query
    
    Args:
    relevant_docs (list): Indices of relevant documents
    retrieved_docs (list): Indices of retrieved documents, in order of retrieval
    k (int): Number of top results to consider
    
    Returns:
    float: Average Precision at K
    """
    if -1 in relevant_docs:
        return 0.0
    
    relevant_docs = set(relevant_docs)
    retrieved_docs = retrieved_docs[:k]
    
    precision_sum = 0
    num_relevant = 0
    
    for i, doc in enumerate(retrieved_docs, 1):
        if doc in relevant_docs:
            num_relevant += 1
            precision_sum += num_relevant / i
    
    return precision_sum / min(len(relevant_docs), k)

def mean_average_precision_at_k(queries_relevant_docs, queries_retrieved_docs, k):
    """
    Calculate Mean Average Precision at K (MAP@K) for multiple queries
    
    Args:
    queries_relevant_docs (list of lists): List of relevant document indices for each query
    queries_retrieved_docs (list of lists): List of retrieved document indices for each query
    k (int): Number of top results to consider
    
    Returns:
    float: MAP@K score
    """
    ap_scores = [
        average_precision_at_k(relevant, retrieved, k)
        for relevant, retrieved in zip(queries_relevant_docs, queries_retrieved_docs)
    ]
    return np.mean(ap_scores)

# Example usage
queries_relevant_docs = [
    [1, 3, 4],  # Relevant document indices for query 1
    [2, 3, 5],  # Relevant document indices for query 2
    [1, 2, 4]   # Relevant document indices for query 3
]

queries_retrieved_docs = [
    [1, 2, 3, 4, 5],  # Retrieved document indices for query 1
    [5, 3, 2, 1, 4],  # Retrieved document indices for query 2
    [1, 3, 2, 5, 4]   # Retrieved document indices for query 3
]

k = 3  # We want to evaluate MAP@3

map_score = mean_average_precision_at_k(queries_relevant_docs, queries_retrieved_docs, k)


MAP@3: 0.7037


In [174]:
1/3*(1/3*(1+2/3)+1/3*(1+1+1)+1/3*(1+2/3))

0.7037037037037035

In [163]:
average_precision_at_k([1, 3, 4], [1, 2, 3, 4, 5], 3)

0.5555555555555555

In [165]:
1/3*(1+2/3)

0.5555555555555555