In [2]:
from datasets import load_dataset
import pandas as pd

# Load SciFact claims and corpus
claims_dataset = load_dataset('allenai/scifact', 'claims', split='train')
corpus_dataset = load_dataset('allenai/scifact', 'corpus', split='train')

# Extract claims and evidence texts
claims_id = []
claims_data = []
evidence_ids = []
evidence_titles = []
evidence_data = []

# Iterate through the claims dataset and extract the claim text and corresponding evidence (i.e., abstracts from the corpus)
for example in claims_dataset:
    claim_id = example['id']
    claim_text = example['claim']
    claims_id.append(claim_id)
    claims_data.append(claim_text)
    
for example in corpus_dataset:   
    # Collect all evidence title and text from the corpus based on doc_id
    evidence_id = example['doc_id']
    evidence_title = example['title']
    evidence_text = example['abstract']
    evidence_ids.append(evidence_id)
    evidence_titles.append(evidence_title)
    evidence_data.append(' '.join(evidence_text))  # All evidence is now a single combined string

# Create a DataFrames for claims and evidence texts
claims_df = pd.DataFrame({
    'Claim_id': claims_id,
    'Claim_text': claims_data,
})
evidence_df = pd.DataFrame({
    'id' : evidence_ids,
    'title' : evidence_titles,
    'abstract': evidence_data,
})

print(claims_df.head())  # Preview of the DataFrame

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


   Claim_id                                         Claim_text
0         0  0-dimensional biomaterials lack inductive prop...
1         2  1 in 5 million in UK have abnormal PrP positiv...
2         4  1-1% of colorectal cancer patients are diagnos...
3         6  10% of sudden infant death syndrome (SIDS) dea...
4         9  32% of liver transplantation programs required...


In [3]:
# Import necessary libraries
import os
import gc
import torch
import pandas as pd
from utils.dataUtils import DataUtils
from utils.modelUtils import ModelUtils
from utils.limeUtils import LIMEUtils
from utils.graphUtils import create_and_save_graph, draw_cluster_graph, draw_soi
from utils.soiUtils import SOIUtils
from utils.ragUtils_scifact import RAGUtils
from transformers import AutoTokenizer, AutoModelForCausalLM

# Parameters
model_name = 'MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli'
embedding_model_name = 'sentence-transformers/all-mpnet-base-v2'
llama_model_name = 'meta-llama/Llama-2-7b-chat-hf'

# Paths for RAGUtils
passages_path = '/home/qsh5523/Documents/factver_dev/scifact/dataset'
index_path = '/home/qsh5523/Documents/factver_dev/scifact/faiss/index.faiss'

#selected_claim_id = 'Claim_10'
similarity_threshold = 0.75  # delta for cosine similarity
alpha = 0.5  # parameter for weighted vector combination of thematic embedding
n_docs = 6  # number of docs to retrieve by RAG
n_components_carag_u = 10  # number of clusters for CARAG-U

# Initialize LLaMA model
llama_tokenizer = AutoTokenizer.from_pretrained(llama_model_name)
llama_model = AutoModelForCausalLM.from_pretrained(llama_model_name)

# Initialize utilities
#data_utils = DataUtils(dataset_name)
model_utils = ModelUtils(model_name, embedding_model_name)
lime_utils = LIMEUtils(model_utils)
soi_utils = SOIUtils(model_utils)
rag_utils = RAGUtils(passages_path, index_path, embedding_model_name)
print("RAGUtils initialized with embedding model:", rag_utils.embedding_model)

# Function to generate LLM-based explanation
def generate_llm_summary(claim, evidences):
    # Clear the GPU cache first
    gc.collect()
    torch.cuda.empty_cache()
    combined_evidence = ' '.join([evidence for evidence in evidences])
    prompt = f"Claim: {claim}\nEvidence: {combined_evidence}\nYou are a fact verification assistant. From the given Claim and its Evidence, determine if the claim is supported by the evidence and generate a concise explanation (two sentences max)."
    
    with torch.no_grad():
        inputs = llama_tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True)
        outputs = llama_model.generate(inputs['input_ids'], max_new_tokens=200)
    
    return llama_tokenizer.decode(outputs[0], skip_special_tokens=True).strip()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/all-mpnet-base-v2
INFO:sentence_transformers.SentenceTransformer:Use pytorch device: cuda
INFO:root:Classification model loaded on CUDA
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/all-mpnet-base-v2
INFO:sentence_transformers.SentenceTransformer:Use pytorch device: cuda

Please make sure the config includes `forced_bos_token_id=0` in future versions. The config can simply be saved and uploaded again to be fixed.

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It

RAGUtils initialized with embedding model: SentenceTransformer(
  (0): Transformer({'max_seq_length': 384, 'do_lower_case': False}) with Transformer model: MPNetModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
)


In [4]:
# --- Setup ---
selected_claim_id = 9  # Example ID from SciFact

# Step 1: Prepare full pool of texts for embedding (claims + evidence abstracts)
claims_pool = claims_df[['Claim_id', 'Claim_text']].copy()
evidence_pool = evidence_df[['id', 'title']].copy()

# Create a unified text pool
embedding_texts = []
embedding_sources = []  # To track whether it's a claim or evidence

# Add claim texts
for _, row in claims_pool.iterrows():
    embedding_texts.append(row['Claim_text'])
    embedding_sources.append({'type': 'claim', 'id': row['Claim_id']})

# Add evidence abstracts
for _, row in evidence_pool.iterrows():
    embedding_texts.append(row['title'])
    embedding_sources.append({'type': 'evidence_title', 'id': row['id']})

# Step 2: Generate embeddings
embeddings = model_utils.get_sent_embeddings(embedding_texts)

# Step 3: Clustering
labels_carag_u = model_utils.cluster_embeddings(embeddings, n_components=n_components_carag_u)
unique_labels_carag_u = set(labels_carag_u)
print(f"Unique clusters identified in the dataset: {unique_labels_carag_u}")


# Step 4: Map each item to its cluster
cluster_map = []
for idx, source in enumerate(embedding_sources):
    cluster_map.append({
        'index': idx,
        'type': source['type'],
        'id': source['id'],
        'text': embedding_texts[idx],
        'cluster': labels_carag_u[idx]
    })

cluster_map_df = pd.DataFrame(cluster_map)


Batches:   0%|          | 0/202 [00:00<?, ?it/s]

Unique clusters identified in the dataset: {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}


In [6]:
from sklearn.metrics.pairwise import cosine_similarity

def compute_soi_scifact_carag_u(selected_claim_id, cluster_map_df, selected_cluster_id, model_utils, similarity_threshold=0.5):
    """
    Computes the Subset of Interest (SOI) for SciFact using CARAG-U logic, working directly with cluster_map_df.

    :param selected_claim_id: ID of the selected claim.
    :param cluster_map_df: DataFrame containing cluster mapping for claims and evidence titles.
    :param selected_cluster_id: The cluster ID of the selected claim.
    :param model_utils: Module for embedding and clustering utilities.
    :param similarity_threshold: Cosine similarity threshold for selecting relevant evidences.
    :return: Dictionary with claim, refined evidences, and similarity scores.
    """
    
    # Step 1: Get selected claim text
    selected_row = cluster_map_df[(cluster_map_df['type'] == 'claim') & (cluster_map_df['id'] == selected_claim_id)]
    if selected_row.empty:
        raise ValueError(f"Claim ID {selected_claim_id} not found in cluster map.")

    selected_claim_text = selected_row.iloc[0]['text']

    # Step 2: Extract evidence titles in the same cluster
    cluster_evidence_rows = cluster_map_df[
        (cluster_map_df['type'] == 'evidence_title') &
        (cluster_map_df['cluster'] == selected_cluster_id)
    ]

    cluster_evidences = [(row['text'], f"Evidence_{row['id']}") for _, row in cluster_evidence_rows.iterrows()]

    # Step 3: Compute claim embedding
    claim_embedding = model_utils.get_embeddings([selected_claim_text])[0]

    soi = {
        'claim_id': selected_claim_id,
        'claim': selected_claim_text,
        'refined_cluster_evidences': [],
        'similarities': []
    }

    # Step 4: Refine cluster evidences based on similarity
    for evidence_text, evidence_id in cluster_evidences:
        evidence_embedding = model_utils.get_embeddings([evidence_text])[0]
        similarity = cosine_similarity(
            claim_embedding.reshape(1, -1),
            evidence_embedding.reshape(1, -1)
        )[0][0]

        if similarity > similarity_threshold:
            soi['refined_cluster_evidences'].append((evidence_text, evidence_id))
            soi['similarities'].append((selected_claim_text, evidence_text, similarity))

    return soi

In [7]:
# Step 6: Find selected claim's cluster ID
selected_row = cluster_map_df[(cluster_map_df['type'] == 'claim') & (cluster_map_df['id'] == selected_claim_id)]

if not selected_row.empty:
    selected_cluster_id_carag_u = selected_row.iloc[0]['cluster']
    print(f"Selected claim (ID {selected_claim_id}) belongs to cluster {selected_cluster_id_carag_u}")

    # Step 7: Compute SOI
    soi_output = compute_soi_scifact_carag_u(
        selected_claim_id=selected_claim_id,
        cluster_map_df=cluster_map_df,
        selected_cluster_id=selected_cluster_id_carag_u,
        model_utils=model_utils,
        similarity_threshold=0.5  # or whatever threshold suits your use case
    )
    soi_evidences = soi_output['refined_cluster_evidences']

    """
    # Step 8: Display or use output
    print("\n--- Subset of Interest (SOI) ---")
    print(f"Claim: {soi_output['claim']}")
    print("Relevant evidences:")
    for evidence_text, evidence_id in soi_output['refined_cluster_evidences']:
        print(f" - ({evidence_id}) {evidence_text}")
    """
else:
    print(f"Claim ID {selected_claim_id} not found in cluster map.")



Selected claim (ID 9) belongs to cluster 8


In [12]:
# Compute aggregated embedding for CARAG-U SOI
aggregated_embedding = rag_utils.compute_aggregated_embedding([evidence for evidence, _ in soi_evidences])
# Retrieve evidence using CARAG-U's dataset-wide embedding
retrived_evidence, retrieved_doc_ids, retrieved_doc_ids_original  = rag_utils.retrieve_evidence(claim_text, n_docs, aggregated_embedding, alpha)
#explanation = generate_llm_summary(claim_text, retrived_evidence)
#print("\nPost Hoc Explanation:\n", explanation)
print(retrived_evidence)
#print(retrieved_doc_ids_original)

Batches:   0%|          | 0/19 [00:00<?, ?it/s]

["Although drugs are intended to be selective, at least some bind to several physiological targets, explaining side effects and efficacy. Because many drug-target combinations exist, it would be useful to explore possible interactions computationally. Here we compared 3,665 US Food and Drug Administration (FDA)-approved and investigational drugs against hundreds of targets, defining each target by its ligands. Chemical similarities between drugs and ligand sets predicted thousands of unanticipated associations. Thirty were tested experimentally, including the antagonism of the beta(1) receptor by the transporter inhibitor Prozac, the inhibition of the 5-hydroxytryptamine (5-HT) transporter by the ion channel drug Vadilex, and antagonism of the histamine H(4) receptor by the enzyme inhibitor Rescriptor. Overall, 23 new drug-target associations were confirmed, five of which were potent (<100 nM). The physiological relevance of one, the drug N,N-dimethyltryptamine (DMT) on serotonergic re

In [15]:
def get_original_doc_ids(retrieved_titles, evidence_df):
    doc_ids = []
    for title in retrieved_titles:
        match = evidence_df[evidence_df['title'] == title]
        if not match.empty:
            doc_ids.append(match.iloc[0]['id'])
        else:
            doc_ids.append(None)
    return doc_ids

ids = get_original_doc_ids(retrived_evidence, evidence_df)
print(ids)

[None]
