In [1]:
# Import necessary libraries
import os
import gc
import torch
import pandas as pd
from utils.dataUtils import DataUtils
from utils.modelUtils import ModelUtils
from utils.limeUtils import LIMEUtils
from utils.graphUtils import create_and_save_graph, draw_cluster_graph, draw_soi
from utils.soiUtils import SOIUtils
from utils.ragUtils import RAGUtils
from transformers import AutoTokenizer, AutoModelForCausalLM

# Parameters
dataset_name = 'manjuvallayil/factver_master'
model_name = 'MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli'
embedding_model_name = 'sentence-transformers/all-mpnet-base-v2'
llama_model_name = 'meta-llama/Llama-2-7b-chat-hf'

selected_claim_id = 'Claim_59'
similarity_threshold = 0.75  # delta for cosine similarity
alpha = 0.5  # para for weighted vector combination of thematic embedding (1 for RAG only)
n_docs = 6  # number of docs to retrieve by RAG
n_components= 10 # number of clusters to from through gmm-em

# Paths for RAGUtils
passages_path = '/home/qsh5523/Documents/factver_dev/dataset'
index_path = '/home/qsh5523/Documents/factver_dev/faiss/index.faiss'

# Initialize LLaMA model
llama_tokenizer = AutoTokenizer.from_pretrained(llama_model_name)
llama_model = AutoModelForCausalLM.from_pretrained(llama_model_name)

# Initialize utilities
data_utils = DataUtils(dataset_name)
model_utils = ModelUtils(model_name, embedding_model_name)
lime_utils = LIMEUtils(model_utils)
soi_utils = SOIUtils(model_utils)
rag_utils = RAGUtils(passages_path, index_path, embedding_model_name)

# Function to generate LLM-based explanation
def generate_llm_summary(claim, evidences):
    # Clear the GPU cache first
    gc.collect()
    torch.cuda.empty_cache()
    combined_evidence = ' '.join([evidence for evidence in evidences])
    prompt = f"Claim: {claim}\nEvidence: {combined_evidence}\nYou are a fact verification assistant. From the given Claim and its Evidence, determine if the claim is supported by the evidence and generate a concise explanation (two sentences max)."
    
    with torch.no_grad():
        inputs = llama_tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True)
        outputs = llama_model.generate(inputs['input_ids'], max_new_tokens=200)
    
    return llama_tokenizer.decode(outputs[0], skip_special_tokens=True).strip()

# Load data without theme-based filtering
grouped_data = data_utils.get_full_data(selected_claim_id)

# GMM-EM Clustering
# Check if data is available
if grouped_data.empty:
    print("No data found in the dataset.")
else:
    # Get embeddings
    all_texts = [row['Claim_text'] for _, row in grouped_data.iterrows()]
    for _, row in grouped_data.iterrows():
        all_texts.extend(row['Evidence_text'])

    embeddings = model_utils.get_sent_embeddings(all_texts)

    # Apply GMM-EM clustering to dataset
    labels = model_utils.cluster_embeddings(embeddings, n_components)
    unique_labels = set(labels)
    print(f"Unique clusters identified in the dataset: {unique_labels}")

    # Draw and save cluster graph
    graph_filepath = 'graph.pkl'
    create_and_save_graph(model_utils, grouped_data, graph_filepath)

    #for cluster_id in unique_labels:
        #draw_cluster_graph(grouped_data, labels, cluster_id=cluster_id, model_utils=model_utils, title=f'Cluster Visualization {cluster_id}')

    # Ensure the selected claim is in the identified cluster
    selected_cluster_id = None
    claim_text = None

    for index, row in grouped_data.iterrows():
        unique_id = row['Claim_topic_id'].split('_')[-1]
        if f"Claim_{unique_id}" == selected_claim_id:
            selected_cluster_id = labels[index]
            claim_text = row['Claim_text']
            break

    if selected_cluster_id is not None:
        print(f"The selected claim ({selected_claim_id}) belongs to cluster {selected_cluster_id}")

        # Compare RAG (alpha=1.0) vs CARAG_U (alpha=0.5)

        # 1. RAG-based retrieval and explanation
        rag_evidence = rag_utils.retrieve_evidence(claim_text, n_docs, aggregated_embedding=None, alpha=1.0)
        rag_explanation = generate_llm_summary(claim_text, rag_evidence)
        print("\nRAG Explanation:\n", rag_explanation)
        
        # Compute the SOI using CARAG_U
        soi = soi_utils.compute_soi_carag_u(selected_claim_id, grouped_data, labels, selected_cluster_id, similarity_threshold)
        soi_evidences = soi['refined_cluster_evidences']
        #draw_soi(soi, similarity_threshold, title=f'SOI Visualization for {selected_claim_id}')

        # Compute aggregated embedding for the SOI evidences
        aggregated_embedding = rag_utils.compute_aggregated_embedding([evidence for evidence, _ in soi_evidences])
        
        # 2. Generate explanation using CARAG_U (retrieved evidence with combined embedding)
        agg_evidence = rag_utils.retrieve_evidence(claim_text, n_docs, aggregated_embedding, alpha=0.5)
        carag_u_explanation = generate_llm_summary(claim_text, agg_evidence)
        print("\nCARAG_U Explanation:\n", carag_u_explanation)
    
    else:
        print(f"Selected claim {selected_claim_id} is not part of any identified cluster.")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/all-mpnet-base-v2
INFO:sentence_transformers.SentenceTransformer:Use pytorch device: cuda
INFO:root:Classification model loaded on CUDA

Please make sure the config includes `forced_bos_token_id=0` in future versions. The config can simply be saved and uploaded again to be fixed.

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizerFast'.
The toke

Claim ID Claim_59 is valid and exists in the dataset.


Batches:   0%|          | 0/116 [00:00<?, ?it/s]

Unique clusters identified in the dataset: {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}
Graph created with 3685 nodes and 3030 edges.
The selected claim (Claim_59) belongs to cluster 8

RAG Explanation:
 Claim: The public is unconcerned about a climate emergency
Evidence: Failure will result in the country's once-successful car making industry being largely consigned to the scrap heap. They could drive back to Israel and get a ferry around the middle east to Qatar - but that is a two-week journey which would have meant missing Wales' first match. Shares fell as low as $6.50-apiece on Monday, down 97 percent from August 2021.BMW-branded cars, motorcycles, and Mini models sold since October 1 get the new warranty. Cho Tae-yong, ambassador of the Republic of Korea to the U.S., said Tuesday officials are discussing “several possible options” to correct what the country believes to be unfair policies that eliminated up to $7,500 of tax credits for EVs produced outside North America. White House press sec

Batches:   0%|          | 0/3 [00:00<?, ?it/s]


CARAG_U Explanation:
 Claim: The public is unconcerned about a climate emergency
Evidence: Failure will result in the country's once-successful car making industry being largely consigned to the scrap heap. This greenhouse gas trading scheme forms part of the UK government's ambition to achieve net zero emissions by 2050.Global business is increasingly familiar with the prospect of short-notice public investigatory attention, whether from regulators, law enforcement, political forces or as a consequence of sanctions, and this can  in in some cases  devastate individual and corporate reputation. Cho Tae-yong, ambassador of the Republic of Korea to the U.S., said Tuesday officials are discussing “several possible options” to correct what the country believes to be unfair policies that eliminated up to $7,500 of tax credits for EVs produced outside North America. They could drive back to Israel and get a ferry around the middle east to Qatar - but that is a two-week journey which would h

In [1]:
# Import necessary libraries
import os
import gc
import torch
import pandas as pd
from utils.dataUtils import DataUtils
from utils.modelUtils import ModelUtils
from utils.limeUtils import LIMEUtils
from utils.graphUtils import create_and_save_graph, draw_cluster_graph, draw_soi
from utils.soiUtils import SOIUtils
from utils.ragUtils import RAGUtils
from transformers import AutoTokenizer, AutoModelForCausalLM

# Parameters
dataset_name = 'manjuvallayil/factver_master'
model_name = 'MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli'
embedding_model_name = 'sentence-transformers/all-mpnet-base-v2'
llama_model_name = 'meta-llama/Llama-2-7b-chat-hf'

selected_claim_id = 'Claim_59'
similarity_threshold = 0.75  # delta for cosine similarity
alpha = 0.5  # parameter for weighted vector combination of thematic embedding
n_docs = 6  # number of docs to retrieve by RAG
n_components_carag = 3  # number of clusters for CARAG
n_components_carag_u = 10  # number of clusters for CARAG-U

# Paths for RAGUtils
passages_path = '/home/qsh5523/Documents/factver_dev/dataset'
index_path = '/home/qsh5523/Documents/factver_dev/faiss/index.faiss'

# Initialize LLaMA model
llama_tokenizer = AutoTokenizer.from_pretrained(llama_model_name)
llama_model = AutoModelForCausalLM.from_pretrained(llama_model_name)

# Initialize utilities
data_utils = DataUtils(dataset_name)
model_utils = ModelUtils(model_name, embedding_model_name)
lime_utils = LIMEUtils(model_utils)
soi_utils = SOIUtils(model_utils)
rag_utils = RAGUtils(passages_path, index_path, embedding_model_name)

# Function to generate LLM-based explanation
def generate_llm_summary(claim, evidences):
    # Clear the GPU cache first
    gc.collect()
    torch.cuda.empty_cache()
    combined_evidence = ' '.join([evidence for evidence in evidences])
    prompt = f"Claim: {claim}\nEvidence: {combined_evidence}\nYou are a fact verification assistant. From the given Claim and its Evidence, determine if the claim is supported by the evidence and generate a concise explanation (two sentences max)."
    
    with torch.no_grad():
        inputs = llama_tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True)
        outputs = llama_model.generate(inputs['input_ids'], max_new_tokens=200)
    
    return llama_tokenizer.decode(outputs[0], skip_special_tokens=True).strip()



# Load themed data
theme, themed_data = data_utils.filter_by_theme(selected_claim_id)

# CARAG: Themed data clustering
if not themed_data.empty:
    # Get embeddings for themed data
    all_texts = [row['Claim_text'] for _, row in themed_data.iterrows()]
    for _, row in themed_data.iterrows():
        all_texts.extend(row['Evidence_text'])
    
    embeddings = model_utils.get_sent_embeddings(all_texts)
    
    # Apply GMM-EM clustering to themed data
    labels_carag = model_utils.cluster_embeddings(embeddings, n_components=n_components_carag)
    unique_labels_carag = set(labels_carag)
    print(f"Unique clusters identified within the theme {theme}: {unique_labels_carag}")

    # Ensure the selected claim is in the identified cluster
    selected_cluster_id_carag = None
    claim_text = None

    for index, row in themed_data.iterrows():
        unique_id = row['Claim_topic_id'].split('_')[-1]
        if f"Claim_{unique_id}" == selected_claim_id:
            selected_cluster_id_carag = labels_carag[index]
            claim_text = row['Claim_text']
            break

    if selected_cluster_id_carag is not None:
        print(f"The selected claim ({selected_claim_id}) belongs to cluster {selected_cluster_id_carag}")

        # 1. RAG-based retrieval and explanation (AS BASELINE)
        rag_evidence = rag_utils.retrieve_evidence(claim_text, n_docs, aggregated_embedding=None, alpha=1.0)
        rag_explanation = generate_llm_summary(claim_text, rag_evidence)
        print("\nRAG Explanation:\n", rag_explanation)
        
        # 2. CARAG-based retrieval and explanation
        carag_soi = soi_utils.compute_soi(selected_claim_id, themed_data, labels_carag, selected_cluster_id_carag, similarity_threshold)
        carag_soi_evidences = carag_soi['related_claims'] + carag_soi['annotated_evidences'] + carag_soi['thematic_cluster_evidences']

        # Compute aggregated embedding for CARAG SOI
        carag_aggregated_embedding = rag_utils.compute_aggregated_embedding([evidence for evidence, _ in carag_soi_evidences])
        
        # Retrieve evidence using CARAG's SOI-based embedding
        carag_evidence = rag_utils.retrieve_evidence(claim_text, n_docs, carag_aggregated_embedding, alpha=0.5)
        carag_explanation = generate_llm_summary(claim_text, carag_evidence)
        print("\nCARAG Explanation:\n", carag_explanation)
    else:
        print(f"Selected claim {selected_claim_id} is not part of any identified cluster in CARAG.")

else:
    print(f"No data found for the theme of claim {selected_claim_id}.")

# CARAG-U: Dataset-wide clustering
grouped_data = data_utils.get_full_data(selected_claim_id)

if not grouped_data.empty:
    # Get embeddings for dataset-wide clustering
    all_texts = [row['Claim_text'] for _, row in grouped_data.iterrows()]
    for _, row in grouped_data.iterrows():
        all_texts.extend(row['Evidence_text'])
    
    embeddings = model_utils.get_sent_embeddings(all_texts)
    
    # Apply GMM-EM clustering to the full dataset
    labels_carag_u = model_utils.cluster_embeddings(embeddings, n_components=n_components_carag_u)
    unique_labels_carag_u = set(labels_carag_u)
    print(f"Unique clusters identified in the dataset: {unique_labels_carag_u}")

    # Ensure the selected claim is in the identified cluster
    selected_cluster_id_carag_u = None

    for index, row in grouped_data.iterrows():
        unique_id = row['Claim_topic_id'].split('_')[-1]
        if f"Claim_{unique_id}" == selected_claim_id:
            selected_cluster_id_carag_u = labels_carag_u[index]
            break

    if selected_cluster_id_carag_u is not None:
        print(f"The selected claim ({selected_claim_id}) belongs to cluster {selected_cluster_id_carag_u}")

        # 3. CARAG-U-based retrieval and explanation
        carag_u_soi = soi_utils.compute_soi_carag_u(selected_claim_id, grouped_data, labels_carag_u, selected_cluster_id_carag_u, similarity_threshold)
        carag_u_soi_evidences = carag_u_soi['refined_cluster_evidences']

        # Compute aggregated embedding for CARAG-U SOI
        carag_u_aggregated_embedding = rag_utils.compute_aggregated_embedding([evidence for evidence, _ in carag_u_soi_evidences])
        
        # Retrieve evidence using CARAG-U's dataset-wide embedding
        carag_u_evidence = rag_utils.retrieve_evidence(claim_text, n_docs, carag_u_aggregated_embedding, alpha=0.5)
        carag_u_explanation = generate_llm_summary(claim_text, carag_u_evidence)
        print("\nCARAG_U Explanation:\n", carag_u_explanation)
    else:
        print(f"Selected claim {selected_claim_id} is not part of any identified cluster in CARAG-U.")

else:
    print("No data found in the dataset.")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/all-mpnet-base-v2
INFO:sentence_transformers.SentenceTransformer:Use pytorch device: cuda
INFO:root:Classification model loaded on CUDA

Please make sure the config includes `forced_bos_token_id=0` in future versions. The config can simply be saved and uploaded again to be fixed.

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizerFast'.
The toke


 The selected Claim belongs to the theme: Climate


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Unique clusters identified within the theme Climate: {0, 1, 2}
The selected claim (Claim_59) belongs to cluster 1

RAG Explanation:
 Claim: The public is unconcerned about a climate emergency
Evidence: Failure will result in the country's once-successful car making industry being largely consigned to the scrap heap. They could drive back to Israel and get a ferry around the middle east to Qatar - but that is a two-week journey which would have meant missing Wales' first match. Shares fell as low as $6.50-apiece on Monday, down 97 percent from August 2021.BMW-branded cars, motorcycles, and Mini models sold since October 1 get the new warranty. Cho Tae-yong, ambassador of the Republic of Korea to the U.S., said Tuesday officials are discussing “several possible options” to correct what the country believes to be unfair policies that eliminated up to $7,500 of tax credits for EVs produced outside North America. White House press secretary Karine Jean-Pierre wrote Biden is ¡°asymptomatic, 

Batches:   0%|          | 0/3 [00:00<?, ?it/s]


CARAG Explanation:
 Claim: The public is unconcerned about a climate emergency
Evidence: Failure will result in the country's once-successful car making industry being largely consigned to the scrap heap. This greenhouse gas trading scheme forms part of the UK government's ambition to achieve net zero emissions by 2050.Global business is increasingly familiar with the prospect of short-notice public investigatory attention, whether from regulators, law enforcement, political forces or as a consequence of sanctions, and this can  in in some cases  devastate individual and corporate reputation. Cho Tae-yong, ambassador of the Republic of Korea to the U.S., said Tuesday officials are discussing “several possible options” to correct what the country believes to be unfair policies that eliminated up to $7,500 of tax credits for EVs produced outside North America. They could drive back to Israel and get a ferry around the middle east to Qatar - but that is a two-week journey which would hav

Batches:   0%|          | 0/116 [00:00<?, ?it/s]

Unique clusters identified in the dataset: {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}
The selected claim (Claim_59) belongs to cluster 8


Batches:   0%|          | 0/3 [00:00<?, ?it/s]


CARAG_U Explanation:
 Claim: The public is unconcerned about a climate emergency
Evidence: Failure will result in the country's once-successful car making industry being largely consigned to the scrap heap. This greenhouse gas trading scheme forms part of the UK government's ambition to achieve net zero emissions by 2050.Global business is increasingly familiar with the prospect of short-notice public investigatory attention, whether from regulators, law enforcement, political forces or as a consequence of sanctions, and this can  in in some cases  devastate individual and corporate reputation. Cho Tae-yong, ambassador of the Republic of Korea to the U.S., said Tuesday officials are discussing “several possible options” to correct what the country believes to be unfair policies that eliminated up to $7,500 of tax credits for EVs produced outside North America. They could drive back to Israel and get a ferry around the middle east to Qatar - but that is a two-week journey which would h

In [2]:
selected_claim_id = 'Claim_59'
# Load full grouped data and validate the claim ID
grouped_data = data_utils.get_full_data(selected_claim_id)

# Check if grouped data is available
if grouped_data.empty:
    print("No data found or the selected claim ID is invalid.")
else:
    # Prepare texts for clustering
    all_texts = []
    index_mapping = []  # Track indices for claims and evidences
    for index, row in grouped_data.iterrows():
        index_mapping.append((index, 'claim'))  # Mark as claim
        all_texts.append(row['Claim_text'])
        for i, evidence in enumerate(row['Evidence_text']):
            index_mapping.append((index, f'evidence_{i}'))  # Mark as evidence
            all_texts.append(evidence)

    # Generate embeddings
    embeddings = model_utils.get_sent_embeddings(all_texts)

    # Perform GMM-EM clustering
    cluster_labels = model_utils.perform_clustering_carag_u(embeddings,n_clusters=6)

    # Map cluster labels to grouped data
    cluster_mapping = {}
    for i, (row_index, text_type) in enumerate(index_mapping):
        if row_index not in cluster_mapping:
            cluster_mapping[row_index] = []
        cluster_mapping[row_index].append((text_type, cluster_labels[i]))

    # Identify the selected claim's cluster and text
    selected_cluster_id = None
    claim_text = None
    for index, row in grouped_data.iterrows():
        unique_id = row['Claim_topic_id'].split('_')[-1]
        if f"Claim_{unique_id}" == selected_claim_id:
            claim_clusters = cluster_mapping[index]
            selected_cluster_id = next(
                (label for label_type, label in claim_clusters if label_type == 'claim'), None
            )
            claim_text = row['Claim_text']
            print(f"Claim {selected_claim_id} assigned to cluster {selected_cluster_id} with text: {claim_text}")
            break

    if selected_cluster_id is None or claim_text is None:
        print(f"Claim {selected_claim_id} not found or not assigned to any cluster.")
    else:
        # Compute the Subset of Interest (SOI)
        soi = soi_utils.compute_soi_carag_u(
            selected_claim_id, grouped_data, cluster_labels, selected_cluster_id, similarity_threshold=0.75
        )
        print(f"SOI extracted for claim {selected_claim_id}:")
        print(f"- Related claims: {len(soi['related_claims'])}")
        print(f"- Cluster evidences: {len(soi['cluster_evidences'])}")

        # Compute aggregated embedding for the SOI
        aggregated_embedding = soi_utils.calculate_aggregate_embedding_carag_u(soi)
        print("Aggregated embedding computed successfully.")

Claim ID Claim_59 is valid and exists in the dataset.


Batches:   0%|          | 0/116 [00:00<?, ?it/s]

Clustering dataset using GMM-EM...
Clustering complete. 6 clusters identified.
Claim Claim_59 assigned to cluster 0 with text: The public is unconcerned about a climate emergency
SOI extracted with 65 related claims and 231 thematic evidences.
SOI extracted for claim Claim_59:
- Related claims: 65
- Cluster evidences: 231
Aggregated embedding computed successfully.


In [3]:
# Retrieve evidence using CARAG-U
print("Retrieving evidence using CARAG-U...")
carag_u_evidence = rag_utils.retrieve_evidence(
    claim=claim_text, 
    aggregated_embedding=aggregated_embedding,
    alpha=0.25
)
print(f"\nCARAG-U Retrieved Evidence:\n{carag_u_evidence}")

# Generate CARAG-U explanation
print("\nGenerating CARAG-U explanation...")
carag_u_explanation = generate_llm_summary(claim_text, carag_u_evidence)
print(f"\nCARAG-U Explanation:\n{carag_u_explanation}")



Retrieving evidence using CARAG-U...

CARAG-U Retrieved Evidence:
['with immunity levels against flu also lower in many people after two years of lower flu circulation Amazon initially invested $700 million in Rivian in 2019 and later introduced its first electric delivery van, designed and built with the EV startup. Such is the reality of some 5% of global Covid-19 survivors who have now developed long-lasting taste and smell problems, according to a 2022 study. Such is the reality of some 5% of global Covid-19 survivors who have now developed long-lasting taste and smell problems, according to a 2022 study. The renovated space ¡ª representing more than 150 years of Jacksonville¡¯s Black history ¡ª celebrated its grand reopening on Tuesday, Nov. 8, 2022. The renovated space ¡ª representing more than 150 years of Jacksonville¡¯s Black history ¡ª celebrated its grand reopening on Tuesday, Nov. 8, 2022.']

Generating CARAG-U explanation...

CARAG-U Explanation:
Claim: The public is uncon

# TRY 3D VISUALIZATIONS and DYNAMIC CLUSTERING