In [1]:
import evaluate
from sklearn.metrics.pairwise import cosine_similarity
import pprint
import json
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load the JSON file
with open('results/QA_RESEARCH_Gemini.json', 'r') as f:
    evaluation_data = json.load(f)

In [3]:
embed_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Load evaluation metrics
rouge = evaluate.load("rouge")
bleu = evaluate.load("bleu")

# Accumulate results
rouge_scores = []
bleu_scores = []
cosine_similarities = []

for data in evaluation_data:
    question = data["question"]
    expected_answer = data["answer"]
    generated_answer = data["LLM"]  # Use the generated answer from the JSON data
    
    # Compute evaluation metrics
    rouge_result = rouge.compute(predictions=[generated_answer], references=[expected_answer])
    bleu_result = bleu.compute(predictions=[generated_answer], references=[[expected_answer]])

    # Compute cosine similarity between embeddings
    reference_embedding = embed_model.embed_query(expected_answer)
    generated_embedding = embed_model.embed_query(generated_answer)
    cosine_sim = cosine_similarity([reference_embedding], [generated_embedding])[0][0]

    # Store the results
    rouge_scores.append(rouge_result)
    bleu_scores.append(bleu_result)
    cosine_similarities.append(cosine_sim)

  embed_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


In [4]:
# Accumulate results
results = []

for data in evaluation_data:
    question = data["question"]
    expected_answer = data["answer"]

    # Query the system
    generated_answer = data["LLM"]  # Using the LLM's response as generated answer
    
    # Check for specific phrases in the generated answer
    if "User Mistake." in generated_answer or "ERROR: SAFETY Issue." in generated_answer:
        # Skip score calculation for these responses
        continue

    # Compute evaluation metrics
    rouge_result = rouge.compute(predictions=[generated_answer], references=[expected_answer])
    bleu_result = bleu.compute(predictions=[generated_answer], references=[[expected_answer]])

    # Compute cosine similarity between embeddings
    reference_embedding = embed_model.embed_query(expected_answer)
    generated_embedding = embed_model.embed_query(generated_answer)
    cosine_sim = cosine_similarity([reference_embedding], [generated_embedding])[0][0]

    # Append the result as a dictionary
    results.append({
        "Question": question,
        "Generated Answer": generated_answer,
        "Expected Answer": expected_answer,
        "Cosine Similarity": cosine_sim,
        # Separate ROUGE metrics
        "ROUGE-1": rouge_result['rouge1'],
        "ROUGE-2": rouge_result['rouge2'],
        "ROUGE-L": rouge_result['rougeL'],
        "ROUGE-Lsum": rouge_result['rougeLsum'],
        # Separate BLEU metrics
        "BLEU": bleu_result['bleu'],
        "Precision_1": bleu_result['precisions'][0] if len(bleu_result['precisions']) > 0 else None,
        "Precision_2": bleu_result['precisions'][1] if len(bleu_result['precisions']) > 1 else None,
        "Precision_3": bleu_result['precisions'][2] if len(bleu_result['precisions']) > 2 else None,
        "Precision_4": bleu_result['precisions'][3] if len(bleu_result['precisions']) > 3 else None,
        "Brevity Penalty": bleu_result['brevity_penalty'],
        "Length Ratio": bleu_result['length_ratio'],
        "Translation Length": bleu_result['translation_length'],
        "Reference Length": bleu_result['reference_length'],
    })

# Create a DataFrame for tabular results
result_df = pd.DataFrame(results)
result_df.head()

Unnamed: 0,Question,Generated Answer,Expected Answer,Cosine Similarity,ROUGE-1,ROUGE-2,ROUGE-L,ROUGE-Lsum,BLEU,Precision_1,Precision_2,Precision_3,Precision_4,Brevity Penalty,Length Ratio,Translation Length,Reference Length
0,List the titles of papers authored by 'Jianmin...,Jianmin Chen authored the paper 'TensorFlow: A...,Jianmin Chen authored the paper 'TensorFlow: A...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,16,16
1,What are the most cited papers in 'Physics'?,The most cited papers related to Physics are: ...,The most cited papers related to Physics are: ...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,120,120
2,Which authors have worked on the 'Network Scie...,The following authors have worked on the 'Netw...,The following authors have worked on the 'Netw...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,51,51
3,What venues have published papers in 'Environm...,PLoS ONE.,PLoS ONE has published papers in 'Environmenta...,0.189857,0.363636,0.222222,0.363636,0.363636,0.0,1.0,0.5,0.0,0.0,0.096972,0.3,3,10
4,How many papers authored by 'Roland Vollgraf'?,Roland Vollgraf authored 1 paper.,Roland Vollgraf authored 1 paper.,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,6,6


In [5]:
# Print the average scores
print(f"Average ROUGE-1: {result_df['ROUGE-1'].mean()}")
print(f"Average ROUGE-2: {result_df['ROUGE-2'].mean()}")
print(f"Average ROUGE-L: {result_df['ROUGE-L'].mean()}")
print(f"Average ROUGE-Lsum: {result_df['ROUGE-Lsum'].mean()}")
print(f"Average BLEU: {result_df['BLEU'].mean()}")
print(f"Average Cosine Similarity: {result_df['Cosine Similarity'].mean()}")

Average ROUGE-1: 0.8008468932622385
Average ROUGE-2: 0.7698130380205725
Average ROUGE-L: 0.7697030804819576
Average ROUGE-Lsum: 0.7697030804819576
Average BLEU: 0.7216656980705826
Average Cosine Similarity: 0.8337880092016062


In [6]:
def dcg_at_k(relevance_scores, k):
    """
    Compute the Discounted Cumulative Gain (DCG) at rank k.
    relevance_scores: list of relevance scores (binary or graded)
    k: rank cutoff
    """
    relevance_scores = np.asarray(relevance_scores, dtype=float)[:k]
    if relevance_scores.size:
        return np.sum((2**relevance_scores - 1) / np.log2(np.arange(1, relevance_scores.size + 1) + 1))
    return 0.

def ndcg_at_k(ground_truth, response, k):
    """
    Compute Normalized Discounted Cumulative Gain (NDCG) at rank k.
    ground_truth: list of ground truth (expected) movies
    response: list of movies returned by the LLM
    k: rank cutoff
    """
    # Binary relevance scores (1 if the movie is in ground truth, 0 otherwise)
    relevance_scores = [1 if movie in ground_truth else 0 for movie in response]
    
    # Compute DCG for the LLM's response
    dcg = dcg_at_k(relevance_scores, k)
    
    # Ideal relevance scores (all 1s for the length of ground_truth, since all movies are relevant in ground truth)
    ideal_relevance_scores = [1] * min(k, len(ground_truth))
    
    # Compute Ideal DCG (IDCG)
    idcg = dcg_at_k(ideal_relevance_scores, k)
    
    # Return NDCG (if IDCG is 0, NDCG is 0 to avoid division by zero)
    return dcg / idcg if idcg > 0 else 0.

In [7]:
for k in [1, 3, 5, 10]:
    NDCG_results = []
    for data in evaluation_data:
        ground_truth = data["GT_NDCG"]
        response = data["response_NDCG"]
        # Query the system
        generated_answer = data["LLM"]  # Using the LLM's response as generated answer

        # Check for specific phrases in the generated answer
        if "User Mistake." in generated_answer or "ERROR: SAFETY Issue." in generated_answer:
            # Skip score calculation for these responses
            continue

        # Calculate NDCG score at rank 3
        # k = 3
        ndcg_score = ndcg_at_k(ground_truth, response, k)
        NDCG_results.append(ndcg_score)
    print("For k = ", k, "NDCG:", np.mean(NDCG_results))

For k =  1 NDCG: 0.8421052631578947
For k =  3 NDCG: 0.8421052631578947
For k =  5 NDCG: 0.8421052631578947
For k =  10 NDCG: 0.8421052631578947


In [8]:
def calculate_metrics_at_k(gt_list, response_list, k):
    # Limit both lists to the top K
    gt_k = set(gt_list[:k])
    response_k = set(response_list[:k])
    
    # Calculate Precision@K
    relevant_and_retrieved = gt_k.intersection(response_k)
    precision_k = len(relevant_and_retrieved) / len(response_k) if response_k else 0
    
    # Calculate Recall@K
    recall_k = len(relevant_and_retrieved) / len(gt_k) if gt_k else 0
    
    # Calculate F1@K
    if precision_k + recall_k == 0:
        f1_k = 0
    else:
        f1_k = 2 * (precision_k * recall_k) / (precision_k + recall_k)
    
    # Calculate Rate@K
    rate_k = len(relevant_and_retrieved) / k
    
    return precision_k, recall_k, f1_k, rate_k

In [9]:
def calculate_metrics_at_k_with_adjustment(gt_list, response_list, k):
    # Adjust k if it's larger than the available elements in gt_list or response_list
    k = min(k, len(gt_list), len(response_list))
    
    # Limit both lists to the top K
    gt_k = set(gt_list[:k])
    response_k = set(response_list[:k])
    
    # Calculate Precision@K
    relevant_and_retrieved = gt_k.intersection(response_k)
    precision_k = len(relevant_and_retrieved) / len(response_k) if response_k else 0
    
    # Calculate Recall@K
    recall_k = len(relevant_and_retrieved) / len(gt_k) if gt_k else 0
    
    # Calculate F1@K
    if precision_k + recall_k == 0:
        f1_k = 0
    else:
        f1_k = 2 * (precision_k * recall_k) / (precision_k + recall_k)
    
    # Calculate Rate@K
    rate_k = len(relevant_and_retrieved) / k if k > 0 else 0
    
    return precision_k, recall_k, f1_k, rate_k


In [10]:
print("Number of responses:", len(evaluation_data))

Number of responses: 19


In [11]:
k = 5  # You can adjust K as needed
for k in [1, 3, 5, 10]:
    precision_scores = []
    recall_scores = []
    f1_scores = []
    rate_scores = []
    for data in evaluation_data:
        ground_truth = data["GT_NDCG"]
        response = data["response_NDCG"]
        # Query the system
        generated_answer = data["LLM"] 

        # Calculate metrics at rank K
        precision_k, recall_k, f1_k, rate_k = calculate_metrics_at_k(ground_truth, response, k)
        precision_scores.append(precision_k)
        recall_scores.append(recall_k)
        f1_scores.append(f1_k)
        rate_scores.append(rate_k)
    print("For k = ", k, "Precision:", np.mean(precision_scores))
    print("For k = ", k, "Recall:", np.mean(recall_scores))
    print("For k = ", k, "F1:", np.mean(f1_scores))
    print("For k = ", k, "Rate:", np.mean(rate_scores))
    print("--------------------------------------------------")


For k =  1 Precision: 0.8421052631578947
For k =  1 Recall: 0.8421052631578947
For k =  1 F1: 0.8421052631578947
For k =  1 Rate: 0.8421052631578947
--------------------------------------------------
For k =  3 Precision: 0.8421052631578947
For k =  3 Recall: 0.8421052631578947
For k =  3 F1: 0.8421052631578947
For k =  3 Rate: 0.5789473684210527
--------------------------------------------------
For k =  5 Precision: 0.8421052631578947
For k =  5 Recall: 0.8421052631578947
For k =  5 F1: 0.8421052631578947
For k =  5 Rate: 0.5157894736842105
--------------------------------------------------
For k =  10 Precision: 0.8421052631578947
For k =  10 Recall: 0.8421052631578947
For k =  10 F1: 0.8421052631578947
For k =  10 Rate: 0.4052631578947368
--------------------------------------------------
