1. Jaccard Index: Measures the overlap between two sets of unique tokens. It gives a score based on how many words are shared between the two texts relative to the union of their words.
2. Dice Coefficient: Focuses on the overlap between two sets but gives more weight to the intersection by multiplying it by 2.
3. Cosine Similarity: Converts the text into vectorized representations (using word counts in this example) and calculates the cosine of the angle between the two vectors. It considers the frequency of words, which is useful for capturing subtle differences in text.

In [44]:
import os
jp = os.path.join
import sys
import datetime
import json
T_now = datetime.datetime.now
from openai import OpenAI

import requests
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import pandas as pd

# Function to calculate Jaccard Index
def jaccard_index(set1, set2):
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return intersection / union

# Function to calculate Dice Coefficient
def dice_coefficient(set1, set2):
    intersection = len(set1.intersection(set2))
    return 2 * intersection / (len(set1) + len(set2))

# Function to calculate Cosine Similarity
def cosine_similarity_custom(text1, text2):
    vectorizer = CountVectorizer().fit_transform([text1, text2])
    vectors = vectorizer.toarray()
    return cosine_similarity(vectors)[0, 1]

Load data

In [13]:
def load_squad_data():
    url = "https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json"
    data = requests.get(url).json()
    print(f"Number of records: {len(data['data']):,}")
    questions_answers = []
    for j, article in enumerate(data['data']):
        if j>20: break
        for paragraph in article['paragraphs']:
            for qas in paragraph['qas']:
                # print(qas)
                # if not qas.get('is_impossible'):
                question = qas['question']
                answer = qas['answers'][0]['text']  # Take the first answer
                questions_answers.append((question, answer))
    return questions_answers  # Limit to 10 for demonstration purposes

In [14]:
qa = load_squad_data()

Number of records: 442


In [16]:
qa[:10]

[('To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?',
  'Saint Bernadette Soubirous'),
 ('What is in front of the Notre Dame Main Building?',
  'a copper statue of Christ'),
 ('The Basilica of the Sacred heart at Notre Dame is beside to which structure?',
  'the Main Building'),
 ('What is the Grotto at Notre Dame?',
  'a Marian place of prayer and reflection'),
 ('What sits on top of the Main Building at Notre Dame?',
  'a golden statue of the Virgin Mary'),
 ('When did the Scholastic Magazine of Notre dame begin publishing?',
  'September 1876'),
 ("How often is Notre Dame's the Juggler published?", 'twice'),
 ('What is the daily student paper at Notre Dame called?', 'The Observer'),
 ('How many student news papers are found at Notre Dame?', 'three'),
 ('In what year did the student paper Common Sense begin publication at Notre Dame?',
  '1987')]

In [22]:
openai_api_key = open(jp(os.path.expanduser("~"), ".secrets", "openai_pmolnar_gsu_edu_msa8700.apikey"), "r").read().strip()
os.environ["OPENAI_API_KEY"] = openai_api_key
client = OpenAI(api_key = openai_api_key)

In [45]:
def query_llm(openai_client, question):
    completion = openai_client.chat.completions.create(
        model="gpt-4o-mini",
        max_tokens=100,
        temperature=0,
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {
                "role": "user",
                "content": f"Answer the following question:\n\n{question}\n"
            }
        ]
    )
    return completion.choices[0].message.content.strip()

In [37]:
query_llm(client, "To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?")

'The Virgin Mary allegedly appeared to a young girl named Bernadette Soubirous in 1858 in Lourdes, France.'

In [40]:
def evaluate_llm_on_qa(qa_data, openai_client):
    # Load QA dataset
    # qa_data = load_squad_data()
    results = []
    
    for qa_pair in qa_data:
        question, reference_answer = qa_pair
        
        # Get the LLM's answer
        try:
            llm_answer = query_llm(openai_client, question)
        except Exception as e:
            print(f"Error querying LLM: {e}")
            continue
        
        # Tokenize the answers into sets of words
        ref_tokens = set(reference_answer.lower().split())
        llm_tokens = set(llm_answer.lower().split())
        
        # Calculate the metrics
        jaccard = jaccard_index(ref_tokens, llm_tokens)
        dice = dice_coefficient(ref_tokens, llm_tokens)
        cosine_sim = cosine_similarity_custom(reference_answer, llm_answer)
        
        # Store results
        results.append({
            "Question": question,
            "Reference Answer": reference_answer,
            "LLM Answer": llm_answer,
            "Jaccard Index": jaccard,
            "Dice Coefficient": dice,
            "Cosine Similarity": cosine_sim
        })
    
    return results

In [None]:
t_0 = T_now()
stats = evaluate_llm_on_qa(qa, client)
evaluation_df = pd.DataFrame.from_records(stats)
print(f"Elapsed time: {T_now()-T_0}")

evaluation_df.to_csv("token_based_metrics_example_table.csv", index=None)
display(evaluation_df.head())

# ROUGE (Recall-Oriented Understudy for Gisting Evaluation).

This example computes ROUGE-1 (unigrams), ROUGE-2 (bigrams), and ROUGE-L (longest common subsequence).

It calculates precision, recall, and F1-score for given reference and candidate texts.

In [49]:
from collections import Counter

# Function to calculate n-grams
def get_ngrams(text, n=1):
    """Generate n-grams from the given text."""
    tokens = text.lower().split()
    ngrams = zip(*[tokens[i:] for i in range(n)])
    return Counter(ngrams)

# ROUGE-1 and ROUGE-2 implementation
def rouge_n(reference, candidate, n=1):
    """
    Calculate ROUGE-N (recall, precision, and F1-score) for n-grams.
    
    Args:
        reference (str): The reference text.
        candidate (str): The candidate text generated by the model.
        n (int): The size of n-grams (1 for ROUGE-1, 2 for ROUGE-2).
    
    Returns:
        dict: Recall, precision, and F1-score for ROUGE-N.
    """
    ref_ngrams = get_ngrams(reference, n)
    cand_ngrams = get_ngrams(candidate, n)
    
    # Calculate overlap
    overlap = sum((ref_ngrams & cand_ngrams).values())
    ref_count = sum(ref_ngrams.values())
    cand_count = sum(cand_ngrams.values())
    
    # Calculate recall, precision, and F1-score
    recall = overlap / ref_count if ref_count > 0 else 0.0
    precision = overlap / cand_count if cand_count > 0 else 0.0
    f1_score = 2 * ((precision * recall) / (precision + recall)) if (precision + recall) > 0 else 0.0
    
    return {
        "recall": recall,
        "precision": precision,
        "f1_score": f1_score
    }

# Longest Common Subsequence (LCS) calculation
def lcs(X, Y):
    """
    Compute the length of the Longest Common Subsequence (LCS) between two sequences.
    """
    m, n = len(X), len(Y)
    dp = [[0] * (n + 1) for _ in range(m + 1)]
    for i in range(1, m + 1):
        for j in range(1, n + 1):
            if X[i - 1] == Y[j - 1]:
                dp[i][j] = dp[i - 1][j - 1] + 1
            else:
                dp[i][j] = max(dp[i - 1][j], dp[i][j - 1])
    return dp[m][n]

# ROUGE-L implementation
def rouge_l(reference, candidate):
    """
    Calculate ROUGE-L (recall, precision, and F1-score based on LCS).
    
    Args:
        reference (str): The reference text.
        candidate (str): The candidate text generated by the model.
    
    Returns:
        dict: Recall, precision, and F1-score for ROUGE-L.
    """
    ref_tokens = reference.lower().split()
    cand_tokens = candidate.lower().split()
    lcs_length = lcs(ref_tokens, cand_tokens)

    recall = lcs_length / len(ref_tokens) if len(ref_tokens) > 0 else 0.0
    precision = lcs_length / len(cand_tokens) if len(cand_tokens) > 0 else 0.0
    f1_score = 2 * ((precision * recall) / (precision + recall)) if (precision + recall) > 0 else 0.0
    
    return {
        "recall": recall,
        "precision": precision,
        "f1_score": f1_score
    }


#     # Example reference and candidate texts
#     reference_text = "The cat sat on the mat and looked at the door."
#     candidate_text = "The cat sat on the mat by the door."

#     # Compute ROUGE-1, ROUGE-2, and ROUGE-L
#     rouge1 = rouge_n(reference_text, candidate_text, n=1)  # ROUGE-1 (unigrams)
#     rouge2 = rouge_n(reference_text, candidate_text, n=2)  # ROUGE-2 (bigrams)
#     rouge_l_result = rouge_l(reference_text, candidate_text)  # ROUGE-L (LCS)

#     # Print results
#     print("ROUGE-1:", rouge1)
#     print("ROUGE-2:", rouge2)
#     print("ROUGE-L:", rouge_l_result)


In [47]:
evaluation_df.head()

Unnamed: 0,Question,Reference Answer,LLM Answer,Jaccard Index,Dice Coefficient,Cosine Similarity
0,To whom did the Virgin Mary allegedly appear i...,Saint Bernadette Soubirous,The Virgin Mary allegedly appeared to a young ...,0.117647,0.210526,0.272166
1,What is in front of the Notre Dame Main Building?,a copper statue of Christ,"In front of the Notre Dame Main Building, you ...",0.066667,0.125,0.19118
2,The Basilica of the Sacred heart at Notre Dame...,the Main Building,The Basilica of the Sacred Heart at Notre Dame...,0.045455,0.086957,0.35007
3,What is the Grotto at Notre Dame?,a Marian place of prayer and reflection,"The Grotto at Notre Dame, officially known as ...",0.056604,0.107143,0.223782
4,What sits on top of the Main Building at Notre...,a golden statue of the Virgin Mary,On top of the Main Building at the University ...,0.15625,0.27027,0.568711


## Calculate ROUGE Metrics

In [59]:
rouge_l_df = pd.DataFrame.from_records(
    evaluation_df.apply(
        lambda row: rouge_l(row['Reference Answer'], row['LLM Answer']),
        axis=1)
).rename(lambda c: f"RougeL_{c}", axis=1)

In [60]:
rouge_l_df

Unnamed: 0,RougeL_recall,RougeL_precision,RougeL_f1_score
0,0.666667,0.117647,0.2
1,0.6,0.047619,0.088235
2,0.333333,0.032258,0.058824
3,0.428571,0.038961,0.071429
4,0.571429,0.093023,0.16
5,0.0,0.0,0.0
6,1.0,0.111111,0.2
7,0.5,0.090909,0.153846
8,0.0,0.0,0.0
9,0.0,0.0,0.0


In [61]:
pd.concat([evaluation_df, rouge_l_df], axis=1)

Unnamed: 0,Question,Reference Answer,LLM Answer,Jaccard Index,Dice Coefficient,Cosine Similarity,RougeL_recall,RougeL_precision,RougeL_f1_score
0,To whom did the Virgin Mary allegedly appear i...,Saint Bernadette Soubirous,The Virgin Mary allegedly appeared to a young ...,0.117647,0.210526,0.272166,0.666667,0.117647,0.2
1,What is in front of the Notre Dame Main Building?,a copper statue of Christ,"In front of the Notre Dame Main Building, you ...",0.066667,0.125,0.19118,0.6,0.047619,0.088235
2,The Basilica of the Sacred heart at Notre Dame...,the Main Building,The Basilica of the Sacred Heart at Notre Dame...,0.045455,0.086957,0.35007,0.333333,0.032258,0.058824
3,What is the Grotto at Notre Dame?,a Marian place of prayer and reflection,"The Grotto at Notre Dame, officially known as ...",0.056604,0.107143,0.223782,0.428571,0.038961,0.071429
4,What sits on top of the Main Building at Notre...,a golden statue of the Virgin Mary,On top of the Main Building at the University ...,0.15625,0.27027,0.568711,0.571429,0.093023,0.16
5,When did the Scholastic Magazine of Notre dame...,September 1876,The Scholastic Magazine of Notre Dame began pu...,0.0,0.0,0.0,0.0,0.0,0.0
6,How often is Notre Dame's the Juggler published?,twice,"Notre Dame's ""The Juggler"" is published twice ...",0.111111,0.2,0.353553,1.0,0.111111,0.2
7,What is the daily student paper at Notre Dame ...,The Observer,The daily student paper at Notre Dame is calle...,0.083333,0.153846,0.588348,0.5,0.090909,0.153846
8,How many student news papers are found at Notr...,three,"At the University of Notre Dame, there are sev...",0.0,0.0,0.0,0.0,0.0,0.0
9,In what year did the student paper Common Sens...,1987,The student paper Common Sense began publicati...,0.0,0.0,0.0,0.0,0.0,0.0


Also create ROUGE_1, ROUGE_2

In [62]:
rouge_N_list = [
    pd.DataFrame.from_records(
        evaluation_df.apply(
            lambda row: rouge_n(row['Reference Answer'], row['LLM Answer'], n),
            axis=1)
    ).rename(lambda c: f"Rouge{n}_{c}", axis=1)
    for n in (1, 2)
]

In [64]:
evaluation_df2 = pd.concat([evaluation_df, rouge_l_df] + rouge_N_list, axis=1)
print(f"Shape of Evaluation Table: {evaluation_df2.shape}")

Shape of Evaluation Table: (10, 15)
