## Accuracy metrics ( Bleu, Rouge, Meteor, BertScore)

### Concatenate triples from six reports

In [None]:
# CONFIGURATION 
import os
import pandas as pd
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk
from sacrebleu.metrics import BLEU
from rouge_score import rouge_scorer
from evaluate import load  # Import the evaluate library for METEOR
from bert_score import score
import re
from collections import Counter

# Get the base directory dynamically 
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__ if '__file__' in globals() else '.')))
EVAL_DIR = os.path.join(BASE_DIR, "eval")
TRIPLE_DIR = os.path.join(BASE_DIR, "Triple_preprocessing")
REPORTS_DIR = os.path.join(BASE_DIR, "reports")

print(f"Base directory: {BASE_DIR}")
print(f"Evaluation directory: {EVAL_DIR}")
print(f"Triple preprocessing directory: {TRIPLE_DIR}")
print(f"Reports directory: {REPORTS_DIR}")

In [None]:
# Concatenate triples from six reports for vicuna, llama, mistral, GPT-40, llama3-70b
df1 = pd.read_csv(os.path.join(TRIPLE_DIR, "2023-Cambodia-Art7Report-for2022", "cleaned_output.csv"))
df2 = pd.read_csv(os.path.join(TRIPLE_DIR, "Annual report 2023", "cleaned_output.csv"))
df3 = pd.read_csv(os.path.join(TRIPLE_DIR, "CAMBODIA_CLEARING_CMR_2023", "cleaned_output.csv"))
df4 = pd.read_csv(os.path.join(TRIPLE_DIR, "Cambodia_Clearing_the_Mines_2023", "cleaned_output.csv"))
df5 = pd.read_csv(os.path.join(TRIPLE_DIR, "IWP-2023", "cleaned_output.csv"))
df6 = pd.read_csv(os.path.join(TRIPLE_DIR, "IWP-2024", "cleaned_output.csv"))

# Concatenate them along the rows
df_all_triples = pd.concat([df1, df2, df3, df4, df5, df6], axis=0)

In [None]:
output_file = os.path.join(TRIPLE_DIR, "df_all_triples.csv")
df_all_triples.to_csv(output_file, index=False)

In [None]:
# Load the CSV files
annotation_df = pd.read_csv(os.path.join(EVAL_DIR, "Annotations.csv"))
all_triples_A1_df = pd.read_csv(os.path.join(TRIPLE_DIR, "All_triples_A1.csv"))

# Find rows in All_triples_A1 where the 'Prompts' column matches with the 'Prompts' column in Annotation
evalute_A1_df = all_triples_A1_df[all_triples_A1_df['Prompts'].isin(annotation_df['Prompts'])]

evalute_A1_df = evalute_A1_df.merge(annotation_df[['Prompts', 'Annotation']], on='Prompts', how='left')

# Display the matching rows
evalute_A1_df

In [None]:
df_all_triples

### Preprocessing with stemming and lematizing

In [None]:
RP_df = pd.read_csv(os.path.join(REPORTS_DIR, "RP_cleaned_output.csv"))
RS_df = pd.read_csv(os.path.join(REPORTS_DIR, "RS_cleaned_output.csv"))
OP_df = pd.read_csv(os.path.join(REPORTS_DIR, "OP_cleaned_output.csv"))
OS_df = pd.read_csv(os.path.join(REPORTS_DIR, "OS_cleaned_output.csv"))
no_demo_df = pd.read_csv(os.path.join(BASE_DIR, "no_demo_cleaned.csv"))

In [None]:
no_demo_df['llama3-70b'][0]

In [None]:
# Download required NLTK data
nltk.download('punkt')
nltk.download('wordnet')

# Initialize stemmer and lemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# Function to stem and lemmatize text, while preserving newlines
def preprocess_text(text, use_stemming=True):
    # Split the text by newline characters to preserve them
    lines = text.splitlines()
    
    # Process each line separately
    processed_lines = []
    for line in lines:
        tokens = word_tokenize(line)  # Tokenize the line
        if use_stemming:
            tokens = [stemmer.stem(token) for token in tokens]  # Apply stemming
        else:
            tokens = [lemmatizer.lemmatize(token) for token in tokens]  # Apply lemmatization
        processed_line = ' '.join(tokens)  # Reassemble the processed tokens into a line
        processed_lines.append(processed_line)  # Append the processed line to the list
    
    # Join the processed lines back with newline characters
    return '\n'.join(processed_lines)

# Apply stemming/lemmatization to the specified columns
columns_to_process = ['mistral-7b', 'llama3-8b', 'gemma2-9b', 'llama3-70b', 'GPT-4o', 'Annotation']

for column in columns_to_process:
    # Choose whether to use stemming or lemmatization and preserve newlines
    RP_df[column] = RP_df[column].apply(lambda x: preprocess_text(x, use_stemming=True))
    RS_df[column] = RS_df[column].apply(lambda x: preprocess_text(x, use_stemming=True))
    OP_df[column] = OP_df[column].apply(lambda x: preprocess_text(x, use_stemming=True))
    OS_df[column] = OS_df[column].apply(lambda x: preprocess_text(x, use_stemming=True))
    no_demo_df[column] = no_demo_df[column].apply(lambda x: preprocess_text(x, use_stemming=True))


In [None]:
RP_df

### Annotation_based Accuracy Eval

In [None]:
import pandas as pd
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk
from sacrebleu.metrics import BLEU
from rouge_score import rouge_scorer
from evaluate import load  # Import the evaluate library for METEOR
from bert_score import score
import pandas as pd
import re
from collections import Counter

bleu_scorer = BLEU(effective_order=True)  # Enable effective_order for sentence-level BLEU
rouge_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
meteor_scorer = load('meteor')

# Function to calculate metrics for each model against the Annotation column
def calculate_detailed_metrics(model_column, reference_column):
    # Initialize lists to store scores
    bleu_detailed_scores = {
        "BLEU": [],
        "1-gram": [],
        "2-gram": [],
        "3-gram": [],
        "hyp_len": [],
        "ref_len": []
    }
    meteor_scores = []
    rouge_detailed_scores = {
        "ROUGE-1": [], "ROUGE-2": [], "ROUGE-L": []
    }
    bert_score_metrics = {
        "BERTScore F1": []
    }

    # Iterate over each row to compute scores
    for i, row in no_demo_df.iterrows():
        candidate = str(row[model_column])  # Ensure candidate and reference are strings
        reference = str(row[reference_column])

        # BLEU score (SacreBLEU) with detailed components
        bleu = bleu_scorer.sentence_score(candidate, [reference])
        bleu_detailed_scores["BLEU"].append(bleu.score)
        bleu_detailed_scores["1-gram"].append(bleu.precisions[0])
        bleu_detailed_scores["2-gram"].append(bleu.precisions[1])
        bleu_detailed_scores["3-gram"].append(bleu.precisions[2])
        bleu_detailed_scores["hyp_len"].append(bleu.sys_len)
        bleu_detailed_scores["ref_len"].append(bleu.ref_len)

        # METEOR score using the evaluate library
        meteor_result = meteor_scorer.compute(predictions=[candidate], references=[reference])
        meteor_scores.append(meteor_result['meteor'])

        # ROUGE scores
        rouge_score = rouge_scorer.score(reference, candidate)
        rouge_detailed_scores["ROUGE-1"].append(rouge_score['rouge1'].fmeasure)
        rouge_detailed_scores["ROUGE-2"].append(rouge_score['rouge2'].fmeasure)
        rouge_detailed_scores["ROUGE-L"].append(rouge_score['rougeL'].fmeasure)

        # BERTScore
        P, R, F1 = score([candidate], [reference], lang="en", verbose=False)
        bert_score_metrics["BERTScore F1"].append(F1.mean().item())

    # Create DataFrame to store all scores
    metrics_df = pd.DataFrame({
        **bleu_detailed_scores,
        **rouge_detailed_scores,
        "METEOR": meteor_scores,
        **bert_score_metrics
    })

    return metrics_df

# Calculate detailed metrics for each model against the Annotation column
metrics_mistral_7b_df = calculate_detailed_metrics('mistral-7b', 'Annotation')
metrics_llama3_8b_df = calculate_detailed_metrics('llama3-8b', 'Annotation')
metrics_gemma2_9b_df = calculate_detailed_metrics('gemma2-9b', 'Annotation')
metrics_llama3_70b_df = calculate_detailed_metrics('llama3-70b', 'Annotation')
metrics_GPT_4o_df = calculate_detailed_metrics('GPT-4o', 'Annotation')

# Calculate average metrics
average_mistral_7b_df = metrics_mistral_7b_df.mean()
average_llama3_8b_df = metrics_llama3_8b_df.mean()
average_gemma2_9b_df = metrics_gemma2_9b_df.mean()
average_llama3_70b_df = metrics_llama3_70b_df.mean()
average_GPT_4o_df = metrics_GPT_4o_df.mean()

In [None]:
average_mistral_7b_df

In [None]:
average_llama3_8b_df

In [None]:
average_gemma2_9b_df

In [None]:
average_llama3_70b_df

In [None]:
average_GPT_4o_df

In [None]:
import pandas as pd
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk
from sacrebleu.metrics import BLEU
from rouge_score import rouge_scorer
from evaluate import load  # Import the evaluate library for METEOR
from bert_score import score
import pandas as pd
import re
from collections import Counter

bleu_scorer = BLEU(effective_order=True)  # Enable effective_order for sentence-level BLEU
rouge_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
meteor_scorer = load('meteor')

# Function to calculate metrics for each model against the Annotation column
def calculate_detailed_metrics(model_column, reference_column):
    # Initialize lists to store scores
    bleu_detailed_scores = {
        "BLEU": [],
        "1-gram": [],
        "2-gram": [],
        "3-gram": [],
        "hyp_len": [],
        "ref_len": []
    }
    meteor_scores = []
    rouge_detailed_scores = {
        "ROUGE-1": [], "ROUGE-2": [], "ROUGE-L": []
    }
    bert_score_metrics = {
        "BERTScore F1": []
    }

# Iterate over each row to compute scores
    for i, row in OS_df.iterrows():
        candidate = str(row[model_column])  # Ensure candidate and reference are strings
        reference = str(row[reference_column])

        # BLEU score (SacreBLEU) with detailed components
        bleu = bleu_scorer.sentence_score(candidate, [reference])
        bleu_detailed_scores["BLEU"].append(bleu.score)
        bleu_detailed_scores["1-gram"].append(bleu.precisions[0])
        bleu_detailed_scores["2-gram"].append(bleu.precisions[1])
        bleu_detailed_scores["3-gram"].append(bleu.precisions[2])
        bleu_detailed_scores["hyp_len"].append(bleu.sys_len)
        bleu_detailed_scores["ref_len"].append(bleu.ref_len)

        # METEOR score using the evaluate library
        meteor_result = meteor_scorer.compute(predictions=[candidate], references=[reference])
        meteor_scores.append(meteor_result['meteor'])

        # ROUGE scores
        rouge_score = rouge_scorer.score(reference, candidate)
        rouge_detailed_scores["ROUGE-1"].append(rouge_score['rouge1'].fmeasure)
        rouge_detailed_scores["ROUGE-2"].append(rouge_score['rouge2'].fmeasure)
        rouge_detailed_scores["ROUGE-L"].append(rouge_score['rougeL'].fmeasure)

        # BERTScore
        P, R, F1 = score([candidate], [reference], lang="en", verbose=False)
        bert_score_metrics["BERTScore F1"].append(F1.mean().item())

    # Create DataFrame to store all scores
    metrics_df = pd.DataFrame({
        **bleu_detailed_scores,
        **rouge_detailed_scores,
        "METEOR": meteor_scores,
        **bert_score_metrics
    })

    return metrics_df

# Calculate detailed metrics for each model against the Annotation column
metrics_mistral_7b_df = calculate_detailed_metrics('mistral-7b', 'Annotation')
metrics_llama3_8b_df = calculate_detailed_metrics('llama3-8b', 'Annotation')
metrics_gemma2_9b_df = calculate_detailed_metrics('gemma2-9b', 'Annotation')
metrics_llama3_70b_df = calculate_detailed_metrics('llama3-70b', 'Annotation')
metrics_GPT_4o_df = calculate_detailed_metrics('GPT-4o', 'Annotation')

# Calculate average metrics
average_mistral_7b_df = metrics_mistral_7b_df.mean()
average_llama3_8b_df = metrics_llama3_8b_df.mean()
average_gemma2_9b_df = metrics_gemma2_9b_df.mean()
average_llama3_70b_df = metrics_llama3_70b_df.mean()
average_GPT_4o_df = metrics_GPT_4o_df.mean()

In [None]:
average_mistral_7b_df

In [None]:
average_llama3_8b_df

In [None]:
average_gemma2_9b_df

In [None]:
average_llama3_70b_df

In [None]:
average_GPT_4o_df

### OC, Hallucinate Eval (hard-coded)

#### OC

In [None]:
RP_df = pd.read_csv('os.path.join(REPORTS_DIR, "RP_cleaned_output.csv"))
RS_df = pd.read_csv('os.path.join(REPORTS_DIR, "RS_cleaned_output.csv"))
OP_df = pd.read_csv('os.path.join(REPORTS_DIR, "OP_cleaned_output.csv"))
OS_df = pd.read_csv('os.path.join(REPORTS_DIR, "OS_cleaned_output.csv"))
no_demo_df = pd.read_csv('os.path.join(BASE_DIR, "no_demo_cleaned.csv")')

In [None]:
no_demo_df

In [None]:
def extract_relation(triple):
    # Extract the part before '(' as the relation
    match = re.match(r'([^\(]+)\(', triple)
    if match:
        return match.group(1).strip()  # Return the relation, strip extra spaces
    return None

# Calculate Ontology Conformance and identify non-conformant relations
def calculate_ontology_conformance(output_triples, allowed_relations):
    conformant_triples = 0
    total_triples = len(output_triples)
    non_conformant_relations = []

    for triple in output_triples:
        relation = extract_relation(triple)
        if relation and relation in allowed_relations:
            conformant_triples += 1
        else:
            if relation: 
                non_conformant_relations.append(relation)

    # Compute the ontology conformance metric
    oc_metric = (conformant_triples / total_triples) * 100 if total_triples > 0 else 0
    return oc_metric, non_conformant_relations

# Function to process the entire DataFrame
def process_dataframe(df):
    # Initialize lists to store results
    oc_metrics = {model: [] for model in ['mistral-7b', 'llama3-8b', 'gemma2-9b', 'llama3-70b', 'GPT-4o']}
    wrong_relations = {model: [] for model in ['mistral-7b', 'llama3-8b', 'gemma2-9b', 'llama3-70b', 'GPT-4o']}

    for i, row in df.iterrows():
        # Extract allowed relations from the prompt (ontology)
        prompt_relations = extract_relations_from_prompt(row['prompts_with_no_demo'])  # Extract from the prompt column

        # Process each model's output
        for model in oc_metrics.keys():
            output_triples = row[model].split('\n')  # Split model's output by newline
            output_triples = [t.strip() for t in output_triples if t.strip()]  # Clean up the triples

            # Calculate ontology conformance
            oc_metric, non_conformant_relations = calculate_ontology_conformance(output_triples, prompt_relations)

            # Store the results
            oc_metrics[model].append(oc_metric)
            wrong_relations[model].append(non_conformant_relations)

    # Add the results to the DataFrame
    for model in oc_metrics.keys():
        df[f'{model}_OC'] = oc_metrics[model]
        df[f'{model}_NonConformantRelations'] = wrong_relations[model]

    return df

# Function to extract allowed relations from the prompt
def extract_relations_from_prompt(prompt):
    match = re.search(r'relation_types:\s*(.*)', prompt)
    if match:
        relations_str = match.group(1)
        allowed_relations = set(map(str.strip, relations_str.split(',')))  # Convert to a set of allowed relations
        return allowed_relations
    else:
        raise ValueError("Relation types not found in the prompt.")

# Example DataFrame processing
no_demo_df_processed = process_dataframe(no_demo_df)

# Calculate the average OC score for each model
average_oc_scores = {
    'mistral-7b': no_demo_df_processed['mistral-7b_OC'].mean(),
    'llama3-8b': no_demo_df_processed['llama3-8b_OC'].mean(),
    'gemma2-9b': no_demo_df_processed['gemma2-9b_OC'].mean(),
    'llama3-70b': no_demo_df_processed['llama3-70b_OC'].mean(),
    'GPT-4o': no_demo_df_processed['GPT-4o_OC'].mean()
}

# Display the average OC scores
for model, avg_oc in average_oc_scores.items():
    print(f"Average OC for {model}: {avg_oc:.2f}%")

In [None]:
RS_df_processed

In [None]:
# Count frequency of non-conformant relation output by each model

# Define the columns containing non-conformant relations
nonconformant_columns = [
    'mistral-7b_NonConformantRelations', 
    'llama3-8b_NonConformantRelations', 
    'gemma2-9b_NonConformantRelations', 
    'llama3-70b_NonConformantRelations', 
    'GPT-4o_NonConformantRelations'
]

# Function to count non-conformant relation frequencies
def count_nonconformant_frequencies(column_data):
    all_relations = []
    for relations in column_data:
        # Extend the list with non-conformant relations directly
        all_relations.extend(relations)
    
    # Count the frequency of each relation using Counter
    return Counter(all_relations)

# Create a dictionary to store the frequency counts for each column
frequency_results = {}

# Process each non-conformant relations column
for column in nonconformant_columns:
    frequency_results[column] = count_nonconformant_frequencies(no_demo_df_processed[column])

# Print out the top 10 most frequent non-conformant relations for each model
for model, counter in frequency_results.items():
    print(f"\nTop 10 most frequent non-conformant relations for {model} (sorted by frequency):")
    # Sort the counter by frequency (most frequent first) and take the top 10
    sorted_relations = sorted(counter.items(), key=lambda item: item[1], reverse=True)[:10]
    for relation, freq in sorted_relations:
        print(f"{relation}: {freq}")

#### Format Conformance and Hallucination Detection

In [None]:
RP_df = pd.read_csv('os.path.join(REPORTS_DIR, "RP_cleaned_output.csv")) # two Sentence 
RS_df = pd.read_csv('os.path.join(REPORTS_DIR, "RS_cleaned_output.csv")) # two Sentence 
OP_df = pd.read_csv('os.path.join(REPORTS_DIR, "OP_cleaned_output.csv")) # two Paragraph
OS_df = pd.read_csv('os.path.join(REPORTS_DIR, "OS_cleaned_output.csv")) # two Sentence 
no_demo_df = pd.read_csv('os.path.join(BASE_DIR, "no_demo_cleaned.csv")') # one Sentence

In [None]:
no_demo_df

In [None]:
# Function to validate the triple format
def is_valid_triple(triple):
    # Updated regex to handle phrases with special characters
    match = re.match(r'^[a-zA-Z_]+\([^()]+(?:, [^()]+)+\)$', triple)
    return bool(match)

# Function to normalize text (lowercase and strip extra spaces)
def normalize_text(text):
    # Replace newline characters with spaces and normalize spaces
    return re.sub(r'\s+', ' ', text.replace('\n', ' ').strip().lower())

# Function to extract the second sentence from the prompt
def extract_context_from_prompt(prompt): # RP_df  RS_df
    # Match all occurrences of "Sentence:" and extract the following content
    matches = re.findall(r'Sentence:\s*(.*?)\s*(?=Sentence:|\Z)', prompt, re.DOTALL)
    if len(matches) > 1:
        context = matches[1]  # Extract the second match
        normalized_context = normalize_text(context)
        # print(f"Extracted Context: {normalized_context}")  # Debugging: Print the extracted context
        return normalized_context
    elif len(matches) == 1:
        # print(f"Only one sentence found in prompt, using the available context:\n{matches[0]}")
        return normalize_text(matches[0])
    else:
        # print(f"No valid context sentence found in the prompt: {prompt}")
        raise ValueError("Second context sentence not found in the prompt.")

# Function to check if the subject and object in the triple are in the context
def detect_hallucinations(triples, context):
    subject_hallucinations = 0
    object_hallucinations = 0
    valid_triples = 0
    invalid_triples_list = []  # List to store invalid triples
    
    for index, triple in enumerate(triples):
        # if index >= 5:  # Process only the first five triples for debugging
        #     break

        if not is_valid_triple(triple):
            print(f"Ill-formatted triple detected: {triple}")
            invalid_triples_list.append(triple)  # Store the invalid triple
            continue  # Skip invalid triples

        valid_triples += 1

        try:
            # Parse the triple
            relation, entities = triple.split('(', 1)
            subject, obj = entities[:-1].split(', ')
            
            # Normalize subject and object
            subject_normalized = normalize_text(subject)
            object_normalized = normalize_text(obj)
            
            # Print the subject and object being compared
            # print(f"Processing Triple: {triple}")
            # print(f"Subject: {subject_normalized} | Object: {object_normalized}")
            
            # Check for subject hallucinations
            if subject_normalized not in context:
                print(f"Subject Hallucination Detected: {subject_normalized} not found in context.")
                subject_hallucinations += 1
                
            # Check for object hallucinations
            if object_normalized not in context:
                print(f"Object Hallucination Detected: {object_normalized} not found in context.")
                object_hallucinations += 1
        
        except ValueError:
            # Handle cases where the triple is malformed
            print(f"Malformed triple detected: {triple}")
            invalid_triples_list.append(triple)  # Store the malformed triple
            continue
    
    return subject_hallucinations, object_hallucinations, valid_triples, invalid_triples_list

# Function to process the entire DataFrame and calculate hallucinations
def process_dataframe_for_hallucinations(df):
    hallucination_counts = {
        model: {'subject': 0, 'object': 0, 'total_triples': 0, 'invalid_triples': 0, 'all_triples': 0, 'invalid_triples_list': []}
        for model in ['mistral-7b', 'llama3-8b', 'gemma2-9b', 'llama3-70b', 'GPT-4o']
    }
    
    for i, row in df.iterrows():
        prompt = row['prompts_with_no_demo']
        try:
            context = extract_context_from_prompt(prompt)
        except ValueError:
            print(f"Skipping row {i} due to missing context.")
            continue
        
        for model in hallucination_counts.keys():
            triples = row[model].split('\n')
            
            hallucination_counts[model]['all_triples'] += len(triples)  # Count all triples

            # Process all triples to detect hallucinations
            subject_hallucinations, object_hallucinations, valid_triple_count, invalid_triples = detect_hallucinations(triples, context)
            
            # Update counts
            hallucination_counts[model]['subject'] += subject_hallucinations
            hallucination_counts[model]['object'] += object_hallucinations
            hallucination_counts[model]['total_triples'] += valid_triple_count
            hallucination_counts[model]['invalid_triples'] += len(invalid_triples)
            hallucination_counts[model]['invalid_triples_list'].extend(invalid_triples)
    
    # Calculate the percentage of hallucinations and invalid triples
    hallucination_percentages = {}
    for model in hallucination_counts.keys():
        total_valid_triples = hallucination_counts[model]['total_triples']
        total_triples = hallucination_counts[model]['all_triples']
        invalid_triples = hallucination_counts[model]['invalid_triples']

        if total_valid_triples > 0:
            subject_percentage = (hallucination_counts[model]['subject'] / total_valid_triples) * 100
            object_percentage = (hallucination_counts[model]['object'] / total_valid_triples) * 100
        else:
            subject_percentage = 0.0
            object_percentage = 0.0
        
        invalid_triple_percentage = (invalid_triples / total_triples) * 100 if total_triples > 0 else 0.0
        
        hallucination_percentages[model] = {
            'subject_percentage': subject_percentage,
            'object_percentage': object_percentage,
            'invalid_triple_percentage': invalid_triple_percentage,
            'invalid_triples': invalid_triples,
            'invalid_triples_list': hallucination_counts[model]['invalid_triples_list']  # Store the list of invalid triples
        }
    
    return hallucination_percentages

# Usage
# df = pd.read_csv('os.path.join(EVAL_DIR, "A1_OC_metrics.csv"))
hallucination_results = process_dataframe_for_hallucinations(no_demo_df)

# Print out the results
for model, percentages in hallucination_results.items():
    print(f"\nHallucination percentages for {model}:")
    print(f"Subject Hallucinations: {percentages['subject_percentage']:.2f}%")
    print(f"Object Hallucinations: {percentages['object_percentage']:.2f}%")
    print(f"Proportion of Invalid Triples: {percentages['invalid_triple_percentage']:.2f}%")
    print(f"Invalid Triples: {percentages['invalid_triples']} found")
    print("Invalid Triples List:")
    for invalid_triple in percentages['invalid_triples_list']:
        print(f" - {invalid_triple}")


In [None]:
for model, percentages in hallucination_results.items():
    format_conformance = 100 - percentages['invalid_triple_percentage']  # Calculate Format Conformance
    print(f"\nHallucination percentages for {model}:")
    print(f"Subject Hallucinations: {percentages['subject_percentage']:.2f}%")
    print(f"Object Hallucinations: {percentages['object_percentage']:.2f}%")
    print(f"Format Conformance: {format_conformance:.2f}%")  # Print Format Conformance
    print(f"Invalid Triples: {percentages['invalid_triples']} found")


### LLM judges (GPT-4o, llama3-70b, llama3.1-70b)

In [None]:
import time
import os
import torch
from openai import AzureOpenAI
from groq import Groq

#### Accuracy Judge Prompts

In [None]:
df = pd.read_csv('os.path.join(EVAL_DIR, "A1_OC_metrics.csv"))

In [None]:
# Define the prompt template
accuracy_judge_prompt_template = '''
You are a judge who ranks five models from 1 to 5 on a triple extraction task. You must assign 1 to the model with the best answer and 5 to the model with the worst answer. Your ranking should be provided directly in this format: [1: model x; 2: model x; 3: model x; 4: model x; 5: model x]. 
Ranking Criteria: 
Correctness of Triples: The triples must conform to the format relation(subject, object) and must accurately reflect relationships stated in the context. Models with significant formatting errors should be penalized. 
Coverage: The number of correct triples extracted. More accurate triples are better, but avoid penalizing slight redundancies unless they detract from the overall relevance. 
Relevance: The triples must be relevant to the specified entity and relation types and should align well with the specific context provided. 
Edge Cases: If a model extracts many triples but includes incorrect or redundant ones, balance accuracy and redundancy in your ranking. Correctness should be prioritized, followed by relevance, then coverage. 
Given Entity Types: {entity_types}
Given Relation Types: {relation_types} 
Context: {context}
Model Outputs: 
Model 1: {model_1}
Model 2: {model_2}
Model 3: {model_3}
Model 4: {model_4}
Model 5: {model_5}
Your ranking:
'''

# Function to extract specific parts from the 'Prompts' column
def extract_parts_from_prompt(prompt):
    entity_types = re.search(r'entity_types:\s*(.*?)\s*relation_types:', prompt, re.DOTALL)
    relation_types = re.search(r'relation_types:\s*(.*?)\s*Example:', prompt, re.DOTALL)
    sentences = re.findall(r'Sentence:\s*(.*?)\s*(?=Sentence:|\Z)', prompt, re.DOTALL)
    
    # Use the second sentence as the context
    context = sentences[1].strip() if len(sentences) > 1 else sentences[0].strip()    
    
    return (entity_types.group(1).strip() if entity_types else "",
            relation_types.group(1).strip() if relation_types else "",
            context)

# Create the 'Accuracy_judge_prompts' column by applying the template to each row
df['Accuracy_judge_prompts'] = df.apply(
    lambda row: accuracy_judge_prompt_template.format(
        entity_types=extract_parts_from_prompt(row['Prompts'])[0],
        relation_types=extract_parts_from_prompt(row['Prompts'])[1],
        context=extract_parts_from_prompt(row['Prompts'])[2],
        model_1=row['llama3'],
        model_2=row['vicuna'],
        model_3=row['mistral-instruct'],
        model_4=row['GPT-4o'],
        model_5=row['llama3-70b']
    ),
    axis=1
)

# Save the updated DataFrame to a new CSV file
df.to_csv('os.path.join(EVAL_DIR, "A1_OC_metrics_with_judge_prompts.csv"), index=False)

#### GPT judges Accuracy

In [None]:
# OpenAI 
prompts = pd.read_csv('os.path.join(EVAL_DIR, "A1_OC_metrics_with_judge_prompts.csv"))
output_csv_file_path = 'os.path.join(EVAL_DIR, "GPT_judge_Accuracy.csv")

client = AzureOpenAI(
  azure_endpoint="https://cmac-openai-default.openai.azure.com/", 
  api_key="<KEY TO BE ADDED>",  
  api_version="2024-02-15-preview"
)

# Define the maximum number of retries
max_retries = 5
retry_delay = 8  # seconds

try:
    # Iterate through each prompt
    # for i, row in prompts.iloc[170:].iterrows():
    for i, row in prompts.iterrows():
        prompt = row['Accuracy_judge_prompts']
        print(f"Processing prompt {i + 1} out of {len(prompts)}")
        
        retry_count = 0
        while retry_count < max_retries:
            try:
                start_time = time.time()
                response = client.chat.completions.create(
                    model="gpt-4o-default", 
                    messages=[
                        {"role": "system", "content": "You are a helpful assistant."},
                        {"role": "user", "content": prompt}
                    ]
                )
                # end_time = time.time()
                # processing_time = end_time - start_time
                
                # Update the DataFrame with the response and processing time
                prompts.at[i, 'GPT-Accuracy'] = response.choices[0].message.content
                
                print(f"Model's Answer for prompt {i + 1}:\n{response.choices[0].message.content}\n")
                
                # Save the updated DataFrame back to a CSV file after each prompt
                prompts.to_csv(output_csv_file_path, index=False)
                
                # Break the retry loop if the request was successful
                break
                
            except Exception as e:
                print(f"An error occurred: {e}")
                if '429' in str(e):
                    retry_count += 1
                    print(f"Rate limit exceeded. Retrying in {retry_delay} seconds... (Attempt {retry_count}/{max_retries})")
                    time.sleep(retry_delay)
                else:
                    raise e

except Exception as e:
    print(f"An error occurred: {e}")

#### GPT judges OC

In [None]:
# OpenAI 
prompts = pd.read_csv('os.path.join(EVAL_DIR, "A1_OC_metrics_with_judge_prompts.csv"))
output_csv_file_path = 'os.path.join(EVAL_DIR, "GPT_judge_OC74-.csv")

client = AzureOpenAI(
  azure_endpoint="https://cmac-openai-default.openai.azure.com/", 
  api_key="8f7786fd7cca4bbeb86fcde25a394fa8",  
  api_version="2024-02-15-preview"
)

# Define the maximum number of retries
max_retries = 5
retry_delay = 8  # seconds

try:
    # Iterate through each prompt
    for i, row in prompts.iloc[74:].iterrows():
    # for i, row in prompts.iterrows():
        prompt = row['OC_judge_prompts']
        print(f"Processing prompt {i + 1} out of {len(prompts)}")
        
        retry_count = 0
        while retry_count < max_retries:
            try:
                start_time = time.time()
                response = client.chat.completions.create(
                    model="gpt-4o-default", 
                    messages=[
                        {"role": "system", "content": "You are a helpful assistant."},
                        {"role": "user", "content": prompt}
                    ]
                )
                # end_time = time.time()
                # processing_time = end_time - start_time
                
                # Update the DataFrame with the response and processing time
                prompts.at[i, 'GPT-OC'] = response.choices[0].message.content
                
                print(f"Model's Answer for prompt {i + 1}:\n{response.choices[0].message.content}\n")
                
                # Save the updated DataFrame back to a CSV file after each prompt
                prompts.to_csv(output_csv_file_path, index=False)
                
                # Break the retry loop if the request was successful
                break
                
            except Exception as e:
                print(f"An error occurred: {e}")
                if '429' in str(e):
                    retry_count += 1
                    print(f"Rate limit exceeded. Retrying in {retry_delay} seconds... (Attempt {retry_count}/{max_retries})")
                    time.sleep(retry_delay)
                else:
                    raise e

except Exception as e:
    print(f"An error occurred: {e}")

#### GPT judges FC

In [None]:
"Accuracy_GPT_judge_llama3"
"Accuracy_GPT_judge_vicuna"
"Accuracy_GPT_judge_mistral"
"Accuracy_GPT_judge_GPT"
"Accuracy_GPT_judge_llama3-70b"

In [None]:
"Accuracy_llama3-70b_judge_llama3"
"Accuracy_llama3-70b_judge_vicuna"
"Accuracy_llama3-70b_judge_mistral"
"Accuracy_llama3-70b_judge_GPT"
"Accuracy_llama3-70b_judge_llama3-70b"

In [None]:
"Accuracy_llama3.1-70b_judge_llama3"
"Accuracy_llama3.1-70b_judge_vicuna"
"Accuracy_llama3.1-70b_judge_mistral"
"Accuracy_llama3.1-70b_judge_GPT"
"Accuracy_llama3.1-70b_judge_llama3-70b"

#### OC Judge Prompts

In [None]:
df = pd.read_csv('os.path.join(EVAL_DIR, "A1_OC_metrics_with_judge_prompts.csv"))

In [None]:
# Define the prompt template
OC_judge_prompt_template = '''
You are a judge who checks whether the relation type in each triple is conformant to the given relation types. The triples are in the format relation(subject, object). You must assign 1 if the relation in a triple is conformant and assign 0 if it is not. Your check results should be provided directly in this format: [x1, x2, ..., xn] where x is a boolean value (1 or 0), and n is the number of triples. Do not include any additional explanations or text, just output the results in the format specified above.
Checking Criteria:
A relation is conformant if it is present in the given relation types; otherwise, it is not conformant.
Given Relation Types: {relation_types}
Model Outputs:
Model 1: {model_1}
Model 2: {model_2}
Model 3: {model_3}
Model 4: {model_4}
Model 5: {model_5}
Your output:
'''

# Function to extract specific parts from the 'Prompts' column
def extract_parts_from_prompt(prompt):
    relation_types = re.search(r'relation_types:\s*(.*?)\s*Example:', prompt, re.DOTALL)    
    return relation_types.group(1).strip() if relation_types else ""

# Create the 'OC_judge_prompts' column by applying the template to each row
df['OC_judge_prompts'] = df.apply(
    lambda row: OC_judge_prompt_template.format(
        relation_types=extract_parts_from_prompt(row['Prompts']),
        model_1=row['llama3'],
        model_2=row['vicuna'],
        model_3=row['mistral-instruct'],
        model_4=row['GPT-4o'],
        model_5=row['llama3-70b']
    ),
    axis=1
)

# Save the updated DataFrame to a new CSV file
df.to_csv('os.path.join(EVAL_DIR, "A1_OC_metrics_with_judge_prompts.csv"), index=False)


#### FC Judge Prompts

In [None]:
df = pd.read_csv('os.path.join(EVAL_DIR, "A1_OC_metrics_with_judge_prompts.csv"))

In [None]:
# Define the prompt template
FC_judge_prompt_template = '''
You are a judge who checks whether each triple is conformant to the format. A triple is considered conformant to the format if it strictly follows the pattern: relation(subject, object). In this pattern, relation is a string of alphabetic characters (e.g., hasLocation). Subject and object are strings enclosed in parentheses, separated by a comma, and may contain special characters, numbers, and spaces.
Checking Criteria:
If the triple is conformant to the format, return 1; otherwise, return 0. In the output, you must directly provide a list [x1, x2, ..., xn] where x is a boolean value (1 or 0), and n is the number of triples. Do not include any additional explanations or text; just output the results in the format specified above.
Edge Cases That Should Be Considered Conformant:
If the subject or object in the triple contains numbers with commas (e.g., 2,500,011 square meters), it should be considered conformant. This allows for large numbers to be formatted correctly.
If the subject or object contains an acronym or abbreviation within parentheses, it should be considered conformant. This allows for organization names, project names, etc., that include acronyms.
Model Outputs: 
Model 1: {model_1}
Model 2: {model_2}
Model 3: {model_3}
Model 4: {model_4}
Model 5: {model_5}
Your output:
'''

# Create the 'FC_judge_prompts' column by applying the template to each row
df['FC_judge_prompts'] = df.apply(
    lambda row: FC_judge_prompt_template.format(
        model_1=row['llama3'],
        model_2=row['vicuna'],
        model_3=row['mistral-instruct'],
        model_4=row['GPT-4o'],
        model_5=row['llama3-70b']
    ),
    axis=1
)

# Save the updated DataFrame to a new CSV file
df.to_csv('os.path.join(EVAL_DIR, "A1_OC_metrics_with_judge_prompts.csv"), index=False)
