In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score
import string
import re
from collections import Counter
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.translate.bleu_score import sentence_bleu
from scipy.spatial.distance import cosine
from sentence_transformers import SentenceTransformer

# Download necessary NLTK resources
import nltk
nltk.download('punkt_tab')
nltk.download('punkt')
nltk.download('stopwords')

## Run the following cell to get the metrics
##### (Make sure that all the data files are present as required.)

In [None]:
def preprocess_text(text):
    """Preprocess text by removing punctuation, lowercasing, and removing stopwords."""
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return ' '.join(tokens)

def compute_exact_match(pred_answers, true_answers):
    """Compute exact match score between predicted and true answers."""
    matches = [pred.strip() == true.strip() for pred, true in zip(pred_answers, true_answers)]
    return sum(matches) / len(matches) if matches else 0

def compute_f1_score(pred_answers, true_answers):
    """Compute word-level F1 score between predicted and true answers."""
    f1_scores = []

    for pred, true in zip(pred_answers, true_answers):
        # Tokenize and create sets of words
        pred_tokens = set(word_tokenize(preprocess_text(pred)))
        true_tokens = set(word_tokenize(preprocess_text(true)))

        # Skip empty answers
        if not true_tokens or not pred_tokens:
            continue

        # Calculate precision, recall, F1
        common_tokens = pred_tokens.intersection(true_tokens)
        precision = len(common_tokens) / len(pred_tokens) if pred_tokens else 0
        recall = len(common_tokens) / len(true_tokens) if true_tokens else 0

        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
        f1_scores.append(f1)

    return sum(f1_scores) / len(f1_scores) if f1_scores else 0

def compute_similarity_score(pred_answers, true_answers):
    """Compute semantic similarity between predicted and true answers using sentence embeddings."""
    model = SentenceTransformer('all-MiniLM-L6-v2')

    similarities = []
    for pred, true in zip(pred_answers, true_answers):
        # Skip empty answers
        if not pred.strip() or not true.strip():
            continue

        # Get embeddings
        pred_embedding = model.encode([pred])[0]
        true_embedding = model.encode([true])[0]

        # Calculate cosine similarity
        similarity = 1 - cosine(pred_embedding, true_embedding)
        similarities.append(similarity)

    return sum(similarities) / len(similarities) if similarities else 0

def compute_answer_recall(pred_answers, true_answers):
    """Compute the proportion of true answer words that appear in the predicted answer."""
    recall_scores = []

    for pred, true in zip(pred_answers, true_answers):
        # Tokenize
        pred_tokens = set(word_tokenize(preprocess_text(pred)))
        true_tokens = set(word_tokenize(preprocess_text(true)))

        # Skip empty true answers
        if not true_tokens:
            continue

        # Calculate recall
        common_tokens = pred_tokens.intersection(true_tokens)
        recall = len(common_tokens) / len(true_tokens)
        recall_scores.append(recall)

    return sum(recall_scores) / len(recall_scores) if recall_scores else 0

def evaluate_qa_metrics(csv_path):
    """
    Evaluate QA performance using multiple metrics:
    - Exact Match
    - F1 Score
    - Semantic Similarity
    - Answer Recall

    Assumes the CSV has columns: 'answer' (model predictions) and 'correct_answer' (ground truth)
    """
    # Read the merged CSV file
    df = pd.read_csv(csv_path)

    # Ensure required columns exist
    required_cols = ['output', 'correct_answer']
    if not all(col in df.columns for col in required_cols):
        raise ValueError(f"CSV must contain these columns: {required_cols}")

    # Get predicted and true answers
    pred_answers = df['output'].astype(str).tolist()
    true_answers = df['correct_answer'].astype(str).tolist()

    # Compute metrics
    exact_match = compute_exact_match(pred_answers, true_answers)
    f1 = compute_f1_score(pred_answers, true_answers)
    similarity = compute_similarity_score(pred_answers, true_answers)
    recall = compute_answer_recall(pred_answers, true_answers)

    # Create results dictionary
    results = {
        'exact_match': exact_match,
        'f1_score': f1,
        'similarity_score': similarity,
        'answer_recall': recall
    }

    return results

if __name__ == '__main__':
    csv_path = 'merged_qa_flant5.csv'  # Path to your merged QA file

    print("Evaluating QA metrics...")
    metrics = evaluate_qa_metrics(csv_path)

    print("\n=== QA Evaluation Results for FLAN-T5 ===")
    print(f"Exact Match: {metrics['exact_match']:.4f}")
    print(f"F1 Score: {metrics['f1_score']:.4f}")
    print(f"Semantic Similarity: {metrics['similarity_score']:.4f}")
    print(f"Answer Recall: {metrics['answer_recall']:.4f}")

    csv_path = 'merged_qa_distilbert.csv'  # Path to your merged QA file

    print("Evaluating QA metrics...")
    metrics = evaluate_qa_metrics(csv_path)

    print("\n=== QA Evaluation Results for DISTILBERT ===")
    print(f"Exact Match: {metrics['exact_match']:.4f}")
    print(f"F1 Score: {metrics['f1_score']:.4f}")
    print(f"Semantic Similarity: {metrics['similarity_score']:.4f}")
    print(f"Answer Recall: {metrics['answer_recall']:.4f}")

    csv_path = 'merged_qa_phi2.csv'  # Path to your merged QA file

    print("Evaluating QA metrics...")
    metrics = evaluate_qa_metrics(csv_path)

    print("\n=== QA Evaluation Results for PHI-2===")
    print(f"Exact Match: {metrics['exact_match']:.4f}")
    print(f"F1 Score: {metrics['f1_score']:.4f}")
    print(f"Semantic Similarity: {metrics['similarity_score']:.4f}")
    print(f"Answer Recall: {metrics['answer_recall']:.4f}")

Evaluating QA metrics...

=== QA Evaluation Results ===
Exact Match: 0.0080
F1 Score: 0.2300
Semantic Similarity: 0.4451
Answer Recall: 0.2669
Evaluating QA metrics...

=== QA Evaluation Results ===
Exact Match: 0.0140
F1 Score: 0.2389
Semantic Similarity: 0.3974
Answer Recall: 0.2214
Evaluating QA metrics...


## Run the following cell to get the IAA
##### (Make sure that all the data files are present as required.)

In [None]:
import pandas as pd
from sklearn.metrics import cohen_kappa_score

def calculate_iaa_metrics(file_path):
    # Load the dataset
    df = pd.read_csv(file_path)

    # Ensure the columns are strings and handle NaN values
    df['annotator1_answer'] = df['annotator1_answer'].astype(str).fillna('')
    df['annotator2_answer'] = df['annotator2_answer'].astype(str).fillna('')

    # Compute percentage agreement
    df['exact_match'] = df['annotator1_answer'] == df['annotator2_answer']
    percentage_agreement = df['exact_match'].mean() * 100

    # Compute Cohen's Kappa
    kappa_score = cohen_kappa_score(df['annotator1_answer'], df['annotator2_answer'])

    return {
        'Percentage Agreement': percentage_agreement,
        "Cohen's Kappa": kappa_score
    }

# Example usage
file_path = 'annotated_qsns_final.csv'  # Replace with your actual file path
metrics = calculate_iaa_metrics(file_path)
print(metrics)