In [1]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, precision_recall_fscore_support
from collections import Counter

## Evaluation GM

In [2]:
def evaluate_predictions(golden_data_path, model_output_path):
    """
    Evaluate model predictions against a golden dataset.
    - Correct predictions: if predicted argument is in golden arguments
    - False positives: if predicted argument is not in golden arguments
    - False negatives: if golden argument is not in predicted arguments
    """

    golden_df = pd.read_csv(golden_data_path)
    model_df = pd.read_csv(model_output_path)

    # Store the golden arguments for each comment
    golden_dict = {}
    for _, row in golden_df.iterrows():
        if row['label'] != 3:  # Exclude arguments not used
            if row['comment_text'] not in golden_dict:
                golden_dict[row['comment_text']] = set()
            golden_dict[row['comment_text']].add(row['argument_text'])

    y_true = []
    y_pred = []
    total_predictions = 0
    correct_predictions = 0
    false_positives = 0
    false_negatives = 0

    # Evaluate model predictions
    for _, row in model_df.iterrows():
        comment = row['comment_text']
        predicted_argument = row['argument_text']

        if comment in golden_dict:
            total_predictions += 1
            golden_arguments = golden_dict[comment]

            # Check if the predicted argument is correct
            if predicted_argument in golden_arguments:
                y_true.append(predicted_argument)
                y_pred.append(predicted_argument)
                correct_predictions += 1
            else:
                # False positive: predicted argument does not match golden arguments
                y_true.append("")  # Append an empty string for no match
                y_pred.append(predicted_argument)
                false_positives += 1

            # False negative: if there are other golden arguments that weren't predicted
            for golden_arg in golden_arguments:
                if golden_arg != predicted_argument:
                    y_true.append(golden_arg)  # Append missed argument to y_true
                    y_pred.append("")  
                    false_negatives += 1

    accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
    precision = correct_predictions / (correct_predictions + false_positives) if (correct_predictions + false_positives) > 0 else 0
    recall = correct_predictions / (correct_predictions + false_negatives) if (correct_predictions + false_negatives) > 0 else 0
    f1_score = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1_score": f1_score,
        "total_predictions": total_predictions,
        "correct_predictions": correct_predictions,
        "false_positives": false_positives,
        "false_negatives": false_negatives
    }

golden_data_path = '/Users/guida/llm_argument_tasks/clean_data/GM_structured.csv'
model_output_path = '/Users/guida/llm_argument_tasks/output_files/llama3/comarg_gm_argument_identification.csv'

evaluate_predictions(golden_data_path, model_output_path)

{'accuracy': 0.6103448275862069,
 'precision': 0.6103448275862069,
 'recall': 0.23663101604278075,
 'f1_score': 0.34104046242774566,
 'total_predictions': 290,
 'correct_predictions': 177,
 'false_positives': 113,
 'false_negatives': 571}

## Evaluation UGIP

In [3]:
def evaluate_predictions(golden_data_path, model_output_path):
    """
    Evaluate model predictions against a golden dataset.
    - Correct predictions: if predicted argument is in golden arguments
    - False positives: if predicted argument is not in golden arguments
    - False negatives: if golden argument is not in predicted arguments
    """
    
    golden_df = pd.read_csv(golden_data_path)
    model_df = pd.read_csv(model_output_path)

    # Store the golden arguments for each comment
    golden_dict = {}
    for _, row in golden_df.iterrows():
        if row['label'] != 3:  # Exclude arguments not used
            if row['comment_text'] not in golden_dict:
                golden_dict[row['comment_text']] = set()
            golden_dict[row['comment_text']].add(row['argument_text'])

    y_true = []
    y_pred = []
    total_predictions = 0
    correct_predictions = 0
    false_positives = 0
    false_negatives = 0

    for _, row in model_df.iterrows():
        comment = row['comment_text']
        predicted_argument = row['argument_text']

        if comment in golden_dict:
            total_predictions += 1
            golden_arguments = golden_dict[comment]

            # Check if the predicted argument is correct
            if predicted_argument in golden_arguments:
                y_true.append(predicted_argument)
                y_pred.append(predicted_argument)
                correct_predictions += 1
            else:
                # False positive: predicted argument does not match golden arguments
                y_true.append("")  # Append an empty string for no match
                y_pred.append(predicted_argument)
                false_positives += 1

            # False negative: if there are other golden arguments that weren't predicted
            for golden_arg in golden_arguments:
                if golden_arg != predicted_argument:
                    y_true.append(golden_arg)  # Append missed argument to y_true
                    y_pred.append("")  
                    false_negatives += 1

    accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
    precision = correct_predictions / (correct_predictions + false_positives) if (correct_predictions + false_positives) > 0 else 0
    recall = correct_predictions / (correct_predictions + false_negatives) if (correct_predictions + false_negatives) > 0 else 0
    f1_score = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1_score": f1_score,
        "total_predictions": total_predictions,
        "correct_predictions": correct_predictions,
        "false_positives": false_positives,
        "false_negatives": false_negatives
    }

golden_data_path = '/Users/guida/llm_argument_tasks/clean_data/UGIP_structured.csv'
model_output_path = '/Users/guida/llm_argument_tasks/output_files/llama3/comarg_ugip_argument_identification.csv'
evaluate_predictions(golden_data_path, model_output_path)

{'accuracy': 0.4873417721518987,
 'precision': 0.4873417721518987,
 'recall': 0.21388888888888888,
 'f1_score': 0.29729729729729726,
 'total_predictions': 158,
 'correct_predictions': 77,
 'false_positives': 81,
 'false_negatives': 283}