In [5]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, precision_recall_fscore_support, classification_report
from collections import Counter
import json

## Evaluation GM

In [12]:
ground_truth = pd.read_csv('/Users/guida/llm_argument_tasks/clean_data/GM_structured.csv') 

with open('/Users/guida/llm_argument_tasks/output_files/gpt4o-mini/comarg_gm_argument_identification.json', 'r') as f:
    predictions_json = json.load(f)

predictions_data = [(json.loads(entry)['id'], json.loads(entry)['label']) for entry in predictions_json]

predictions = pd.DataFrame(predictions_data, columns=['id', 'predicted_label'])

# Map ground truth labels: 1, 2, 4, 5 -> 1 (argument used), 3 -> 0 (argument not used)
ground_truth['mapped_label'] = ground_truth['label'].apply(lambda x: 1 if x in [1, 2, 4, 5] else 0)

golden_comment_ids = ground_truth[['id', 'mapped_label']]
predictions_df = predictions[['id', 'predicted_label']]

merged_df = predictions_df.merge(golden_comment_ids, on='id', how='left', suffixes=('_pred', '_true'))

tp = len(merged_df[(merged_df['predicted_label'] == 1) & (merged_df['mapped_label'] == 1)])  # Correctly predicted positives
fp = len(merged_df[(merged_df['predicted_label'] == 1) & (merged_df['mapped_label'] == 0)])  # Incorrectly predicted positives
fn = len(merged_df[(merged_df['predicted_label'] == 0) & (merged_df['mapped_label'] == 1)])  # Missed positives
tn = len(merged_df[(merged_df['predicted_label'] == 0) & (merged_df['mapped_label'] == 0)])  # Correctly predicted negatives

precision = tp / (tp + fp) if (tp + fp) > 0 else 0
recall = tp / (tp + fn) if (tp + fn) > 0 else 0
f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
accuracy = (tp + tn) / len(merged_df) if len(merged_df) > 0 else 0

print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1_score:.2f}')
print(f'Accuracy: {accuracy:.2f}')
print(f'Total Predictions: {len(predictions)}')
print(f'Correct Predictions (TP): {tp}')
print(f'False Positives (FP): {fp}')
print(f'False Negatives (FN): {fn}')
print(f'Correct Predictions (TN): {tn}')

Precision: 0.69
Recall: 0.50
F1 Score: 0.58
Accuracy: 0.75
Total Predictions: 1285
Correct Predictions (TP): 216
False Positives (FP): 96
False Negatives (FN): 220
Correct Predictions (TN): 753


## Evaluation UGIP

In [13]:
ground_truth = pd.read_csv('/Users/guida/llm_argument_tasks/clean_data/UGIP_structured.csv') 

with open('/Users/guida/llm_argument_tasks/output_files/gpt4o-mini/comarg_ugip_argument_identification.json', 'r') as f:
    predictions_json = json.load(f)

predictions_data = [(json.loads(entry)['id'], json.loads(entry)['label']) for entry in predictions_json]

predictions = pd.DataFrame(predictions_data, columns=['id', 'predicted_label'])

# Map ground truth labels: 1, 2, 4, 5 -> 1 (argument used), 3 -> 0 (argument not used)
ground_truth['mapped_label'] = ground_truth['label'].apply(lambda x: 1 if x in [1, 2, 4, 5] else 0)

golden_comment_ids = ground_truth[['id', 'mapped_label']]
predictions_df = predictions[['id', 'predicted_label']]

merged_df = predictions_df.merge(golden_comment_ids, on='id', how='left', suffixes=('_pred', '_true'))

tp = len(merged_df[(merged_df['predicted_label'] == 1) & (merged_df['mapped_label'] == 1)])  # Correctly predicted positives
fp = len(merged_df[(merged_df['predicted_label'] == 1) & (merged_df['mapped_label'] == 0)])  # Incorrectly predicted positives
fn = len(merged_df[(merged_df['predicted_label'] == 0) & (merged_df['mapped_label'] == 1)])  # Missed positives
tn = len(merged_df[(merged_df['predicted_label'] == 0) & (merged_df['mapped_label'] == 0)])  # Correctly predicted negatives

precision = tp / (tp + fp) if (tp + fp) > 0 else 0
recall = tp / (tp + fn) if (tp + fn) > 0 else 0
f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
accuracy = (tp + tn) / len(merged_df) if len(merged_df) > 0 else 0

print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1_score:.2f}')
print(f'Accuracy: {accuracy:.2f}')
print(f'Total Predictions: {len(predictions)}')
print(f'Correct Predictions (TP): {tp}')
print(f'False Positives (FP): {fp}')
print(f'False Negatives (FN): {fn}')
print(f'Correct Predictions (TN): {tn}')

Precision: 0.54
Recall: 0.54
F1 Score: 0.54
Accuracy: 0.70
Total Predictions: 1013
Correct Predictions (TP): 175
False Positives (FP): 152
False Negatives (FN): 147
Correct Predictions (TN): 539


In [3]:
def map_gold_labels(labels):
    """
    Maps the original labels in golden data to binary labels for evaluation.
    Labels 1, 2, 4, 5 are mapped to 1.
    Label 3 is mapped to 0.
    """
    mapping = {1: 1, 2: 1, 4: 1, 5: 1, 3: 0}
    return labels.map(mapping)

In [20]:
def evaluate_predictions(golden_csv_path: str, predictions_json_path: str):
    """
    Evaluates predictions against the golden data.
    
    Parameters:
    - golden_csv_path: Path to the CSV file containing golden data.
    - predictions_json_path: Path to the JSON file containing predictions.
    
    Output:
    - Classification report comparing the predictions with the mapped gold labels.
    """

    golden_df = pd.read_csv(golden_csv_path)

    with open(predictions_json_path, 'r') as f:
        predictions = json.load(f)
        
    predictions = [json.loads(pred) for pred in predictions]
    predictions_df = pd.DataFrame(predictions)

    merged_df = pd.merge(golden_df[['id', 'label']], predictions_df[['id', 'label']], on='id', suffixes=('_gold', '_pred'))

    merged_df['label_gold'] = map_gold_labels(merged_df['label_gold'].astype(int))
    
    ground_truth = merged_df['label_gold'].values
    
    predictions = merged_df['label_pred'].values

    report = classification_report(ground_truth, predictions)
    print(report)

## GM

In [21]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/GM_structured.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/output_files/gpt4o-mini/comarg_gm_argument_identification.json'

evaluate_predictions(golden_csv_path, predictions_json_path)

              precision    recall  f1-score   support

           0       0.77      0.89      0.83       849
           1       0.69      0.50      0.58       436

    accuracy                           0.75      1285
   macro avg       0.73      0.69      0.70      1285
weighted avg       0.75      0.75      0.74      1285



## UGIP

In [22]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/UGIP_structured.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/output_files/gpt4o-mini/comarg_ugip_argument_identification.json'

evaluate_predictions(golden_csv_path, predictions_json_path)

              precision    recall  f1-score   support

           0       0.79      0.78      0.78       691
           1       0.54      0.54      0.54       322

    accuracy                           0.70      1013
   macro avg       0.66      0.66      0.66      1013
weighted avg       0.71      0.70      0.71      1013

