In [1]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, precision_recall_fscore_support, classification_report, confusion_matrix, ConfusionMatrixDisplay
from collections import Counter, defaultdict
import json, csv
import numpy as np
import matplotlib.pyplot as plt

In [2]:
def map_gold_labels(labels):
    """
    Maps the original labels in golden data to binary labels for evaluation.
    Labels 1, 2 are mapped to 1.
    Label 4, 5 are mapped to 5.
    """
    mapping = {1: 1, 2: 1, 4: 5, 5: 5}
    return labels.map(mapping)

In [3]:
def evaluate_predictions(golden_csv_path: str, predictions_json_path: str):
    golden_df = pd.read_csv(golden_csv_path)
    
    golden_df['label'] = map_gold_labels(golden_df['label'])

    predictions_df = pd.read_json(predictions_json_path, lines=True) 

    merged_df = pd.merge(golden_df[['id', 'label']], predictions_df[['id', 'label']], 
                        on='id', how='inner', suffixes=('_gold', '_pred'))

    ground_truth = merged_df['label_gold'].astype(int)

    predictions = merged_df['label_pred'].astype(int)

    report = classification_report(ground_truth, predictions)
    print(report)

In [4]:
def evaluate_predictions_polarity(golden_csv_path: str, predictions_json_path: str):

    golden_df = pd.read_csv(golden_csv_path)
    
    predictions_df = pd.read_json(predictions_json_path, lines=True) 

    merged_df = pd.merge(golden_df[['id', 'label']], predictions_df[['id', 'label']], 
                        on='id', how='inner', suffixes=('_gold', '_pred'))

    ground_truth = merged_df['label_gold'].astype(int)

    predictions = merged_df['label_pred'].astype(int)

    report = classification_report(ground_truth, predictions)
    print(report)

## Evaluation ZERO SHOT

In [4]:
# No polarity

In [19]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/GM_structured_no_3.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/task2/output_files/llama/zero-shot/comarg_gm_relation_identification_nopol_without3_llama.jsonl'

print("=== GM ====")
evaluate_predictions(golden_csv_path, predictions_json_path)

=== GM ====
              precision    recall  f1-score   support

           1       0.29      0.32      0.30       161
           5       0.55      0.51      0.53       261

    accuracy                           0.44       422
   macro avg       0.42      0.42      0.42       422
weighted avg       0.45      0.44      0.44       422



In [20]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/UGIP_structured.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/task2/output_files/llama/zero-shot/comarg_ugip_relation_identification_nopol_without3_llama.jsonl'

print("=== UGIP ====")
evaluate_predictions(golden_csv_path, predictions_json_path)

=== UGIP ====
              precision    recall  f1-score   support

           1       0.62      0.39      0.48       134
           5       0.65      0.83      0.73       187

    accuracy                           0.64       321
   macro avg       0.64      0.61      0.60       321
weighted avg       0.64      0.64      0.63       321



## Polarity

In [23]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/GM_structured_no_3.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/task2/output_files/llama/zero-shot/comarg_gm_relation_identification_pol_without3_llama.jsonl'

print("=== GM ====")
evaluate_predictions_polarity(golden_csv_path, predictions_json_path)

=== GM ====
              precision    recall  f1-score   support

           1       0.21      0.21      0.21        84
           2       0.18      0.13      0.15        69
           4       0.22      0.14      0.17        87
           5       0.42      0.54      0.47       162

    accuracy                           0.32       402
   macro avg       0.26      0.26      0.25       402
weighted avg       0.29      0.32      0.30       402



In [24]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/UGIP_structured_no_3.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/task2/output_files/llama/zero-shot/comarg_ugip_relation_identification_pol_without3_llama.jsonl'

print("=== UGIP ====")
evaluate_predictions_polarity(golden_csv_path, predictions_json_path)

=== UGIP ====
              precision    recall  f1-score   support

           1       0.45      0.11      0.17        47
           2       0.48      0.17      0.26        86
           4       0.26      0.36      0.30        55
           5       0.54      0.82      0.65       130

    accuracy                           0.46       318
   macro avg       0.43      0.36      0.34       318
weighted avg       0.46      0.46      0.41       318

