In [26]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, precision_recall_fscore_support, classification_report
from collections import Counter, defaultdict
import json, csv
import numpy as np

In [27]:
def map_gold_labels(labels):
    """
    Maps the original labels in golden data to binary labels for evaluation.
    Labels 1, 2 are mapped to 1.
    Label 3 remains 3.
    Label 4, 5 are mapped to 5.
    """
    mapping = {1: 1, 2: 1, 3: 3, 4: 5, 5: 5}
    return labels.map(mapping)

In [28]:
def evaluate_predictions(golden_csv_path: str, predictions_json_path: str):

    golden_df = pd.read_csv(golden_csv_path)
    
    golden_df['label'] = map_gold_labels(golden_df['label'])

    predictions_df = pd.read_json(predictions_json_path, lines=True) 

    merged_df = pd.merge(golden_df[['id', 'label']], predictions_df[['id', 'label']], 
                        on='id', how='inner', suffixes=('_gold', '_pred'))

    ground_truth = merged_df['label_gold'].astype(int)

    predictions = merged_df['label_pred'].astype(int)

    report = classification_report(ground_truth, predictions)
    print(report)

In [35]:
def evaluate_predictions_polarity(golden_csv_path: str, predictions_json_path: str):

    golden_df = pd.read_csv(golden_csv_path)
    
    predictions_df = pd.read_json(predictions_json_path, lines=True) 

    merged_df = pd.merge(golden_df[['id', 'label']], predictions_df[['id', 'label']], 
                        on='id', how='inner', suffixes=('_gold', '_pred'))

    ground_truth = merged_df['label_gold'].astype(int)

    predictions = merged_df['label_pred'].astype(int)

    report = classification_report(ground_truth, predictions)
    print(report)

## Evaluation ZERO SHOT

# NO polarity

In [31]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/GM_structured.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/task2/output_files/gpt/zero-shot/comarg_gm_relation_identification_3_gpt.jsonl'

print("=== GM ====")
evaluate_predictions(golden_csv_path, predictions_json_path)

=== GM ====
              precision    recall  f1-score   support

           1       0.15      0.58      0.24       159
           3       0.93      0.23      0.37       856
           5       0.34      0.57      0.43       269

    accuracy                           0.35      1284
   macro avg       0.47      0.46      0.35      1284
weighted avg       0.71      0.35      0.37      1284



In [33]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/UGIP_structured.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/task2/output_files/gpt/zero-shot/comarg_ugip_relation_identification_3_gpt.jsonl'

print("=== UGIP ====")
evaluate_predictions(golden_csv_path, predictions_json_path)

=== UGIP ====
              precision    recall  f1-score   support

           1       0.22      0.89      0.35       132
           3       0.93      0.18      0.30       692
           5       0.41      0.73      0.53       188

    accuracy                           0.38      1012
   macro avg       0.52      0.60      0.39      1012
weighted avg       0.74      0.38      0.35      1012



# Polarity

In [36]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/GM_structured.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/task2/output_files/gpt/zero-shot/comarg_gm_relation_identification_5_gpt.jsonl'

print("=== GM ====")
evaluate_predictions_polarity(golden_csv_path, predictions_json_path)

=== GM ====
              precision    recall  f1-score   support

           1       0.07      0.51      0.13        90
           2       0.00      0.00      0.00        73
           3       0.91      0.13      0.23       850
           4       0.07      0.07      0.07        98
           5       0.14      0.35      0.20       173

    accuracy                           0.18      1284
   macro avg       0.24      0.21      0.13      1284
weighted avg       0.63      0.18      0.20      1284



In [10]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/UGIP_structured.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/task2/output_files/gpt/comarg_ugip_relation_identification_3_gpt.jsonl'

print("=== UGIP ====")
evaluate_predictions_polarity(golden_csv_path, predictions_json_path)

=== UGIP ====
              precision    recall  f1-score   support

           1       0.08      0.93      0.15        46
           2       0.00      0.00      0.00        86
           3       0.93      0.18      0.30       692
           4       0.00      0.00      0.00        58
           5       0.31      0.80      0.45       130

    accuracy                           0.27      1012
   macro avg       0.26      0.38      0.18      1012
weighted avg       0.68      0.27      0.27      1012



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Evaluation ONE SHOT

# NO polarity

In [55]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/GM_structured.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/task2/output_files/gpt/one-shot/comarg_gm_relation_identification3way_gpt_1shot.jsonl'

print("=== GM ====")
evaluate_predictions(golden_csv_path, predictions_json_path)

=== GM ====
              precision    recall  f1-score   support

           1       0.13      0.58      0.21       161
           3       0.83      0.12      0.21       846
           5       0.23      0.36      0.28       272

    accuracy                           0.23      1279
   macro avg       0.40      0.35      0.23      1279
weighted avg       0.62      0.23      0.22      1279



In [56]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/UGIP_structured.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/task2/output_files/gpt/one-shot/comarg_ugip_relation_identification3way_gpt_1shot.jsonl'

print("=== UGIP ====")
evaluate_predictions(golden_csv_path, predictions_json_path)

=== UGIP ====
              precision    recall  f1-score   support

           1       0.22      0.83      0.34       134
           3       0.89      0.25      0.39       685
           5       0.44      0.71      0.55       188

    accuracy                           0.41      1007
   macro avg       0.52      0.59      0.42      1007
weighted avg       0.72      0.41      0.41      1007



# Polarity

In [49]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/GM_structured.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/task2/output_files/gpt/one-shot/comarg_gm_relation_identification5ways_gpt_1shot (1).jsonl'

print("=== GM ====")
evaluate_predictions_polarity(golden_csv_path, predictions_json_path)

=== GM ====
              precision    recall  f1-score   support

           1       0.07      0.57      0.13        86
           2       0.04      0.01      0.02        70
           3       0.88      0.20      0.33       857
           4       0.05      0.02      0.03        97
           5       0.19      0.36      0.24       168

    accuracy                           0.22      1278
   macro avg       0.25      0.23      0.15      1278
weighted avg       0.62      0.22      0.26      1278



In [54]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/UGIP_structured.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/task2/output_files/gpt/one-shot/comarg_ugip_relation_identification5way_gpt_1shot (1).jsonl'

print("=== UGIP ====")
evaluate_predictions_polarity(golden_csv_path, predictions_json_path)

=== UGIP ====
              precision    recall  f1-score   support

           1       0.09      0.79      0.17        48
           2       0.06      0.01      0.02        86
           3       0.90      0.26      0.41       685
           4       0.20      0.16      0.18        58
           5       0.22      0.58      0.32       130

    accuracy                           0.30      1007
   macro avg       0.30      0.36      0.22      1007
weighted avg       0.67      0.30      0.34      1007



## Few shot

# NO polarity

In [58]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/GM_structured.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/task2/output_files/gpt/five-shots/comarg_gm_relation_identification3way_gpt_5shot.jsonl'

print("=== GM ====")
evaluate_predictions(golden_csv_path, predictions_json_path)

=== GM ====
              precision    recall  f1-score   support

           1       0.15      0.64      0.24       161
           3       0.92      0.38      0.54       845
           5       0.39      0.35      0.37       273

    accuracy                           0.41      1279
   macro avg       0.49      0.46      0.39      1279
weighted avg       0.71      0.41      0.47      1279



In [59]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/UGIP_structured.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/task2/output_files/gpt/five-shots/comarg_ugip_relation_identification3way_gpt_5shot (1).jsonl'

print("=== UGIP ====")
evaluate_predictions(golden_csv_path, predictions_json_path)

=== UGIP ====
              precision    recall  f1-score   support

           1       0.21      0.65      0.32       134
           3       0.92      0.31      0.46       685
           5       0.36      0.71      0.48       188

    accuracy                           0.43      1007
   macro avg       0.50      0.55      0.42      1007
weighted avg       0.72      0.43      0.45      1007



## Polarity

In [63]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/GM_structured.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/task2/output_files/gpt/five-shots/comarg_gm_relation_identification5ways_gpt_5shot (1).jsonl'

print("=== GM ====")
evaluate_predictions_polarity(golden_csv_path, predictions_json_path)

=== GM ====
              precision    recall  f1-score   support

           1       0.12      0.48      0.19        89
           2       0.05      0.25      0.09        73
           3       0.89      0.38      0.53       842
           4       0.00      0.00      0.00        97
           5       0.32      0.37      0.35       175

    accuracy                           0.35      1276
   macro avg       0.28      0.30      0.23      1276
weighted avg       0.64      0.35      0.42      1276



In [64]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/UGIP_structured.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/task2/output_files/gpt/five-shots/comarg_ugip_relation_identification5way_gpt_5shot (1).jsonl'

print("=== UGIP ====")
evaluate_predictions_polarity(golden_csv_path, predictions_json_path)

=== UGIP ====
              precision    recall  f1-score   support

           1       0.11      0.77      0.19        48
           2       0.13      0.05      0.07        86
           3       0.90      0.33      0.48       685
           4       0.18      0.14      0.16        58
           5       0.24      0.62      0.35       130

    accuracy                           0.35      1007
   macro avg       0.31      0.38      0.25      1007
weighted avg       0.67      0.35      0.39      1007

