In [4]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, precision_recall_fscore_support, classification_report
from collections import Counter, defaultdict
import json, csv
import numpy as np

In [5]:
def map_gold_labels(labels):
    """
    Maps the original labels in golden data to binary labels for evaluation.
    Labels 1, 2 are mapped to 1.
    Label 4, 5 are mapped to 5.
    """
    mapping = {1: 1, 2: 1, 4: 5, 5: 5}
    return labels.map(mapping)

In [6]:
def evaluate_predictions(golden_csv_path: str, predictions_json_path: str):

    golden_df = pd.read_csv(golden_csv_path)
    
    golden_df['label'] = map_gold_labels(golden_df['label'])

    predictions_df = pd.read_json(predictions_json_path, lines=True) 

    merged_df = pd.merge(golden_df[['id', 'label']], predictions_df[['id', 'label']], 
                        on='id', how='inner', suffixes=('_gold', '_pred'))

    ground_truth = merged_df['label_gold'].astype(int)

    predictions = merged_df['label_pred'].astype(int)

    report = classification_report(ground_truth, predictions)
    print(report)

In [7]:
def evaluate_predictions_polarity(golden_csv_path: str, predictions_json_path: str):

    golden_df = pd.read_csv(golden_csv_path)
    
    predictions_df = pd.read_json(predictions_json_path, lines=True) 

    merged_df = pd.merge(golden_df[['id', 'label']], predictions_df[['id', 'label']], 
                        on='id', how='inner', suffixes=('_gold', '_pred'))

    ground_truth = merged_df['label_gold'].astype(int)

    predictions = merged_df['label_pred'].astype(int)

    report = classification_report(ground_truth, predictions)
    print(report)

## Evaluation ZERO SHOT

# NO polarity

In [78]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/GM_structured_no_3.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/task2/output_files/gpt/zero-shot/comarg_gm_relation_identification_nopol_without3_gpt.jsonl'

print("=== GM ====")
evaluate_predictions(golden_csv_path, predictions_json_path)

=== GM ====
              precision    recall  f1-score   support

           1       0.37      0.64      0.47       162
           5       0.63      0.36      0.46       274

    accuracy                           0.47       436
   macro avg       0.50      0.50      0.47       436
weighted avg       0.53      0.47      0.46       436



In [79]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/UGIP_structured_no_3.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/task2/output_files/gpt/zero-shot/comarg_ugip_relation_identification_nopol_without3_gpt.jsonl'

print("=== UGIP ====")
evaluate_predictions(golden_csv_path, predictions_json_path)

=== UGIP ====
              precision    recall  f1-score   support

           1       0.66      0.90      0.76       134
           5       0.90      0.68      0.77       188

    accuracy                           0.77       322
   macro avg       0.78      0.79      0.77       322
weighted avg       0.80      0.77      0.77       322



# Polarity

In [81]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/GM_structured_no_3.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/task2/output_files/gpt/zero-shot/comarg_gm_relation_identification_polarity_without3_gpt.jsonl'

print("=== GM ====")
evaluate_predictions_polarity(golden_csv_path, predictions_json_path)

=== GM ====
              precision    recall  f1-score   support

           1       0.19      0.49      0.27        90
           2       0.07      0.01      0.02        73
           4       0.36      0.08      0.14        96
           5       0.38      0.35      0.37       172

    accuracy                           0.26       431
   macro avg       0.25      0.24      0.20       431
weighted avg       0.28      0.26      0.24       431



In [82]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/UGIP_structured_no_3.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/task2/output_files/gpt/zero-shot/comarg_ugip_relation_identification_polarity_without3_gpt.jsonl'

print("=== UGIP ====")
evaluate_predictions_polarity(golden_csv_path, predictions_json_path)

=== UGIP ====
              precision    recall  f1-score   support

           1       0.26      0.91      0.41        46
           2       0.00      0.00      0.00        86
           3       0.00      0.00      0.00         0
           4       0.00      0.00      0.00        57
           5       0.67      0.79      0.73       126

    accuracy                           0.45       315
   macro avg       0.19      0.34      0.23       315
weighted avg       0.31      0.45      0.35       315



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Evaluation ONE SHOT

# NO polarity

In [8]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/GM_structured_main.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/task2/output_files/gpt/one-shot/comarg_gm_relation_identification2ways_gpt_1shot (1).jsonl'

print("=== GM ====")
evaluate_predictions(golden_csv_path, predictions_json_path)

=== GM ====
              precision    recall  f1-score   support

           1       0.36      0.59      0.45       160
           3       0.00      0.00      0.00         0
           5       0.61      0.38      0.47       271

    accuracy                           0.46       431
   macro avg       0.33      0.32      0.31       431
weighted avg       0.52      0.46      0.46       431



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [9]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/UGIP_structured_main.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/task2/output_files/gpt/one-shot/comarg_ugip_relation_identification2way_gpt_1shot (1).jsonl'

print("=== UGIP ====")
evaluate_predictions(golden_csv_path, predictions_json_path)

=== UGIP ====
              precision    recall  f1-score   support

           1       0.74      0.96      0.84       132
           3       0.00      0.00      0.00         0
           5       0.97      0.75      0.84       185

    accuracy                           0.84       317
   macro avg       0.57      0.57      0.56       317
weighted avg       0.87      0.84      0.84       317



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# Polarity

In [10]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/GM_structured_main.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/task2/output_files/gpt/one-shot/comarg_gm_relation_identification4ways_gpt_1shot (1).jsonl'

print("=== GM ====")
evaluate_predictions_polarity(golden_csv_path, predictions_json_path)

=== GM ====
              precision    recall  f1-score   support

           1       0.19      0.51      0.28        88
           2       0.10      0.01      0.02        72
           3       0.00      0.00      0.00         0
           4       0.14      0.03      0.05        96
           5       0.39      0.37      0.38       175

    accuracy                           0.26       431
   macro avg       0.17      0.19      0.15       431
weighted avg       0.25      0.26      0.23       431



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [12]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/UGIP_structured_main.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/task2/output_files/gpt/one-shot/comarg_ugip_relation_identification4way_gpt_1shot.jsonl'

print("=== UGIP ====")
evaluate_predictions_polarity(golden_csv_path, predictions_json_path)

=== UGIP ====
              precision    recall  f1-score   support

           1       0.26      0.96      0.41        47
           2       0.45      0.06      0.10        85
           3       0.00      0.00      0.00         0
           4       0.50      0.07      0.12        57
           5       0.56      0.54      0.55       128

    accuracy                           0.39       317
   macro avg       0.35      0.33      0.24       317
weighted avg       0.48      0.39      0.33       317



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Few shot

# NO polarity

In [13]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/GM_structured_main.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/task2/output_files/gpt/five-shots/comarg_gm_relation_identification2ways_gpt_5shot (1).jsonl'

print("=== GM ====")
evaluate_predictions(golden_csv_path, predictions_json_path)

=== GM ====
              precision    recall  f1-score   support

           1       0.35      0.51      0.41       159
           5       0.60      0.43      0.50       269

    accuracy                           0.46       428
   macro avg       0.47      0.47      0.46       428
weighted avg       0.50      0.46      0.47       428



In [14]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/UGIP_structured.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/task2/output_files/gpt/five-shots/comarg_ugip_relation_identification2way_gpt_5shot (1).jsonl'

print("=== UGIP ====")
evaluate_predictions(golden_csv_path, predictions_json_path)

=== UGIP ====
              precision    recall  f1-score   support

           1       0.52      0.89      0.66       132
           5       0.85      0.42      0.56       185

    accuracy                           0.62       317
   macro avg       0.68      0.66      0.61       317
weighted avg       0.71      0.62      0.60       317



## Polarity

In [15]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/GM_structured_main.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/task2/output_files/gpt/five-shots/comarg_gm_relation_identification4ways_gpt_5shot (1).jsonl'

print("=== GM ====")
evaluate_predictions_polarity(golden_csv_path, predictions_json_path)

=== GM ====
              precision    recall  f1-score   support

           1       0.17      0.34      0.22        88
           2       0.17      0.12      0.15        72
           4       0.16      0.03      0.05        96
           5       0.35      0.36      0.36       174

    accuracy                           0.24       430
   macro avg       0.21      0.21      0.19       430
weighted avg       0.24      0.24      0.23       430



In [16]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/UGIP_structured.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/task2/output_files/gpt/five-shots/comarg_ugip_relation_identification4way_gpt_5shot (1).jsonl'
print("=== UGIP ====")
evaluate_predictions_polarity(golden_csv_path, predictions_json_path)

=== UGIP ====
              precision    recall  f1-score   support

           1       0.22      0.66      0.33        47
           2       0.36      0.31      0.33        85
           4       0.00      0.00      0.00        57
           5       0.54      0.42      0.47       128

    accuracy                           0.35       317
   macro avg       0.28      0.35      0.28       317
weighted avg       0.35      0.35      0.33       317

