In [1]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, precision_recall_fscore_support, classification_report, confusion_matrix, ConfusionMatrixDisplay
from collections import Counter, defaultdict
import json, csv
import numpy as np
import matplotlib.pyplot as plt

In [2]:
def map_gold_labels(labels):
    """
    Maps the original labels in golden data to binary labels for evaluation.
    Labels 1, 2 are mapped to 1.
    Label 4, 5 are mapped to 5.
    """
    mapping = {1: 1, 2: 1, 4: 5, 5: 5}
    return labels.map(mapping)

In [3]:
def evaluate_predictions(golden_csv_path: str, predictions_json_path: str):
    golden_df = pd.read_csv(golden_csv_path)
    
    golden_df['label'] = map_gold_labels(golden_df['label'])

    predictions_df = pd.read_json(predictions_json_path, lines=True) 

    merged_df = pd.merge(golden_df[['id', 'label']], predictions_df[['id', 'label']], 
                        on='id', how='inner', suffixes=('_gold', '_pred'))

    ground_truth = merged_df['label_gold'].astype(int)

    predictions = merged_df['label_pred'].astype(int)

    report = classification_report(ground_truth, predictions)
    print(report)

In [4]:
def evaluate_predictions_polarity(golden_csv_path: str, predictions_json_path: str):

    golden_df = pd.read_csv(golden_csv_path)
    
    predictions_df = pd.read_json(predictions_json_path, lines=True) 
 
    merged_df = pd.merge(golden_df[['id', 'label']], predictions_df[['id', 'label']], 
                        on='id', how='inner', suffixes=('_gold', '_pred'))

    ground_truth = merged_df['label_gold'].astype(int)

    predictions = merged_df['label_pred'].astype(int)

    report = classification_report(ground_truth, predictions)
    print(report)

## Evaluation ZERO SHOT

In [4]:
# No polarity

In [5]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/GM_structured_main.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/task2/output_files/gemini/zero-shot/comarg_gm_relation_identification_nopol_without3_gemini.jsonl'

print("=== GM ====")
evaluate_predictions(golden_csv_path, predictions_json_path)

=== GM ====
              precision    recall  f1-score   support

           1       0.63      1.00      0.77       160
           5       1.00      0.66      0.79       271

    accuracy                           0.78       431
   macro avg       0.82      0.83      0.78       431
weighted avg       0.86      0.78      0.79       431



In [6]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/UGIP_structured_main.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/task2/output_files/gemini/zero-shot/comarg_ugip_relation_identification_nopol_without3_gemini.jsonl'

print("=== UGIP ====")
evaluate_predictions(golden_csv_path, predictions_json_path)

=== UGIP ====
              precision    recall  f1-score   support

           1       0.80      0.92      0.86       132
           5       0.94      0.84      0.89       185

    accuracy                           0.87       317
   macro avg       0.87      0.88      0.87       317
weighted avg       0.88      0.87      0.87       317



## Polarity

In [7]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/GM_structured_main.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/task2/output_files/gemini/zero-shot/comarg_gm_relation_identification_pol_without3_gemini.jsonl'

print("=== GM ====")
evaluate_predictions_polarity(golden_csv_path, predictions_json_path)

=== GM ====
              precision    recall  f1-score   support

           1       0.34      0.93      0.50        88
           2       0.39      0.12      0.19        72
           4       0.44      0.07      0.12        96
           5       0.68      0.59      0.63       175

    accuracy                           0.47       431
   macro avg       0.46      0.43      0.36       431
weighted avg       0.51      0.47      0.42       431



In [8]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/UGIP_structured_main.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/task2/output_files/gemini/zero-shot/comarg_ugip_relation_identification_pol_without3_gemini.jsonl'

print("=== UGIP ====")
evaluate_predictions_polarity(golden_csv_path, predictions_json_path)

=== UGIP ====
              precision    recall  f1-score   support

           1       0.27      0.89      0.42        47
           2       0.27      0.05      0.08        85
           4       0.25      0.02      0.03        57
           5       0.66      0.74      0.70       128

    accuracy                           0.45       317
   macro avg       0.36      0.43      0.31       317
weighted avg       0.42      0.45      0.37       317



## ONE shot

# NO polarity

In [9]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/GM_structured_main.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/task2/output_files/gemini/one-shot/comarg_gm_relation_identification2ways_gemini_1shot.jsonl'

print("=== GM ====")
evaluate_predictions(golden_csv_path, predictions_json_path)

=== GM ====
              precision    recall  f1-score   support

           1       0.49      0.99      0.66       160
           5       0.98      0.39      0.56       271

    accuracy                           0.61       431
   macro avg       0.74      0.69      0.61       431
weighted avg       0.80      0.61      0.60       431



In [19]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/UGIP_structured_main.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/task2/output_files/gemini/one-shot/comarg_ugip_relation_identification2ways_gemini_1shot.jsonl'

print("=== UGIP ====")
evaluate_predictions(golden_csv_path, predictions_json_path)

=== UGIP ====
              precision    recall  f1-score   support

           1       0.56      0.92      0.70       132
           5       0.90      0.48      0.62       185

    accuracy                           0.66       317
   macro avg       0.73      0.70      0.66       317
weighted avg       0.76      0.66      0.65       317



## Polarity

In [None]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/GM_structured_no_3.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/task2/output_files/gemini/zero-shot/comarg_gm_relation_identification_pol_without3_gemini.jsonl'

print("=== GM ====")
evaluate_predictions_polarity(golden_csv_path, predictions_json_path)

=== GM ====
              precision    recall  f1-score   support

           1       0.34      0.93      0.50        89
           2       0.39      0.12      0.19        73
           4       0.44      0.07      0.12        98
           5       0.68      0.59      0.63       176

    accuracy                           0.47       436
   macro avg       0.46      0.43      0.36       436
weighted avg       0.51      0.47      0.42       436



In [None]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/UGIP_structured_no_3.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/task2/output_files/gemini/zero-shot/comarg_ugip_relation_identification_pol_without3_gemini.jsonl'

print("=== UGIP ====")
evaluate_predictions_polarity(golden_csv_path, predictions_json_path)

=== UGIP ====
              precision    recall  f1-score   support

           1       0.27      0.90      0.42        48
           2       0.27      0.05      0.08        86
           4       0.25      0.02      0.03        58
           5       0.67      0.75      0.71       130

    accuracy                           0.45       322
   macro avg       0.36      0.43      0.31       322
weighted avg       0.43      0.45      0.37       322



# Polarity

In [12]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/UGIP_structured_no_3.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/task2/output_files/gemini/one-shot/comarg_ugip_relation_identification5ways_gemini_1shot.jsonl'

print("=== UGIP ====")
evaluate_predictions_polarity(golden_csv_path, predictions_json_path)

=== UGIP ====
              precision    recall  f1-score   support

           1       0.21      0.96      0.35        47
           2       0.36      0.06      0.10        85
           4       0.33      0.02      0.03        57
           5       0.78      0.55      0.64       128

    accuracy                           0.38       317
   macro avg       0.42      0.40      0.28       317
weighted avg       0.50      0.38      0.34       317



In [13]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/GM_structured_no_3.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/task2/output_files/gemini/one-shot/comarg_gm_relation_identification5ways_gemini_1shot.jsonl'

print("=== GM ====")
evaluate_predictions_polarity(golden_csv_path, predictions_json_path)

=== GM ====
              precision    recall  f1-score   support

           1       0.33      0.84      0.48       177
           2       0.36      0.10      0.16       144
           3       0.00      0.00      0.00         0
           4       0.34      0.06      0.10       194
           5       0.65      0.57      0.61       350

    accuracy                           0.43       865
   macro avg       0.34      0.31      0.27       865
weighted avg       0.47      0.43      0.39       865



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## FEW shot

# NO Polarity

In [14]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/GM_structured_main.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/task2/output_files/gemini/five-shot/comarg_gm_relation_identification2ways_gemini_5shot.jsonl'

print("=== GM ====")
evaluate_predictions_polarity(golden_csv_path, predictions_json_path)

=== GM ====
              precision    recall  f1-score   support

           1       0.31      0.84      0.46       101
           2       0.34      0.22      0.27        87
           4       0.28      0.10      0.15       107
           5       0.71      0.46      0.56       204

    accuracy                           0.42       499
   macro avg       0.41      0.41      0.36       499
weighted avg       0.47      0.42      0.40       499



In [15]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/UGIP_structured_main.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/task2/output_files/gemini/five-shot/comarg_ugip_relation_identification2ways_gemini_5shot.jsonl'

print("=== UGIP ====")
evaluate_predictions_polarity(golden_csv_path, predictions_json_path)

=== UGIP ====
              precision    recall  f1-score   support

           1       0.20      0.98      0.33        47
           2       0.40      0.02      0.04        85
           4       0.33      0.02      0.03        57
           5       0.76      0.46      0.57       128

    accuracy                           0.34       317
   macro avg       0.42      0.37      0.25       317
weighted avg       0.50      0.34      0.30       317



# Polarity 

In [20]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/GM_structured_main.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/task2/output_files/gemini/five-shot/comarg_gm_relation_identification5ways_gemini_5shot.jsonl'

print("=== GM ====")
evaluate_predictions_polarity(golden_csv_path, predictions_json_path)

=== GM ====
              precision    recall  f1-score   support

           1       0.39      0.81      0.53        88
           2       0.30      0.15      0.20        72
           4       0.17      0.06      0.09        96
           5       0.69      0.70      0.69       175

    accuracy                           0.49       431
   macro avg       0.39      0.43      0.38       431
weighted avg       0.45      0.49      0.44       431



In [21]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/UGIP_structured_main.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/task2/output_files/gemini/five-shot/comarg_ugip_relation_identification5ways_gemini_5shot.jsonl'

print("=== UGIP ====")
evaluate_predictions_polarity(golden_csv_path, predictions_json_path)

=== UGIP ====
              precision    recall  f1-score   support

           1       0.19      0.96      0.32        47
           2       0.60      0.04      0.07        85
           4       0.00      0.00      0.00        59
           5       0.76      0.44      0.55       128

    accuracy                           0.33       319
   macro avg       0.39      0.36      0.23       319
weighted avg       0.49      0.33      0.29       319

