In [1]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, precision_recall_fscore_support, classification_report
from collections import Counter
import json, jsonlines

## ZERO SHOT

## Evaluation GM

In [2]:
def map_gold_labels(labels):
    """
    Maps the original labels in golden data to binary labels for evaluation.
    Labels 1, 2, 4, 5 are mapped to 1.
    Label 3 is mapped to 0.
    """
    mapping = {1: 1, 2: 1, 4: 1, 5: 1, 3: 0}
    return labels.map(mapping)

In [5]:
def evaluate_predictions_comarg(golden_csv_path: str, predictions_json_path: str):
    """
    Evaluates predictions against the golden data.
    
    Parameters:
    - golden_csv_path: Path to the CSV file containing golden data.
    - predictions_json_path: Path to the JSON file containing predictions.
    
    Output:
    - Classification report comparing the predictions with the mapped gold labels.
    """

    golden_df = pd.read_csv(golden_csv_path)
    golden_df['label'] = map_gold_labels(golden_df['label'])

    predictions_df = pd.read_json(predictions_json_path, lines=True)
    
    merged_df = pd.merge(golden_df[['id', 'label']], predictions_df[['id', 'label']], on='id', suffixes=('_gold', '_pred'))
    ground_truth = merged_df['label_gold'].values
    
    predictions = merged_df['label_pred'].values

    report = classification_report(ground_truth, predictions)
    print(report)

In [49]:

def evaluate_predictions_yru(golden_csv_path: str, predictions_json_path: str):
    """
    Evaluates predictions against the golden data.
    
    Parameters:
    - golden_csv_path: Path to the CSV file containing golden data.
    - predictions_json_path: Path to the JSON file containing predictions.
    
    Output:
    - Classification report comparing the predictions with the mapped gold labels.
    """
    
    golden_df = pd.read_csv(golden_csv_path)
    
    predictions_df = pd.read_json(predictions_json_path, lines=True)
    
    merged_df = pd.merge(
        golden_df[['uid', 'present']], 
        predictions_df[['id', 'label']], 
        left_on='uid', 
        right_on='id', 
        how='inner', 
        suffixes=('_gold', '_pred')
)
    print(merged_df)
    ground_truth = merged_df['present'].values
    predictions = merged_df['label'].values

    report = classification_report(ground_truth, predictions)
    print(report)

## GM

In [52]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/GM_all_arguments.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/output_files/gpt4o-mini/zero-shot/comarg_gm_argument_identification.jsonl'

evaluate_predictions_comarg(golden_csv_path, predictions_json_path)

           id  label_gold  label_pred
0     100arg1           0           0
1     100arg2           0           0
2     100arg3           0           0
3     100arg4           1           1
4     100arg5           1           0
...       ...         ...         ...
1381    9arg3           0           0
1382    9arg4           0           1
1383    9arg5           1           0
1384    9arg6           1           1
1385    9arg7           0           0

[1386 rows x 3 columns]
              precision    recall  f1-score   support

           0       0.80      0.89      0.84       950
           1       0.67      0.50      0.58       436

    accuracy                           0.77      1386
   macro avg       0.73      0.70      0.71      1386
weighted avg       0.76      0.77      0.76      1386



## UGIP

In [48]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/UGIP_all_arguments.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/output_files/gpt4o-mini/zero-shot/comarg_ugip_argument_identification.jsonl'

evaluate_predictions_comarg(golden_csv_path, predictions_json_path)

              precision    recall  f1-score   support

           0       0.90      0.71      0.79      1778
           1       0.26      0.56      0.35       322

    accuracy                           0.68      2100
   macro avg       0.58      0.63      0.57      2100
weighted avg       0.80      0.68      0.72      2100



======= YRU =========

## Abortion

In [27]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/yru_abortion_with_negatives_main.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/output_files/gpt4o-mini/yru_abortion_identification_GPT_negatives.jsonl'

evaluate_predictions_yru(golden_csv_path, predictions_json_path)

              precision    recall  f1-score   support

           0       0.98      0.85      0.91      5934
           1       0.40      0.83      0.54       726

    accuracy                           0.85      6660
   macro avg       0.69      0.84      0.72      6660
weighted avg       0.91      0.85      0.87      6660



## Gay Rights

In [28]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/yru_gayrights_with_negatives_main.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/output_files/gpt4o-mini/yru_gayRights_identification_GPT_negatives.jsonl'

evaluate_predictions_yru(golden_csv_path, predictions_json_path)

              precision    recall  f1-score   support

           0       0.94      0.78      0.85      5080
           1       0.31      0.68      0.43       761

    accuracy                           0.77      5841
   macro avg       0.63      0.73      0.64      5841
weighted avg       0.86      0.77      0.80      5841



## Marijuana

In [29]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/yru_marijuana_with_negatives_main.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/output_files/gpt4o-mini/yru_marijuana_identification_GPT_negatives.jsonl'

evaluate_predictions_yru(golden_csv_path, predictions_json_path)

              precision    recall  f1-score   support

           0       0.92      0.82      0.87      4317
           1       0.34      0.58      0.43       687

    accuracy                           0.79      5004
   macro avg       0.63      0.70      0.65      5004
weighted avg       0.84      0.79      0.81      5004



## Obama

In [30]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/yru_obama_with_negatives_main.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/output_files/gpt4o-mini/yru_obama_identification_GPT_negatives.jsonl'

evaluate_predictions_yru(golden_csv_path, predictions_json_path)

              precision    recall  f1-score   support

           0       0.97      0.83      0.89      7261
           1       0.26      0.67      0.37       641

    accuracy                           0.81      7902
   macro avg       0.61      0.75      0.63      7902
weighted avg       0.91      0.81      0.85      7902



## ONE-SHOT

In [6]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/GM_all_arguments.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/task1/output_files/gpt4o-mini/one-shot/comarg_gm_identification_GPT_1shot.jsonl'

print(" === GM === ")
evaluate_predictions_comarg(golden_csv_path, predictions_json_path)

 === GM === 
              precision    recall  f1-score   support

           0       0.79      0.90      0.84       945
           1       0.68      0.47      0.56       434

    accuracy                           0.76      1379
   macro avg       0.73      0.69      0.70      1379
weighted avg       0.75      0.76      0.75      1379



In [10]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/UGIP_all_arguments.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/task1/output_files/gpt4o-mini/one-shot/comarg_ugip_identification_GPT_1shot.jsonl'

print(" === UGIP === ")
evaluate_predictions_comarg(golden_csv_path, predictions_json_path)

 === UGIP === 
              precision    recall  f1-score   support

           0       0.89      0.67      0.76      1772
           1       0.24      0.57      0.33       322

    accuracy                           0.65      2094
   macro avg       0.57      0.62      0.55      2094
weighted avg       0.79      0.65      0.70      2094



## UGIP

In [None]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/UGIP_all_arguments.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/output_files/gpt4o-mini/zero-shot/comarg_ugip_argument_identification.jsonl'

evaluate_predictions_comarg(golden_csv_path, predictions_json_path)

              precision    recall  f1-score   support

           0       0.90      0.71      0.79      1778
           1       0.26      0.56      0.35       322

    accuracy                           0.68      2100
   macro avg       0.58      0.63      0.57      2100
weighted avg       0.80      0.68      0.72      2100



## Abortion

In [31]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/yru_obama_with_negatives_main.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/output_files/gpt4o-mini/one-shot/yru_obama_identification_GPT_1shot.jsonl'

evaluate_predictions_yru(golden_csv_path, predictions_json_path)

              precision    recall  f1-score   support

           0       0.96      0.88      0.92      7244
           1       0.29      0.54      0.38       640

    accuracy                           0.85      7884
   macro avg       0.62      0.71      0.65      7884
weighted avg       0.90      0.85      0.87      7884



## Gay Rights

In [32]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/yru_gayrights_with_negatives_main.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/output_files/gpt4o-mini/one-shot/yru_gayrights_identification_GPT_1shot.jsonl'

evaluate_predictions_yru(golden_csv_path, predictions_json_path)

              precision    recall  f1-score   support

           0       0.94      0.81      0.87      5080
           1       0.33      0.63      0.44       761

    accuracy                           0.79      5841
   macro avg       0.64      0.72      0.65      5841
weighted avg       0.86      0.79      0.81      5841



## Marijuana

In [33]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/yru_marijuana_with_negatives_main.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/output_files/gpt4o-mini/one-shot/yru_marijuana_identification_GPT_1shot.jsonl'

evaluate_predictions_yru(golden_csv_path, predictions_json_path)

              precision    recall  f1-score   support

           0       0.92      0.87      0.89      4317
           1       0.38      0.51      0.43       687

    accuracy                           0.82      5004
   macro avg       0.65      0.69      0.66      5004
weighted avg       0.84      0.82      0.83      5004



## Obama

In [34]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/yru_obama_with_negatives_main.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/output_files/gpt4o-mini/one-shot/yru_obama_identification_GPT_1shot.jsonl'

evaluate_predictions_yru(golden_csv_path, predictions_json_path)

              precision    recall  f1-score   support

           0       0.96      0.88      0.92      7244
           1       0.29      0.54      0.38       640

    accuracy                           0.85      7884
   macro avg       0.62      0.71      0.65      7884
weighted avg       0.90      0.85      0.87      7884



## K-SHOTS

In [8]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/GM_all_arguments.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/task1/output_files/gpt4o-mini/five-shot/comarg_gm_identification_GPT_5shot.jsonl'

print(" === GM ===")
evaluate_predictions_comarg(golden_csv_path, predictions_json_path)

 === GM ===
              precision    recall  f1-score   support

           0       0.78      0.83      0.80       945
           1       0.56      0.48      0.52       434

    accuracy                           0.72      1379
   macro avg       0.67      0.66      0.66      1379
weighted avg       0.71      0.72      0.71      1379



## UGIP

In [9]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/UGIP_all_arguments.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/task1/output_files/gpt4o-mini/five-shot/comarg_ugip_identification_GPT_5shot.jsonl'

evaluate_predictions_comarg(golden_csv_path, predictions_json_path)

              precision    recall  f1-score   support

           0       0.90      0.71      0.79      1772
           1       0.26      0.56      0.35       322

    accuracy                           0.68      2094
   macro avg       0.58      0.63      0.57      2094
weighted avg       0.80      0.68      0.72      2094



## Abortion

In [36]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/yru_obama_with_negatives_main.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/output_files/gpt4o-mini/k-shots/yru_obama_identification_GPT_5shot.jsonl'

evaluate_predictions_yru(golden_csv_path, predictions_json_path)

              precision    recall  f1-score   support

           0       0.96      0.88      0.92      7244
           1       0.29      0.56      0.39       640

    accuracy                           0.86      7884
   macro avg       0.63      0.72      0.65      7884
weighted avg       0.90      0.86      0.87      7884



## Gay Rights

In [37]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/yru_gayrights_with_negatives_main.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/output_files/gpt4o-mini/k-shots/yru_gayrights_identification_GPT_5shot.jsonl'

evaluate_predictions_yru(golden_csv_path, predictions_json_path)

              precision    recall  f1-score   support

           0       0.94      0.82      0.87      5080
           1       0.34      0.64      0.45       761

    accuracy                           0.79      5841
   macro avg       0.64      0.73      0.66      5841
weighted avg       0.86      0.79      0.82      5841



## Marijuana

In [38]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/yru_marijuana_with_negatives_main.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/output_files/gpt4o-mini/k-shots/yru_marijuana_identification_GPT_5shot.jsonl'

evaluate_predictions_yru(golden_csv_path, predictions_json_path)

              precision    recall  f1-score   support

           0       0.92      0.85      0.88      4317
           1       0.36      0.54      0.43       687

    accuracy                           0.80      5004
   macro avg       0.64      0.69      0.66      5004
weighted avg       0.84      0.80      0.82      5004



## Obama

In [39]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/yru_obama_with_negatives_main.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/output_files/gpt4o-mini/k-shots/yru_obama_identification_GPT_5shot.jsonl'

evaluate_predictions_yru(golden_csv_path, predictions_json_path)

              precision    recall  f1-score   support

           0       0.96      0.88      0.92      7244
           1       0.29      0.56      0.39       640

    accuracy                           0.86      7884
   macro avg       0.63      0.72      0.65      7884
weighted avg       0.90      0.86      0.87      7884

