In [1]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, precision_recall_fscore_support, classification_report
from collections import Counter
import json, jsonlines

## ZERO SHOT

## Evaluation GM

In [2]:
def map_gold_labels(labels):
    """
    Maps the original labels in golden data to binary labels for evaluation.
    Labels 1, 2, 4, 5 are mapped to 1.
    Label 3 is mapped to 0.
    """
    mapping = {1: 1, 2: 1, 4: 1, 5: 1, 3: 0}
    return labels.map(mapping)

In [45]:
def evaluate_predictions_comarg(golden_csv_path: str, predictions_json_path: str):
    """
    Evaluates predictions against the golden data.
    
    Parameters:
    - golden_csv_path: Path to the CSV file containing golden data.
    - predictions_json_path: Path to the JSON file containing predictions.
    
    Output:
    - Classification report comparing the predictions with the mapped gold labels.
    """

    golden_df = pd.read_csv(golden_csv_path)
    golden_df['label'] = map_gold_labels(golden_df['label'])
    
    predictions_df = pd.read_json(predictions_json_path, lines=True)
    
    merged_df = pd.merge(golden_df[['id', 'label']], predictions_df[['id', 'label']], on='id', suffixes=('_gold', '_pred'))
    ground_truth = merged_df['label_gold'].values

    predictions = merged_df['label_pred'].values

    report = classification_report(ground_truth, predictions)
    print(report)

In [46]:

def evaluate_predictions_yru(golden_csv_path: str, predictions_json_path: str):
    """
    Evaluates predictions against the golden data.
    
    Parameters:
    - golden_csv_path: Path to the CSV file containing golden data.
    - predictions_json_path: Path to the JSON file containing predictions.
    
    Output:
    - Classification report comparing the predictions with the mapped gold labels.
    """
    
    golden_df = pd.read_csv(golden_csv_path)

    predictions_df = pd.read_json(predictions_json_path, lines=True)

    merged_df = pd.merge(
        golden_df[['uid', 'present']], 
        predictions_df[['id', 'label']], 
        left_on='uid', 
        right_on='id', 
        how='inner'
)

    ground_truth = merged_df['present'].values
    predictions = merged_df['label'].values

    report = classification_report(ground_truth, predictions)
    print(report)

# ZERO shot

## GM

In [53]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/GM_all_arguments_main.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/task1/output_files/gpt4o-mini/zero-shot/comarg_gm_argument_identification.jsonl'

evaluate_predictions_comarg(golden_csv_path, predictions_json_path)

              precision    recall  f1-score   support

           0       0.80      0.89      0.84       945
           1       0.67      0.51      0.58       434

    accuracy                           0.77      1379
   macro avg       0.74      0.70      0.71      1379
weighted avg       0.76      0.77      0.76      1379



## UGIP

In [68]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/UGIP_all_arguments_main.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/task1/output_files/gpt4o-mini/zero-shot/comarg_ugip_argument_identification.jsonl'

evaluate_predictions_comarg(golden_csv_path, predictions_json_path)

              precision    recall  f1-score   support

           0       0.90      0.70      0.79      1772
           1       0.26      0.56      0.35       322

    accuracy                           0.68      2094
   macro avg       0.58      0.63      0.57      2094
weighted avg       0.80      0.68      0.72      2094



======= YRU ========= zero shot

## Abortion

In [54]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/yru_abortion_with_negatives_main.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/task1/output_files/gpt4o-mini/zero-shot/yru_abortion_identification_GPT_negatives.jsonl'
evaluate_predictions_yru(golden_csv_path, predictions_json_path)

              precision    recall  f1-score   support

           0       0.98      0.85      0.91      5932
           1       0.40      0.83      0.54       723

    accuracy                           0.85      6655
   macro avg       0.69      0.84      0.72      6655
weighted avg       0.91      0.85      0.87      6655



## Gay Rights

In [74]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/yru_gayrights_with_negatives_main.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/task1/output_files/gpt4o-mini/zero-shot/yru_gayRights_identification_GPT_negatives.jsonl'

evaluate_predictions_yru(golden_csv_path, predictions_json_path)

              precision    recall  f1-score   support

           0       0.94      0.78      0.85      5078
           1       0.31      0.68      0.43       758

    accuracy                           0.77      5836
   macro avg       0.63      0.73      0.64      5836
weighted avg       0.86      0.77      0.80      5836



## Marijuana

In [73]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/yru_marijuana_with_negatives_main.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/task1/output_files/gpt4o-mini/zero-shot/yru_marijuana_identification_GPT_negatives.jsonl'

evaluate_predictions_yru(golden_csv_path, predictions_json_path)

              precision    recall  f1-score   support

           0       0.93      0.82      0.87      4315
           1       0.34      0.58      0.43       684

    accuracy                           0.79      4999
   macro avg       0.63      0.70      0.65      4999
weighted avg       0.85      0.79      0.81      4999



## Obama

In [57]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/yru_obama_with_negatives_main.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/task1/output_files/gpt4o-mini/zero-shot/yru_obama_identification_GPT_negatives.jsonl'

evaluate_predictions_yru(golden_csv_path, predictions_json_path)

              precision    recall  f1-score   support

           0       0.97      0.83      0.89      7259
           1       0.26      0.67      0.37       638

    accuracy                           0.81      7897
   macro avg       0.61      0.75      0.63      7897
weighted avg       0.91      0.81      0.85      7897



## ONE-SHOT

In [51]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/GM_all_arguments_main.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/task1/output_files/gpt4o-mini/one-shot/comarg_gm_identification_GPT_1shot.jsonl'

evaluate_predictions_comarg(golden_csv_path, predictions_json_path)

              precision    recall  f1-score   support

           0       0.79      0.90      0.84       945
           1       0.68      0.47      0.56       434

    accuracy                           0.76      1379
   macro avg       0.73      0.69      0.70      1379
weighted avg       0.75      0.76      0.75      1379



## UGIP

In [79]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/UGIP_all_arguments_main.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/task1/output_files/gpt4o-mini/one-shot/comarg_ugip_identification_GPT_1shot.jsonl'

evaluate_predictions_comarg(golden_csv_path, predictions_json_path)

              precision    recall  f1-score   support

           0       0.89      0.67      0.76      1772
           1       0.24      0.57      0.33       322

    accuracy                           0.65      2094
   macro avg       0.57      0.62      0.55      2094
weighted avg       0.79      0.65      0.70      2094



## Abortion

In [70]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/yru_abortion_with_negatives_main.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/task1/output_files/gpt4o-mini/one-shot/yru_abortion_identification_GPT_1shot.jsonl'

evaluate_predictions_yru(golden_csv_path, predictions_json_path)

              precision    recall  f1-score   support

           0       0.96      0.72      0.82      5976
           1       0.24      0.77      0.37       707

    accuracy                           0.72      6683
   macro avg       0.60      0.74      0.60      6683
weighted avg       0.89      0.72      0.77      6683



## Gay Rights

In [59]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/yru_gayrights_with_negatives_main.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/task1/output_files/gpt4o-mini/one-shot/yru_gayRights_identification_GPT_1shot.jsonl'

evaluate_predictions_yru(golden_csv_path, predictions_json_path)

              precision    recall  f1-score   support

           0       0.94      0.81      0.87      5078
           1       0.33      0.63      0.44       758

    accuracy                           0.79      5836
   macro avg       0.63      0.72      0.65      5836
weighted avg       0.86      0.79      0.81      5836



## Marijuana

In [71]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/yru_marijuana_with_negatives_main.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/task1/output_files/gpt4o-mini/one-shot/yru_marijuana_identification_GPT_1shot.jsonl'

evaluate_predictions_yru(golden_csv_path, predictions_json_path)

              precision    recall  f1-score   support

           0       0.92      0.67      0.77      4324
           1       0.23      0.61      0.33       686

    accuracy                           0.66      5010
   macro avg       0.57      0.64      0.55      5010
weighted avg       0.82      0.66      0.71      5010



## Obama

In [61]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/yru_obama_with_negatives_main.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/task1/output_files/gpt4o-mini/one-shot/yru_obama_identification_GPT_1shot.jsonl'

evaluate_predictions_yru(golden_csv_path, predictions_json_path)

              precision    recall  f1-score   support

           0       0.96      0.88      0.92      7242
           1       0.29      0.54      0.38       637

    accuracy                           0.85      7879
   macro avg       0.62      0.71      0.65      7879
weighted avg       0.90      0.85      0.87      7879



## K-SHOTS

In [62]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/GM_all_arguments_main.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/task1/output_files/gpt4o-mini/five-shot/comarg_gm_identification_GPT_5shot.jsonl'

print(" === GM ===")
evaluate_predictions_comarg(golden_csv_path, predictions_json_path)

 === GM ===
              precision    recall  f1-score   support

           0       0.78      0.83      0.80       945
           1       0.56      0.48      0.52       434

    accuracy                           0.72      1379
   macro avg       0.67      0.66      0.66      1379
weighted avg       0.71      0.72      0.71      1379



## UGIP

In [63]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/UGIP_all_arguments_main.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/task1/output_files/gpt4o-mini/five-shot/comarg_ugip_identification_GPT_5shot.jsonl'

print(" === GM ===")
evaluate_predictions_comarg(golden_csv_path, predictions_json_path)

 === GM ===
              precision    recall  f1-score   support

           0       0.90      0.71      0.79      1772
           1       0.26      0.56      0.35       322

    accuracy                           0.68      2094
   macro avg       0.58      0.63      0.57      2094
weighted avg       0.80      0.68      0.72      2094



## Abortion

In [64]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/yru_abortion_with_negatives_main.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/task1/output_files/gpt4o-mini/five-shot/yru_abortion_identification_GPT_5shot.jsonl'

evaluate_predictions_yru(golden_csv_path, predictions_json_path)

              precision    recall  f1-score   support

           0       0.97      0.85      0.91      5932
           1       0.41      0.82      0.54       723

    accuracy                           0.85      6655
   macro avg       0.69      0.84      0.73      6655
weighted avg       0.91      0.85      0.87      6655



## Gay Rights

In [65]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/yru_gayrights_with_negatives_main.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/task1/output_files/gpt4o-mini/five-shot/yru_gayRights_identification_GPT_5shot.jsonl'

evaluate_predictions_yru(golden_csv_path, predictions_json_path)

              precision    recall  f1-score   support

           0       0.94      0.82      0.87      5078
           1       0.34      0.64      0.45       758

    accuracy                           0.79      5836
   macro avg       0.64      0.73      0.66      5836
weighted avg       0.86      0.79      0.82      5836



## Marijuana

In [66]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/yru_marijuana_with_negatives_main.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/task1/output_files/gpt4o-mini/five-shot/yru_marijuana_identification_GPT_5shot.jsonl'

evaluate_predictions_yru(golden_csv_path, predictions_json_path)

              precision    recall  f1-score   support

           0       0.92      0.85      0.88      4315
           1       0.36      0.54      0.43       684

    accuracy                           0.80      4999
   macro avg       0.64      0.69      0.66      4999
weighted avg       0.84      0.80      0.82      4999



## Obama

In [67]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/yru_obama_with_negatives_main.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/task1/output_files/gpt4o-mini/five-shot/yru_obama_identification_GPT_5shot.jsonl'

evaluate_predictions_yru(golden_csv_path, predictions_json_path)

              precision    recall  f1-score   support

           0       0.96      0.88      0.92      7242
           1       0.29      0.56      0.39       637

    accuracy                           0.86      7879
   macro avg       0.63      0.72      0.65      7879
weighted avg       0.90      0.86      0.87      7879

