In [19]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, precision_recall_fscore_support, classification_report
from collections import Counter
import json, jsonlines

## ZERO SHOT

## Evaluation GM

In [20]:
def map_gold_labels(labels):
    """
    Maps the original labels in golden data to binary labels for evaluation.
    Labels 1, 2, 4, 5 are mapped to 1.
    Label 3 is mapped to 0.
    """
    mapping = {1: 1, 2: 1, 4: 1, 5: 1, 3: 0}
    return labels.map(mapping)

In [21]:
def evaluate_predictions_comarg(golden_csv_path: str, predictions_json_path: str):
    """
    Evaluates predictions against the golden data.
    
    Parameters:
    - golden_csv_path: Path to the CSV file containing golden data.
    - predictions_json_path: Path to the JSON file containing predictions.
    
    Output:
    - Classification report comparing the predictions with the mapped gold labels.
    """

    golden_df = pd.read_csv(golden_csv_path)
    golden_df['label'] = map_gold_labels(golden_df['label'])

    predictions_df = pd.read_json(predictions_json_path, lines=True)

    merged_df = pd.merge(golden_df[['id', 'label']], predictions_df[['id', 'label']], on='id', suffixes=('_gold', '_pred'))

    ground_truth = merged_df['label_gold'].values
    
    predictions = merged_df['label_pred'].values

    report = classification_report(ground_truth, predictions)
    print(report)

In [22]:

def evaluate_predictions_yru(golden_csv_path: str, predictions_json_path: str):
    """
    Evaluates predictions against the golden data.
    
    Parameters:
    - golden_csv_path: Path to the CSV file containing golden data.
    - predictions_json_path: Path to the JSON file containing predictions.
    
    Output:
    - Classification report comparing the predictions with the mapped gold labels.
    """
    
    golden_df = pd.read_csv(golden_csv_path)
    
    predictions_df = pd.read_json(predictions_json_path, lines=True)
    
    merged_df = pd.merge(
        golden_df[['uid', 'present']], 
        predictions_df[['id', 'label']], 
        left_on='uid', 
        right_on='id', 
        how='inner', 
        suffixes=('_gold', '_pred')
)
    ground_truth = merged_df['present'].values
    predictions = merged_df['label'].values

    report = classification_report(ground_truth, predictions)
    print(report)

## GM

In [23]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/GM_all_arguments.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/task1/output_files/gemini1.5/zero-shot/comarg_gm_argument_identification_gemini.jsonl'

evaluate_predictions_comarg(golden_csv_path, predictions_json_path)

              precision    recall  f1-score   support

           0       0.89      0.71      0.79       950
           1       0.56      0.81      0.66       436

    accuracy                           0.74      1386
   macro avg       0.73      0.76      0.73      1386
weighted avg       0.79      0.74      0.75      1386



## UGIP

In [25]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/UGIP_all_arguments.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/task1/output_files/gemini1.5/zero-shot/comarg_ugip_argument_identification_gemini.jsonl'

evaluate_predictions_comarg(golden_csv_path, predictions_json_path)

              precision    recall  f1-score   support

           0       0.94      0.51      0.66      1778
           1       0.23      0.81      0.36       322

    accuracy                           0.55      2100
   macro avg       0.58      0.66      0.51      2100
weighted avg       0.83      0.55      0.61      2100



======= YRU =========

## Abortion

In [54]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/yru_abortion_with_negatives_main.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/task1/output_files/gemini1.5/zero-shot/yru_abortion_identification_gemini_0s.jsonl'

evaluate_predictions_yru(golden_csv_path, predictions_json_path)

              precision    recall  f1-score   support

           0       0.98      0.72      0.83      5935
           1       0.28      0.87      0.42       727

    accuracy                           0.74      6662
   macro avg       0.63      0.80      0.62      6662
weighted avg       0.90      0.74      0.78      6662



## Gay Rights

In [55]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/yru_gayrights_with_negatives_main.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/task1/output_files/gemini1.5/zero-shot/yru_gayRights_identification_gemini_0s.jsonl'

evaluate_predictions_yru(golden_csv_path, predictions_json_path)

              precision    recall  f1-score   support

           0       0.94      0.76      0.84      5080
           1       0.30      0.68      0.41       761

    accuracy                           0.75      5841
   macro avg       0.62      0.72      0.63      5841
weighted avg       0.86      0.75      0.78      5841



## Marijuana

In [56]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/yru_marijuana_with_negatives_main.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/task1/output_files/gemini1.5/zero-shot/yru_marijuana_identification_gemini_0s.jsonl'

evaluate_predictions_yru(golden_csv_path, predictions_json_path)

              precision    recall  f1-score   support

           0       0.92      0.81      0.87      4317
           1       0.33      0.57      0.42       687

    accuracy                           0.78      5004
   macro avg       0.63      0.69      0.64      5004
weighted avg       0.84      0.78      0.80      5004



## Obama

In [57]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/yru_obama_with_negatives_main.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/task1/output_files/gemini1.5/zero-shot/yru_obama_identification_gemini_0s.jsonl'

evaluate_predictions_yru(golden_csv_path, predictions_json_path)

              precision    recall  f1-score   support

           0       0.96      0.84      0.89      7261
           1       0.25      0.61      0.35       641

    accuracy                           0.82      7902
   macro avg       0.60      0.72      0.62      7902
weighted avg       0.90      0.82      0.85      7902



## ONE-SHOT 

In [58]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/GM_all_arguments.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/task1/output_files/gemini1.5/one-shot/comarg_gm_identification_gemini_1shot.jsonl'

evaluate_predictions_comarg(golden_csv_path, predictions_json_path)

              precision    recall  f1-score   support

           0       0.89      0.47      0.62       943
           1       0.43      0.87      0.58       432

    accuracy                           0.60      1375
   macro avg       0.66      0.67      0.60      1375
weighted avg       0.75      0.60      0.61      1375



In [59]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/UGIP_all_arguments.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/task1/output_files/gemini1.5/one-shot/comarg_ugip_identification_gemini_1shot.jsonl'

evaluate_predictions_comarg(golden_csv_path, predictions_json_path)

              precision    recall  f1-score   support

           0       0.98      0.36      0.52      1772
           1       0.21      0.95      0.35       322

    accuracy                           0.45      2094
   macro avg       0.59      0.65      0.43      2094
weighted avg       0.86      0.45      0.50      2094



## Abortion

In [10]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/yru_abortion_with_negatives_main.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/task1/output_files/gemini1.5/one-shot/yru_abortion_identification_gemini_1shot.jsonl'

evaluate_predictions_yru(golden_csv_path, predictions_json_path)

              precision    recall  f1-score   support

           0       0.98      0.69      0.81      5747
           1       0.27      0.89      0.41       709

    accuracy                           0.72      6456
   macro avg       0.62      0.79      0.61      6456
weighted avg       0.90      0.72      0.77      6456



## Gay Rights

In [11]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/yru_gayRights_with_negatives_main.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/task1/output_files/gemini1.5/one-shot/yru_gayRights_identification_gemini_1shot.jsonl'

evaluate_predictions_yru(golden_csv_path, predictions_json_path)

              precision    recall  f1-score   support

           0       0.95      0.76      0.85      5085
           1       0.31      0.72      0.43       758

    accuracy                           0.76      5843
   macro avg       0.63      0.74      0.64      5843
weighted avg       0.86      0.76      0.79      5843



## Marijuana

In [12]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/yru_marijuana_with_negatives_main.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/task1/output_files/gemini1.5/one-shot/yru_marijuana_identification_gemini_1shot.jsonl'

evaluate_predictions_yru(golden_csv_path, predictions_json_path)

              precision    recall  f1-score   support

           0       0.92      0.71      0.80      4102
           1       0.26      0.64      0.37       670

    accuracy                           0.70      4772
   macro avg       0.59      0.68      0.59      4772
weighted avg       0.83      0.70      0.74      4772



## Obama

In [None]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/yru_obama_with_negatives_main.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/output_files/gpt4o-mini/one-shot/yru_obama_identification_GPT_1shot.jsonl'

evaluate_predictions_yru(golden_csv_path, predictions_json_path)

              precision    recall  f1-score   support

           0       0.96      0.88      0.92      7244
           1       0.29      0.54      0.38       640

    accuracy                           0.85      7884
   macro avg       0.62      0.71      0.65      7884
weighted avg       0.90      0.85      0.87      7884



## K-SHOTS

In [None]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/GM_all_arguments.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/task1/output_files/gemini1.5/few-shot/comarg_gm_identification_gemini_5shot.jsonl'

evaluate_predictions_comarg(golden_csv_path, predictions_json_path)

              precision    recall  f1-score   support

           0       0.90      0.47      0.62       897
           1       0.43      0.88      0.57       401

    accuracy                           0.60      1298
   macro avg       0.66      0.68      0.60      1298
weighted avg       0.75      0.60      0.61      1298



In [None]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/UGIP_all_arguments.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/task1/output_files/gemini1.5/few-shot/comarg_ugip_identification_gemini_5shot.jsonl'

evaluate_predictions_comarg(golden_csv_path, predictions_json_path)

              precision    recall  f1-score   support

           0       0.98      0.53      0.69      1772
           1       0.26      0.93      0.41       322

    accuracy                           0.59      2094
   macro avg       0.62      0.73      0.55      2094
weighted avg       0.87      0.59      0.65      2094



## Abortion

In [14]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/yru_abortion_with_negatives_main.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/task1/output_files/gemini1.5/few-shot/yru_abortion_identification_gemini_5shot.jsonl'

evaluate_predictions_yru(golden_csv_path, predictions_json_path)

              precision    recall  f1-score   support

           0       0.97      0.84      0.90      5380
           1       0.38      0.82      0.52       661

    accuracy                           0.83      6041
   macro avg       0.68      0.83      0.71      6041
weighted avg       0.91      0.83      0.86      6041



## Gay Rights

In [15]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/yru_gayrights_with_negatives_main.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/task1/output_files/gemini1.5/few-shot/yru_gayRights_identification_gemini_5shot.jsonl'

evaluate_predictions_yru(golden_csv_path, predictions_json_path)

              precision    recall  f1-score   support

           0       0.94      0.79      0.86      5034
           1       0.33      0.69      0.45       752

    accuracy                           0.78      5786
   macro avg       0.64      0.74      0.65      5786
weighted avg       0.86      0.78      0.81      5786



## Marijuana

In [16]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/yru_marijuana_with_negatives_main.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/task1/output_files/gemini1.5/few-shot/yru_marijuana_identification_gemini_5shot.jsonl'

evaluate_predictions_yru(golden_csv_path, predictions_json_path)

              precision    recall  f1-score   support

           0       0.95      0.63      0.75      4150
           1       0.25      0.79      0.38       668

    accuracy                           0.65      4818
   macro avg       0.60      0.71      0.57      4818
weighted avg       0.85      0.65      0.70      4818



## Obama

In [17]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/yru_obama_with_negatives_main.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/task1/output_files/gemini1.5/few-shot/yru_obama_identification_gemini_5shot.jsonl'

evaluate_predictions_yru(golden_csv_path, predictions_json_path)

              precision    recall  f1-score   support

           0       0.96      0.84      0.90      7204
           1       0.25      0.60      0.35       631

    accuracy                           0.82      7835
   macro avg       0.60      0.72      0.62      7835
weighted avg       0.90      0.82      0.85      7835

