In [41]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, precision_recall_fscore_support, classification_report, confusion_matrix, ConfusionMatrixDisplay
from collections import Counter, defaultdict
import json, csv
import numpy as np
import matplotlib.pyplot as plt

In [26]:
def map_gold_labels(labels):
    """
    Maps the original labels in golden data to binary labels for evaluation.
    Labels 1, 2, 4, 5 are mapped to 1.
    Label 3 is mapped to 0.
    """
    mapping = {1: 1, 2: 1, 4: 1, 5: 1, 3: 0}
    return labels.map(mapping)

In [38]:
def evaluate_predictions(golden_csv_path: str, predictions_json_path: str):
    """
    Evaluates predictions against the golden data.
    
    Parameters:
    - golden_csv_path: Path to the CSV file containing golden data.
    - predictions_json_path: Path to the JSON file containing predictions.
    
    Output:
    - Classification report comparing the predictions with the mapped gold labels.
    """

    golden_df = pd.read_csv(golden_csv_path)

    golden_df['label'] = map_gold_labels(golden_df['label'])

    predictions_df = pd.read_json(predictions_json_path, lines=True)

    merged_df = pd.merge(golden_df[['id', 'label']], predictions_df[['id', 'label']], on='id', how='inner', suffixes=('_gold', '_pred'))

    ground_truth = merged_df['label_gold'].values
    predictions = merged_df['label_pred'].values

    report = classification_report(ground_truth, predictions)
    print(report)



In [31]:
def evaluate_predictions_yru(golden_csv_path: str, model_output_path: str):
    """
    Evaluates predictions against the golden data.
    
    Parameters:
    - golden_csv_path: Path to the CSV file containing golden data.
    - predictions_json_path: Path to the JSON file containing predictions.
    
    Output:
    - Classification report comparing the predictions with the mapped gold labels.
    """

    golden_df = pd.read_csv(golden_csv_path)

    predictions_df = pd.read_json(model_output_path, lines=True)
    
    merged_df = pd.merge(
        golden_df[['uid', 'present']], 
        predictions_df[['id', 'label']], 
        left_on='uid', 
        right_on='id', 
        how='inner', 
        suffixes=('_gold', '_pred')
)

    ground_truth = merged_df['present'].values
    predictions = merged_df['label'].values

    report = classification_report(ground_truth, predictions)
    print(report)


## Evaluation GM

In [39]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/GM_structured.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/output_files/llama3/comarg_gm_argument_identification.jsonl'

evaluate_predictions(golden_csv_path, predictions_json_path)

              precision    recall  f1-score   support

           0       0.76      0.83      0.79       849
           1       0.60      0.50      0.54       436

    accuracy                           0.72      1285
   macro avg       0.68      0.66      0.67      1285
weighted avg       0.71      0.72      0.71      1285



## Evaluation UGIP

In [40]:
golden_data_path = '/Users/guida/llm_argument_tasks/clean_data/UGIP_structured.csv'
model_output_path = '/Users/guida/llm_argument_tasks/output_files/llama3/comarg_ugip_argument_identification.jsonl'

evaluate_predictions(golden_data_path, model_output_path)

              precision    recall  f1-score   support

           0       0.80      0.67      0.73       690
           1       0.48      0.64      0.55       322

    accuracy                           0.66      1012
   macro avg       0.64      0.66      0.64      1012
weighted avg       0.70      0.66      0.67      1012



====== YRU Dataset =======

## Abortion

In [20]:
golden_data_path = '/Users/guida/llm_argument_tasks/clean_data/yru_abortion_with_negatives_main.csv'
model_output_path = '/Users/guida/llm_argument_tasks/output_files/llama3/yru_abortion_identification_with_negatives.jsonl'

evaluate_predictions_yru(golden_data_path, model_output_path)

              precision    recall  f1-score   support

           0       0.95      0.84      0.89      5973
           1       0.31      0.63      0.42       702

    accuracy                           0.81      6675
   macro avg       0.63      0.73      0.65      6675
weighted avg       0.88      0.81      0.84      6675



## Gay Rights

In [25]:
golden_data_path = '/Users/guida/llm_argument_tasks/clean_data/yru_gayRights_with_negatives_main.csv'
model_output_path = '/Users/guida/llm_argument_tasks/output_files/llama3/yru_gayRights_identification_with_negatives.jsonl'

evaluate_predictions_yru(golden_data_path, model_output_path)

              precision    recall  f1-score   support

           0       0.93      0.82      0.87      5059
           1       0.35      0.61      0.44       782

    accuracy                           0.79      5841
   macro avg       0.64      0.71      0.66      5841
weighted avg       0.85      0.79      0.82      5841



## Marijuana

In [22]:
golden_data_path = '/Users/guida/llm_argument_tasks/clean_data/yru_marijuana_with_negatives_main.csv'
model_output_path = '/Users/guida/llm_argument_tasks/output_files/llama3/yru_marijuana_identification_with_negatives.jsonl'

evaluate_predictions_yru(golden_data_path, model_output_path)

              precision    recall  f1-score   support

           0       0.92      0.82      0.87      4318
           1       0.32      0.52      0.39       685

    accuracy                           0.78      5003
   macro avg       0.62      0.67      0.63      5003
weighted avg       0.83      0.78      0.80      5003



## Obama

In [23]:
golden_data_path = '/Users/guida/llm_argument_tasks/clean_data/yru_obama_with_negatives_main.csv'
model_output_path = '/Users/guida/llm_argument_tasks/output_files/llama3/yru_obama_identification_with_negatives.jsonl'

evaluate_predictions_yru(golden_data_path, model_output_path)

              precision    recall  f1-score   support

           0       0.96      0.84      0.89      7270
           1       0.23      0.55      0.32       632

    accuracy                           0.82      7902
   macro avg       0.59      0.69      0.61      7902
weighted avg       0.90      0.82      0.85      7902

