In [3]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, precision_recall_fscore_support, classification_report
from collections import Counter
import json

## Evaluation GM

In [3]:
def map_gold_labels(labels):
    """
    Maps the original labels in golden data to binary labels for evaluation.
    Labels 1, 2, 4, 5 are mapped to 1.
    Label 3 is mapped to 0.
    """
    mapping = {1: 1, 2: 1, 4: 1, 5: 1, 3: 0}
    return labels.map(mapping)

In [79]:
def evaluate_predictions_comarg(golden_csv_path: str, predictions_json_path: str):
    """
    Evaluates predictions against the golden data.
    
    Parameters:
    - golden_csv_path: Path to the CSV file containing golden data.
    - predictions_json_path: Path to the JSON file containing predictions.
    
    Output:
    - Classification report comparing the predictions with the mapped gold labels.
    """

    golden_df = pd.read_csv(golden_csv_path)

    with open(predictions_json_path, 'r') as f:
        predictions = json.load(f)
        
    predictions = [json.loads(pred) for pred in predictions]
    predictions_df = pd.DataFrame(predictions)

    merged_df = pd.merge(golden_df[['id', 'label']], predictions_df[['id', 'label']], on='id', suffixes=('_gold', '_pred'))

    merged_df['label_gold'] = map_gold_labels(merged_df['label_gold'].astype(int))
    
    ground_truth = merged_df['label_gold'].values
    
    predictions = merged_df['label_pred'].values

    report = classification_report(ground_truth, predictions)
    print(report)

In [11]:
pd.set_option('display.max_colwidth', None)

def evaluate_predictions_yru(golden_csv_path: str, predictions_json_path: str):
    """
    Evaluates predictions against the golden data.
    
    Parameters:
    - golden_csv_path: Path to the CSV file containing golden data.
    - predictions_json_path: Path to the JSON file containing predictions.
    
    Output:
    - Classification report comparing the predictions with the mapped gold labels.
    """

    golden_df = pd.read_csv(golden_csv_path)
    golden_df = golden_df[golden_df['present'] == 0]
    golden_df.to_csv('check_abortion_0.csv')

    with open(predictions_json_path, 'r') as f:
        predictions = json.load(f)
        
    predictions = [json.loads(pred) for pred in predictions]
    predictions_df = pd.DataFrame(predictions)

    merged_df = pd.DataFrame({
        'id': golden_df['id'],
        'label_gold': golden_df['label'],
        'label_pred': predictions_df['label']  
    })


    ground_truth = merged_df['label_gold'].values
    predictions = merged_df['label_pred'].values

    report = classification_report(ground_truth, predictions)
    print(report)

## GM

In [40]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/GM_structured.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/output_files/gpt4o-mini/comarg_gm_argument_identification.json'

evaluate_predictions_comarg(golden_csv_path, predictions_json_path)

              precision    recall  f1-score   support

           0       0.77      0.89      0.83       849
           1       0.69      0.50      0.58       436

    accuracy                           0.75      1285
   macro avg       0.73      0.69      0.70      1285
weighted avg       0.75      0.75      0.74      1285



## UGIP

In [22]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/UGIP_structured.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/output_files/gpt4o-mini/comarg_ugip_argument_identification.json'

evaluate_predictions_comarg(golden_csv_path, predictions_json_path)

              precision    recall  f1-score   support

           0       0.79      0.78      0.78       691
           1       0.54      0.54      0.54       322

    accuracy                           0.70      1013
   macro avg       0.66      0.66      0.66      1013
weighted avg       0.71      0.70      0.71      1013



======= COMARG =========

## Abortion

In [12]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/yru_abortion_with_negatives.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/output_files/gpt4o-mini/yru_abortion_identification_GPT.json'

evaluate_predictions_yru(golden_csv_path, predictions_json_path)

  return x.astype(dtype, copy=copy, casting=casting)


ValueError: Input y_pred contains NaN.

## Gay Rights

In [86]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/yru_gayrights.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/output_files/gpt4o-mini/yru_gayRights_identification_GPT.json'

evaluate_predictions_yru(golden_csv_path, predictions_json_path)

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.67      0.80       772

    accuracy                           0.67       772
   macro avg       0.50      0.34      0.40       772
weighted avg       1.00      0.67      0.80       772



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Marijuana

In [84]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/yru_marijuana.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/output_files/gpt4o-mini/yru_marijuana_identification_GPT.json'

evaluate_predictions_yru(golden_csv_path, predictions_json_path)

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.60      0.75       691

    accuracy                           0.60       691
   macro avg       0.50      0.30      0.37       691
weighted avg       1.00      0.60      0.75       691



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Obama

In [85]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/yru_obama.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/output_files/gpt4o-mini/yru_obama_identification_GPT.json'

evaluate_predictions_yru(golden_csv_path, predictions_json_path)

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.66      0.79       646

    accuracy                           0.66       646
   macro avg       0.50      0.33      0.40       646
weighted avg       1.00      0.66      0.79       646



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
