In [13]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, precision_recall_fscore_support, classification_report
from collections import Counter
import json, jsonlines

## Evaluation GM

In [14]:
def map_gold_labels(labels):
    """
    Maps the original labels in golden data to binary labels for evaluation.
    Labels 1, 2, 4, 5 are mapped to 1.
    Label 3 is mapped to 0.
    """
    mapping = {1: 1, 2: 1, 4: 1, 5: 1, 3: 0}
    return labels.map(mapping)

In [15]:
def evaluate_predictions_comarg(golden_csv_path: str, predictions_json_path: str):
    """
    Evaluates predictions against the golden data.
    
    Parameters:
    - golden_csv_path: Path to the CSV file containing golden data.
    - predictions_json_path: Path to the JSON file containing predictions.
    
    Output:
    - Classification report comparing the predictions with the mapped gold labels.
    """

    golden_df = pd.read_csv(golden_csv_path)

    predictions_df = pd.read_json(predictions_json_path, lines=True)
    
    merged_df = pd.merge(golden_df[['id', 'label']], predictions_df[['id', 'label']], on='id', suffixes=('_gold', '_pred'))
    
    ground_truth = merged_df['label_gold'].values
    
    predictions = merged_df['label_pred'].values

    report = classification_report(ground_truth, predictions)
    print(report)

In [8]:
def evaluate_predictions_yru(golden_csv_path: str, predictions_json_path: str):
    """
    Evaluates predictions against the golden data.
    
    Parameters:
    - golden_csv_path: Path to the CSV file containing golden data.
    - predictions_json_path: Path to the JSON file containing predictions.
    
    Output:
    - Classification report comparing the predictions with the mapped gold labels.
    """
    
    golden_df = pd.read_csv(golden_csv_path)
    
    predictions = []
    with open(predictions_json_path, 'r') as f:
        for line in f:
            predictions.append(json.loads(line.strip()))
    
    predictions_df = pd.DataFrame(predictions)
        
    predictions = [json.loads(pred) for pred in predictions]
    predictions_df = pd.DataFrame(predictions)

    merged_df = pd.merge(golden_df[['id', 'present']], predictions_df[['id', 'label']], on='id', suffixes=('_gold', '_pred'))
    print(merged_df)
    ground_truth = merged_df['present'].values
    predictions = merged_df['label'].values

    report = classification_report(ground_truth, predictions)
    print(report)

## GM

In [16]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/GM_structured.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/output_files/gpt4o-mini/comarg_gm_argument_identification.jsonl'

evaluate_predictions_comarg(golden_csv_path, predictions_json_path)

  predictions_df = pd.read_json(predictions_json_path, lines=True)


ValueError: Expected object or value

## UGIP

In [43]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/UGIP_structured.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/output_files/gpt4o-mini/comarg_ugip_argument_identification_gpt.jsonl'

evaluate_predictions_comarg(golden_csv_path, predictions_json_path)

              precision    recall  f1-score   support

           0       0.79      0.91      0.85       691
           1       0.72      0.47      0.56       322

    accuracy                           0.77      1013
   macro avg       0.75      0.69      0.71      1013
weighted avg       0.76      0.77      0.76      1013



======= YRU =========

## Abortion

In [9]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/yru_abortion_with_negatives.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/output_files/gpt4o-mini/yru_abortion_identification_GPT_negatives.jsonl'

evaluate_predictions_yru(golden_csv_path, predictions_json_path)

         id  present  label
0       ab1        1      1
1       ab1        1      1
2       ab1        1      1
3       ab1        1      1
4       ab1        1      0
...     ...      ...    ...
99895  ab99        0      0
99896  ab99        0      0
99897  ab99        0      0
99898  ab99        0      0
99899  ab99        0      0

[99900 rows x 3 columns]
              precision    recall  f1-score   support

           0       0.90      0.76      0.82     89010
           1       0.12      0.27      0.17     10890

    accuracy                           0.71     99900
   macro avg       0.51      0.52      0.50     99900
weighted avg       0.81      0.71      0.75     99900



## Gay Rights

In [10]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/yru_gayrights_with_negatives.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/output_files/gpt4o-mini/yru_gayRights_identification_GPT_negatives.jsonl'

evaluate_predictions_yru(golden_csv_path, predictions_json_path)

         id  present  label
0       gr1        1      1
1       gr1        1      1
2       gr1        1      1
3       gr1        1      0
4       gr1        1      1
...     ...      ...    ...
64246  gr99        0      0
64247  gr99        0      0
64248  gr99        0      0
64249  gr99        0      0
64250  gr99        0      0

[64251 rows x 3 columns]
              precision    recall  f1-score   support

           0       0.87      0.72      0.79     55880
           1       0.14      0.31      0.20      8371

    accuracy                           0.67     64251
   macro avg       0.51      0.52      0.49     64251
weighted avg       0.78      0.67      0.71     64251



## Marijuana

In [11]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/yru_marijuana_with_negatives.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/output_files/gpt4o-mini/yru_marijuana_identification_GPT_negatives.jsonl'

evaluate_predictions_yru(golden_csv_path, predictions_json_path)

         id  present  label
0       ma1        0      0
1       ma1        0      0
2       ma1        0      0
3       ma1        0      0
4       ma1        0      0
...     ...      ...    ...
60043  ma99        0      0
60044  ma99        0      0
60045  ma99        0      0
60046  ma99        0      0
60047  ma99        0      0

[60048 rows x 3 columns]
              precision    recall  f1-score   support

           0       0.87      0.76      0.81     51804
           1       0.15      0.27      0.20      8244

    accuracy                           0.70     60048
   macro avg       0.51      0.52      0.51     60048
weighted avg       0.77      0.70      0.73     60048



## Obama

In [12]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/yru_obama_with_negatives.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/output_files/gpt4o-mini/yru_obama_identification_GPT_negatives.jsonl'

evaluate_predictions_yru(golden_csv_path, predictions_json_path)

           id  present  label
0        oba1        0      0
1        oba1        0      0
2        oba1        0      0
3        oba1        0      1
4        oba1        0      0
...       ...      ...    ...
142069  oba99        0      0
142070  oba99        0      0
142071  oba99        0      0
142072  oba99        0      0
142073  oba99        0      1

[142074 rows x 3 columns]
              precision    recall  f1-score   support

           0       0.92      0.79      0.85    130545
           1       0.10      0.25      0.14     11529

    accuracy                           0.75    142074
   macro avg       0.51      0.52      0.50    142074
weighted avg       0.86      0.75      0.80    142074

