In [8]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, precision_recall_fscore_support, classification_report, confusion_matrix, ConfusionMatrixDisplay
from collections import Counter, defaultdict
import json, csv
import numpy as np
import matplotlib.pyplot as plt

In [9]:
def map_gold_labels(labels):
    """
    Maps the original labels in golden data to binary labels for evaluation.
    Labels 1, 2, 4, 5 are mapped to 1.
    Label 3 is mapped to 0.
    """
    mapping = {1: 1, 2: 1, 4: 1, 5: 1, 3: 0}
    return labels.map(mapping)

In [10]:
def evaluate_predictions(golden_csv_path: str, predictions_json_path: str):
    """
    Evaluates predictions against the golden data.
    
    Parameters:
    - golden_csv_path: Path to the CSV file containing golden data.
    - predictions_json_path: Path to the JSON file containing predictions.
    
    Output:
    - Classification report comparing the predictions with the mapped gold labels.
    """ 


    golden_df = pd.read_csv(golden_csv_path)

    golden_df['label'] = map_gold_labels(golden_df['label'])

    predictions_df = pd.read_json(predictions_json_path, lines=True)

    merged_df = pd.merge(golden_df[['id', 'label']], predictions_df[['id', 'label']], on='id', how='inner', suffixes=('_gold', '_pred'))

    ground_truth = merged_df['label_gold'].values
    predictions = merged_df['label_pred'].values

    report = classification_report(ground_truth, predictions)
    print(report)

In [11]:

def evaluate_predictions_yru(golden_csv_path: str, predictions_json_path: str):
    """
    Evaluates predictions against the golden data.
    
    Parameters:
    - golden_csv_path: Path to the CSV file containing golden data.
    - predictions_json_path: Path to the JSON file containing predictions.
    
    Output:
    - Classification report comparing the predictions with the mapped gold labels.
    """
    
    golden_df = pd.read_csv(golden_csv_path)
    
    predictions_df = pd.read_json(predictions_json_path, lines=True)
    
    merged_df = pd.merge(
        golden_df[['uid', 'present']], 
        predictions_df[['id', 'label']], 
        left_on='uid', 
        right_on='id', 
        how='inner', 
        suffixes=('_gold', '_pred')
)
    ground_truth = merged_df['present'].values
    predictions = merged_df['label'].values

    report = classification_report(ground_truth, predictions)
    print(report)

## Evaluation GM

In [12]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/GM_all_arguments_main.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/task1/output_files/llama3/zero-shot/comarg_gm_argument_identification.jsonl'
 
evaluate_predictions(golden_csv_path, predictions_json_path)

              precision    recall  f1-score   support

           0       0.78      0.84      0.81       944
           1       0.58      0.50      0.54       434

    accuracy                           0.73      1378
   macro avg       0.68      0.67      0.67      1378
weighted avg       0.72      0.73      0.72      1378



## Evaluation UGIP

In [13]:
golden_data_path = '/Users/guida/llm_argument_tasks/clean_data/UGIP_all_arguments_main.csv'
model_output_path = '/Users/guida/llm_argument_tasks/task1/output_files/llama3/zero-shot/comarg_ugip_argument_identification.jsonl'

evaluate_predictions(golden_data_path, model_output_path)

              precision    recall  f1-score   support

           0       0.90      0.61      0.73      1769
           1       0.23      0.63      0.34       322

    accuracy                           0.62      2091
   macro avg       0.57      0.62      0.53      2091
weighted avg       0.80      0.62      0.67      2091



====== YRU Dataset =======

## Abortion

In [14]:
golden_data_path = '/Users/guida/llm_argument_tasks/clean_data/yru_abortion_with_negatives_main.csv'
model_output_path = '/Users/guida/llm_argument_tasks/task1/output_files/llama3/zero-shot/yru_abortion_identification_with_negatives.jsonl'

evaluate_predictions_yru(golden_data_path, model_output_path)

              precision    recall  f1-score   support

           0       0.95      0.84      0.89      5971
           1       0.31      0.63      0.41       699

    accuracy                           0.81      6670
   macro avg       0.63      0.73      0.65      6670
weighted avg       0.88      0.81      0.84      6670



## Gay Rights

In [15]:
golden_data_path = '/Users/guida/llm_argument_tasks/clean_data/yru_gayrights_with_negatives_main.csv'
model_output_path = '/Users/guida/llm_argument_tasks/task1/output_files/llama3/zero-shot/yru_gayRights_identification_with_negatives.jsonl'

evaluate_predictions_yru(golden_data_path, model_output_path)

              precision    recall  f1-score   support

           0       0.93      0.82      0.87      5057
           1       0.34      0.60      0.44       779

    accuracy                           0.79      5836
   macro avg       0.64      0.71      0.66      5836
weighted avg       0.85      0.79      0.82      5836



## Marijuana

In [16]:
golden_data_path = '/Users/guida/llm_argument_tasks/clean_data/yru_marijuana_with_negatives_main.csv'
model_output_path = '/Users/guida/llm_argument_tasks/task1/output_files/llama3/zero-shot/yru_marijuana_identification_with_negatives.jsonl'

evaluate_predictions_yru(golden_data_path, model_output_path)

              precision    recall  f1-score   support

           0       0.92      0.82      0.87      4316
           1       0.32      0.52      0.39       682

    accuracy                           0.78      4998
   macro avg       0.62      0.67      0.63      4998
weighted avg       0.83      0.78      0.80      4998



## Obama

In [17]:
golden_data_path = '/Users/guida/llm_argument_tasks/clean_data/yru_obama_with_negatives_main.csv'
model_output_path = '/Users/guida/llm_argument_tasks/task1/output_files/llama3/zero-shot/yru_obama_identification_with_negatives.jsonl'

evaluate_predictions_yru(golden_data_path, model_output_path)

              precision    recall  f1-score   support

           0       0.96      0.84      0.89      7268
           1       0.23      0.55      0.32       629

    accuracy                           0.82      7897
   macro avg       0.59      0.69      0.61      7897
weighted avg       0.90      0.82      0.85      7897



## One shot

In [18]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/GM_all_arguments.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/output_files/llama3/zero-shot/comarg_gm_argument_identification.jsonl'

evaluate_predictions(golden_csv_path, predictions_json_path)

  predictions_df = pd.read_json(predictions_json_path, lines=True)


ValueError: Expected object or value

In [19]:
golden_data_path = '/Users/guida/llm_argument_tasks/clean_data/yru_abortion_with_negatives_main.csv'
model_output_path = '/Users/guida/llm_argument_tasks/task1/output_files/llama3/one-shot/yru_abortion_identification_llama_1shot.jsonl'

evaluate_predictions_yru(golden_data_path, model_output_path)

              precision    recall  f1-score   support

           0       0.96      0.72      0.82      5976
           1       0.24      0.77      0.37       707

    accuracy                           0.72      6683
   macro avg       0.60      0.74      0.60      6683
weighted avg       0.89      0.72      0.77      6683



## Gay Rights

In [22]:
golden_data_path = '/Users/guida/llm_argument_tasks/clean_data/yru_gayrights_with_negatives_main.csv'
model_output_path = '/Users/guida/llm_argument_tasks/task1/output_files/llama3/one-shot/yru_gayrights_identification_llama_1shot.jsonl'
evaluate_predictions_yru(golden_data_path, model_output_path)

              precision    recall  f1-score   support

           0       0.94      0.80      0.86      5079
           1       0.32      0.64      0.43       765

    accuracy                           0.78      5844
   macro avg       0.63      0.72      0.64      5844
weighted avg       0.86      0.78      0.80      5844



## Marijuana

In [23]:
golden_data_path = '/Users/guida/llm_argument_tasks/clean_data/yru_marijuana_with_negatives_main.csv'
model_output_path = '/Users/guida/llm_argument_tasks/task1/output_files/llama3/one-shot/yru_marijuana_identification_llama_1shot.jsonl'

evaluate_predictions_yru(golden_data_path, model_output_path)

              precision    recall  f1-score   support

           0       0.92      0.67      0.77      4324
           1       0.23      0.61      0.33       686

    accuracy                           0.66      5010
   macro avg       0.57      0.64      0.55      5010
weighted avg       0.82      0.66      0.71      5010



## Obama

In [24]:
golden_data_path = '/Users/guida/llm_argument_tasks/clean_data/yru_obama_with_negatives_main.csv'
model_output_path = '/Users/guida/llm_argument_tasks/task1/output_files/llama3/one-shot/yru_obama_identification_llama_1shot.jsonl'

evaluate_predictions_yru(golden_data_path, model_output_path)

              precision    recall  f1-score   support

           0       0.96      0.76      0.85      7276
           1       0.19      0.66      0.30       639

    accuracy                           0.75      7915
   macro avg       0.58      0.71      0.57      7915
weighted avg       0.90      0.75      0.81      7915



## Few shot

## Abortion

In [None]:
golden_data_path = '/Users/guida/llm_argument_tasks/clean_data/yru_abortion_with_negatives_main.csv'
model_output_path = '/Users/guida/llm_argument_tasks/task1/output_files/llama3/k-shots/yru_abortion_identification_llama_5shot.jsonl'

evaluate_predictions_yru(golden_data_path, model_output_path)

              precision    recall  f1-score   support

           0       0.96      0.52      0.67      5987
           1       0.16      0.82      0.27       677

    accuracy                           0.55      6664
   macro avg       0.56      0.67      0.47      6664
weighted avg       0.88      0.55      0.63      6664



## Gay Rights

In [None]:
golden_data_path = '/Users/guida/llm_argument_tasks/clean_data/yru_gayrights_with_negatives_main.csv'
model_output_path = '/Users/guida/llm_argument_tasks/task1/output_files/llama3/k-shots/yru_gayRights_identification_llama_5shot.jsonl'

evaluate_predictions_yru(golden_data_path, model_output_path)

              precision    recall  f1-score   support

           0       0.93      0.68      0.78      1077
           1       0.29      0.72      0.41       195

    accuracy                           0.68      1272
   macro avg       0.61      0.70      0.60      1272
weighted avg       0.83      0.68      0.73      1272



## Marijuana

In [None]:
golden_data_path = '/Users/guida/llm_argument_tasks/clean_data/yru_marijuana_with_negatives_main.csv'
model_output_path = '/Users/guida/llm_argument_tasks/task1/output_files/llama3/k-shots/yru_marijuana_identification_llama_5shot.jsonl'

evaluate_predictions_yru(golden_data_path, model_output_path)

              precision    recall  f1-score   support

           0       0.95      0.60      0.74      4329
           1       0.24      0.78      0.36       682

    accuracy                           0.63      5011
   macro avg       0.59      0.69      0.55      5011
weighted avg       0.85      0.63      0.69      5011



## Obama

In [None]:
golden_data_path = '/Users/guida/llm_argument_tasks/clean_data/yru_obama_with_negatives_main.csv'
model_output_path = '/Users/guida/llm_argument_tasks/task1/output_files/llama3/zero-shot/yru_obama_identification_with_negatives.jsonl'

evaluate_predictions_yru(golden_data_path, model_output_path)

              precision    recall  f1-score   support

           0       0.96      0.84      0.89      7268
           1       0.23      0.55      0.32       629

    accuracy                           0.82      7897
   macro avg       0.59      0.69      0.61      7897
weighted avg       0.90      0.82      0.85      7897

