In [22]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, precision_recall_fscore_support, classification_report
from collections import Counter, defaultdict
import json, csv
import numpy as np

## Majority baseline

In [201]:
def process_dataset_and_calculate_majority_baseline(csv_file_path, label_column='label'):
    """
    Remap labels in the dataset and calculate the majority baseline.
    Labels 1, 2, 4, and 5 are mapped to 1, while label 3 is mapped to 0.

    Parameters:
    - csv_file_path (str): Path to the CSV file containing the dataset.
    - label_column (str): The name of the column containing labels (default is 'label').

    Returns:
    - majority_label (int): The label that occurs most frequently after remapping and its count
    - accuracy (float): The accuracy of the majority baseline.
    """

    df = pd.read_csv(csv_file_path)

    mapping = {1: 1, 2: 1, 3: 0, 4: 1, 5: 1}
    
    df[label_column] = df[label_column].map(mapping)

    label_counts = df[label_column].value_counts()

    majority_label = label_counts.idxmax()
    majority_count = label_counts.max()
    total_samples = len(df)

    accuracy = majority_count / total_samples

    print(f"\nMajority Label: {majority_label} (Count: {majority_count})")
    print(f"Majority Baseline Accuracy: {accuracy:.2f}")

In [202]:
file_path = '/Users/guida/llm_argument_tasks/clean_data/GM_structured.csv'
print("=== GM ===")
process_dataset_and_calculate_majority_baseline(file_path)

=== GM ===

Majority Label: 0 (Count: 849)
Majority Baseline Accuracy: 0.66


In [203]:
file_path = '/Users/guida/llm_argument_tasks/clean_data/UGIP_structured.csv'
print("=== UGIP ===")
process_dataset_and_calculate_majority_baseline(file_path)

=== UGIP ===

Majority Label: 0 (Count: 691)
Majority Baseline Accuracy: 0.68


In [19]:
def map_gold_labels(labels):
    """
    Maps the original labels in golden data to binary labels for evaluation.
    Labels 1, 2, 4, 5 are mapped to 1.
    Label 3 is mapped to 0.
    """
    mapping = {1: 1, 2: 1, 4: 1, 5: 1, 3: 0}
    return labels.map(mapping)

In [68]:
def evaluate_predictions(golden_csv_path: str, predictions_json_path: str):
    """
    Evaluates predictions against the golden data.
    
    Parameters:
    - golden_csv_path: Path to the CSV file containing golden data.
    - predictions_json_path: Path to the JSON file containing predictions.
    
    Output:
    - Classification report comparing the predictions with the mapped gold labels.
    """

    golden_df = pd.read_csv(golden_csv_path)
    
    with open(predictions_json_path, 'r') as f:
        predictions = json.load(f)

    predictions_df = pd.DataFrame(predictions)

    merged_df = pd.merge(golden_df[['id', 'label']], predictions_df[['id', 'label']], on='id', suffixes=('_gold', '_pred'))

    merged_df['label_gold'] = map_gold_labels(merged_df['label_gold'].astype(int))
    
    # Ground truth vector
    ground_truth = merged_df['label_gold'].values

    # Prediction vector
    predictions = merged_df['label_pred'].values

    report = classification_report(ground_truth, predictions)
    print(report)


In [85]:
def evaluate_predictions_yru(golden_csv_path: str, predictions_json_path: str):
    """
    Evaluates predictions against the golden data.
    
    Parameters:
    - golden_csv_path: Path to the CSV file containing golden data.
    - predictions_json_path: Path to the JSON file containing predictions.
    
    Output:
    - Classification report comparing the predictions with the mapped gold labels.
    """

    golden_df = pd.read_csv(golden_csv_path)
    golden_df['label'] = 1  

    with open(predictions_json_path, 'r') as f:
        predictions = json.load(f)

    predictions_df = pd.DataFrame(predictions)
    filtered_predictions_df = predictions_df[predictions_df['id'].isin(golden_df['id'])]

    merged_df = pd.merge(golden_df[['id', 'label']], filtered_predictions_df[['id', 'label']], on='id', how='inner', suffixes=('_gold', '_pred'))


    ground_truth = merged_df['label_gold'].values
    predictions = merged_df['label_pred'].values

    report = classification_report(ground_truth, predictions)
    print(report)


## Evaluation GM

In [69]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/GM_structured.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/output_files/llama3/comarg_gm_argument_identification.json'

evaluate_predictions(golden_csv_path, predictions_json_path)

           id  label_gold  label_pred
0       1arg2           1           1
1       1arg3           1           0
2       1arg4           1           1
3       1arg5           1           0
4       1arg6           1           1
...       ...         ...         ...
1262  198arg4           1           0
1263  198arg5           1           1
1264  198arg6           1           0
1265  198arg7           1           1
1266    1arg1           1           0

[1267 rows x 3 columns]
[1 1 1 ... 1 1 1]
[1 0 1 ... 0 1 0]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.29      0.45      1267

    accuracy                           0.29      1267
   macro avg       0.50      0.15      0.23      1267
weighted avg       1.00      0.29      0.45      1267



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Evaluation UGIP

In [57]:
golden_data_path = '/Users/guida/llm_argument_tasks/clean_data/UGIP_structured.csv'
model_output_path = '/Users/guida/llm_argument_tasks/output_files/llama3/comarg_ugip_argument_identification.json'

evaluate_predictions(golden_data_path, model_output_path)

              precision    recall  f1-score   support

           0       0.72      0.92      0.81       535
           1       0.70      0.36      0.47       291

    accuracy                           0.72       826
   macro avg       0.71      0.64      0.64       826
weighted avg       0.71      0.72      0.69       826



====== YRU Dataset =======

## Abortion

In [86]:
golden_data_path = '/Users/guida/llm_argument_tasks/clean_data/yru_abortion.csv'
model_output_path = '/Users/guida/llm_argument_tasks/output_files/llama3/yru_abortion_identification.json'

evaluate_predictions_yru(golden_data_path, model_output_path)

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.69      0.82      1604

    accuracy                           0.69      1604
   macro avg       0.50      0.35      0.41      1604
weighted avg       1.00      0.69      0.82      1604



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Gay Rights

In [87]:
golden_data_path = '/Users/guida/llm_argument_tasks/clean_data/yru_gayrights.csv'
model_output_path = '/Users/guida/llm_argument_tasks/output_files/llama3/yru_gayRights_identification.json'

evaluate_predictions_yru(golden_data_path, model_output_path)

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.62      0.77      1408

    accuracy                           0.62      1408
   macro avg       0.50      0.31      0.38      1408
weighted avg       1.00      0.62      0.77      1408



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Marijuana

In [91]:
golden_data_path = '/Users/guida/llm_argument_tasks/clean_data/yru_marijuana.csv'
model_output_path = '/Users/guida/llm_argument_tasks/output_files/llama3/yru_marijuana_identification.json'

evaluate_predictions_yru(golden_data_path, model_output_path)

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.57      0.73      1491

    accuracy                           0.57      1491
   macro avg       0.50      0.29      0.36      1491
weighted avg       1.00      0.57      0.73      1491



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Obama

In [92]:
golden_data_path = '/Users/guida/llm_argument_tasks/clean_data/yru_obama.csv'
model_output_path = '/Users/guida/llm_argument_tasks/output_files/llama3/yru_obama_identification.json'

evaluate_predictions_yru(golden_data_path, model_output_path)

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.66      0.79      1308

    accuracy                           0.66      1308
   macro avg       0.50      0.33      0.40      1308
weighted avg       1.00      0.66      0.79      1308



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
