In [1]:

import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, precision_recall_fscore_support, classification_report, confusion_matrix, ConfusionMatrixDisplay
from collections import Counter, defaultdict
import json, csv
import numpy as np
import matplotlib.pyplot as plt

## Majority baseline

In [153]:
def majority_baseline_comarg(csv_file_path, label_column='label'):
    """
    Remap labels in the dataset and calculate the majority baseline.
    Labels 1, 2, 4, and 5 are mapped to 1, while label 3 is mapped to 0.

    Parameters:
    - csv_file_path (str): Path to the CSV file containing the dataset.
    - label_column (str): The name of the column containing labels (default is 'label').

    Returns:
    - majority_label (int): The label that occurs most frequently after remapping and its count
    - accuracy (float): The accuracy of the majority baseline.
    """

    df = pd.read_csv(csv_file_path)

    mapping = {1: 1, 2: 1, 3: 0, 4: 1, 5: 1}
    
    df['label'] = df[label_column].map(mapping)

    label_counts = df['label'].value_counts()

    majority_label = label_counts.idxmax()
    majority_count = label_counts.max()
    total_samples = len(df)

    majority_predictions = [majority_label] * total_samples

    f1 = f1_score(df['label'], majority_predictions, pos_label=majority_label, zero_division=1)

    print(f"\nMajority Label: {majority_label} (Count: {majority_count})")
    print(f"F1 Score: {f1:.2f}")

In [175]:
def majority_baseline_yru(csv_file_path):
    df = pd.read_csv(csv_file_path)

    label_counts = df['present'].value_counts()

    majority_label = label_counts.idxmax()
    majority_count = label_counts.max()
    total_samples = len(df)

    majority_predictions = [majority_label] * total_samples

    f1 = f1_score(df['present'], majority_predictions, pos_label=majority_label, zero_division=1)

    print(f"\nMajority Label: {majority_label} (Count: {majority_count})")
    print(f"F1 Score: {f1:.2f}")

In [154]:
file_path = '/Users/guida/llm_argument_tasks/clean_data/GM_structured.csv'
print("=== GM ===")
majority_baseline_comarg(file_path)

=== GM ===

Majority Label: 0 (Count: 849)
F1 Score: 0.80


In [155]:
file_path = '/Users/guida/llm_argument_tasks/clean_data/UGIP_structured.csv'
print("=== UGIP ===")
majority_baseline_comarg(file_path)

=== UGIP ===

Majority Label: 0 (Count: 691)
F1 Score: 0.81


In [176]:
file_path = '/Users/guida/llm_argument_tasks/clean_data/yru_abortion_with_negatives_main.csv'
print("=== ABORTION ===")
majority_baseline_yru(file_path)

=== ABORTION ===

Majority Label: 0 (Count: 5947)
F1 Score: 0.94


In [164]:
file_path = '/Users/guida/llm_argument_tasks/clean_data/yru_gayrights_with_negatives_main.csv'
print("=== GAY RIGHTS ===")
majority_baseline_yru(file_path)

=== GAY RIGHTS ===

Majority Label: 0 (Count: 5080)
F1 Score: 0.93


In [165]:
file_path = '/Users/guida/llm_argument_tasks/clean_data/yru_marijuana_with_negatives_main.csv'
print("=== MARIJUANA ===")
majority_baseline_yru(file_path)

=== MARIJUANA ===

Majority Label: 0 (Count: 4317)
F1 Score: 0.93


In [166]:
file_path = '/Users/guida/llm_argument_tasks/clean_data/yru_obama_with_negatives_main.csv'
print("=== OBAMA ===")
majority_baseline_yru(file_path)

=== OBAMA ===

Majority Label: 0 (Count: 7261)
F1 Score: 0.96


In [106]:
def map_gold_labels(labels):
    """
    Maps the original labels in golden data to binary labels for evaluation.
    Labels 1, 2, 4, 5 are mapped to 1.
    Label 3 is mapped to 0.
    """
    mapping = {1: 1, 2: 1, 4: 1, 5: 1, 3: 0}
    return labels.map(mapping)

In [129]:
def evaluate_predictions(golden_csv_path: str, predictions_json_path: str):
    """
    Evaluates predictions against the golden data.
    
    Parameters:
    - golden_csv_path: Path to the CSV file containing golden data.
    - predictions_json_path: Path to the JSON file containing predictions.
    
    Output:
    - Classification report comparing the predictions with the mapped gold labels.
    """

    golden_df = pd.read_csv(golden_csv_path)
    print(golden_df)

    predictions = []
    with open(predictions_json_path, 'r') as f:
        for line in f:
            predictions.append(json.loads(line.strip())) 

    predictions_df = pd.DataFrame(predictions)

    filtered_predictions_df = predictions_df[predictions_df['id'].isin(golden_df['id'])]

    merged_df = pd.merge(golden_df[['id', 'present']], filtered_predictions_df[['id', 'label']], on='id', how='inner', suffixes=('_gold', '_pred'))

    ground_truth = merged_df['present'].values
    predictions = merged_df['label'].values

    report = classification_report(ground_truth, predictions)
    print(report)



In [19]:
def evaluate_predictions_yru(golden_csv_path: str, model_output_path: str):
    """
    Evaluates predictions against the golden data.
    
    Parameters:
    - golden_csv_path: Path to the CSV file containing golden data.
    - predictions_json_path: Path to the JSON file containing predictions.
    
    Output:
    - Classification report comparing the predictions with the mapped gold labels.
    """

    golden_df = pd.read_csv(golden_csv_path)

    predictions_df = pd.read_json(model_output_path, lines=True)
    
    merged_df = pd.merge(
        golden_df[['uid', 'present']], 
        predictions_df[['id', 'label']], 
        left_on='uid', 
        right_on='id', 
        how='inner', 
        suffixes=('_gold', '_pred')
)

    ground_truth = merged_df['present'].values
    predictions = merged_df['label'].values

    report = classification_report(ground_truth, predictions)
    print(report)


## Evaluation GM

In [69]:
golden_csv_path = '/Users/guida/llm_argument_tasks/clean_data/GM_structured.csv'
predictions_json_path = '/Users/guida/llm_argument_tasks/output_files/llama3/comarg_gm_argument_identification.json'

evaluate_predictions(golden_csv_path, predictions_json_path)

           id  label_gold  label_pred
0       1arg2           1           1
1       1arg3           1           0
2       1arg4           1           1
3       1arg5           1           0
4       1arg6           1           1
...       ...         ...         ...
1262  198arg4           1           0
1263  198arg5           1           1
1264  198arg6           1           0
1265  198arg7           1           1
1266    1arg1           1           0

[1267 rows x 3 columns]
[1 1 1 ... 1 1 1]
[1 0 1 ... 0 1 0]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.29      0.45      1267

    accuracy                           0.29      1267
   macro avg       0.50      0.15      0.23      1267
weighted avg       1.00      0.29      0.45      1267



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Evaluation UGIP

In [114]:
golden_data_path = '/Users/guida/llm_argument_tasks/clean_data/UGIP_structured.csv'
model_output_path = '/Users/guida/llm_argument_tasks/output_files/llama3/comarg_ugip_argument_identification.json'

evaluate_predictions(golden_data_path, model_output_path)

              precision    recall  f1-score   support

           0       0.72      0.92      0.81       535
           1       0.70      0.36      0.47       291

    accuracy                           0.72       826
   macro avg       0.71      0.64      0.64       826
weighted avg       0.71      0.72      0.69       826



====== YRU Dataset =======

## Abortion

In [20]:
golden_data_path = '/Users/guida/llm_argument_tasks/clean_data/yru_abortion_with_negatives_main.csv'
model_output_path = '/Users/guida/llm_argument_tasks/output_files/llama3/yru_abortion_identification_with_negatives.jsonl'

evaluate_predictions_yru(golden_data_path, model_output_path)

              precision    recall  f1-score   support

           0       0.95      0.84      0.89      5973
           1       0.31      0.63      0.42       702

    accuracy                           0.81      6675
   macro avg       0.63      0.73      0.65      6675
weighted avg       0.88      0.81      0.84      6675



## Gay Rights

In [25]:
golden_data_path = '/Users/guida/llm_argument_tasks/clean_data/yru_gayRights_with_negatives_main.csv'
model_output_path = '/Users/guida/llm_argument_tasks/output_files/llama3/yru_gayRights_identification_with_negatives.jsonl'

evaluate_predictions_yru(golden_data_path, model_output_path)

              precision    recall  f1-score   support

           0       0.93      0.82      0.87      5059
           1       0.35      0.61      0.44       782

    accuracy                           0.79      5841
   macro avg       0.64      0.71      0.66      5841
weighted avg       0.85      0.79      0.82      5841



## Marijuana

In [22]:
golden_data_path = '/Users/guida/llm_argument_tasks/clean_data/yru_marijuana_with_negatives_main.csv'
model_output_path = '/Users/guida/llm_argument_tasks/output_files/llama3/yru_marijuana_identification_with_negatives.jsonl'

evaluate_predictions_yru(golden_data_path, model_output_path)

              precision    recall  f1-score   support

           0       0.92      0.82      0.87      4318
           1       0.32      0.52      0.39       685

    accuracy                           0.78      5003
   macro avg       0.62      0.67      0.63      5003
weighted avg       0.83      0.78      0.80      5003



## Obama

In [23]:
golden_data_path = '/Users/guida/llm_argument_tasks/clean_data/yru_obama_with_negatives_main.csv'
model_output_path = '/Users/guida/llm_argument_tasks/output_files/llama3/yru_obama_identification_with_negatives.jsonl'

evaluate_predictions_yru(golden_data_path, model_output_path)

              precision    recall  f1-score   support

           0       0.96      0.84      0.89      7270
           1       0.23      0.55      0.32       632

    accuracy                           0.82      7902
   macro avg       0.59      0.69      0.61      7902
weighted avg       0.90      0.82      0.85      7902

