In [1]:
import pandas as pd
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, classification_report
from collections import Counter, defaultdict
import json, csv
import numpy as np
import xmltodict
from sklearn.dummy import DummyClassifier
import random
import os
from statistics import mean, stdev
import warnings 
from scipy.stats import ttest_rel, levene
from collections import defaultdict
from typing import List, Dict, Tuple
from scipy import stats
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from rouge_score import rouge_scorer

warnings.filterwarnings('ignore') 

Task 3, GPT4o - 0s (strongest) -- FP and FN

In [2]:
def get_kshot_examples(dataset, shots, split):
    """Get k-shot examples for a specific dataset, shot count and split."""
    if shots == "0":
        return []
    
    dataset_name = dataset.upper()
    shots_name = f"{shots}shot"
    
    filename = f"/Users/guida/llm_argument_tasks/run_all_k_shots/k-shots/{dataset_name}_all_arguments_main_{shots_name}_split_{split}.csv"
    filepath = os.path.join("run_all_k_shots", "k-shots", filename)
    
    df = pd.read_csv(filepath)
    return df['id'].tolist()

In [17]:
def get_gold(binarize, task, data, kshot_examples, data_dir='/Users/guida/llm_argument_tasks/clean_data/'):
    data = open(f'{data_dir}{data.upper()}.xml', 'r').read()
    data = xmltodict.parse(data)
    
    gold = {}
    # binary labels (3 => 0; all else => 1)
    if task == 1:
        for comarg in data['document']['unit']:
            ii = comarg['@id']
            label = comarg['label']
            if label == '3' and ii not in kshot_examples:
                gold[ii] = 0
            elif ii not in kshot_examples:
                gold[ii] = 1
                
    # types of use
    # binary: 4/5 = 5 and 1/2 = 1 and 3 = delete
    # 4-way: 1, 2, 4, 5, 3 = delete
    if task == 2:
        for comarg in data['document']['unit']:
            ii = comarg['@id']
            label = comarg['label']
            if binarize:
                if label in ['1','2']:
                    label = 1
                elif label in ['4','5']:
                    label = 5
            if label != '3' and ii not in kshot_examples:
                gold[ii] = int(label)
    return gold


In [18]:
def get_predictions(fname, gold): 
    preds = pd.read_json(fname, lines=True).to_dict()
    preds = {preds['id'][i]: preds['label'][i] for i in preds['id'].keys()}
   
    gold_list = []
    pred_list = []
    ids = []

    gold_ids = set(gold.keys())
    pred_ids = set(preds.keys())
    
   # print(f"Gold IDs count: {len(gold_ids)}, Prediction IDs count: {len(pred_ids)}")
   # print(f"Overlap count: {len(gold_ids.intersection(pred_ids))}")
   # print(f"Gold-only IDs: {len(gold_ids - pred_ids)}, Pred-only IDs: {len(pred_ids - gold_ids)}")


    for k in set(list(gold.keys()) + list(preds.keys())):
        if k in gold and k in preds:
            ids.append(k)
            gold_list.append(gold[k])
            pred_list.append(preds[k])

    return gold_list, pred_list, ids

def evaluate(gold, predicted):
    prec = precision_score(gold, predicted, average='macro', zero_division=0.0)
    rec = recall_score(gold, predicted, average='macro', zero_division=0.0)
    f1 = (2 * prec * rec) / (prec + rec) if (prec + rec) > 0 else 0.0
    support = len(gold)
    cm = confusion_matrix(gold, predicted)
    return prec, rec, f1, support, cm

In [20]:
!pwd

/Users/guida/llm_argument_tasks/evaluation-lea/error_analysis


In [47]:
def get_full_argument_context(dataset, unit_id):
    data_dir = '/Users/guida/llm_argument_tasks/clean_data/'
    data = open(f'{data_dir}{dataset.upper()}.xml', 'r').read()
    data = xmltodict.parse(data)
    for unit in data['document']['unit']:
        if unit['@id'] == unit_id:
            return {
                'argument': unit['argument']['text'],
                'stance': unit['argument']['stance'],
                'comment': unit['comment']['text']
            }
    return None

def evaluate_all_splits_with_errors():
    results = []
    error_cases = []
    correct_cases = []
    for dataset in ["gm", "ugip"]:
        for task in ["binary"]:
            print(f"\n===== {task} {dataset} =====")
            dataset_results = []
            if task == "binary":
                binarize = True
                base_path = "/Users/guida/llm_argument_tasks/evaluation-lea/task2_binary_outputs"
                subtask_folder = "SubtaskA"
                tag = "2ways"
            else:
                binarize = False
                base_path = "/../task2_full_scale_outputs"
                subtask_folder = "SubtaskB"
                tag = "5ways"
            gold_raw = get_gold(binarize=binarize, task=2, data=dataset, kshot_examples=[])
            for model in ["gpt4"]:
                pred_file_0 = f"comarg_{dataset}_relation_identification_{tag}_{model}_0shot.jsonl"
                pred_path_0 = os.path.join(base_path, model, "0-s", pred_file_0)
                try:
                    gold, pred, ids = get_predictions(pred_path_0, gold_raw)
                    labels = np.unique(np.concatenate([gold, pred]))
                    pr, re, f1, supp, cm = evaluate(gold, pred)
                    result = {
                        'dataset': dataset,
                        'model': model,
                        'split': 0,
                        'shot': "0",
                        'precision': pr,
                        'recall': re,
                        'f1': f1,
                        'support': supp,
                        'confusion_matrix': cm
                    }
                    results.append(result)
                    dataset_results.append(result)
                    for i, (g, p, id_) in enumerate(zip(gold, pred, ids)):

                        context = get_full_argument_context(dataset, id_)
                        entry = {
                                'dataset': dataset,
                                'split': 0,
                                'id': id_,
                                'gold': g,
                                'pred': p,
                                'error_type': 'FP' if p == 1 and g == 0 else 'FN',
                                'argument': context['argument'],
                                'stance': context['stance'],
                                'comment': context['comment']
                        }

                        if g != p:
                            entry['error_type'] = 'FP' if p == 1 and g == 0 else 'FN'
                            error_cases.append(entry)
                        else:
                            entry['error_type'] = 'correct'
                            correct_cases.append(entry)
                except FileNotFoundError:
                    print(f"Missing file: {pred_path_0}")
                except Exception as e:
                    print(f"Error processing {pred_path_0}: {str(e)}")
        print(f"\n{'='*20} Aggregate Results for {model} {'='*20}")
        dataset_df = pd.DataFrame(dataset_results)
        agg_results = dataset_df.groupby(['model', 'shot']).agg({
            'precision': ['mean', 'std'],
            'recall': ['mean', 'std'],
            'f1': ['mean', 'std']
        }).round(3)
        print(agg_results)
    return agg_results, pd.DataFrame(error_cases), pd.DataFrame(correct_cases)

agg_results, error_comarg, correct_comarg = evaluate_all_splits_with_errors()


===== binary gm =====

           precision     recall         f1    
                mean std   mean std   mean std
model shot                                    
gpt4  0        0.932 NaN  0.954 NaN  0.943 NaN

===== binary ugip =====

           precision     recall         f1    
                mean std   mean std   mean std
model shot                                    
gpt4  0        0.966 NaN  0.971 NaN  0.969 NaN


In [48]:
all_cases = pd.concat([error_comarg, correct_comarg], ignore_index=True)
all_cases

Unnamed: 0,dataset,split,id,gold,pred,error_type,argument,stance,comment
0,gm,0,107arg1,5,1,FN,Gay couples can declare their union without re...,Con,Marriage is a union between MAN and WIFE in th...
1,gm,0,95arg2,5,1,FN,Gay couples should be able to take advantage o...,Pro,It is not possible for a gay couple to enter i...
2,gm,0,137arg5,5,1,FN,Major world religions are against gay marriages,Con,So I want to know whats next. Are we going to ...
3,gm,0,148arg1,5,1,FN,Gay couples can declare their union without re...,Con,The ideal setting in which to raise a child is...
4,gm,0,60arg7,5,1,FN,Marriage should be between a man and a woman,Con,I do believe that if you love each other that ...
...,...,...,...,...,...,...,...,...,...
751,ugip,0,414721730arg1,1,1,correct,Separation of state and religion,Con,The words Under God should not be removed from...
752,ugip,0,414721661arg2,5,5,correct,Removing under god would promote religious tol...,Con,It should be taken out. This is NOT a christia...
753,ugip,0,414721876arg2,5,5,correct,Removing under god would promote religious tol...,Con,I am against having the statement Under God in...
754,ugip,0,414721809arg6,5,5,correct,America is based on democracy and the pledge s...,Pro,Since the majority of the people in America be...


In [45]:
all_cases['error_type'].value_counts()

error_type
correct    721
FN          35
Name: count, dtype: int64

In [46]:
error_comarg

Unnamed: 0,dataset,split,id,gold,pred,error_type,argument,stance,comment
0,gm,0,107arg1,5,1,FN,Gay couples can declare their union without re...,Con,Marriage is a union between MAN and WIFE in th...
1,gm,0,95arg2,5,1,FN,Gay couples should be able to take advantage o...,Pro,It is not possible for a gay couple to enter i...
2,gm,0,137arg5,5,1,FN,Major world religions are against gay marriages,Con,So I want to know whats next. Are we going to ...
3,gm,0,148arg1,5,1,FN,Gay couples can declare their union without re...,Con,The ideal setting in which to raise a child is...
4,gm,0,60arg7,5,1,FN,Marriage should be between a man and a woman,Con,I do believe that if you love each other that ...
5,gm,0,119arg2,5,1,FN,Gay couples should be able to take advantage o...,Pro,"No, gay marriage will never be legal ,marriage..."
6,gm,0,85arg5,5,1,FN,Major world religions are against gay marriages,Con,If we address gay marriage on the basis of rel...
7,gm,0,127arg1,5,1,FN,Gay couples can declare their union without re...,Con,People need to realize that gay marriage is wr...
8,gm,0,42arg1,5,1,FN,Gay couples can declare their union without re...,Con,Hell to the No. Marriage has always been defi...
9,gm,0,7arg1,5,1,FN,Gay couples can declare their union without re...,Con,Absolutely No. Who are we to rewrite the creat...


In [43]:
all_cases['gold'] = all_cases['gold'].replace({5: "Support", 1: "Attack"})
all_cases['pred'] = all_cases['pred'].replace({5: "Support", 1: "Attack"})

all_cases

Unnamed: 0,dataset,split,id,gold,pred,error_type,argument,stance,comment
0,gm,0,107arg1,Support,Attack,FN,Gay couples can declare their union without re...,Con,Marriage is a union between MAN and WIFE in th...
1,gm,0,95arg2,Support,Attack,FN,Gay couples should be able to take advantage o...,Pro,It is not possible for a gay couple to enter i...
2,gm,0,137arg5,Support,Attack,FN,Major world religions are against gay marriages,Con,So I want to know whats next. Are we going to ...
3,gm,0,148arg1,Support,Attack,FN,Gay couples can declare their union without re...,Con,The ideal setting in which to raise a child is...
4,gm,0,60arg7,Support,Attack,FN,Marriage should be between a man and a woman,Con,I do believe that if you love each other that ...
...,...,...,...,...,...,...,...,...,...
751,ugip,0,414721730arg1,Attack,Attack,correct,Separation of state and religion,Con,The words Under God should not be removed from...
752,ugip,0,414721661arg2,Support,Support,correct,Removing under god would promote religious tol...,Con,It should be taken out. This is NOT a christia...
753,ugip,0,414721876arg2,Support,Support,correct,Removing under god would promote religious tol...,Con,I am against having the statement Under God in...
754,ugip,0,414721809arg6,Support,Support,correct,America is based on democracy and the pledge s...,Pro,Since the majority of the people in America be...


In [49]:
all_cases.to_csv('../qualitative_analysis/error_and_correct_t3.csv', index=False)