In [1]:
import pandas as pd
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, classification_report
from collections import Counter, defaultdict
import json, csv
import numpy as np
import xmltodict
from sklearn.dummy import DummyClassifier
import random
import os
from statistics import mean, stdev
import warnings 
from scipy.stats import ttest_rel, levene
from collections import defaultdict
from typing import List, Dict, Tuple
from scipy import stats
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from rouge_score import rouge_scorer

warnings.filterwarnings('ignore') 

Task 1, Gemini1.5-5s (strongest) -- FP and FN

Task 1 YRU

In [5]:
# Dictionary for label-to-argument mappings for each topic
topic_label_to_argument = {
    "abortion": {
        "p-right": "Abortion is a woman’s right.",
        "p-rape": "Rape victims need it to be legal.",
        "p-not_human": "A fetus is not a human yet, so it's okay to abort.",
        "p-mother_danger": "Abortion should be allowed when a mother's life is in danger.",
        "p-baby_ill_treatment": "Unwanted babies are ill-treated by parents and/or not always adopted.",
        "p-birth_ctrl": "Birth control fails at times and abortion is one way to deal with it.",
        "p-not_murder": "Abortion is not murder.",
        "p-sick_mom": "Mother is not healthy/financially solvent.",
        "p-other": "Others",
        "c-adopt": "Put baby up for adoption.",
        "c-kill": "Abortion kills a life.",
        "c-baby_right": "An unborn baby is a human and has the right to live.",
        "c-sex": "Be willing to have the baby if you have sex.",
        "c-bad_4_mom": "Abortion is harmful for women.",
        "c-other": "Others"
    },
    "gayRights": {
        "p-normal": "Gay marriage is like any other marriage.",
        "p-right_denied": "Gay people should have the same rights as straight people.",
        "p-no_threat_for_child": "Gay parents can adopt and ensure a happy life for a baby.",
        "p-born": "People are born gay.",
        "p-religion": "Religion should not be used against gay rights.",
        "p-Other": "Others",
        "c-religion": "Religion does not permit gay marriages.",
        "c-abnormal": "Gay marriages are not normal/against nature.",
        "c-threat_to_child": "Gay parents cannot raise kids properly.",
        "c-gay_problems": "Gay people have problems and create social issues.",
        "c-Other": "Others"
    },
    "obama": {
        "p-economy": "Fixed the economy.",
        "p-War": "Ending the wars.",
        "p-republicans": "Better than the republican candidates.",
        "p-decision_policies": "Makes good decisions/policies.",
        "p-quality": "Has qualities of a good leader.",
        "p-health": "Ensured better healthcare.",
        "p-foreign_policies": "Executed effective foreign policies.",
        "p-job": "Created more jobs.",
        "p-Other": "Others",
        "c-economy": "Destroyed our economy.",
        "c-War": "Wars are still on.",
        "c-job": "Unemployment rate is high.",
        "c-health": "Healthcare bill is a failure.",
        "c-decision_policies": "Poor decision-maker.",
        "c-republicans": "We have better republicans than Obama.",
        "c-quality": "Not eligible as a leader.",
        "c-foreign_policies": "Ineffective foreign policies.",
        "c-Other": "Others"
    },
    "marijuana": {
        "p-not_addictive": "Not addictive.",
        "p-medicine": "Used as a medicine for its positive effects.",
        "p-legal": "Legalized marijuana can be controlled and regulated by the government.",
        "p-right": "Prohibition violates human rights.",
        "p-no_damage": "Does not cause any damage to our bodies.",
        "p-Other": "Others",
        "c-health": "Damages our bodies.",
        "c-mind": "Responsible for brain damage.",
        "c-illegal": "If legalized, people will use marijuana and other drugs more.",
        "c-crime": "Causes crime.",
        "c-addiction": "Highly addictive.",
        "c-Other": "Others"
    }
}

In [10]:
def get_shot_examples(topic, n_shot, split):
    if n_shot not in ["1", "5"]:
        return []

    shot_file = f'/Users/guida/llm_argument_tasks/run_all_k_shots/k-shots/yru_{topic}_with_negatives_main_{n_shot}shot_split_{split}.csv'
    try:
        shot_df = pd.read_csv(shot_file)
        return shot_df['uid'].tolist()
    except FileNotFoundError:
        print(f"Warning: Shot file not found: {shot_file}")
        return []

def compute_rouge(predictions_file, golden_data_file, topic, shot, split_number):
    golden_data = pd.read_csv(golden_data_file)
    shot_examples = get_shot_examples(topic, shot, split_number)

    with open(predictions_file, 'r') as f:
        predictions = [json.loads(line) for line in f]
    
    # Group predictions by ID since there may be multiple spans per ID
    predictions_dict = defaultdict(list)
    for pred in predictions:
        predictions_dict[pred['id']].append(pred['span'])

    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    example_data = []

    # Group golden data by ID to match prediction structure
    golden_groups = golden_data.groupby('id')
    
    for id, group in golden_groups:
        if id in shot_examples:
            continue
            
        # Get all golden spans for this ID
        golden_spans = group['line'].tolist()
        pred_spans = predictions_dict.get(id, [])
        
        # For each golden span, find best matching prediction
        for index, row in group.iterrows():
            best_score = 0
            best_pred = ""
            
            for pred_span in pred_spans:
                score = scorer.score(row['line'], pred_span)['rougeL'].fmeasure
                if score > best_score:
                    best_score = score
                    best_pred = pred_span
            
            label = row.get("label", "")
            argument = topic_label_to_argument.get(topic, {}).get(label, "")
            stance = "Pro" if label.startswith("p-") else "Con" if label.startswith("c-") else ""

            example_data.append({
                'dataset': topic,
                'split': split_number,
                'id': id,
                'gold': row['line'],
                'pred': best_pred,
                'rouge_score': round(best_score, 4),
                'argument': argument,
                'stance': stance,
                'comment': row.get('text', '')
            })

    return example_data

def evaluate_all_splits():
    all_results = []
    base_path = "/Users/guida/llm_argument_tasks/run_all_k_shots/task3"

    for dataset in ["abortion", "gayRights", "marijuana", "obama"]:
        print(f"\n{'='*20} {dataset.upper()} {'='*20}")

        for model in ["gemini"]:
            for split in range(1, 6):
                for shot in ["5"]:
                    results_dir = f'{base_path}/Results_T3_{model}_Split_{split}'
                    golden_data_file = f'/Users/guida/llm_argument_tasks/clean_data/yru_{dataset}_main.csv'
                    pred_file = f'yru_{dataset}_span_identification_{model}_{shot}shot.jsonl'
                    pred_path = os.path.join(results_dir, pred_file)

                    try:
                        rouge_rows = compute_rouge(
                            pred_path,
                            golden_data_file,
                            dataset,
                            shot,
                            split
                        )
                        all_results.extend(rouge_rows)

                    except FileNotFoundError:
                        print(f"Missing file: {pred_path}")
                    except Exception as e:
                        print(f"Error processing {pred_path}: {str(e)}")

    results_df = pd.DataFrame(all_results)
    results_df.to_csv('rouge_error_analysis_detailed.csv', index=False)
    return results_df

all_cases = evaluate_all_splits()







In [13]:
all_cases.to_csv('qualitative_analysis/error_analysis_t2.csv', index=False)

In [9]:
all_cases.to_csv("qualitative_analysis/error_analysis_t2.csv", index=False)