# Analysis of Annotations - Both Human (Validation) and LLMs



## Part 0: Pipeline Function definitions.

In [175]:
import json
import os
import numpy as np
import re
from typing import List, Tuple, Dict, Any, Union, Optional
import pandas as pd
from sklearn.metrics import cohen_kappa_score
import krippendorff
import irrCAC 
from transform_annotations_functions import (
    mapping, 
    EXPANDED_MAPPING, 
    get_edit_quality,
    get_edit_subtype,
    Quality,           
    Error,
    ReorderLevel,
    Structure,
    Information
)

from helper_functions import QUALITY_EXPANDED_MAPPING
from helper_functions import df_to_enhanced_latex

np.set_printoptions(threshold=np.inf)

In [176]:
def load_json_file(file_path: str) -> Any:
    """
    Loads a JSON file and returns the parsed data.
    """
    with open(file_path, "r", encoding="utf-8") as f:
        return json.load(f)

def parse_annotations(file_path: str, format_type: str) -> List[Dict]:
    """
    Loads and parses an annotation file into a standardized structure.
    """
    data = load_json_file(file_path)
    
    if format_type.lower() in ["default", "old"]:
        for item in data:
            for edit in item.get("edits", []):
                # If the edit contains an "annotation", look for a sub-dict that has a "val" key.
                annotation = edit.get("annotation") or {}
                quality = None
                for key, value in annotation.items():
                    if isinstance(value, dict) and "val" in value:
                        quality = value["val"]
                        break  # use the first quality we find
                edit["quality"] = quality
                # Also, if a significance value is missing, set it to None (or a default)
                if "significance" not in edit:
                    edit["significance"] = None
        return data

    elif format_type.lower() in ["llm", "new"]:
        # For the new LLM format, the file already contains keys
        # "source", "target", "metadata", and edits with "input_text", "output_text", "quality", "significance".
        return data

    else:
        raise ValueError(f"Unknown format_type: {format_type}")

In [177]:
# ------------------------------
# Tokenization Function
# ------------------------------
def tokenize_with_char_spans_minimal(sentence: str):
    """
    Splits a sentence on whitespace and returns a list of tuples:
      (token, start_index, end_index)
    """
    tokens = []
    for match in re.finditer(r'\S+', sentence):
        tok = match.group()
        start = match.start()
        end = match.end()
        tokens.append((tok, start, end))
    return tokens

# ------------------------------
# Mapping Functions
# ------------------------------
def map_word_sequence_to_char_span(word_seq: List[str], sentence: str) -> List[List[int]]:
    """
    Joins a list of words into a substring and returns its [start, end] character indices in the sentence.
    If the substring is not found, returns an empty list.
    """
    if not word_seq:
        return []
    substring = " ".join(word_seq)
    pos = sentence.find(substring)
    if pos == -1:
        return []
    return [[pos, pos + len(substring)]]

def flatten_char_spans(char_spans: List) -> List[List[int]]:
    """
    Recursively flattens nested lists of character spans into a list of [start, end] pairs.
    """
    flat_spans = []
    for span in char_spans:
        if span is None:
            continue
        if isinstance(span, list) and len(span) == 2 and all(isinstance(x, int) for x in span):
            flat_spans.append(span)
        elif isinstance(span, list):
            flat_spans.extend(flatten_char_spans(span))
    return flat_spans

def map_char_spans_to_token_indices(char_spans: List[List[int]], tokens: List[Tuple[str, int, int]]) -> List[int]:
    """
    Given a list of character spans and a list of tokens (with their char indices),
    returns a sorted list of token indices that fall completely within any of the spans.
    """
    char_spans = flatten_char_spans(char_spans)
    token_indices = []
    for (span_start, span_end) in char_spans:
        for idx, (tok, tstart, tend) in enumerate(tokens):
            # If the token is fully within the span, add its index.
            if tstart >= span_start and tend <= span_end:
                token_indices.append(idx)
    return sorted(set(token_indices))

# ------------------------------
# Unification Function for One Item
# ------------------------------
def unify_edit_tokens_in_item(item: Dict) -> Dict:
    """
    For a single annotation item, tokenizes its 'source' and 'target', and adds
    'input_tokens' and 'output_tokens' to each edit.
    
    It handles two cases:
      - Edits with provided "input_idx" / "output_idx" or "input_text" / "output_text".
      - For splits/structure edits with "constituent_edits", extracts the token indices
        from each constituent edit and merges them.
    """
    source = item.get("source", "")
    target = item.get("target", "")
    source_tokens = tokenize_with_char_spans_minimal(source)
    target_tokens = tokenize_with_char_spans_minimal(target)
    
    for edit in item.get("edits", []):
        # Initialize token lists for this edit
        edit["input_tokens"] = []
        edit["output_tokens"] = []
        
        # Process parent's own spans (default case)
        if "input_idx" in edit and edit["input_idx"]:
            edit["input_tokens"] = map_char_spans_to_token_indices(edit["input_idx"], source_tokens)
        elif "input_text" in edit and edit["input_text"]:
            # LLM format: use input_text
            input_substring = " ".join(edit["input_text"])
            pos = source.find(input_substring)
            char_spans = map_word_sequence_to_char_span(edit["input_text"], source)
            edit["input_tokens"] = map_char_spans_to_token_indices(char_spans, source_tokens)
        
        if "output_idx" in edit and edit["output_idx"]:
            edit["output_tokens"] = map_char_spans_to_token_indices(edit["output_idx"], target_tokens)
        elif "output_text" in edit and edit["output_text"]:
            output_substring = " ".join(edit["output_text"])
            pos = target.find(output_substring)
            char_spans = map_word_sequence_to_char_span(edit["output_text"], target)
            edit["output_tokens"] = map_char_spans_to_token_indices(char_spans, target_tokens)
        
        if "constituent_edits" in edit and edit["constituent_edits"]:
            cons_input_tokens = []
            cons_output_tokens = []
            for cons_edit in edit["constituent_edits"]:
                # Process constituent edit input tokens
                if "input_idx" in cons_edit and cons_edit["input_idx"]:
                    cons_input_tokens.extend(map_char_spans_to_token_indices(cons_edit["input_idx"], source_tokens))
                elif "input_text" in cons_edit and cons_edit["input_text"]:
                    cons_char_spans = map_word_sequence_to_char_span(cons_edit["input_text"], source)
                    cons_input_tokens.extend(map_char_spans_to_token_indices(cons_char_spans, source_tokens))
                
                # Process constituent edit output tokens
                if "output_idx" in cons_edit and cons_edit["output_idx"]:
                    cons_output_tokens.extend(map_char_spans_to_token_indices(cons_edit["output_idx"], target_tokens))
                elif "output_text" in cons_edit and cons_edit["output_text"]:
                    cons_char_spans = map_word_sequence_to_char_span(cons_edit["output_text"], target)
                    cons_output_tokens.extend(map_char_spans_to_token_indices(cons_char_spans, target_tokens))
            
            # Merge the constituent tokens with the parent's tokens.
            edit["input_tokens"] = sorted(set(edit["input_tokens"] + cons_input_tokens))
            edit["output_tokens"] = sorted(set(edit["output_tokens"] + cons_output_tokens))
    
    # Also, add the full token lists to the item for debugging
    item["source_tokens"] = source_tokens
    item["target_tokens"] = target_tokens
    
    return item

# ------------------------------
# Batch Processing Function
# ------------------------------
def unify_edit_tokens_in_annotations(annotations: List[Dict]) -> List[Dict]:
    """
    Processes a list of annotation items, adding token mappings for each edit.
    Returns the updated list.
    """
    return [unify_edit_tokens_in_item(item) for item in annotations]

In [178]:
def safe_krippendorff_alpha(data_matrix, debug=False):
    """
    A helper that calls krippendorff.alpha, but returns 1.0 if the matrix
    has no variation (i.e. only one unique value).
    """
    unique_values = np.unique(data_matrix)
    if len(unique_values) <= 1:
        if debug:
            print("Data matrix has only one unique value. Interpreting as perfect agreement = 1.0")
        return 1.0
    return krippendorff.alpha(reliability_data=data_matrix, level_of_measurement="nominal")

In [179]:
def get_category_label(
    edit: Dict,
    quality_split: bool,
    heineman_typology: bool = False,
    debug: bool = False
) -> Optional[str]:
    """
    If `heineman_typology=True`, parse custom annotation fields
    (e.g., 'deletion_type', 'substitution_info_change', 'reorder_level', etc.)
    to figure out the appropriate enumerations for get_edit_subtype.
    --> only for Substitution, other types are clear.

    Otherwise, fall back to the old logic (quality_split + get_edit_quality).
    """

    if debug:
        print(f"DEBUG: startet get_category_label with edit={edit}, looking for annotation next")

    annotation = edit.get("annotation")

    if heineman_typology:
        # special additional lookup for Heineman's typology
        if not annotation:
            return None
       
        #  1) Figure out the base edit type:
        edit_type_str = (edit.get("category") or "").lower()
        if debug:
            print(f"Debug: category={edit_type_str}, annotation={annotation}")

        eq = None
        err_type = None
        reorder_lvl = None
        struct_type = None
        info_impact = None

        if debug:
            print(f"Debug: about to parse annotation={annotation}")
        
        #  5) Parse 'substitution_info_change', if present
        if "substitution_info_change" in annotation:
            sub_obj = annotation["substitution_info_change"]
            if isinstance(sub_obj, dict):
                base_val = sub_obj.get("val", "").lower()
                if base_val == "same":
                    info_impact = Information.SAME
                elif base_val == "less":
                    info_impact = Information.LESS
                elif base_val == "more":
                    info_impact = Information.MORE
                elif base_val == "different":
                    pass

                # See if sub_obj[base_val] is a dict with a "val": "good"/"bad"/"trivial"
                if base_val in sub_obj:
                    sub_info = sub_obj[base_val]
                    if isinstance(sub_info, dict):
                        sub2_val = sub_info.get("val", "").lower()
                        if sub2_val == "good":
                            eq = Quality.QUALITY
                        elif sub2_val == "bad_deletion":
                            eq = Quality.ERROR
                            err_type = Error.BAD_DELETION
                        elif sub2_val == "trivial":
                            eq = Quality.TRIVIAL

        if "reorder_level" in annotation:
            reorder_lvl = annotation["reorder_level"].get("val", "").lower()



    # Old Logic Branch (quality_split-based)
    category_str = (edit.get("category") or "").strip().lower()

    # Special case for "substitution" with Heineman's typology
    if heineman_typology:
        # If we have a specific info_impact, adjust the return category label
        if category_str == "substitution" and info_impact:
            if info_impact == Information.SAME:
                return f"Substitution - Same Information"
            elif info_impact == Information.LESS:
                return f"Substitution - Less Information"
            elif info_impact == Information.MORE:
                return f"Substitution - More Information"
            elif info_impact == Information.DIFFERENT:
                return f"Substitution - Different Information"
            else:
                # Should not happen
                return None
        elif category_str == "substitution":
            # if no info_impact, return nothing
            return None
        elif category_str == "deletion":
            return "Deletion - Less Information"
        elif category_str == "insertion":
            return "Insertion - More Information"
        elif category_str == "split":
            return "Split - Sentence Split"
        elif category_str == "structure":
            return "Structure - Structure"

        elif category_str == "reorder":
            if reorder_lvl == "word_level":
                return "Reordering - Word-level"
            elif reorder_lvl == "component_level":
                return "Reordering - Component-level"
            else:
                return None
       
    if not quality_split:
        return category_str.capitalize()

    # If quality_split = True => call get_edit_quality
    label = get_edit_quality(annotation, category_str)
    if label is None or (isinstance(label, str) and label.startswith("ERROR")):
        return category_str.capitalize()
    else:
        return label

In [180]:
# ------------------------------
# Build Token Presence Matrices
# ------------------------------
def build_token_presence_matrices(
    annotators_annotations: List[List[Dict]], 
    quality_split: bool = False,
    heineman_typology: bool = False,        
    debug: bool = False
) -> Dict[str, np.ndarray]:
    """
    Builds a token-level binary presence matrix for each label.
    If heineman_typology is True, we use get_edit_subtype. Otherwise, use the normal logic.
    The matrix has shape (n_annotators, total_tokens_across_sentences).
    """
    n_annotators = len(annotators_annotations)
    if n_annotators == 0:
        return {}
    
    n_sentences = len(annotators_annotations[0])
    results = [[{} for _ in range(n_sentences)] for _ in range(n_annotators)]
    labels_set = set()
    
    for sent_idx in range(n_sentences):
        first_item = annotators_annotations[0][sent_idx]
        source_len = len(first_item.get("source_tokens", []))
        target_len = len(first_item.get("target_tokens", []))
        sentence_length = source_len + target_len
        
        # Iterate Over Annotators and their Edits in the Sentence
        for a_idx in range(n_annotators):
            for edit in annotators_annotations[a_idx][sent_idx].get("edits", []):
                # differentiate between LLM format and default format
                if "input_text" in edit:
                    if quality_split:
                        quality = edit.get("quality")
                        category = edit.get("category")
                        # If either is missing, fallback to category (if available), else return None.
                        if quality is None or category is None:
                            label = None
                        else:
                            label = quality.capitalize() + " " + category.capitalize()
                    else: # quality_split=False
                        category = edit.get("category")
                        label = category.capitalize() if category is not None else None
                else:
                    label = get_category_label(edit, quality_split, heineman_typology, debug=debug)

                if label is None:
                    continue  # Skip edits with no annotation
                labels_set.add(label)
                if label not in results[a_idx][sent_idx]:
                    results[a_idx][sent_idx][label] = np.zeros(sentence_length, dtype=int)
                for tok_idx in edit.get("input_tokens", []):
                    if 0 <= tok_idx < sentence_length:
                        results[a_idx][sent_idx][label][tok_idx] = 1
                for tok_idx in edit.get("output_tokens", []):
                    if 0 <= tok_idx < sentence_length:
                        results[a_idx][sent_idx][label][tok_idx] = 1
    
    presence_by_label = {label: [] for label in labels_set}
    for label in labels_set:
        for a_idx in range(n_annotators):
            sentence_vecs = []
            for sent_idx in range(n_sentences):
                first_item = annotators_annotations[0][sent_idx]
                sent_len = len(first_item.get("source_tokens", [])) + len(first_item.get("target_tokens", []))
                vec = results[a_idx][sent_idx].get(label, np.zeros(sent_len, dtype=int))
                sentence_vecs.append(vec)
            full_vec = np.hstack(sentence_vecs)
            presence_by_label[label].append(full_vec)
    
    for label in presence_by_label:
        presence_by_label[label] = np.vstack(presence_by_label[label])
        if debug:
            print(f"\n[DEBUG] Label '{label}' token-level presence matrix shape: {presence_by_label[label].shape}")
            print(presence_by_label[label])
    
    return presence_by_label

In [181]:
# ------------------------------
# 1. Agreement Calculation for a Single Matrix
# ------------------------------
def calculate_token_agreement(matrix: np.ndarray, metric: str = 'cohen', debug: bool = False) -> float:
    debug=True
    """
    Calculates an agreement statistic for a given binary matrix.
    """
    
    # Check if all rows are identical; if so, return 1.0.
    if matrix.shape[0] > 0 and np.all(np.diff(matrix, axis=0) == 0):
        if debug:
            print("[DEBUG] All annotator rows are identical. Returning 1.0")
        return 1.0
    
    metric = metric.lower()
    if metric == 'cohen':
        if matrix.shape[0] == 2:
            result = cohen_kappa_score(matrix[0], matrix[1])
            if debug:
                print(f"[DEBUG] Cohen's Kappa calculated: {result}")
            return result
        else:
            primary = matrix[0]
            scores = []
            for i in range(1, matrix.shape[0]):
                s = cohen_kappa_score(primary, matrix[i])
                scores.append(s)
            avg_score = np.mean(scores) if scores else np.nan
            if debug:
                print(f"[DEBUG] Average Cohen's Kappa for >2 annotators: {avg_score}")
            return avg_score
    elif metric == 'krippendorff':
        try:
            result = safe_krippendorff_alpha(matrix)
            return result
        except Exception as e:
            if debug:
                print(f"[DEBUG] Error computing Krippendorff's Alpha: {e}")
            return np.nan

    elif metric == 'fleiss':
        return calculate_fleiss_kappa(matrix, debug=debug)

    else:
        raise ValueError(f"Unsupported metric: {metric}")


# ------------------------------
# 2. Agreement Calculation for All Labels
# ------------------------------

def calculate_agreement_for_all_labels(
    presence_matrices: Dict[str, np.ndarray], 
    metric: str = 'cohen', 
    debug: bool = False
) -> pd.DataFrame:
    """
    """
    if debug:
        print("DEBUG: runnning calculate_agreement_for_all_labels with input presence_matrices:")
        print(presence_matrices)
    
    results = []
    for label, mat in presence_matrices.items():
        # mat shape is (n_annotators, n_items)
        main_agree = calculate_token_agreement(mat, metric=metric, debug=debug)
        pa = calculate_percent_agreement(mat, debug=debug)
        ac1 = calculate_gwets_ac1(mat, debug=debug)

        if debug:
            print(f"DEBUG: label={label}, main_agree={main_agree}, pa={pa}, ac1={ac1}")
        
        # 4) Count how many items (columns) have at least one annotator marking 1
        presence_any = np.any(mat > 0, axis=0)  # True if any annotator has a 1 in that item
        n_present_any = np.sum(presence_any)   # count of items labeled by at least one annotator
        pct_present_any = n_present_any / mat.shape[1] if mat.shape[1] else 0.0
        
        results.append({
            'label': label,
            'main_agreement': main_agree,      # Cohen/Krippendorff
            'percent_agreement': pa,           # fraction of columns with identical coding
            'gwets_ac1': ac1,                  # Gwet's AC1
            'n_present_any': n_present_any,    # how many items had the label at least once
            'pct_present_any': pct_present_any # fraction of items that had the label
        })
    return pd.DataFrame(results).sort_values(by='label')


In [182]:
def build_sentence_presence_matrices(
    annotators_annotations: List[List[Dict]],
    quality_split: bool = False,
    heineman_typology: bool = False,   
    debug: bool = False
) -> Dict[str, np.ndarray]:
    """
    Builds sentence-level presence matrices of shape (n_annotators, n_sentences) for each label.
    
    - assume annotators_annotations[a][s] is the item for annotator 'a' and sentence index 's'.
    - if, in that sentence, an edit of label L is present, we set presence_matrix[a, s] = 1, otherwise 0.
    """
    
    n_annotators = len(annotators_annotations)
    if n_annotators == 0:
        return {}
    n_sentences = len(annotators_annotations[0])
    
    # We'll collect labels from whichever approach we are using
    if heineman_typology:
        label_set = set()
    else:
        if quality_split:
            label_set = QUALITY_EXPANDED_MAPPING
        else:
            label_set = {key.capitalize() for key in mapping.keys()}

    presence_by_label = {}

    for a_idx in range(n_annotators):
        if debug:
            print(f"   -- Looking at another number: {a_idx}  -- ")
        for s_idx in range(n_sentences):
            item = annotators_annotations[a_idx][s_idx]

            # Track which labels appear in this sentence
            labels_in_sentence = set()
            for edit in item.get("edits", []):
                # differentiate between LLM format and default format
                if "input_text" in edit:
                    if quality_split:
                        quality = edit.get("quality")
                        category = edit.get("category")
                        # If either is missing, fallback to category (if available), else return None.
                        if quality is None or category is None:
                            label = None
                        else:
                            label = quality.capitalize() + " " + category.capitalize()
                    else: # quality_split=False
                        category = edit.get("category")
                        label = category.capitalize() if category is not None else None
                else:
                    label = get_category_label(edit, quality_split, heineman_typology, debug=debug)
                if debug:
                    print(f"   --> [DEBUG] Extracted label: {label}")
                if label is not None:
                    labels_in_sentence.add(label)

            if debug:
                print(f"   [DEBUG] Sentence {s_idx} labels: {labels_in_sentence}")
                print(f"   [DEBUG] ... now about to iterate over these labels")
            for label in labels_in_sentence:
                if label not in presence_by_label:
                    if debug:
                        print(f"[   DEBUG] label not in presence_by_label: {label}, filling up with zeros")
                    presence_by_label[label] = np.zeros((n_annotators, n_sentences), dtype=int)
                
                presence_by_label[label][a_idx, s_idx] = 1
            if debug:
                print(f"=> [DEBUG] Sentence presence matrices built: {presence_by_label}")
    if debug:
        print(f"[DEBUG] Sentence presence matrices built for labels: {presence_by_label.keys()}")
    
    return presence_by_label

In [183]:
def calculate_percent_agreement(matrix: np.ndarray, debug: bool = False) -> float:
    """
    Given a binary matrix of shape (n_annotators, n_items), calculates the percentage
    of items (columns) for which all annotators agree.
    """
    # For each item, check if all raters agree (compare each column to the first row)
    all_agree = np.all(matrix == matrix[0, :], axis=0)
    pa = np.mean(all_agree)
    if debug:
        print(f"[DEBUG] Percent Agreement: {pa} (over {matrix.shape[1]} items)")
    return pa

def calculate_gwets_ac1(matrix: np.ndarray, debug: bool = False) -> float:
    """
    Given a binary matrix of shape (n_annotators, n_items), calculates Gwet's AC1.
    
    This function transposes the matrix (so rows=items, columns=annotators) and then
    creates a CAC object from the irrCAC.raw module.
    """
    debug = False # disable debugging for Gwet's AC1
    
    try:
        # Import the CAC class from irrCAC.raw
        from irrCAC.raw import CAC
    except ImportError:
        print("Error: irrCAC package not installed. Please install it via 'pip install irrCAC'")
        return np.nan
    
    # Transpose matrix: now rows = items, columns = raters
    data = np.transpose(matrix)
    # Create a DataFrame with column names "Rater1", "Rater2", etc.
    df = pd.DataFrame(data, columns=[f"Rater{i+1}" for i in range(matrix.shape[0])])
    if debug:
        print(f"[DEBUG] DataFrame shape for Gwet's AC1: {df.shape}")
        print(df)
    try:
        cac_obj = CAC(df)
        gwet_results = cac_obj.gwet()
        ac1 = gwet_results['est']['coefficient_value']
        if debug:
            print(f"[DEBUG] Gwet's AC1: {ac1}")
        return ac1
    except Exception as e:
        if debug:
            print(f"[DEBUG] Error computing Gwet's AC1: {e}")
        return np.nan

def calculate_fleiss_kappa(matrix: np.ndarray, debug: bool = False) -> float:
    """
    Given a binary matrix of shape (n_annotators, n_items), calculates Fleiss' Kappa.
    
    We transpose the matrix (so rows=items, columns=annotators) and then
    create a CAC object from the irrCAC.raw module and call its fleiss() method.
    """
    try:
        from irrCAC.raw import CAC
    except ImportError:
        print("Error: irrCAC package not installed. Please install it via 'pip install irrCAC'.")
        return float('nan')
    
    # Transpose so that rows=items, columns=annotators
    data = np.transpose(matrix)
    df = pd.DataFrame(data, columns=[f"Rater{i+1}" for i in range(matrix.shape[0])])

    try:
        cac_obj = CAC(df)
        fleiss_results = cac_obj.fleiss()
        fleiss_kappa = fleiss_results['est']['coefficient_value']
        return fleiss_kappa
    except Exception as e:
        return float('nan')

In [184]:
def run_pairwise_agreement_pipeline(
    primary_file: str,
    secondary_files: List[str],
    primary_format: str,
    secondary_format: str,
    level: str = "token",
    quality_split: bool = True,
    metric: str = "cohen",
    heineman_typology: bool = False,    
    debug: bool = False,
    return_pairwise: bool = False
) -> Union[pd.DataFrame, Tuple[pd.DataFrame, pd.DataFrame]]:
    """
    Runs a pairwise agreement pipeline that always compares the primary annotator (from primary_file)
    against each secondary annotator.
    
    Parameters:
      primary_file: Path to the primary annotator file.
      secondary_files: List of paths to secondary annotator files.
      primary_format: Format for the primary file ("default" or "llm").
      secondary_format: Format for the secondary files ("default" or "llm").
      level: "token" or "sentence".
      quality_split: Whether to produce detailed labels (e.g., "Good Deletion") or just main types ("Deletion").
      metric: "cohen" or "krippendorff".
      debug: If True, prints debug statements.
      return_pairwise: If True, returns both the combined DataFrame (averaged across pairs) and a copy
                       of the non-aggregated (big) DataFrame.
    
    If heineman_typology=True, we switch from normal get_edit_quality to get_edit_subtype.
    """
    # 1. Parse & unify primary data
    primary_data = parse_annotations(primary_file, format_type=primary_format)
    primary_data = unify_edit_tokens_in_annotations(primary_data)

    if debug:
        print(f"DEBUG:  primary_data = {primary_data}")
    
    pairwise_results_dfs = []
    
    # 2. Loop over secondary files
    for sec_file in secondary_files:
        sec_data = parse_annotations(sec_file, format_type=secondary_format)
        sec_data = unify_edit_tokens_in_annotations(sec_data)
        
        annotators = [primary_data, sec_data]
        
        if level.lower() == "token":
            pres_matrices = build_token_presence_matrices(
                annotators, 
                quality_split=quality_split, 
                heineman_typology=heineman_typology, 
                debug=debug
            )
        elif level.lower() == "sentence":
            if debug:
                print(f"DEBUG: Running sentence-level agreement pipeline (build_sentence_presence_matrices)")
            pres_matrices = build_sentence_presence_matrices(
                annotators,
                quality_split=quality_split,
                heineman_typology=heineman_typology,
                debug=debug
            )
        else:
            raise ValueError(f"Unsupported level: {level}")
        
        df_agreement = calculate_agreement_for_all_labels(pres_matrices, metric=metric, debug=debug)
        df_agreement["primary"] = primary_file.split("/")[-1]
        df_agreement["secondary"] = sec_file.split("/")[-1]
        
        pairwise_results_dfs.append(df_agreement)
    
    # 3. Combine the pairwise results into one DataFrame
    combined_df_big = pd.concat(pairwise_results_dfs, ignore_index=True)

    # if there is a column where n_present_any is 0, we should remove it
    combined_df_big = combined_df_big[combined_df_big["n_present_any"] > 0]

    # 4. Average the numeric columns across comparisons.
    numeric_cols = ["main_agreement", "percent_agreement", "gwets_ac1", "n_present_any", "pct_present_any"]
    numeric_cols = ["main_agreement", "gwets_ac1"]
    combined_df_avg = combined_df_big.groupby("label", as_index=False)[numeric_cols].mean()
    
    # 5. Sorting the final aggregated DataFrame.
    if quality_split:
        # For quality-split labels (e.g., "Good Deletion", "Trivial Deletion", "Bad Deletion"),
        # sort by the base edit type alphabetically and then by quality in the order: Good, Trivial, Bad.
        combined_df_avg['quality'] = combined_df_avg['label'].apply(lambda x: x.split()[0] if len(x.split()) > 1 else "")
        combined_df_avg['base'] = combined_df_avg['label'].apply(lambda x: " ".join(x.split()[1:]) if len(x.split()) > 1 else x)
        quality_order = ["Good", "Trivial", "Bad"]
        combined_df_avg['quality'] = pd.Categorical(combined_df_avg['quality'], categories=quality_order, ordered=True)
        combined_df_avg = combined_df_avg.sort_values(by=["base", "quality"])
        combined_df_avg = combined_df_avg.drop(columns=["base", "quality"])
    else:
        combined_df_avg = combined_df_avg.sort_values(by="label")
    
    if return_pairwise:
        return combined_df_avg, combined_df_big
    else:
        return combined_df_avg

---

## Part 1: Validate Human Annotations by calculating IAA on a subsample

### Evaluating per-Token Agreement

This first analysis step looks at each word (token) and the by the annotators identified edit types (following Heineman et al.'s own methodology, but simplifying it slightly).

In [None]:
# Load files
primary_annotator_file = '../data/salsa_peer_annotations/final_peer_annotations/annotations_M_Gold.json'

# based on directory, load all json file names into a list
peer_dir = '../data/salsa_peer_annotations/final_peer_annotations/'

peer_files = [peer_dir + f for f in os.listdir(peer_dir) if f.endswith('.json')]
# remove from peer_files any files containing "LLM" (LLM's own annotations)
peer_files = [f for f in peer_files if "LLM" not in f]

# remove gold standard file from peer_files
peer_files = [f for f in peer_files if "Gold" not in f]

secondary_annotator_files = peer_files

secondary_annotator_files

['../data/salsa_peer_annotations/final_peer_annotations/annotations_DO.json',
 '../data/salsa_peer_annotations/final_peer_annotations/annotations_D.json',
 '../data/salsa_peer_annotations/final_peer_annotations/annotations_E.json',
 '../data/salsa_peer_annotations/final_peer_annotations/annotations_S.json',
 '../data/salsa_peer_annotations/final_peer_annotations/annotations_R.json',
 '../data/salsa_peer_annotations/final_peer_annotations/annotations_G.json',
 '../data/salsa_peer_annotations/final_peer_annotations/annotations_L.json',
 '../data/salsa_peer_annotations/final_peer_annotations/annotations_T.json',
 '../data/salsa_peer_annotations/final_peer_annotations/annotations_A.json',
 '../data/salsa_peer_annotations/final_peer_annotations/annotations_M.json']

#### Token: By Edit Type

In [186]:
results_token_general, results_token_general_pairWise = run_pairwise_agreement_pipeline(
    primary_file=primary_annotator_file,
    secondary_files=secondary_annotator_files,
    primary_format="default",
    secondary_format="default",
    level="token",
    quality_split=False,
    metric="krippendorff",
    debug=False,
    return_pairwise=True
)

# Exporting raw Data for R Graphs
#results_token_general_pairWise.to_csv("../data/charts_data/IAA_tokenlevel_NoQuali.csv", index=False)

# create a df with human vs LLM information (for charts)
df_perToken_MB_vs_LLM = results_token_general.copy()

display(results_token_general)

# df_to_enhanced_latex(results_token_general,
#     caption = "Token-Level IAA without Quality Split",
#     label = "tab:iaa_token_noquali"
#     )

[DEBUG] All annotator rows are identical. Returning 1.0
[DEBUG] All annotator rows are identical. Returning 1.0


Unnamed: 0,label,main_agreement,gwets_ac1
0,Deletion,0.843361,0.952302
1,Insertion,0.376332,0.994835
2,Reorder,0.526138,0.952617
3,Split,0.77542,0.992042
4,Structure,0.111699,0.95567
5,Substitution,0.679386,0.890626


In [187]:
results_token_withQuality, results_token_withQuality_pairWise = run_pairwise_agreement_pipeline(
    primary_file=primary_annotator_file,
    secondary_files=secondary_annotator_files,
    primary_format="default",
    secondary_format="default",
    level="token",
    quality_split=True,
    metric="krippendorff",
    debug=False,
    return_pairwise=True
)

# Exporting raw Data for R Graphs
results_token_withQuality_pairWise.to_csv("../data/charts_data/IAA_tokenlevel_WithQuali.csv", index=False)

display(results_token_withQuality)

# df_to_enhanced_latex(results_token_withQuality,
#     caption = "Token-Level IAA with Quality Split",
#     label = "tab:iaa_token_quali"
#     )

[DEBUG] All annotator rows are identical. Returning 1.0
[DEBUG] All annotator rows are identical. Returning 1.0
[DEBUG] All annotator rows are identical. Returning 1.0
[DEBUG] All annotator rows are identical. Returning 1.0


Unnamed: 0,label,main_agreement,gwets_ac1
5,Good Deletion,0.341206,0.904734
13,Trivial Deletion,0.059268,0.973993
0,Bad Deletion,0.468416,0.9461
6,Good Insertion,0.298991,0.996689
14,Trivial Insertion,0.166331,0.9978
1,Bad Insertion,-0.000305,0.997557
7,Good Reorder,0.518947,0.953109
2,Bad Reorder,-0.005521,0.987035
8,Good Split,0.684931,0.990757
15,Trivial Split,-0.002295,0.993555


In [188]:
# Combine into one table, ready for LaTeX Export

import re

# Create a new DataFrame to hold the combined data
combined_df = pd.DataFrame(columns=['Edit Type', 'Quality Split', 'main_agreement', 'gwets_ac1'])
# Get unique main edit types and sort them alphabetically
main_edit_types = sorted(results_token_general['label'].unique())
print(main_edit_types)

# Function to determine the main edit type of each quality split
def get_main_type(quality_label):
    for edit_type in main_edit_types:
        if edit_type in quality_label:
            print(f": Found {edit_type} in {quality_label}")
            return edit_type
    return None

# Add a main type column to quality splits data
df_quality = results_token_withQuality.copy()
print(df_quality)
df_quality['main_type'] = df_quality['label'].apply(get_main_type)
print(df_quality)



# Loop through each main edit type
for edit_type in main_edit_types:
    # Add the main edit type row
    main_row = results_token_general[results_token_general['label'] == edit_type]
    print(f": main_row: {main_row}")
    combined_df = pd.concat([combined_df, pd.DataFrame({
        'Edit Type': edit_type,
        'Quality Split': '',
        'main_agreement': main_row['main_agreement'].values[0],
        'gwets_ac1': main_row['gwets_ac1'].values[0]
    }, index=[0])], ignore_index=True)
    
    # Find all quality split rows for this edit type
    quality_rows = df_quality[df_quality['main_type'] == edit_type]
    
    # If there are quality splits for this edit type, add them
    for _, row in quality_rows.iterrows():
        combined_df = pd.concat([combined_df, pd.DataFrame({
            'Edit Type': '',
            'Quality Split': row['label'],
            'main_agreement': row['main_agreement'],
            'gwets_ac1': row['gwets_ac1']
        }, index=[0])], ignore_index=True)

# Display the result
display(combined_df)

# df_to_enhanced_latex(combined_df,
#     caption = "Token-Level IAA with Quality Split - XXX",
#     label = "tab:iaa_token_quali_comparison"
#     )

['Deletion', 'Insertion', 'Reorder', 'Split', 'Structure', 'Substitution']
                   label  main_agreement  gwets_ac1
5          Good Deletion        0.341206   0.904734
13      Trivial Deletion        0.059268   0.973993
0           Bad Deletion        0.468416   0.946100
6         Good Insertion        0.298991   0.996689
14     Trivial Insertion        0.166331   0.997800
1          Bad Insertion       -0.000305   0.997557
7           Good Reorder        0.518947   0.953109
2            Bad Reorder       -0.005521   0.987035
8             Good Split        0.684931   0.990757
15         Trivial Split       -0.002295   0.993555
11                 Split        0.000000   0.998170
9         Good Structure        0.096790   0.964144
16     Trivial Structure       -0.003679   0.990757
3          Bad Structure       -0.002752   0.992630
10     Good Substitution        0.300699   0.920880
17  Trivial Substitution        0.515042   0.926657
4       Bad Substitution        0.485945 

  combined_df = pd.concat([combined_df, pd.DataFrame({


Unnamed: 0,Edit Type,Quality Split,main_agreement,gwets_ac1
0,Deletion,,0.843361,0.952302
1,,Good Deletion,0.341206,0.904734
2,,Trivial Deletion,0.059268,0.973993
3,,Bad Deletion,0.468416,0.9461
4,Insertion,,0.376332,0.994835
5,,Good Insertion,0.298991,0.996689
6,,Trivial Insertion,0.166331,0.9978
7,,Bad Insertion,-0.000305,0.997557
8,Reorder,,0.526138,0.952617
9,,Good Reorder,0.518947,0.953109


#### Token: Heineman Typology
For 1:1 comparison of achieved IAA

In [189]:
results_token_withQuality_HM, results_token_withQuality_pairWise_HM = run_pairwise_agreement_pipeline(
    primary_file=primary_annotator_file,
    secondary_files=secondary_annotator_files,
    primary_format="default",
    secondary_format="default",
    level="token",
    heineman_typology = True,   # <-- NEW!
    quality_split = False,      # Usually  ignore quality if using Heineman’s
    metric="krippendorff",
    debug=False,
    return_pairwise=True
)


# Define the correct order based on Heineman Table
correct_order = [
    "Insertion - More Information",
    "Deletion - Less Information",
    "Substitution - More Information",
    "Substitution - Less Information",
    "Reordering - Word-level",
    "Reordering - Component-level",
    "Split - Sentence Split",
    "Structure - Structure",
    "Substitution - Same Information"
]

# Reorder the DataFrame
results_token_withQuality_HM['label'] = pd.Categorical(
    results_token_withQuality_HM['label'], 
    categories=correct_order, 
    ordered=True
)

# Sort the DataFrame
results_token_withQuality_HM = results_token_withQuality_HM.sort_values('label')

# Exporting raw Data for R Graphs
results_token_withQuality_pairWise.to_csv("../data/charts_data/IAA_tokenlevel_WithQuali.csv", index=False)

display(results_token_withQuality_HM)

# df_to_enhanced_latex(results_token_withQuality_HM,
#     caption = "Token-Level IAA with Quality Split",
#     label = "tab:iaa_token_quali"
#     )

[DEBUG] All annotator rows are identical. Returning 1.0
[DEBUG] All annotator rows are identical. Returning 1.0


Unnamed: 0,label,main_agreement,gwets_ac1
1,Insertion - More Information,0.376332,0.994835
0,Deletion - Less Information,0.843361,0.952302
7,Substitution - More Information,-0.003674,0.99077
6,Substitution - Less Information,0.346859,0.951347
3,Reordering - Word-level,0.081874,0.970355
2,Reordering - Component-level,0.281625,0.955547
4,Split - Sentence Split,0.767981,0.991855
5,Structure - Structure,0.111699,0.95567
8,Substitution - Same Information,0.598201,0.903268


### Evaluating per-Sentence Agreement
#### (no quality split))

The previous analysis looked at per-token identification of edit types.

Inspired by Heineman et al (2023), we look at per-sentence level identification of bad or good edits of each type.


In [190]:
# First just for completene the non-quality split version

results_sentence_general, results_sentence_general_pairWise = run_pairwise_agreement_pipeline(
    primary_file=primary_annotator_file,
    secondary_files=secondary_annotator_files,
    primary_format="default",
    secondary_format="default",
    level="sentence",
    quality_split=False,
    metric="krippendorff",
    debug=False,
    return_pairwise=True
)

display(results_sentence_general)

# df_to_enhanced_latex(results_sentence_general,
#     caption = "Sentence-Level IAA without Quality Split",
#     label = "tab:iaa_sentence_noquali"
#     )

[DEBUG] All annotator rows are identical. Returning 1.0
[DEBUG] All annotator rows are identical. Returning 1.0
[DEBUG] All annotator rows are identical. Returning 1.0
[DEBUG] All annotator rows are identical. Returning 1.0
[DEBUG] All annotator rows are identical. Returning 1.0
[DEBUG] All annotator rows are identical. Returning 1.0
[DEBUG] All annotator rows are identical. Returning 1.0
[DEBUG] All annotator rows are identical. Returning 1.0
[DEBUG] All annotator rows are identical. Returning 1.0
[DEBUG] All annotator rows are identical. Returning 1.0
[DEBUG] All annotator rows are identical. Returning 1.0
[DEBUG] All annotator rows are identical. Returning 1.0
[DEBUG] All annotator rows are identical. Returning 1.0
[DEBUG] All annotator rows are identical. Returning 1.0
[DEBUG] All annotator rows are identical. Returning 1.0
[DEBUG] All annotator rows are identical. Returning 1.0
[DEBUG] All annotator rows are identical. Returning 1.0
[DEBUG] All annotator rows are identical. Return

Unnamed: 0,label,main_agreement,gwets_ac1
0,Deletion,0.7,0.96685
1,Insertion,0.280189,0.736903
2,Reorder,0.895604,0.908255
3,Split,0.961616,0.960396
4,Structure,0.100388,0.328073
5,Substitution,0.136152,0.754567


#### (with quality split))

In [191]:
# Now with quality split
results_sentence_withQuality, results_sentence_withQuality_pairWise = run_pairwise_agreement_pipeline(
    primary_file=primary_annotator_file,
    secondary_files=secondary_annotator_files,
    primary_format="default",
    secondary_format="default",
    level="sentence",
    quality_split=True,
    metric="krippendorff",
    debug=False,
    return_pairwise=True
)

display(results_sentence_withQuality)

# df_to_enhanced_latex(results_sentence_withQuality,
#     caption = "Sentence-Level IAA with Quality Split",
#     label = "tab:iaa_sentence_quali"
#     )

[DEBUG] All annotator rows are identical. Returning 1.0
[DEBUG] All annotator rows are identical. Returning 1.0
[DEBUG] All annotator rows are identical. Returning 1.0
[DEBUG] All annotator rows are identical. Returning 1.0
[DEBUG] All annotator rows are identical. Returning 1.0
[DEBUG] All annotator rows are identical. Returning 1.0
[DEBUG] All annotator rows are identical. Returning 1.0
[DEBUG] All annotator rows are identical. Returning 1.0
[DEBUG] All annotator rows are identical. Returning 1.0
[DEBUG] All annotator rows are identical. Returning 1.0
[DEBUG] All annotator rows are identical. Returning 1.0
[DEBUG] All annotator rows are identical. Returning 1.0
[DEBUG] All annotator rows are identical. Returning 1.0
[DEBUG] All annotator rows are identical. Returning 1.0


Unnamed: 0,label,main_agreement,gwets_ac1
5,Good Deletion,0.251044,0.300101
13,Trivial Deletion,0.036759,0.473365
0,Bad Deletion,0.134033,0.176685
6,Good Insertion,0.28125,0.874876
14,Trivial Insertion,0.151634,0.871497
1,Bad Insertion,0.0,0.8895
7,Good Reorder,0.871245,0.892121
2,Bad Reorder,0.0,0.8895
8,Good Split,0.749518,0.777344
15,Trivial Split,-0.080556,0.683775


In [192]:
# Now with quality split
# --> Fleiss Kappa to match Heineman
results_sentence_withQuality_FK, results_sentence_withQuality_pairWise_FK = run_pairwise_agreement_pipeline(
    primary_file=primary_annotator_file,
    secondary_files=secondary_annotator_files,
    primary_format="default",
    secondary_format="default",
    level="sentence",
    quality_split=True,
    metric="fleiss",
    debug=False,
    return_pairwise=True
)

display(results_sentence_withQuality_FK)

# df_to_enhanced_latex(results_sentence_withQuality_FK,
#     caption = "Sentence-Level IAA with Quality Split - Fleiss' Kappa",
#     label = "tab:iaa_sentence_quali_fleiss"
#     )

[DEBUG] All annotator rows are identical. Returning 1.0
[DEBUG] All annotator rows are identical. Returning 1.0
[DEBUG] All annotator rows are identical. Returning 1.0
[DEBUG] All annotator rows are identical. Returning 1.0
[DEBUG] All annotator rows are identical. Returning 1.0
[DEBUG] All annotator rows are identical. Returning 1.0
[DEBUG] All annotator rows are identical. Returning 1.0
[DEBUG] All annotator rows are identical. Returning 1.0
[DEBUG] All annotator rows are identical. Returning 1.0
[DEBUG] All annotator rows are identical. Returning 1.0
[DEBUG] All annotator rows are identical. Returning 1.0
[DEBUG] All annotator rows are identical. Returning 1.0
[DEBUG] All annotator rows are identical. Returning 1.0
[DEBUG] All annotator rows are identical. Returning 1.0


Unnamed: 0,label,main_agreement,gwets_ac1
5,Good Deletion,0.211626,0.300101
13,Trivial Deletion,-0.013937,0.473365
0,Bad Deletion,0.088457,0.176685
6,Good Insertion,0.243422,0.874876
14,Trivial Insertion,0.106984,0.871497
1,Bad Insertion,-0.05263,0.8895
7,Good Reorder,0.864469,0.892121
2,Bad Reorder,-0.05263,0.8895
8,Good Split,0.736335,0.777344
15,Trivial Split,-0.137425,0.683775


---
## Part 2: Validate LLM Annotations by calculating IAA

Just as before, we can compare the LLM-generated annotations like we compared the human annotations.

Two analyses: One entire dataset (or random subset), and one pair-wise comparison with peer annotations (10x10 sentences).

### Evaluating per-Token Agreement
#### Main Analysis

In [193]:
# Load files

# GPT-4o:
#primary_annotator_file_LLM = '../data/LLM_annotations/LLM_annotations_PeerSet_Heineman_FewShot_GPT4o_FULL.json'

# DeepSeek R1;
primary_annotator_file_LLM = '../data/LLM_annotations/LLM_annotations_PeerSet_Heineman_FewShot_DeepSeekR1_FULL.json'

In [194]:
results_token_general_LLM, results_token_general_pairWise_LLM = run_pairwise_agreement_pipeline(
    primary_file=primary_annotator_file_LLM,
    secondary_files=secondary_annotator_files,
    primary_format="llm",
    secondary_format="default",
    level="token",
    quality_split=False,
    metric="krippendorff",
    debug=False,
    return_pairwise=True
)

display(results_token_general_LLM)

Unnamed: 0,label,main_agreement,gwets_ac1
0,Deletion,0.643498,0.916507
1,Insertion,0.23475,0.99186
2,Reorder,0.601899,0.960269
3,Split,0.736026,0.990129
4,Structure,-0.002215,0.836499
5,Substitution,0.515935,0.799942


In [195]:
# merge on label
df_perToken_MB_vs_LLM2 = pd.merge(df_perToken_MB_vs_LLM, results_token_general_LLM, on="label", suffixes=("_MB", "_LLM"))


In [196]:
results_token_withQuali_LLM, results_token_withQuali_LLM_pairWise = run_pairwise_agreement_pipeline(
    primary_file=primary_annotator_file_LLM,
    secondary_files=secondary_annotator_files,
    primary_format="llm",
    secondary_format="default",
    level="token",
    quality_split=True,
    metric="krippendorff",
    debug=False,
    return_pairwise=True
)

display(results_token_withQuali_LLM)

Unnamed: 0,label,main_agreement,gwets_ac1
5,Good Deletion,0.178779,0.936687
13,Trivial Deletion,-0.011784,0.974025
0,Bad Deletion,0.519227,0.944595
6,Good Insertion,0.238128,0.994846
14,Trivial Insertion,-0.00055,0.997066
1,Bad Insertion,-0.000305,0.997557
7,Good Reorder,0.618326,0.963175
2,Bad Reorder,-0.005521,0.987035
8,Good Split,0.649014,0.988846
15,Trivial Split,-0.002295,0.993555


In [197]:
# merge on label for export
df_perTokenWithQuali_MB_vs_LLM2 = pd.merge(df_perToken_MB_vs_LLM, results_token_general_LLM, on="label", suffixes=("_MB", "_LLM"))

#### PerToken: Finding best model

In [198]:
# LLM Files in Directory
directory_path = "../data/LLM_annotations"
llm_files = os.listdir(directory_path)

# those containing "PeerSet" are the ones we want to use
llm_files = [file for file in llm_files if "PeerSet" in file]

llm_files = [os.path.join(directory_path, file) for file in llm_files]

llm_files

['../data/LLM_annotations/LLM_annotations_PeerSet_Heineman_FewShot_LLAMA70b_FULL.json',
 '../data/LLM_annotations/LLM_annotations_PeerSet_Heineman_FewShot_ClaudeSonnet_FULL.json',
 '../data/LLM_annotations/LLM_annotations_PeerSet_Heineman_ZeroShot_LLAMA70b_FULL.json',
 '../data/LLM_annotations/LLM_annotations_PeerSet_Heineman_ZeroShot_ClaudeSonnet_FULL.json',
 '../data/LLM_annotations/LLM_annotations_PeerSet_Heineman_FewShot_LLAMA8b_FULL.json',
 '../data/LLM_annotations/LLM_annotations_PeerSet_Heineman_ZeroShot_O1Prev_FULL.json',
 '../data/LLM_annotations/LLM_annotations_PeerSet_Heineman_FewShot_GPT4o_FULL.json',
 '../data/LLM_annotations/LLM_annotations_PeerSet_Heineman_ZeroShot_LLAMA8b_FULL.json',
 '../data/LLM_annotations/LLM_annotations_PeerSet_Heineman_ZeroShot_DeepSeekR1_FULL.json',
 '../data/LLM_annotations/LLM_annotations_PeerSet_Heineman_ZeroShot_GPT4omini_FULL.json',
 '../data/LLM_annotations/LLM_annotations_PeerSet_Heineman_FewShot_o1Prev_FULL.json',
 '../data/LLM_annotation

In [199]:
df_results_all = pd.DataFrame()

df_results_means = pd.DataFrame()

for file in llm_files:
    print(f"Processing file: {file}")

    # run the pipeline for each file, conncatonating the results (and indicating the used model in a column)

    results_token_general_LLM = run_pairwise_agreement_pipeline(
        primary_file=file,
        secondary_files=secondary_annotator_files,
        primary_format="llm",
        secondary_format="default",
        level="token",
        quality_split=False,
        metric="krippendorff",
        debug=False,
        return_pairwise=False
    )

    # calculate overall average
    avg_main_agreement = results_token_general_LLM["main_agreement"].median() ## CHANGE THIS MEDIAN
    print(f"Average Main Agreement: {avg_main_agreement}")

    modelName = re.search(r"Heineman_(.*?)_FULL", file).group(1)

    results_token_general_LLM["model"] = modelName

    df_results_means.loc[modelName, "main_agreement"] = avg_main_agreement


    df_results_all = pd.concat([df_results_all, results_token_general_LLM])

# order by main agreement
df_results_means = df_results_means.sort_values(by="main_agreement", ascending=False)
display(df_results_means)

Processing file: ../data/LLM_annotations/LLM_annotations_PeerSet_Heineman_FewShot_LLAMA70b_FULL.json
Average Main Agreement: 0.4203563751360424
Processing file: ../data/LLM_annotations/LLM_annotations_PeerSet_Heineman_FewShot_ClaudeSonnet_FULL.json
Average Main Agreement: 0.4735499239647381
Processing file: ../data/LLM_annotations/LLM_annotations_PeerSet_Heineman_ZeroShot_LLAMA70b_FULL.json
Average Main Agreement: 0.19935251688093608
Processing file: ../data/LLM_annotations/LLM_annotations_PeerSet_Heineman_ZeroShot_ClaudeSonnet_FULL.json
Average Main Agreement: 0.5254886642005994
Processing file: ../data/LLM_annotations/LLM_annotations_PeerSet_Heineman_FewShot_LLAMA8b_FULL.json
Average Main Agreement: 0.30557168517721667
Processing file: ../data/LLM_annotations/LLM_annotations_PeerSet_Heineman_ZeroShot_O1Prev_FULL.json
Average Main Agreement: 0.3882767109616878
Processing file: ../data/LLM_annotations/LLM_annotations_PeerSet_Heineman_FewShot_GPT4o_FULL.json
Average Main Agreement: 0.36

Unnamed: 0,main_agreement
FewShot_DeepSeekR1,0.558917
ZeroShot_ClaudeSonnet,0.525489
ZeroShot_GPT4o,0.504333
FewShot_ClaudeSonnet,0.47355
ZeroShot_GPT4omini,0.425705
FewShot_LLAMA70b,0.420356
ZeroShot_O1Prev,0.388277
ZeroShot_DeepSeekR1,0.383012
FewShot_GPT4o,0.36429
FewShot_GPT4omini,0.34052


In [200]:
df_results_means

# separate the index into two columns by using "_" as separator
df_results_means2 = df_results_means.reset_index()
df_results_means2[["Prompt Type", "model"]] = df_results_means2["index"].str.split("_", expand=True)

# reorder columns
df_results_means2 = df_results_means2[["model","Prompt Type",  "main_agreement"]]

display(df_results_means2)


Unnamed: 0,model,Prompt Type,main_agreement
0,DeepSeekR1,FewShot,0.558917
1,ClaudeSonnet,ZeroShot,0.525489
2,GPT4o,ZeroShot,0.504333
3,ClaudeSonnet,FewShot,0.47355
4,GPT4omini,ZeroShot,0.425705
5,LLAMA70b,FewShot,0.420356
6,O1Prev,ZeroShot,0.388277
7,DeepSeekR1,ZeroShot,0.383012
8,GPT4o,FewShot,0.36429
9,GPT4omini,FewShot,0.34052


### Evaluating per-Sentence Agreement

In [201]:
results_sentence_NoQuali_LLM, results_sentence_NoQuali_LLM_PairWise = run_pairwise_agreement_pipeline(
    primary_file=primary_annotator_file_LLM,
    secondary_files=secondary_annotator_files,
    primary_format="llm",
    secondary_format="default",
    level="sentence",
    quality_split=False,
    metric="krippendorff",
    debug=False,
    return_pairwise=True
)

display(results_sentence_NoQuali_LLM)

[DEBUG] All annotator rows are identical. Returning 1.0
[DEBUG] All annotator rows are identical. Returning 1.0
[DEBUG] All annotator rows are identical. Returning 1.0
[DEBUG] All annotator rows are identical. Returning 1.0
[DEBUG] All annotator rows are identical. Returning 1.0
[DEBUG] All annotator rows are identical. Returning 1.0
[DEBUG] All annotator rows are identical. Returning 1.0
[DEBUG] All annotator rows are identical. Returning 1.0
[DEBUG] All annotator rows are identical. Returning 1.0
[DEBUG] All annotator rows are identical. Returning 1.0
[DEBUG] All annotator rows are identical. Returning 1.0
[DEBUG] All annotator rows are identical. Returning 1.0
[DEBUG] All annotator rows are identical. Returning 1.0
[DEBUG] All annotator rows are identical. Returning 1.0
[DEBUG] All annotator rows are identical. Returning 1.0
[DEBUG] All annotator rows are identical. Returning 1.0
[DEBUG] All annotator rows are identical. Returning 1.0
[DEBUG] All annotator rows are identical. Return

Unnamed: 0,label,main_agreement,gwets_ac1
0,Deletion,0.7,0.96685
1,Insertion,0.220814,0.707491
2,Reorder,0.805128,0.839289
3,Split,0.961616,0.960396
4,Structure,0.016872,0.254679
5,Substitution,0.046078,0.847107


In [202]:
# now with quality split
results_sentence_WithQuali_LLM, results_sentence_WithQuali_LLM_PairWise = run_pairwise_agreement_pipeline(
    primary_file=primary_annotator_file_LLM,
    secondary_files=secondary_annotator_files,
    primary_format="llm",
    secondary_format="default",
    level="sentence",
    quality_split=True,
    metric="krippendorff",
    debug=False,
    return_pairwise=True
)

display(results_sentence_WithQuali_LLM)

[DEBUG] All annotator rows are identical. Returning 1.0
[DEBUG] All annotator rows are identical. Returning 1.0
[DEBUG] All annotator rows are identical. Returning 1.0
[DEBUG] All annotator rows are identical. Returning 1.0
[DEBUG] All annotator rows are identical. Returning 1.0
[DEBUG] All annotator rows are identical. Returning 1.0
[DEBUG] All annotator rows are identical. Returning 1.0
[DEBUG] All annotator rows are identical. Returning 1.0
[DEBUG] All annotator rows are identical. Returning 1.0
[DEBUG] All annotator rows are identical. Returning 1.0


Unnamed: 0,label,main_agreement,gwets_ac1
5,Good Deletion,0.096548,0.191511
13,Trivial Deletion,-0.164979,0.465486
0,Bad Deletion,0.311421,0.385211
6,Good Insertion,0.28125,0.874876
14,Trivial Insertion,0.046078,0.847107
1,Bad Insertion,0.0,0.8895
7,Good Reorder,0.774366,0.825772
2,Bad Reorder,0.0,0.8895
8,Good Split,0.749518,0.777344
15,Trivial Split,-0.080556,0.683775


In [203]:
# concat all results and export to csv
all_results_human_per_token = pd.concat([
    results_token_general, 
    results_token_withQuality, 
    ])

# add a column for the type of annotation
all_results_human_per_token["detail"] = "token"

all_results_human_per_sentence = pd.concat([
    results_sentence_general, 
    results_sentence_withQuality
    ])
# add detail
all_results_human_per_sentence["detail"] = "sentence"

all_results_human = pd.concat([all_results_human_per_token, all_results_human_per_sentence])
# add a column for the type of annotation
all_results_human["annotation_type"] = "human"

all_results_LLM = pd.concat([
    results_token_general_LLM, 
    results_token_withQuali_LLM, 
    results_sentence_NoQuali_LLM, 
    results_sentence_WithQuali_LLM
    ])

# add a column for the type of annotation
all_results_LLM["annotation_type"] = "LLM"

all_results = pd.concat([all_results_human, all_results_LLM])

# if label contains Good, Bad or Trivial: new column "quality_split"
all_results["quality_split"] = all_results["label"].str.contains("Good|Bad|Trivial")

#all_results.to_csv("../data/charts_data/IAA_all.csv", index=False)

all_results


Unnamed: 0,label,main_agreement,gwets_ac1,detail,annotation_type,model,quality_split
0,Deletion,0.843361,0.952302,token,human,,False
1,Insertion,0.376332,0.994835,token,human,,False
2,Reorder,0.526138,0.952617,token,human,,False
3,Split,0.775420,0.992042,token,human,,False
4,Structure,0.111699,0.955670,token,human,,False
...,...,...,...,...,...,...,...
3,Bad Structure,0.000000,0.889500,,LLM,,True
10,Good Substitution,0.149419,0.303824,,LLM,,True
17,Trivial Substitution,0.048129,0.039428,,LLM,,True
4,Bad Substitution,0.414213,0.449500,,LLM,,True


### Creating tables for comparison to human annotator

In [204]:
#df_perToken_MB_vs_LLM2.to_csv("../data/charts_data/IAA_tokenlevel_NoQuali_MBLLM.csv", index=False)

# keep only the relevant columns
df_perToken_MB_vs_LLM2 = df_perToken_MB_vs_LLM2[["label", "main_agreement_MB", "main_agreement_LLM"]]

display(df_perToken_MB_vs_LLM2)

Unnamed: 0,label,main_agreement_MB,main_agreement_LLM
0,Deletion,0.843361,0.643498
1,Insertion,0.376332,0.23475
2,Reorder,0.526138,0.601899
3,Split,0.77542,0.736026
4,Structure,0.111699,-0.002215
5,Substitution,0.679386,0.515935


In [205]:
# Token-Level with Quality Split
df_perTokenWithQuality_MB_vs_LLM = pd.merge(results_token_withQuality, results_token_withQuali_LLM, on="label", suffixes=("_MB", "_LLM"))

# filter out labels "Split" and "Substitution"
df_perTokenWithQuality_MB_vs_LLM = df_perTokenWithQuality_MB_vs_LLM[df_perTokenWithQuality_MB_vs_LLM["label"]!="Split"]
df_perTokenWithQuality_MB_vs_LLM = df_perTokenWithQuality_MB_vs_LLM[df_perTokenWithQuality_MB_vs_LLM["label"]!="Substitution"]

# keep columns label, main_agreement_MB, main_agreement_LLM
df_perTokenWithQuality_MB_vs_LLM = df_perTokenWithQuality_MB_vs_LLM[["label", "main_agreement_MB", "main_agreement_LLM"]]

display(df_perTokenWithQuality_MB_vs_LLM)

Unnamed: 0,label,main_agreement_MB,main_agreement_LLM
0,Good Deletion,0.341206,0.178779
1,Trivial Deletion,0.059268,-0.011784
2,Bad Deletion,0.468416,0.519227
3,Good Insertion,0.298991,0.238128
4,Trivial Insertion,0.166331,-0.00055
5,Bad Insertion,-0.000305,-0.000305
6,Good Reorder,0.518947,0.618326
7,Bad Reorder,-0.005521,-0.005521
8,Good Split,0.684931,0.649014
9,Trivial Split,-0.002295,-0.002295


In [206]:
# Last: Sentence-Level (first without quality split)
df_perSentence_MB_vs_LLM = pd.merge(results_sentence_general, results_sentence_NoQuali_LLM, on="label", suffixes=("_MB", "_LLM"))

# keep only the relevant columns
df_perSentence_MB_vs_LLM = df_perSentence_MB_vs_LLM[["label", "main_agreement_MB", "main_agreement_LLM"]]

df_perSentence_MB_vs_LLM

Unnamed: 0,label,main_agreement_MB,main_agreement_LLM
0,Deletion,0.7,0.7
1,Insertion,0.280189,0.220814
2,Reorder,0.895604,0.805128
3,Split,0.961616,0.961616
4,Structure,0.100388,0.016872
5,Substitution,0.136152,0.046078


In [207]:
# Sentence-Level WITH   Quality Split
df_perSentenceWithQuality_MB_vs_LLM = pd.merge(results_sentence_withQuality, results_sentence_WithQuali_LLM, on="label", suffixes=("_MB", "_LLM"))

# filter out labels "Split" and "Substitution"
df_perSentenceWithQuality_MB_vs_LLM = df_perSentenceWithQuality_MB_vs_LLM[df_perSentenceWithQuality_MB_vs_LLM["label"]!="Split"]
df_perSentenceWithQuality_MB_vs_LLM = df_perSentenceWithQuality_MB_vs_LLM[df_perSentenceWithQuality_MB_vs_LLM["label"]!="Substitution"]

# keep columns label, main_agreement_MB, main_agreement_LLM
df_perSentenceWithQuality_MB_vs_LLM = df_perSentenceWithQuality_MB_vs_LLM[["label", "main_agreement_MB", "main_agreement_LLM"]]

df_perSentenceWithQuality_MB_vs_LLM

Unnamed: 0,label,main_agreement_MB,main_agreement_LLM
0,Good Deletion,0.251044,0.096548
1,Trivial Deletion,0.036759,-0.164979
2,Bad Deletion,0.134033,0.311421
3,Good Insertion,0.28125,0.28125
4,Trivial Insertion,0.151634,0.046078
5,Bad Insertion,0.0,0.0
6,Good Reorder,0.871245,0.774366
7,Bad Reorder,0.0,0.0
8,Good Split,0.749518,0.749518
9,Trivial Split,-0.080556,-0.080556


## Part (3): Validate Taxonomy
### Option A: Using LLM Annotations

#### Function Definitions

In [208]:
# --- Parsing and Token Labeling (Per-Token) ---

def parse_new_format_annotations(file_path: str):
    """
    Parses the new JSON format annotations.
    Assumes each JSON file is a list of items (sentences).
    """
    with open(file_path, "r", encoding="utf-8") as f:
        return json.load(f)

def compute_token_labels_for_item(item: dict, attribute_extractor) -> list:
    """
    Given a single annotation item (with keys 'source', 'target', and 'edits'),
    return a list of labels (one per token) for the concatenated tokens.
    Tokens not covered by any edit are labeled as 'NoEdit'.
    """
    source = item.get("source", "")
    target = item.get("target", "")
    
    source_tokens = tokenize_with_char_spans_minimal(source)
    target_tokens = tokenize_with_char_spans_minimal(target)
    total_tokens = len(source_tokens) + len(target_tokens)
    
    labels = ["No"] * total_tokens
    
    for edit in item.get("edits", []):
        label = attribute_extractor(edit)
        
        if "input_text" in edit and edit["input_text"]:
            input_char_spans = map_word_sequence_to_char_span(edit["input_text"], source)
            input_token_indices = map_char_spans_to_token_indices(input_char_spans, source_tokens)
            for idx in input_token_indices:
                labels[idx] = label
        
        if "output_text" in edit and edit["output_text"]:
            output_char_spans = map_word_sequence_to_char_span(edit["output_text"], target)
            output_token_indices = map_char_spans_to_token_indices(output_char_spans, target_tokens)
            offset = len(source_tokens)
            for idx in output_token_indices:
                labels[offset + idx] = label
                
    return labels

def compute_taxonomy_krippendorff_alpha(annotations_by_annotator: list, attribute_extractor, debug: bool = False) -> float:
    """
    Computes Krippendorff α at the token level for a given attribute.
    """
    annotator_label_lists = []
    for ann_idx, annotator_annotations in enumerate(annotations_by_annotator):
        token_labels = []
        for item in annotator_annotations:
            labels = compute_token_labels_for_item(item, attribute_extractor)
            token_labels.extend(labels)
        annotator_label_lists.append(token_labels)
        if debug:
            print(f"Annotator {ann_idx} - Total tokens: {len(token_labels)}")
    
    unique_labels = set()
    for labels in annotator_label_lists:
        unique_labels.update(labels)
    label_to_int = {label: i for i, label in enumerate(sorted(unique_labels))}
    if debug:
        print("Unique labels and mapping:", label_to_int)
    
    numeric_arrays = [np.array([label_to_int[lbl] for lbl in labels]) for labels in annotator_label_lists]
    n_tokens = [len(arr) for arr in numeric_arrays]
    if len(set(n_tokens)) != 1:
        raise ValueError("Mismatch in the number of token labels among annotators. Alignment required.")
    
    data_matrix = np.vstack(numeric_arrays)
    #alpha = krippendorff.alpha(reliability_data=data_matrix, level_of_measurement='nominal')
    alpha = safe_krippendorff_alpha(data_matrix, debug=debug)
    return alpha

# --- Alignment Functions (Per-Sentence) ---
# Hint: Need to do this a bit differently, since each position in the matric (now a sentence, not a single token anymore) can have multiple labels.
# Therefore need to do it on a per-category basis and then combine the results by taking the average.

def align_annotations_by_thresh_id(annotations_by_annotator: list) -> list:
    """
    Aligns annotation items (sentences) across annotators using their _thresh_id.
    Returns a list of groups; each group is a list of items (one per annotator) sharing the same _thresh_id.
    """
    annotator_dicts = [
        { item["_thresh_id"]: item for item in annotations }
        for annotations in annotations_by_annotator
    ]
    common_keys = set(annotator_dicts[0].keys())
    for d in annotator_dicts[1:]:
        common_keys.intersection_update(d.keys())
    common_keys = sorted(common_keys)
    aligned_items = []
    for key in common_keys:
        group = [d[key] for d in annotator_dicts]
        aligned_items.append(group)
    return aligned_items
    

# --- Dynamic Attribute Discovery and Extractor Creation ---

def get_all_attribute_keys(annotations_by_annotator: list):
    """
    Discovers attribute keys present in the annotations.
    Returns two sets:
      - taxonomy_keys: keys in edits that start with "edit_type"
      - orthogonal_keys: keys inside the "orthogonal_data" dictionary.
    """
    taxonomy_keys = set()
    orthogonal_keys = set()
    for annotations in annotations_by_annotator:
        for item in annotations:
            for edit in item.get("edits", []):
                for key in edit.keys():
                    if key.lower().startswith("edit_type"):
                        taxonomy_keys.add(key)
                if "orthogonal_data" in edit and isinstance(edit["orthogonal_data"], dict):
                    for key in edit["orthogonal_data"].keys():
                        orthogonal_keys.add(key)
    return taxonomy_keys, orthogonal_keys

def make_extractor(key, nested=False):
    """
    Returns an extractor function for the given key.
    If nested is True, the key is extracted from the "orthogonal_data" dictionary.
    If the value is None, the function returns the string "None".
    """
    def extractor(edit):
        if nested:
            value = edit.get("orthogonal_data", {}).get(key)
        else:
            value = edit.get(key)
        return value if value is not None else "None"
    return extractor

# --- Dynamic Agreement Computation Functions ---

def compute_token_agreements_for_extractor(annotations_by_annotator, attribute_extractor, debug: bool = False):
    """
    Aligns items, computes token-level labels, builds a numeric data matrix,
    and then calculates both Krippendorff's α and Gwet's AC1.
    
    Returns:
        (alpha, ac1): A tuple containing the token-level Krippendorff's alpha 
                      and Gwet's AC1 computed on the data matrix.
    """
    # Align items across annotators.
    aligned_items = align_annotations_by_thresh_id(annotations_by_annotator)
    num_annotators = len(aligned_items[0])
    token_labels_by_annotator = [[] for _ in range(num_annotators)]

    if debug:
        print(f"[DEBUG] Number of aligned items: {len(aligned_items)}")
        print(f"[DEBUG] Number of annotators: {num_annotators}")
        print(f"[DEBUG] Number of items per annotator: {[len(annotations) for annotations in annotations_by_annotator]}")
        print(f"[DEBUG] Number of items per group: {[len(group) for group in aligned_items]}")
    
    # Compute token labels for each aligned item.
    for group in aligned_items:
        for i, item in enumerate(group):
            labels = compute_token_labels_for_item(item, attribute_extractor)
            token_labels_by_annotator[i].extend(labels)
    
    # Map unique labels to integers.
    unique_labels = set()
    for labels in token_labels_by_annotator:
        unique_labels.update(labels)
    label_to_int = {label: i for i, label in enumerate(sorted(unique_labels))}
    
    # Build numeric arrays.
    numeric_arrays = [np.array([label_to_int[lbl] for lbl in labels]) for labels in token_labels_by_annotator]
    n_tokens = [len(arr) for arr in numeric_arrays]
    if len(set(n_tokens)) != 1:
        raise ValueError("Mismatch in token count after alignment; check your data.")
    
    data_matrix = np.vstack(numeric_arrays)
    if debug:
        print(f"[DEBUG] Data matrix shape: {data_matrix.shape}")
        print(f"[DEBUG] Data Matrix:\n{data_matrix}")
    
    # Compute Krippendorff's Alpha.
    #alpha = krippendorff.alpha(reliability_data=data_matrix, level_of_measurement='nominal')
    alpha = safe_krippendorff_alpha(data_matrix, debug=debug)
    # Compute Gwet's AC1.
    ac1 = calculate_gwets_ac1(data_matrix, debug=debug)
    
    return alpha, ac1


def compute_dynamic_attribute_agreements(annotations_by_annotator: list, debug: bool = False) -> pd.DataFrame:
    """
    Dynamically discovers all attribute keys and computes token-level Krippendorff’s α and
    Gwet's AC1 for each.
    
    Returns:
        A DataFrame with one row per attribute containing both metrics.
    """
    taxonomy_keys, orthogonal_keys = get_all_attribute_keys(annotations_by_annotator)
    attributes = {}
    for key in taxonomy_keys:
        attributes[key] = make_extractor(key, nested=False)
    for key in orthogonal_keys:
        attributes[f"Orthogonal_{key}"] = make_extractor(key, nested=True)
    
    results = []
    for attr_name, extractor in attributes.items():
        try:
            alpha, ac1 = compute_token_agreements_for_extractor(annotations_by_annotator, extractor, debug=debug)
        except Exception as e:
            if debug:
                print(f"[DEBUG] Error computing token-level agreement for {attr_name}: {e}")
            alpha = None
            ac1 = None
        results.append({
            "Attribute": attr_name,
            "Token-level Krippendorff Alpha": alpha,
            "Token-level Gwet's AC1": ac1
        })
    
    return pd.DataFrame(results)

def compute_attribute_krippendorff_alpha_aligned(annotations_by_annotator: list, attribute_extractor, debug: bool = False) -> float:
    """
    Helper function that aligns items and computes token-level Krippendorff α.
    """
    # We re-use the previously defined compute_taxonomy_krippendorff_alpha but on aligned items.
    # Align items first.
    aligned_items = align_annotations_by_thresh_id(annotations_by_annotator)
    # Flatten token labels from aligned items.
    num_annotators = len(aligned_items[0])
    token_labels_by_annotator = [[] for _ in range(num_annotators)]
    for group in aligned_items:
        for i, item in enumerate(group):
            labels = compute_token_labels_for_item(item, attribute_extractor)
            token_labels_by_annotator[i].extend(labels)
    unique_labels = set()
    for labels in token_labels_by_annotator:
        unique_labels.update(labels)
    label_to_int = {label: i for i, label in enumerate(sorted(unique_labels))}
    numeric_arrays = [np.array([label_to_int[lbl] for lbl in labels]) for labels in token_labels_by_annotator]
    n_tokens = [len(arr) for arr in numeric_arrays]
    if len(set(n_tokens)) != 1:
        raise ValueError("Mismatch in token count after alignment; check your data.")
    data_matrix = np.vstack(numeric_arrays)
    if debug:
        print(f"Data matrix shape: {data_matrix.shape}")
        print(f"Data Matrix: {data_matrix}")
    #alpha = krippendorff.alpha(reliability_data=data_matrix, level_of_measurement='nominal')
    alpha = safe_krippendorff_alpha(data_matrix, debug=debug)
    return alpha


def compute_missing_statistics(annotations_by_annotator: list, debug: bool = False) -> pd.DataFrame:
    """
    Computes missing-value statistics for dynamic attributes across all edits.
    
    It uses the same key discovery method as before to determine:
      - Taxonomy keys (those starting with "edit_type")
      - Orthogonal keys (keys inside the "orthogonal_data" dictionary)
    
    For each attribute, it counts the total number of edits (across all items and annotators)
    and the number (and percentage) that are missing (None or empty string).
    
    Returns:
      A pandas DataFrame with columns: Attribute, Missing Count, Total, Missing Percentage.
    """
    # Discover keys.
    taxonomy_keys, orthogonal_keys = get_all_attribute_keys(annotations_by_annotator)
    
    results = []
    
    # Process taxonomy keys.
    for key in sorted(taxonomy_keys):
        total = 0
        missing = 0
        for annotations in annotations_by_annotator:
            for item in annotations:
                for edit in item.get("edits", []):
                    total += 1
                    value = edit.get(key, None)
                    if value is None or (isinstance(value, str) and value.strip() == ""):
                        missing += 1
        perc = missing / total if total > 0 else None
        results.append({
            "Attribute": key,
            "Missing Count": missing,
            "Total": total,
            "Missing Percentage": perc
        })
    
    # Process orthogonal keys.
    for key in sorted(orthogonal_keys):
        total = 0
        missing = 0
        for annotations in annotations_by_annotator:
            for item in annotations:
                for edit in item.get("edits", []):
                    total += 1
                    value = edit.get("orthogonal_data", {}).get(key, None)
                    if value is None or (isinstance(value, str) and value.strip() == ""):
                        missing += 1
        perc = missing / total if total > 0 else None
        results.append({
            "Attribute": f"Orthogonal_{key}",
            "Missing Count": missing,
            "Total": total,
            "Missing Percentage": perc
        })
    
    df_missing = pd.DataFrame(results)
    if debug:
        print(df_missing)
    return df_missing

In [209]:
def gather_all_categories_for_attribute(annotations_by_annotator, attribute_extractor):
    """
    Scans all edits from all annotators and collects every distinct non-empty, non-"None" value 
    returned by attribute_extractor(edit).

    Returns a sorted list of unique categories.
    """
    category_set = set()
    for ann in annotations_by_annotator:
        for item in ann:
            for edit in item.get("edits", []):
                val = attribute_extractor(edit) or ""
                # Skip empty or "None"
                if val.strip() and val != "None":
                    category_set.add(val)
    
    return sorted(category_set)


def compute_sentence_agreement_per_category(
    annotations_by_annotator, attribute_extractor, align_func, debug=False
):
    """
    For a chosen attribute, first gather all distinct category labels. 
    Then, for each category, compute both:
      - Krippendorff's alpha (presence/absence)
      - Gwet's AC1 (presence/absence)
    Returns a list of dicts, each with:
       { 
         "Category": <cat>, 
         "Sentence-level Krippendorff Alpha": <alpha>, 
         "Sentence-level Gwet's AC1": <ac1> 
       }
    """
    all_categories = gather_all_categories_for_attribute(annotations_by_annotator, attribute_extractor)
    if debug:
        print("All discovered categories for this attribute:", all_categories)

    aligned_items = align_func(annotations_by_annotator)
    num_annotators = len(aligned_items[0])
    num_sentences = len(aligned_items)

    results = []
    
    for category in all_categories:
        if debug:
            print(f"\n[DEBUG] Computing presence/absence for category '{category}'")
        
        # shape: (num_annotators x num_sentences)
        matrix = np.zeros((num_annotators, num_sentences), dtype=int)
        
        for j, group in enumerate(aligned_items):
            # group: list of items, one per annotator
            for i, item in enumerate(group):
                for edit in item.get("edits", []):
                    val = attribute_extractor(edit)
                    if val == category:
                        matrix[i, j] = 1
                        break  # Found presence, no need to check other edits for this item

        # Calculate Krippendorff’s alpha on this binary matrix
        if debug:
            print(f"[DEBUG] Reliability matrix for category '{category}':\n{matrix}")
        alpha_val = safe_krippendorff_alpha(matrix, debug=debug)
        if debug:
            print(f"[DEBUG] Krippendorff's Alpha for category '{category}': {alpha_val}")
        
        # Calculate Gwet’s AC1 on the same binary matrix
        ac1_val = calculate_gwets_ac1(matrix, debug=debug)

        results.append({
            "Category": category,
            "Sentence-level Krippendorff Alpha": alpha_val,
            "Sentence-level Gwet's AC1": ac1_val
        })
    
    return results


#### Read in Data

In [210]:
# LLM Files in Directory

directory_path = "../data/LLM_annotations"

llm_TaxEval_files = os.listdir(directory_path)

# those containing "PeerSet" are the ones we want to use
llm_TaxEval_files = [file for file in llm_TaxEval_files if "TaxVal" in file]

llm_TaxEval_files = [os.path.join(directory_path, file) for file in llm_TaxEval_files]

taxonomies = []
annotatorModels = []

for file in llm_TaxEval_files:
    datasetAndModel = re.search(r"TaxVal_(.*?).json", file).group(1)
    taxonomyName, modelName = datasetAndModel.split("_")

    # append if not already in list
    if taxonomyName not in taxonomies:
        taxonomies.append(taxonomyName)
    if modelName not in annotatorModels:    
        annotatorModels.append(modelName)

print(taxonomies)
print(annotatorModels)

llm_TaxEval_files

['HuidromBelz', 'MB2025V001']
['GPT4o', 'ClaudeSonnet', 'LLAMA8b']


['../data/LLM_annotations/LLM_annotations_TaxVal_HuidromBelz_GPT4o.json',
 '../data/LLM_annotations/LLM_annotations_TaxVal_MB2025V001_GPT4o.json',
 '../data/LLM_annotations/LLM_annotations_TaxVal_MB2025V001_ClaudeSonnet.json',
 '../data/LLM_annotations/LLM_annotations_TaxVal_MB2025V001_LLAMA8b.json',
 '../data/LLM_annotations/LLM_annotations_TaxVal_HuidromBelz_LLAMA8b.json',
 '../data/LLM_annotations/LLM_annotations_TaxVal_HuidromBelz_ClaudeSonnet.json']

#### Taxonomy 1: Huidzom (Meta)

In [211]:
def run_taxval_custom(taxonomy_name, annotator_files, debug=False, compute_missing_stats=False):
    print(f"Processing Taxonomy: {taxonomy}")

    # Treat all models as independent annotators
    # Load annotations for each annotator (each file is assumed to contain a list of items)
    annotations_by_annotator = [parse_new_format_annotations(f) for f in annotator_files]
    print(f"Annotations by annotator: {[len(ann) for ann in annotations_by_annotator]}")

    if compute_missing_stats:
        print("Missing Attributes Statistics:")
        df_missing_stats = compute_missing_statistics(annotations_by_annotator, debug=True)
        display(df_missing_stats)
        print(df_missing_stats)

    if debug:
        print(f"Annotations peak 1: {annotations_by_annotator}")

    # Compute token-level dynamic agreements.
    df_token_agreements = compute_dynamic_attribute_agreements(annotations_by_annotator, debug=debug)
    print("Token-level Agreement:")
    print(df_token_agreements)
    display(df_token_agreements)

    edit_type_levels = ["edit_type_level1", "edit_type_level2", "edit_type_level3"]
    rows = []

    for attr_key in edit_type_levels:
        if debug:
            print(f"\n----- Now processing presence/absence for {attr_key} -----")
        
        def extractor(edit):
            return edit.get(attr_key, "None")

        cat_results = compute_sentence_agreement_per_category(
            annotations_by_annotator=annotations_by_annotator,
            attribute_extractor=extractor,
            align_func=align_annotations_by_thresh_id,
            debug=debug
        )

        # Add the attribute name to each row
        for row_data in cat_results:
            row_data["Attribute"] = attr_key
            rows.append(row_data)

    df_sentence_pa = pd.DataFrame(rows)
    # Reorder columns if desired
    df_sentence_pa = df_sentence_pa[
        ["Attribute", "Category", "Sentence-level Krippendorff Alpha", "Sentence-level Gwet's AC1"]
    ]

    print("\nSentence-level Presence/Absence Agreement for each Category (Krippendorff & Gwet’s AC1):")
    print(df_sentence_pa)
    display(df_sentence_pa)

In [212]:
taxonomy = "HuidromBelz"

annotator_files_HB = [
    '../data/LLM_annotations/LLM_annotations_TaxVal_HuidromBelz_GPT4o.json',
    '../data/LLM_annotations/LLM_annotations_TaxVal_HuidromBelz_LLAMA8b.json',
    '../data/LLM_annotations/LLM_annotations_TaxVal_HuidromBelz_ClaudeSonnet.json'
    ]


run_taxval_custom(taxonomy, annotator_files_HB, debug=False)

Processing Taxonomy: HuidromBelz
Annotations by annotator: [49, 50, 49]
Token-level Agreement:
                       Attribute  Token-level Krippendorff Alpha  \
0               edit_type_level2                        0.187905   
1               edit_type_level3                        0.237792   
2               edit_type_level1                        0.277279   
3    Orthogonal_context_function                        0.194917   
4           Orthogonal_num_words                        0.249980   
5  Orthogonal_syntactic_category                        0.243258   
6            Orthogonal_severity                        0.208436   
7   Orthogonal_meaning_deviation                        0.259065   

   Token-level Gwet's AC1  
0                 0.38427  
1                 0.55673  
2                 0.47361  
3                 0.38611  
4                 0.47118  
5                 0.48577  
6                 0.35203  
7                 0.51969  


Unnamed: 0,Attribute,Token-level Krippendorff Alpha,Token-level Gwet's AC1
0,edit_type_level2,0.187905,0.38427
1,edit_type_level3,0.237792,0.55673
2,edit_type_level1,0.277279,0.47361
3,Orthogonal_context_function,0.194917,0.38611
4,Orthogonal_num_words,0.24998,0.47118
5,Orthogonal_syntactic_category,0.243258,0.48577
6,Orthogonal_severity,0.208436,0.35203
7,Orthogonal_meaning_deviation,0.259065,0.51969



Sentence-level Presence/Absence Agreement for each Category (Krippendorff & Gwet’s AC1):
           Attribute                            Category  \
0   edit_type_level1                            Addition   
1   edit_type_level1                            Omission   
2   edit_type_level1                          Reordering   
3   edit_type_level1                        Substitution   
4   edit_type_level2                            Addition   
5   edit_type_level2                         Duplication   
6   edit_type_level2                               Exact   
7   edit_type_level2                       Lexical Error   
8   edit_type_level2                                NULL   
9   edit_type_level2                            Omission   
10  edit_type_level2                               Other   
11  edit_type_level2             Other Meaning Deviation   
12  edit_type_level2          Other Wrong Lexical Choice   
13  edit_type_level2        Other Wrongly Rendered Input   
14  edit_t

Unnamed: 0,Attribute,Category,Sentence-level Krippendorff Alpha,Sentence-level Gwet's AC1
0,edit_type_level1,Addition,0.006742,0.10543
1,edit_type_level1,Omission,0.361607,0.41765
2,edit_type_level1,Reordering,-0.007042,0.97144
3,edit_type_level1,Substitution,-0.007042,0.97144
4,edit_type_level2,Addition,-0.007042,0.97144
5,edit_type_level2,Duplication,-0.028777,0.92557
6,edit_type_level2,Exact,0.0,0.98592
7,edit_type_level2,Lexical Error,0.043666,0.07343
8,edit_type_level2,,0.0,0.98592
9,edit_type_level2,Omission,0.323877,0.97104


#### Taxonomy 2: Heineman SALSA

In [213]:
taxonomy = "SALSA"

annotator_files_SALSA = [
    '../data/LLM_annotations/LLM_annotations_N50_Heineman_ClaudeSonnet_FULL_LEVELSformat.json',
    '../data/LLM_annotations/LLM_annotations_N50_Heineman_GPT4o_FULL_LEVELSformat.json',
    '../data/LLM_annotations/LLM_annotations_N50_Heineman_LLAMA8b_FULL_LEVELSformat.json'
    ]


run_taxval_custom(taxonomy, annotator_files_SALSA, debug=False)

Processing Taxonomy: SALSA
Annotations by annotator: [50, 50, 50]
Token-level Agreement:
                             Attribute  Token-level Krippendorff Alpha  \
0                     edit_type_level2                        0.257309   
1                     edit_type_level3                        0.214294   
2                     edit_type_level1                        0.305982   
3           Orthogonal_polarity_switch                        0.401560   
4  Orthogonal_simplification_direction                        0.401560   
5        Orthogonal_factual_dependence                        0.401560   
6                  Orthogonal_severity                        0.214294   
7        Orthogonal_domain_sensitivity                        0.401560   
8                     Orthogonal_scope                        0.401560   

   Token-level Gwet's AC1  
0                 0.46634  
1                 0.38074  
2                 0.49921  
3                 0.40193  
4                 0.40193  
5 

Unnamed: 0,Attribute,Token-level Krippendorff Alpha,Token-level Gwet's AC1
0,edit_type_level2,0.257309,0.46634
1,edit_type_level3,0.214294,0.38074
2,edit_type_level1,0.305982,0.49921
3,Orthogonal_polarity_switch,0.40156,0.40193
4,Orthogonal_simplification_direction,0.40156,0.40193
5,Orthogonal_factual_dependence,0.40156,0.40193
6,Orthogonal_severity,0.214294,0.38074
7,Orthogonal_domain_sensitivity,0.40156,0.40193
8,Orthogonal_scope,0.40156,0.40193



Sentence-level Presence/Absence Agreement for each Category (Krippendorff & Gwet’s AC1):
           Attribute              Category  Sentence-level Krippendorff Alpha  \
0   edit_type_level1              Deletion                          -0.013605   
1   edit_type_level1             Insertion                          -0.006757   
2   edit_type_level1               Reorder                          -0.006757   
3   edit_type_level1                 Split                          -0.013605   
4   edit_type_level1          Substitution                          -0.027586   
5   edit_type_level1              deletion                           0.317265   
6   edit_type_level1             insertion                           0.047399   
7   edit_type_level1               reorder                           0.148571   
8   edit_type_level1                 split                           0.710173   
9   edit_type_level1            structural                           0.000000   
10  edit_type_level

Unnamed: 0,Attribute,Category,Sentence-level Krippendorff Alpha,Sentence-level Gwet's AC1
0,edit_type_level1,Deletion,-0.013605,0.95837
1,edit_type_level1,Insertion,-0.006757,0.97261
2,edit_type_level1,Reorder,-0.006757,0.97261
3,edit_type_level1,Split,-0.013605,0.95837
4,edit_type_level1,Substitution,-0.027586,0.92874
5,edit_type_level1,deletion,0.317265,0.35278
6,edit_type_level1,insertion,0.047399,0.3521
7,edit_type_level1,reorder,0.148571,0.52249
8,edit_type_level1,split,0.710173,0.75446
9,edit_type_level1,structural,0.0,0.98649


#### Taxonomy X: MB 2025

In [214]:
taxonomy = "MB2025"

annotator_files_MB = [
    '../data/LLM_annotations/LLM_annotations_TaxVal_MB2025V001_ClaudeSonnet.json',
    '../data/LLM_annotations/LLM_annotations_TaxVal_MB2025V001_GPT4o.json',
    '../data/LLM_annotations/LLM_annotations_TaxVal_MB2025V001_LLAMA8b.json'
    ]


run_taxval_custom(taxonomy, annotator_files_MB, debug=False)

Processing Taxonomy: MB2025
Annotations by annotator: [48, 50, 50]
Token-level Agreement:
                             Attribute  Token-level Krippendorff Alpha  \
0                     edit_type_level2                        0.267754   
1                     edit_type_level3                        0.227384   
2                     edit_type_level1                        0.277208   
3           Orthogonal_polarity_switch                        0.352074   
4  Orthogonal_simplification_direction                        0.196369   
5        Orthogonal_factual_dependence                        0.187713   
6                  Orthogonal_severity                        0.223069   
7        Orthogonal_domain_sensitivity                        0.187715   
8                     Orthogonal_scope                        0.254544   

   Token-level Gwet's AC1  
0                 0.49137  
1                 0.47148  
2                 0.42927  
3                 0.56758  
4                 0.39421  
5

Unnamed: 0,Attribute,Token-level Krippendorff Alpha,Token-level Gwet's AC1
0,edit_type_level2,0.267754,0.49137
1,edit_type_level3,0.227384,0.47148
2,edit_type_level1,0.277208,0.42927
3,Orthogonal_polarity_switch,0.352074,0.56758
4,Orthogonal_simplification_direction,0.196369,0.39421
5,Orthogonal_factual_dependence,0.187713,0.38344
6,Orthogonal_severity,0.223069,0.46298
7,Orthogonal_domain_sensitivity,0.187715,0.37341
8,Orthogonal_scope,0.254544,0.47408



Sentence-level Presence/Absence Agreement for each Category (Krippendorff & Gwet’s AC1):
           Attribute                                 Category  \
0   edit_type_level1                 Content / Meaning Errors   
1   edit_type_level1                    Form / Fluency Errors   
2   edit_type_level2                                 Addition   
3   edit_type_level2          Coherence and Structural Issues   
4   edit_type_level2                                 Omission   
5   edit_type_level2                                    Split   
6   edit_type_level2                         Stylistic Errors   
7   edit_type_level2                             Substitution   
8   edit_type_level2                         Syntactic Errors   
9   edit_type_level3                         Awkward Phrasing   
10  edit_type_level3                    Bad Structure / Split   
11  edit_type_level3          Coherence and Structural Issues   
12  edit_type_level3                      Contextual Omission   


Unnamed: 0,Attribute,Category,Sentence-level Krippendorff Alpha,Sentence-level Gwet's AC1
0,edit_type_level1,Content / Meaning Errors,-0.036232,0.90943
1,edit_type_level1,Form / Fluency Errors,-0.008818,0.01538
2,edit_type_level2,Addition,0.06291,0.45063
3,edit_type_level2,Coherence and Structural Issues,0.225232,0.22462
4,edit_type_level2,Omission,0.111267,0.17026
5,edit_type_level2,Split,0.0,0.98592
6,edit_type_level2,Stylistic Errors,-0.043796,0.89287
7,edit_type_level2,Substitution,0.486166,0.81015
8,edit_type_level2,Syntactic Errors,0.022556,0.83829
9,edit_type_level3,Awkward Phrasing,0.075099,0.65827


In [215]:
# ALL IN ONE for simplified comparison

taxonomy_dict = {
    "HuidromBelz": annotator_files_HB,
    "SALSA": annotator_files_SALSA,
    "MB2025": annotator_files_MB
}

token_agreements_results = []        # For token-level dynamic agreements
sentence_pa_results = []             # For sentence-level presence/absence agreements
sentence_taxlevel_results = []       # For sentence-level taxonomy level agreements

# Loop over each taxonomy.
for taxonomy_name, annotator_files in taxonomy_dict.items():
    print(f"\nProcessing Taxonomy: {taxonomy_name}")
    
    # Load annotations for each annotator.
    annotations_by_annotator = [parse_new_format_annotations(f) for f in annotator_files]
    print(f"Annotations by annotator: {[len(ann) for ann in annotations_by_annotator]}")
    
    # -------------------------------
    # (1) Token-Level Dynamic Agreements
    # -------------------------------
    df_token = compute_dynamic_attribute_agreements(annotations_by_annotator, debug=False)
    df_token["Taxonomy"] = taxonomy_name
    token_agreements_results.append(df_token)
    
    # -------------------------------
    # (2) Sentence-Level Presence/Absence Agreements
    # For each of the three edit_type levels, compute per-category metrics.
    # -------------------------------
    edit_type_levels = ["edit_type_level1", "edit_type_level2", "edit_type_level3"]
    rows = []
    for attr_key in edit_type_levels:
        # Create a simple extractor for the given attribute key.
        def extractor(edit, key=attr_key):
            return edit.get(key, "None")
        
        cat_results = compute_sentence_agreement_per_category(
            annotations_by_annotator=annotations_by_annotator,
            attribute_extractor=extractor,
            align_func=align_annotations_by_thresh_id,
            debug=False
        )
        # Append attribute key info to each row.
        for row_data in cat_results:
            row_data["Attribute"] = attr_key
            rows.append(row_data)
    
    df_sentence_pa = pd.DataFrame(rows)
    df_sentence_pa = df_sentence_pa[["Attribute", "Category", "Sentence-level Krippendorff Alpha", "Sentence-level Gwet's AC1"]]
    df_sentence_pa["Taxonomy"] = taxonomy_name
    sentence_pa_results.append(df_sentence_pa)

# --------------------------------------------------
# Aggregate the results from all taxonomies.
# --------------------------------------------------
df_token_agreements_all = pd.concat(token_agreements_results, ignore_index=True)
df_sentence_pa_all = pd.concat(sentence_pa_results, ignore_index=True)
#df_sentence_taxlevel_all = pd.concat(sentence_taxlevel_results, ignore_index=True)

print(f"DEBUG: rows before filtering: {df_token_agreements_all.shape[0]}")
# Focus on Level 1, Level 2, and Level 3 and severity (Filter out) for Taxonomy = SALSA only, since that taxonomy doesn't contain those categories.
df_token_agreements_all = df_token_agreements_all[
    ~((df_token_agreements_all["Taxonomy"] == "SALSA") & 
      ~(df_token_agreements_all["Attribute"].str.contains("level|severity", case=False)))
]
print(f"DEBUG: rows after filtering: {df_token_agreements_all.shape[0]}")
# same for sentence-level presence/absence
df_sentence_pa_all = df_sentence_pa_all[
    ~((df_sentence_pa_all["Taxonomy"] == "SALSA") &
        ~(df_sentence_pa_all["Attribute"].str.contains("level|severity", case=False)))
    ]


# Optionally reorder columns.
df_token_agreements_all = df_token_agreements_all[["Taxonomy", "Attribute", "Token-level Krippendorff Alpha", "Token-level Gwet's AC1"]]
#df_sentence_taxlevel_all = df_sentence_taxlevel_all[["Taxonomy", "Attribute", "Sentence-Level TaxLevel-Agreement Krippendorff Alpha", "Sentence-Level TaxLevel-Agreement Gwet's AC1"]]

print("\nAggregated Token-level Dynamic Attribute Agreements:")
display(df_token_agreements_all)

print("\nAggregated Sentence-level Presence/Absence Agreements:")
display(df_sentence_pa_all)


Processing Taxonomy: HuidromBelz
Annotations by annotator: [49, 50, 49]

Processing Taxonomy: SALSA
Annotations by annotator: [50, 50, 50]

Processing Taxonomy: MB2025
Annotations by annotator: [48, 50, 50]
DEBUG: rows before filtering: 26
DEBUG: rows after filtering: 21

Aggregated Token-level Dynamic Attribute Agreements:


Unnamed: 0,Taxonomy,Attribute,Token-level Krippendorff Alpha,Token-level Gwet's AC1
0,HuidromBelz,edit_type_level2,0.187905,0.38427
1,HuidromBelz,edit_type_level3,0.237792,0.55673
2,HuidromBelz,edit_type_level1,0.277279,0.47361
3,HuidromBelz,Orthogonal_context_function,0.194917,0.38611
4,HuidromBelz,Orthogonal_num_words,0.24998,0.47118
5,HuidromBelz,Orthogonal_syntactic_category,0.243258,0.48577
6,HuidromBelz,Orthogonal_severity,0.208436,0.35203
7,HuidromBelz,Orthogonal_meaning_deviation,0.259065,0.51969
8,SALSA,edit_type_level2,0.257309,0.46634
9,SALSA,edit_type_level3,0.214294,0.38074



Aggregated Sentence-level Presence/Absence Agreements:


Unnamed: 0,Attribute,Category,Sentence-level Krippendorff Alpha,Sentence-level Gwet's AC1,Taxonomy
0,edit_type_level1,Addition,0.006742,0.10543,HuidromBelz
1,edit_type_level1,Omission,0.361607,0.41765,HuidromBelz
2,edit_type_level1,Reordering,-0.007042,0.97144,HuidromBelz
3,edit_type_level1,Substitution,-0.007042,0.97144,HuidromBelz
4,edit_type_level2,Addition,-0.007042,0.97144,HuidromBelz
...,...,...,...,...,...
101,edit_type_level3,Stylistic Errors,-0.028777,0.92557,MB2025
102,edit_type_level3,Subject-Verb Agreement Error,-0.014184,0.95656,MB2025
103,edit_type_level3,Syntactic Errors,0.000000,0.98592,MB2025
104,edit_type_level3,Tense Inconsistency,-0.021429,0.94127,MB2025


### Option B: 2 Human Annotators


#### Convert Data format from AnnoTool to standard

In [216]:
import json
import os
import re

def convert_thresh_to_salsa(thresh_data, annotator="claude-3-5-sonnet-20241022", system="anthropic"):
    """
    Convert annotation data from THRESH format to SALSA format
    
    Args:
        thresh_data (list): List of annotation objects in THRESH format
        annotator (str): Name of the annotator
        system (str): System identifier
        
    Returns:
        list: List of annotation objects in SALSA format
    """
    salsa_data = []
    
    # Helper function to convert camelCase to snake_case
    def camel_to_snake(name):
        s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
        return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()
    
    for item in thresh_data:
        salsa_item = {
            "source": item["complexSentence"],
            "target": item["simplifiedSentence"],
            "metadata": {
                "annotator": annotator,
                "system": system
            },
            "edits": [],
        }
        
        # Preserve the original thresh_id if available
        if "_thresh_id" in item:
            salsa_item["_thresh_id"] = item["_thresh_id"]
        else:
            salsa_item["_thresh_id"] = item["id"]
        
        # Convert annotations to edits
        for annotation in item.get("annotations", []):
            # Convert orthogonal data keys to snake_case
            orthogonal_data = {}
            for k, v in annotation.get("orthogonalData", {}).items():
                orthogonal_data[camel_to_snake(k)] = v
            
            edit = {
                "edit_type_level1": annotation["level1"],
                "edit_type_level2": annotation["level2"],
                "edit_type_level3": annotation["level3"],
                "input_text": [span["text"] for span in annotation.get("complexSpan", [])],
                "output_text": [span["text"] for span in annotation.get("simplifiedSpan", [])],
                "orthogonal_data": orthogonal_data
            }
            salsa_item["edits"].append(edit)
        
        salsa_data.append(salsa_item)
    
    return salsa_data

def convert_file(input_file, output_file, annotator="claude-3-5-sonnet-20241022", system="anthropic"):
    """
    Convert a THRESH format file to a SALSA format file
    
    Args:
        input_file (str): Path to the input file
        output_file (str): Path to the output file
        annotator (str): Name of the annotator
        system (str): System identifier
    """
    with open(input_file, 'r', encoding='utf-8') as f:
        thresh_data = json.load(f)
    
    salsa_data = convert_thresh_to_salsa(thresh_data, annotator, system)
    
    # Create directory if it doesn't exist
    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    
    # Write the converted data
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(salsa_data, f, ensure_ascii=False, indent=4)
    
    print(f"Converted {len(thresh_data)} annotations and saved to {output_file}")

In [217]:
#  usage
input_file = "../data/tax_validation_annotations/Annotations_TaxVal_MB.json"
output_file = "../data/tax_validation_annotations/Annotations_TaxVal_MB_CONV.json"

convert_file(input_file, output_file)

# Convert the other files
input_file = "../data/tax_validation_annotations/Annotations_TaxVal_LP.json"
output_file = "../data/tax_validation_annotations/Annotations_TaxVal_LP_CONV.json"
convert_file(input_file, output_file)

Converted 50 annotations and saved to ../data/tax_validation_annotations/Annotations_TaxVal_MB_CONV.json
Converted 50 annotations and saved to ../data/tax_validation_annotations/Annotations_TaxVal_LP_CONV.json


#### Run Analysis

In [218]:
### Read in Data

human_TaxEval_files = [
    "../data/tax_validation_annotations/Annotations_TaxVal_LP_CONV.json",
    "../data/tax_validation_annotations/Annotations_TaxVal_MB_CONV.json",
]

human_TaxEval_files

['../data/tax_validation_annotations/Annotations_TaxVal_LP_CONV.json',
 '../data/tax_validation_annotations/Annotations_TaxVal_MB_CONV.json']

In [219]:
# ALL IN ONE for simplified comparison

taxonomy_name = "MB_2025_Human"
token_agreements_results = []        # For token-level dynamic agreements
sentence_pa_results = []             # For sentence-level presence/absence agreements
sentence_taxlevel_results = []       # For sentence-level taxonomy level agreements


print(f"\nProcessing Taxonomy: {taxonomy_name}")

# Load annotations for each annotator.
annotations_by_annotator = [parse_new_format_annotations(f) for f in human_TaxEval_files]
print(f"Annotations by annotator: {[len(ann) for ann in annotations_by_annotator]}")

# -------------------------------
# (1) Token-Level Dynamic Agreements
# -------------------------------
df_token = compute_dynamic_attribute_agreements(annotations_by_annotator, debug=False)
df_token["Taxonomy"] = taxonomy_name
token_agreements_results.append(df_token)

# -------------------------------
# (2) Sentence-Level Presence/Absence Agreements
# For each of the three edit_type levels, compute per-category metrics.
# -------------------------------
edit_type_levels = ["edit_type_level1", "edit_type_level2", "edit_type_level3"]
rows = []
for attr_key in edit_type_levels:
    # Create a simple extractor for the given attribute key.
    def extractor(edit, key=attr_key):
        return edit.get(key, "None")
    
    cat_results = compute_sentence_agreement_per_category(
        annotations_by_annotator=annotations_by_annotator,
        attribute_extractor=extractor,
        align_func=align_annotations_by_thresh_id,
        debug=False
    )
    # Append attribute key info to each row.
    for row_data in cat_results:
        row_data["Attribute"] = attr_key
        rows.append(row_data)

df_sentence_pa = pd.DataFrame(rows)
df_sentence_pa = df_sentence_pa[["Attribute", "Category", "Sentence-level Krippendorff Alpha", "Sentence-level Gwet's AC1"]]
df_sentence_pa["Taxonomy"] = taxonomy_name
sentence_pa_results.append(df_sentence_pa)


# --------------------------------------------------
# Aggregate the results from all taxonomies.
# --------------------------------------------------
df_token_agreements_HumanEval = pd.concat(token_agreements_results, ignore_index=True)
df_sentence_pa_HumanEval = pd.concat(sentence_pa_results, ignore_index=True)
#df_sentence_taxlevel_all = pd.concat(sentence_taxlevel_results, ignore_index=True)

# Optionally reorder columns.
df_token_agreements_HumanEval = df_token_agreements_HumanEval[["Attribute", "Token-level Krippendorff Alpha", "Token-level Gwet's AC1"]]

print("\nAggregated Token-level Dynamic Attribute Agreements:")
display(df_token_agreements_HumanEval)
# export to csv
df_token_agreements_HumanEval.to_csv("../data/charts_data/IAA_tokenlevel_TaxVal_HUMAN.csv", index=False)

print("\nAggregated Sentence-level Presence/Absence Agreements:")
display(df_sentence_pa_HumanEval)
# export to csv
df_sentence_pa_HumanEval.to_csv("../data/charts_data/IAA_sentencelevel_TaxVal_HUMAN.csv", index=False)


Processing Taxonomy: MB_2025_Human
Annotations by annotator: [50, 50]

Aggregated Token-level Dynamic Attribute Agreements:


Unnamed: 0,Attribute,Token-level Krippendorff Alpha,Token-level Gwet's AC1
0,edit_type_level2,0.657138,0.88884
1,edit_type_level3,0.56036,0.86118
2,edit_type_level1,0.652457,0.88019
3,Orthogonal_polarity_switch,0.66812,0.88981
4,Orthogonal_simplification_direction,0.659188,0.88681
5,Orthogonal_factual_dependence,0.627076,0.88174
6,Orthogonal_severity,0.475977,0.82927
7,Orthogonal_domain_sensitivity,0.671651,0.89061
8,Orthogonal_scope,0.648288,0.88909



Aggregated Sentence-level Presence/Absence Agreements:


Unnamed: 0,Attribute,Category,Sentence-level Krippendorff Alpha,Sentence-level Gwet's AC1,Taxonomy
0,edit_type_level1,Content / Meaning Errors,0.582278,0.79048,MB_2025_Human
1,edit_type_level1,Form / Fluency Errors,0.832298,0.90709,MB_2025_Human
2,edit_type_level2,Addition,0.671096,0.89463,MB_2025_Human
3,edit_type_level2,Coherence and Structural Issues,0.7525,0.88235,MB_2025_Human
4,edit_type_level2,Omission,0.717143,0.7931,MB_2025_Human
5,edit_type_level2,Substitution,0.7624,0.76,MB_2025_Human
6,edit_type_level2,Syntactic Errors,1.0,1.0,MB_2025_Human
7,edit_type_level3,Awkward Phrasing,-0.042105,0.8895,MB_2025_Human
8,edit_type_level3,Bad Structure / Split,0.731707,0.88649,MB_2025_Human
9,edit_type_level3,Contextual Omission,0.538462,0.75639,MB_2025_Human


#### Per Token Category Deep-Dive

In [221]:
# PER TOKEN PER CATEGORY

def binary_extractor_factory(target_value, attribute_key="edit_type_2"):
    """
    Returns an extractor function that marks tokens as target_value if edit[attribute_key] == target_value,
    else 'No'.
    """
    def extractor(edit):
        val = edit.get(attribute_key, None)
        if val == target_value:
            return target_value
        else:
            return "No"
    return extractor

def compute_token_binary_matrix(annotations_by_annotator, attribute_extractor, debug=False):
    """
    Aligns items across annotators and then computes a binary matrix for token-level labels.
    The extractor should return target_value for a positive label and "No" for negative.
    Returns a numpy array of shape (n_annotators, total_tokens) with 1 for match and 0 for "No".
    """
    aligned_items = align_annotations_by_thresh_id(annotations_by_annotator)
    num_annotators = len(aligned_items[0])
    binary_token_labels_by_annotator = [[] for _ in range(num_annotators)]
    
    for group in aligned_items:
        for i, item in enumerate(group):
            # Compute token labels using your existing function; these will be either target_value or "No"
            labels = compute_token_labels_for_item(item, attribute_extractor)
            # Map target_value -> 1 and "No" -> 0
            binary_labels = [1 if lbl != "No" else 0 for lbl in labels]
            binary_token_labels_by_annotator[i].extend(binary_labels)
    
    # Verify that each annotator has the same number of tokens
    lengths = [len(x) for x in binary_token_labels_by_annotator]
    if len(set(lengths)) != 1:
        raise ValueError("Mismatch in token count among annotators.")
    
    data_matrix = np.vstack(binary_token_labels_by_annotator)
    if debug:
        print(f"[DEBUG] Token binary matrix shape: {data_matrix.shape}")
    return data_matrix

def compute_per_category_binary_agreements(annotations_by_annotator, attribute_key="edit_type_level2"):
    """
    Discovers all sub-category values under 'attribute_key' across the data,
    then computes a separate Krippendorff α and Gwet's AC1 (binary labeling)
    for each sub-category. Returns a DataFrame with one row per sub-category.
    """
    # 1) Gather possible sub-categories
    all_subcats = set()
    for annotator_items in annotations_by_annotator:
        for item in annotator_items:
            for edit in item.get("edits", []):
                val = edit.get(attribute_key, None)
                if val not in [None, "None", ""]:
                    all_subcats.add(val)
    all_subcats = sorted(all_subcats)
    print(f"Extracted sub-categories: {all_subcats}")
    
    # 2) For each sub-category, build a binary extractor & compute metrics
    results = []
    for sc in all_subcats:
        extractor = binary_extractor_factory(sc, attribute_key=attribute_key)
        try:
            print(f"Running compute for sub-category '{sc}' with attribute key {attribute_key}")
            # Compute token-level Krippendorff's α (using your alignment-based function)
            alpha_sc = compute_attribute_krippendorff_alpha_aligned(
                annotations_by_annotator,
                attribute_extractor=extractor,
                debug=True
            )
            # Build binary matrix and compute Gwet's AC1
            binary_matrix = compute_token_binary_matrix(
                annotations_by_annotator, extractor, debug=True
            )
            ac1_sc = calculate_gwets_ac1(binary_matrix, debug=True)
            results.append({
                "SubCategory": sc,
                "Krippendorff Alpha": alpha_sc,
                "Gwet's AC1": ac1_sc
            })
        except Exception as e:
            results.append({
                "SubCategory": sc,
                "Krippendorff Alpha": None,
                "Gwet's AC1": None,
                "Error": str(e)
            })

    # order by krippendorff alpha
    results.sort(key=lambda x: x["Krippendorff Alpha"] if x["Krippendorff Alpha"] is not None else -1, reverse=True)
    
    return pd.DataFrame(results)



print(f"Processing Taxonomy: {taxonomy}")

# Treat all models as independent annotators.
# Load annotations for each annotator (each file is assumed to contain a list of items)
annotations_by_annotator = [parse_new_format_annotations(f) for f in human_TaxEval_files]
print(f"Annotations by annotator: {[len(ann) for ann in annotations_by_annotator]}")

binary_df_level3 = compute_per_category_binary_agreements(
    annotations_by_annotator,
    attribute_key="edit_type_level3"   # or "edit_type_level1" or another dimension
)
binary_df_level2 = compute_per_category_binary_agreements(
    annotations_by_annotator,
    attribute_key="edit_type_level2"   # or "edit_type_level1" or another dimension
)
binary_df_level1 = compute_per_category_binary_agreements(
    annotations_by_annotator,
    attribute_key="edit_type_level1"   # or "edit_type_level1" or another dimension
)

print("\nBinary Token-Level Agreements for each Category (Krippendorff & Gwet’s AC1):")
display(binary_df_level3)
display(binary_df_level2)
display(binary_df_level1)

# combine all 3 dataframes with a "Level" column
binary_df_level3["Level"] = "Level 3"
binary_df_level2["Level"] = "Level 2"
binary_df_level1["Level"] = "Level 1"

# move "Level" column to the front


binary_df_combined = pd.concat([
    binary_df_level1,
    binary_df_level2, 
    binary_df_level3, 
], ignore_index=True)

binary_df_combined = binary_df_combined[["Level", "SubCategory", "Krippendorff Alpha", "Gwet's AC1"]]

display(binary_df_combined)

Processing Taxonomy: MB2025
Annotations by annotator: [50, 50]
Extracted sub-categories: ['Awkward Phrasing', 'Bad Structure / Split', 'Contextual Omission', 'Coreference / Anaphora Resolution', 'Essential Omission', 'Factual Distortion', 'Factual Hallucination', 'Lack of Simplicity / Lexical Complexity', 'Lexical Inaccuracy & Semantic Drift', 'Repetitive Addition', 'Subject-Verb Agreement Error', 'Tense Inconsistency', 'Unnecessary Expansion']
Running compute for sub-category 'Awkward Phrasing' with attribute key edit_type_level3
Data matrix shape: (2, 2730)
Data Matrix: [[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
  1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
  1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
  1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
  1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
  1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1

Unnamed: 0,SubCategory,Krippendorff Alpha,Gwet's AC1
0,Subject-Verb Agreement Error,0.749679,0.99927
1,Factual Distortion,0.702683,0.98459
2,Factual Hallucination,0.618734,0.99593
3,Lexical Inaccuracy & Semantic Drift,0.601489,0.96406
4,Contextual Omission,0.528397,0.97574
5,Bad Structure / Split,0.390259,0.95728
6,Coreference / Anaphora Resolution,0.362469,0.99743
7,Repetitive Addition,0.166892,0.98242
8,Essential Omission,0.110622,0.97823
9,Tense Inconsistency,-0.000183,0.99927


Unnamed: 0,SubCategory,Krippendorff Alpha,Gwet's AC1
0,Omission,0.81073,0.9854
1,Substitution,0.712873,0.95608
2,Syntactic Errors,0.599339,0.99853
3,Addition,0.451542,0.97431
4,Coherence and Structural Issues,0.435097,0.95576


Unnamed: 0,SubCategory,Krippendorff Alpha,Gwet's AC1
0,Content / Meaning Errors,0.685744,0.90695
1,Form / Fluency Errors,0.440945,0.95401


Unnamed: 0,Level,SubCategory,Krippendorff Alpha,Gwet's AC1
0,Level 1,Content / Meaning Errors,0.685744,0.90695
1,Level 1,Form / Fluency Errors,0.440945,0.95401
2,Level 2,Omission,0.81073,0.9854
3,Level 2,Substitution,0.712873,0.95608
4,Level 2,Syntactic Errors,0.599339,0.99853
5,Level 2,Addition,0.451542,0.97431
6,Level 2,Coherence and Structural Issues,0.435097,0.95576
7,Level 3,Subject-Verb Agreement Error,0.749679,0.99927
8,Level 3,Factual Distortion,0.702683,0.98459
9,Level 3,Factual Hallucination,0.618734,0.99593
