In [14]:
#@title Alt-test for generative clause graphs (3 humans vs LLM) — Colab-ready
# Mount & basics
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

import os, json, glob, re, unicodedata, math
from collections import defaultdict, Counter
from typing import Dict, List, Tuple, Optional
import numpy as np
import pandas as pd

!pip -q install scipy
from scipy.optimize import linear_sum_assignment
from scipy.stats import ttest_1samp

# ================== CONFIG ==================
H1_DIR = "/content/drive/MyDrive/minigraph__alt_test_clean/manual_labeling_h0"
H2_DIR = "/content/drive/MyDrive/minigraph__alt_test_clean/manual_labeling_h1"
H3_DIR = "/content/drive/MyDrive/minigraph__alt_test_clean/manual_labeling_h2"
LLM_DIR= "/content/drive/MyDrive/minigraph__alt_test_clean/manual_labeling_mar"

# scoring knobs
EDGE_WEIGHT = 0.5
NODE_WEIGHT = 0.5
SIM_THRESHOLD = 0.85            # node match threshold (0..1)
EPSILON_MARGIN = 0.15           # epsilon (cost-benefit margin)
Q_FDR = 0.05                    # FDR level
INCLUDE_NODE_TYPES = None       # e.g., {"PARTY","DEFINED_TERM","VALUE"}; None => include all except "CLAUSE"

# ================ HELPERS ====================
def list_records(path: str) -> List[dict]:
    """Load records from a directory of .json/.jsonl or a single file. Each record has 'prompt' and 'completion'."""
    recs = []
    paths = []
    if os.path.isdir(path):
        paths = sorted(glob.glob(os.path.join(path, "**/*"), recursive=True))
        paths = [p for p in paths if os.path.isfile(p) and os.path.splitext(p)[1].lower() in {".json",".jsonl"}]
    elif os.path.isfile(path):
        paths = [path]
    else:
        return recs

    for p in paths:
        try:
            with open(p, "r", encoding="utf-8") as f:
                if p.endswith(".jsonl"):
                    for line in f:
                        line=line.strip()
                        if not line: continue
                        recs.append(json.loads(line))
                else:
                    obj = json.load(f)
                    if isinstance(obj, dict) and "prompt" in obj:
                        recs.append(obj)
                    elif isinstance(obj, list):
                        recs.extend(obj)
        except Exception as e:
            print(f"⚠️ failed to read {p}: {e}")
    return recs

def salvage_json(s: str):
    if isinstance(s, dict): return s
    if not isinstance(s, str): return None
    try:
        return json.loads(s)
    except Exception:
        a, b = s.find("{"), s.rfind("}")
        if a>=0 and b>a:
            try: return json.loads(s[a:b+1])
            except: return None
        return None

PUNCT_RE = re.compile(r"[^\w\s\.\-]")  # keep letters/digits/_ whitespace . -
WS_RE    = re.compile(r"\s+")
def norm_text(s: str) -> str:
    if s is None: return ""
    s = unicodedata.normalize("NFKD", s)
    s = s.lower()
    s = PUNCT_RE.sub(" ", s)
    s = WS_RE.sub(" ", s).strip()
    return s

def node_label(n: dict) -> Tuple[str,str]:
    """Return (type, canonical_label_text) for node."""
    t = n.get("type","")
    a = n.get("attrs",{}) or {}
    txt = ""
    if t == "DEFINED_TERM":
        txt = a.get("term") or a.get("text") or ""
    elif t in {"PARTY","VALUE","JURISDICTION","PRODUCT","LOCATION","RIGHT","OBLIGATION","PROHIBITION"}:
        txt = a.get("text") or ""
    elif t == "CLAUSE":
        # clauses can be noisy; use id or title if present
        txt = str(n.get("id") or a.get("title") or "")
    else:
        txt = a.get("text") or a.get("term") or str(n.get("id") or "")
    return t, norm_text(txt)

def filter_nodes(nodes: List[dict]) -> List[dict]:
    if INCLUDE_NODE_TYPES is None:
        return [n for n in nodes if n.get("type") != "CLAUSE"]
    return [n for n in nodes if n.get("type") in INCLUDE_NODE_TYPES]

def build_graph(completion) -> Tuple[List[dict], List[dict]]:
    """Parse completion => (nodes, edges)."""
    g = completion if isinstance(completion, dict) else salvage_json(completion)
    if not g: return [], []
    nodes = g.get("nodes") or []
    edges = g.get("edges") or []
    # ensure dict lists
    nodes = [n for n in nodes if isinstance(n, dict)]
    edges = [e for e in edges if isinstance(e, dict)]
    return nodes, edges

def similarity(a: Tuple[str,str], b: Tuple[str,str]) -> float:
    """Type-aware similarity for node labels (simple)."""
    ta, la = a; tb, lb = b
    if ta != tb: return 0.0
    if la == lb: return 1.0
    # soft overlap
    sa = set(la.split()); sb = set(lb.split())
    if not sa or not sb: return 0.0
    j = len(sa & sb) / len(sa | sb)
    # small bonus for long exact prefixes
    pref = 1.0 if la[:15]==lb[:15] and len(la)>=15 and len(lb)>=15 else 0.0
    return max(j, 0.8*pref)

def hungarian_match(A: List[dict], B: List[dict], thr=0.85) -> Dict[int,int]:
    """Return mapping from indices in A to indices in B for pairs with sim >= thr."""
    if not A or not B: return {}
    la = [node_label(n) for n in A]
    lb = [node_label(n) for n in B]
    M = np.zeros((len(A), len(B)), dtype=float)
    for i in range(len(A)):
        for j in range(len(B)):
            M[i,j] = 1.0 - similarity(la[i], lb[j])
    # forbid low-sim by setting huge cost
    big = 1e6
    for i in range(len(A)):
        for j in range(len(B)):
            if 1.0 - M[i,j] < thr:  # sim < thr
                M[i,j] = big
    row_ind, col_ind = linear_sum_assignment(M)
    match = {}
    for r, c in zip(row_ind, col_ind):
        sim = 1.0 - M[r,c]
        if sim >= thr and sim <= 1.0:  # keep only good matches
            match[r] = c
    return match

def f1_counts(pred_pos: int, ref_pos: int, tp: int) -> Tuple[float,float,float]:
    if pred_pos == 0 and ref_pos == 0: return (1.0, 1.0, 1.0)
    prec = tp / pred_pos if pred_pos>0 else 0.0
    rec  = tp / ref_pos  if ref_pos>0 else 0.0
    if prec+rec == 0: f1 = 0.0
    else: f1 = 2*prec*rec/(prec+rec)
    return prec, rec, f1

def edge_f1(pred_nodes, pred_edges, ref_nodes, ref_edges, thr=SIM_THRESHOLD) -> Tuple[float,float,float]:
    # match nodes pred->ref
    A = filter_nodes(pred_nodes)
    B = filter_nodes(ref_nodes)
    amap = hungarian_match(A, B, thr=thr)  # indices into A -> indices into B

    # build canonical ids for ref nodes (only filtered types)
    ref_ids = {}
    for j, n in enumerate(B):
        ref_ids[j] = f"{n.get('type','')}|{node_label(n)[1]}|{j}"

    # map pred edges to ref-space using matches
    def canon_edge_set(nodes, edges, to_ref: bool):
        E = set()
        if to_ref:
            # map through amap; only edges whose both endpoints matched
            for e in edges:
                try:
                    s_idx = next(i for i,nn in enumerate(A) if nn.get("id")==e.get("src") or node_label(nn)==node_label(next(n for n in A if n.get('id')==e.get('src'))))
                except StopIteration:
                    # fallback: find index by label match
                    s_idx = None
                    for i,nn in enumerate(A):
                        if node_label(nn)==node_label({'type': nn.get('type'), 'attrs': {'text': nn.get('attrs',{}).get('text', '')}}):
                            s_idx = i; break
                # simpler: map by indices from A list
                # We need a more reliable mapping: build id->index maps for A
            return E

    # Instead of unreliable id matching, build id->index maps for A and B by order
    id_to_idx_A = { (A[i].get("id")): i for i in range(len(A)) }
    id_to_idx_B = { (B[j].get("id")): j for j in range(len(B)) }

    # make helper to map an edge list using amap via id->idx
    def map_edges_to_ref(A_nodes, edges, amap_dict):
        mapped = []
        # build id->idx for A_nodes
        id_to_idx = { A_nodes[i].get("id"): i for i in range(len(A_nodes)) }
        for e in edges:
            si = id_to_idx.get(e.get("src"))
            ti = id_to_idx.get(e.get("tgt"))
            if si is None or ti is None:
                continue
            if si in amap_dict and ti in amap_dict:
                sref = ref_ids[amap_dict[si]]
                tref = ref_ids[amap_dict[ti]]
                mapped.append( (sref, e.get("type"), tref) )
        return set(mapped)

    pred_edge_set = map_edges_to_ref(A, pred_edges, amap)
    # ref edge set in its own space (use its own ids)
    ref_edge_set = set()
    # restrict ref edges to filtered node types
    ref_allowed_ids = { B[j].get("id"): ref_ids[j] for j in range(len(B)) }
    for e in ref_edges:
        sid = e.get("src"); tid = e.get("tgt")
        if sid in ref_allowed_ids and tid in ref_allowed_ids:
            ref_edge_set.add( (ref_allowed_ids[sid], e.get("type"), ref_allowed_ids[tid]) )

    tp = len(pred_edge_set & ref_edge_set)
    prec, rec, f1 = f1_counts(len(pred_edge_set), len(ref_edge_set), tp)
    return prec, rec, f1

def node_f1(pred_nodes, ref_nodes, thr=SIM_THRESHOLD) -> Tuple[float,float,float]:
    A = filter_nodes(pred_nodes)
    B = filter_nodes(ref_nodes)
    match = hungarian_match(A, B, thr=thr)
    tp = len(match)
    return f1_counts(len(A), len(B), tp)

def consensus_graph(gA: dict, gB: dict, thr=SIM_THRESHOLD) -> dict:
    """Intersection consensus of two human graphs (edge-level).
       Return nodes/edges in A-space (use A's node IDs for consensus)."""
    A_nodes, A_edges = gA["nodes"], gA["edges"]
    B_nodes, B_edges = gB["nodes"], gB["edges"]
    # match B->A
    A_filt = filter_nodes(A_nodes)
    B_filt = filter_nodes(B_nodes)
    match = hungarian_match(B_filt, A_filt, thr=thr)  # indices in B_filt -> indices in A_filt
    # build idx maps for filt<->orig
    B_filt_to_orig = {i: B_nodes.index(B_filt[i]) for i in range(len(B_filt))}
    A_filt_to_orig = {j: A_nodes.index(A_filt[j]) for j in range(len(A_filt))}

    # map B edges into A id-space
    id_map = {}
    for i_b, j_a in match.items():
        b_orig_idx = B_filt_to_orig[i_b]
        a_orig_idx = A_filt_to_orig[j_a]
        id_map[B_nodes[b_orig_idx].get("id")] = A_nodes[a_orig_idx].get("id")

    def edge_set(nodes, edges, id_map=None):
        s = set()
        for e in edges:
            s_id = e.get("src"); t_id = e.get("tgt"); rel = e.get("type")
            if id_map is not None:
                s_id = id_map.get(s_id)
                t_id = id_map.get(t_id)
                if s_id is None or t_id is None:
                    continue
            s.add((s_id, rel, t_id))
        return s

    A_edge_set = edge_set(A_nodes, A_edges, id_map=None)
    B_edge_set_in_A = edge_set(B_nodes, B_edges, id_map=id_map)
    inter_edges = A_edge_set & B_edge_set_in_A

    # consensus nodes = endpoints in consensus edges
    keep_ids = set()
    for s_id, rel, t_id in inter_edges:
        keep_ids.add(s_id); keep_ids.add(t_id)
    A_nodes_by_id = {n.get("id"): n for n in A_nodes}
    cons_nodes = [A_nodes_by_id[i] for i in A_nodes_by_id if i in keep_ids]
    cons_edges = [{"src": s, "type": r, "tgt": t} for (s,r,t) in sorted(inter_edges)]

    return {"nodes": cons_nodes, "edges": cons_edges}

def score_against_ref(pred: dict, ref: dict) -> Dict[str,float]:
    p_nodes, p_edges = pred["nodes"], pred["edges"]
    r_nodes, r_edges = ref["nodes"], ref["edges"]
    ep, er, ef1 = edge_f1(p_nodes, p_edges, r_nodes, r_edges)
    np_, nr, nf1 = node_f1(p_nodes, r_nodes)
    S = EDGE_WEIGHT*ef1 + NODE_WEIGHT*nf1
    return {"edge_prec":ep, "edge_rec":er, "edge_f1":ef1,
            "node_prec":np_, "node_rec":nr, "node_f1":nf1, "S":S}

def by_fdr(pvals: List[float], q=0.05):
    """Benjamini–Yekutieli control; return list of booleans for rejections in original order."""
    m = len(pvals)
    order = np.argsort(pvals)
    c_m = sum(1.0/i for i in range(1, m+1))
    thr = [ (k+1)/m * (q/c_m) for k in range(m) ]  # 0-indexed
    sorted_p = [pvals[i] for i in order]
    kmax = -1
    for k in range(m-1, -1, -1):
        if sorted_p[k] <= thr[k]:
            kmax = k; break
    reject_sorted = [False]*m
    if kmax >= 0:
        for k in range(kmax+1):
            reject_sorted[k] = True
    # map back
    reject = [False]*m
    for pos,k in enumerate(order):
        reject[k] = reject_sorted[pos]
    return reject, thr, order.tolist()

# ================ LOAD DATA =================
def load_map_orig(path) -> Dict[str, dict]:
    """Return {prompt: graph_dict} for a directory/file of records."""
    recs = list_records(path)
    out = {}
    for r in recs:
        pr = r.get("prompt")
        comp = r.get("completion")
        gnodes, gedges = build_graph(comp)
        if pr is None or gnodes is None:
            continue
        out[pr] = {"nodes": gnodes, "edges": gedges}
    return out# ================ LOAD DATA (CORRECTED) =================
def load_map(path) -> Dict[str, dict]:
    """Return {prompt_key: graph_dict} for a directory/file of records."""
    recs = list_records(path)
    out = {}
    for r in recs:
        prompt_val = r.get("prompt")
        pr_key = None # This will be our new string-based key

        # --- START: FIX ---
        # If the prompt is a dictionary, convert it to a stable JSON string.
        # sort_keys=True ensures that the string is the same regardless of key order.
        if isinstance(prompt_val, dict):
            pr_key = json.dumps(prompt_val, sort_keys=True)
        # If it's already a string, we can use it directly.
        elif isinstance(prompt_val, str):
            pr_key = prompt_val
        # --- END: FIX ---

        comp = r.get("completion")
        gnodes, gedges = build_graph(comp)

        # Use the new pr_key and check that the graph is valid
        if pr_key is None or gnodes is None:
            continue

        out[pr_key] = {"nodes": gnodes, "edges": gedges}
    return out

H1 = load_map(H1_DIR)
H2 = load_map(H2_DIR)
H3 = load_map(H3_DIR)
LLM= load_map(LLM_DIR)

common_prompts = sorted(set(H1).intersection(H2).intersection(H3).intersection(LLM))
print(f"Common prompts across H1, H2, H3, LLM: {len(common_prompts)}")

# ============== PER-CLAUSE STATS ============
rows = []
# j=0,1,2 for annotators 1..3
HUMANS = [H1, H2, H3]
names  = ["H1","H2","H3"]

for pr in common_prompts:
    g1, g2, g3 = H1[pr], H2[pr], H3[pr]
    gf = LLM[pr]
    # for each j, build leave-one-out consensus G_-j
    for j,(Hj,name) in enumerate(zip(HUMANS,names)):
        others = [HUMANS[k] for k in range(3) if k!=j]
        # consensus between the two others (in space of the first 'other')
        # pick order stable: others[0] as A, others[1] as B
        A = others[0][pr]; B = others[1][pr]
        Gref = consensus_graph(A, B, thr=SIM_THRESHOLD)

        s_f = score_against_ref(gf, Gref)
        s_h = score_against_ref(Hj[pr], Gref)

        # outcomes
        win = 1 if s_f["S"] > s_h["S"] else 0
        tie = 1 if abs(s_f["S"] - s_h["S"]) < 1e-9 else 0
        loss= 1 if s_f["S"] < s_h["S"] else 0

        # margin-adjusted difference for t-test
        z = (s_f["S"] - s_h["S"]) - EPSILON_MARGIN

        rows.append({
            "prompt": pr,
            "heldout": name,
            "S_llm": s_f["S"],
            "S_h": s_h["S"],
            "win": win,
            "tie": tie,
            "loss": loss,
            "z": z
        })

df = pd.DataFrame(rows)
print(df.head())

# ============== PER-HUMAN TESTS =============
# Build per-item wins for each heldout human j
pvals = []
ns = []
summ = []
rho_f = []   # AP components: P(LLM >= H_j)

for j, name in enumerate(names):
    d_rows = df[df["heldout"]==name].copy()
    # Recompute per-item wins W_f, W_h from the per-clause S values you already saved
    # (You saved S_llm and S_h in df rows)
    Wf = (d_rows["S_llm"] >= d_rows["S_h"]).astype(int).values
    Wh = (d_rows["S_h"]   >= d_rows["S_llm"]).astype(int).values
    d  = (Wh - Wf).astype(float)              # element-wise in {-1,0,1}
    n  = len(d)
    ns.append(n)

    # Advantage probabilities
    rho_f_j = Wf.mean()
    rho_f.append(float(rho_f_j))

    # One-sample *left-tailed* t-test on (d_bar - EPSILON_MARGIN) < 0
    dbar = d.mean()
    sd   = d.std(ddof=1) if n > 1 else 0.0
    if n > 1 and sd > 0:
        tstat = (dbar - EPSILON_MARGIN) / (sd / np.sqrt(n))
        # left-tailed p-value:
        from scipy.stats import t as student_t
        p_one = float(student_t.cdf(tstat, df=n-1))
    else:
        # fallback: if no variance (all d same), decide deterministically
        tstat = np.inf if (dbar - EPSILON_MARGIN) > 0 else -np.inf
        p_one = 1.0 if tstat == np.inf else 0.0

    pvals.append(p_one)

    # Descriptives (optional)
    win_rate_half_ties = ((d_rows["win"].sum() + 0.5*d_rows["tie"].sum()) / n) if n else 0.0
    summ.append({
        "heldout": name,
        "n_clauses": n,
        "t_stat": float(tstat),
        "p_one_sided_left": float(p_one),
        "rho_f": float(rho_f_j),
        "mean_d": float(dbar),
        "std_d": float(sd),
        "win_rate_half_ties": float(win_rate_half_ties),
    })

summary = pd.DataFrame(summ)
print("\nPer-human tests (before FDR):")
display(summary)

# BY FDR and ω
reject, thr, order = by_fdr(pvals, q=Q_FDR)
omega = sum(reject) / len(reject) if reject else 0.0

print("\nFDR (Benjamini–Yekutieli):")
print(f"p-values (H1: d_bar < eps): {pvals}")
print(f"Reject?                     {reject}")
print(f"ω (fraction rejected)       = {omega:.3f}  --> {'PASS' if omega>=0.5 else 'FAIL'} at q={Q_FDR}")

# AP ρ = average over humans of rho^f_j
rho = sum(rho_f)/len(rho_f)
print(f"\nEffect size ρ (Average Advantage Probability) = {rho:.3f}")
print("(Interpretation: probability the LLM is at least as good as a random human on a random clause.)")

# Optional: save a CSV with all per-clause rows and summary
OUT_DIR = "/content/drive/MyDrive/alt_test_results"
os.makedirs(OUT_DIR, exist_ok=True)
df.to_csv(os.path.join(OUT_DIR, "per_clause_alt_test_rows.csv"), index=False)
summary.to_csv(os.path.join(OUT_DIR, "per_human_summary.csv"), index=False)
print(f"\nSaved per-clause rows and per-human summary to: {OUT_DIR}")


Mounted at /content/drive
⚠️ failed to read /content/drive/MyDrive/minigraph__alt_test_clean/manual_labeling_h2/BELLICUMPHARMACEUTICALS_INC_05_07_2019-EX-10.1-Supply_Agreement_8.7.json: Expecting ',' delimiter: line 97 column 2 (char 6370)
⚠️ failed to read /content/drive/MyDrive/minigraph__alt_test_clean/manual_labeling_h2/WORLDWIDESTRATEGIESINC_11_02_2005-EX-10-RESELLER_AGREEMENT_5.1.json: Expecting property name enclosed in double quotes: line 12 column 5 (char 4155)
⚠️ failed to read /content/drive/MyDrive/minigraph__alt_test_clean/manual_labeling_mar/BELLICUMPHARMACEUTICALS_INC_05_07_2019-EX-10.1-Supply_Agreement_e.json: Expecting ',' delimiter: line 105 column 7 (char 6279)
⚠️ failed to read /content/drive/MyDrive/minigraph__alt_test_clean/manual_labeling_mar/NETGEAR_INC_04_21_2003-EX-10.16-DISTRIBUTOR_AGREEMENT_16.json: Expecting ',' delimiter: line 61 column 9 (char 7011)
⚠️ failed to read /content/drive/MyDrive/minigraph__alt_test_clean/manual_labeling_mar/StaarSurgicalCompany

Unnamed: 0,heldout,n_clauses,t_stat,p_one_sided_left,rho_f,mean_d,std_d,win_rate_half_ties
0,H1,43,-3.17585,0.001399,0.813953,-0.209302,0.741881,0.604651
1,H2,43,-inf,0.0,1.0,0.0,0.0,0.5
2,H3,43,-3.748868,0.000268,0.906977,-0.186047,0.587805,0.593023



FDR (Benjamini–Yekutieli):
p-values (H1: d_bar < eps): [0.001399173157613514, 0.0, 0.0002684186256635553]
Reject?                     [True, True, True]
ω (fraction rejected)       = 1.000  --> PASS at q=0.05

Effect size ρ (Average Advantage Probability) = 0.907
(Interpretation: probability the LLM is at least as good as a random human on a random clause.)

Saved per-clause rows and per-human summary to: /content/drive/MyDrive/alt_test_results


In [11]:
# ========= HUMAN↔HUMAN PAIRWISE AGREEMENT (symmetrized) =========

from itertools import combinations

def sym_pair_scores(A: dict, B: dict, prompts: List[str]):
    """Return per-clause symmetrized scores for A vs B over shared prompts."""
    rows = []
    for pr in prompts:
        gA, gB = A[pr], B[pr]
        sab = score_against_ref(gA, gB)  # A vs B
        sba = score_against_ref(gB, gA)  # B vs A

        edge_f1_sym = 0.5*(sab["edge_f1"] + sba["edge_f1"])
        node_f1_sym = 0.5*(sab["node_f1"] + sba["node_f1"])
        S_sym       = EDGE_WEIGHT*edge_f1_sym + NODE_WEIGHT*node_f1_sym

        rows.append({
            "prompt": pr,
            "edge_f1_sym": edge_f1_sym,
            "node_f1_sym": node_f1_sym,
            "S_sym": S_sym,
            "edge_f1_ab": sab["edge_f1"], "edge_f1_ba": sba["edge_f1"],
            "node_f1_ab": sab["node_f1"], "node_f1_ba": sba["node_f1"],
            "S_ab": sab["S"], "S_ba": sba["S"],
        })
    return rows

# Build shared prompt sets for each human pair
human_maps = {"H1": H1, "H2": H2, "H3": H3}
pairs = list(combinations(human_maps.keys(), 2))

all_rows = []
for a, b in pairs:
    prompts_ab = sorted(set(human_maps[a]).intersection(human_maps[b]))
    rows = sym_pair_scores(human_maps[a], human_maps[b], prompts_ab)
    for r in rows:
        r["pair"] = f"{a}-{b}"
    all_rows.extend(rows)

hh_rows_df = pd.DataFrame(all_rows)
print("Per-clause human↔human rows:")
display(hh_rows_df.head())

# ---- Summary tables: mean ± std per pair ----
def mean_std(df, col):
    return f"{df[col].mean():.3f} ± {df[col].std(ddof=1):.3f}"

summ_rows = []
for (pair), g in hh_rows_df.groupby("pair"):
    summ_rows.append({
        "pair": pair,
        "n_clauses": g.shape[0],
        "edge_f1_sym (mean±std)": mean_std(g, "edge_f1_sym"),
        "node_f1_sym (mean±std)": mean_std(g, "node_f1_sym"),
        "S_sym (mean±std)":       mean_std(g, "S_sym"),
    })
hh_summary = pd.DataFrame(summ_rows).sort_values("pair")
print("\nHuman↔human symmetrized agreement summary:")
display(hh_summary)

# ---- Agreement matrix (mean S_sym) for quick glance ----
pairs_means = hh_rows_df.groupby("pair")["S_sym"].mean()
humans = ["H1","H2","H3"]
mat = pd.DataFrame(index=humans, columns=humans, dtype=float)
for h in humans:
    mat.loc[h,h] = 1.0  # perfect self-agreement by convention
for a,b in pairs:
    m = pairs_means[f"{a}-{b}"]
    mat.loc[a,b] = m
    mat.loc[b,a] = m
print("\nMean S_sym agreement matrix (humans only):")
display(mat)


Per-clause human↔human rows:


Unnamed: 0,prompt,edge_f1_sym,node_f1_sym,S_sym,edge_f1_ab,edge_f1_ba,node_f1_ab,node_f1_ba,S_ab,S_ba,pair
0,"{""input"": {""clause"": {""id"": ""1.2"", ""text"": ""1....",1.0,0.875,0.9375,1.0,1.0,0.875,0.875,0.9375,0.9375,H1-H2
1,"{""input"": {""clause"": {""id"": ""10"", ""text"": ""10 ...",0.5,0.666667,0.583333,0.0,1.0,0.666667,0.666667,0.333333,0.833333,H1-H2
2,"{""input"": {""clause"": {""id"": ""10.13"", ""text"": ""...",0.9,0.857143,0.878571,0.8,1.0,0.857143,0.857143,0.828571,0.928571,H1-H2
3,"{""input"": {""clause"": {""id"": ""10.14"", ""text"": ""...",1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,H1-H2
4,"{""input"": {""clause"": {""id"": ""10.2"", ""text"": ""1...",1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,H1-H2



Human↔human symmetrized agreement summary:


Unnamed: 0,pair,n_clauses,edge_f1_sym (mean±std),node_f1_sym (mean±std),S_sym (mean±std)
0,H1-H2,48,0.914 ± 0.140,0.895 ± 0.127,0.905 ± 0.126
1,H1-H3,47,0.881 ± 0.170,0.873 ± 0.141,0.877 ± 0.142
2,H2-H3,48,0.939 ± 0.149,0.923 ± 0.125,0.931 ± 0.121



Mean S_sym agreement matrix (humans only):


Unnamed: 0,H1,H2,H3
H1,1.0,0.904797,0.877261
H2,0.904797,1.0,0.930803
H3,0.877261,0.930803,1.0
