# Part II — Reliability Testing (3 annotators)

This notebook computes **pairwise inter-annotator agreement (IAA)** using **Cohen’s κ** for:

- **Partner 1 vs Partner 2** (human–human)
- **Partner 1 vs Assistant label** (human–assistant)
- **Partner 2 vs Assistant label** (human–assistant)

It also produces **confusion matrices** and supports **error analysis**.

**Input file:** `shared_memory_for_labeling.xlsx`


In [None]:
import pandas as pd
import numpy as np

from sklearn.metrics import cohen_kappa_score, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

FILE_PATH = "shared_memory_for_labeling.xlsx"
df = pd.read_excel(FILE_PATH)
print("Loaded:", FILE_PATH)
print("Rows, Cols:", df.shape)
df.head()


## 1) Set your column names

Run the next cell to see your columns, then set the three variables.


In [None]:
list(df.columns)


In [None]:
# ✅ EDIT THESE THREE if needed
PARTNER_1_COL = "Partner 1"
PARTNER_2_COL = "Partner 2"
ASSISTANT_COL = "Assistant label"

for col in [PARTNER_1_COL, PARTNER_2_COL, ASSISTANT_COL]:
    assert col in df.columns, f"Column not found: {col}"

# Optional: auto-detect a text column to show disagreement examples
TEXT_COL = None
for c in [
    "Turns",
    "Turns (2–3 utterances, real-time order)",
    "Turns (real-time order)",
    "Turns (real-time order; one cue)",
    "Block",
    "Text",
    "Utterance",
]:
    if c in df.columns:
        TEXT_COL = c
        break
print("Using TEXT_COL =", TEXT_COL)


## 2) Clean labels + create pairwise datasets


In [None]:
def clean_series(s: pd.Series) -> pd.Series:
    return s.astype(str).str.strip()

df_pair_p1_p2 = df[[PARTNER_1_COL, PARTNER_2_COL] + ([TEXT_COL] if TEXT_COL else [])].dropna(subset=[PARTNER_1_COL, PARTNER_2_COL]).copy()
df_pair_p1_asst = df[[PARTNER_1_COL, ASSISTANT_COL] + ([TEXT_COL] if TEXT_COL else [])].dropna(subset=[PARTNER_1_COL, ASSISTANT_COL]).copy()
df_pair_p2_asst = df[[PARTNER_2_COL, ASSISTANT_COL] + ([TEXT_COL] if TEXT_COL else [])].dropna(subset=[PARTNER_2_COL, ASSISTANT_COL]).copy()

df_pair_p1_p2[PARTNER_1_COL] = clean_series(df_pair_p1_p2[PARTNER_1_COL])
df_pair_p1_p2[PARTNER_2_COL] = clean_series(df_pair_p1_p2[PARTNER_2_COL])

df_pair_p1_asst[PARTNER_1_COL] = clean_series(df_pair_p1_asst[PARTNER_1_COL])
df_pair_p1_asst[ASSISTANT_COL] = clean_series(df_pair_p1_asst[ASSISTANT_COL])

df_pair_p2_asst[PARTNER_2_COL] = clean_series(df_pair_p2_asst[PARTNER_2_COL])
df_pair_p2_asst[ASSISTANT_COL] = clean_series(df_pair_p2_asst[ASSISTANT_COL])

print("Rows with both labels:")
print("- Partner 1 vs Partner 2:", len(df_pair_p1_p2))
print("- Partner 1 vs Assistant:", len(df_pair_p1_asst))
print("- Partner 2 vs Assistant:", len(df_pair_p2_asst))


## 3) Cohen’s κ (pairwise)


In [None]:
kappa_p1_p2 = cohen_kappa_score(df_pair_p1_p2[PARTNER_1_COL], df_pair_p1_p2[PARTNER_2_COL])
kappa_p1_asst = cohen_kappa_score(df_pair_p1_asst[PARTNER_1_COL], df_pair_p1_asst[ASSISTANT_COL])
kappa_p2_asst = cohen_kappa_score(df_pair_p2_asst[PARTNER_2_COL], df_pair_p2_asst[ASSISTANT_COL])

print("Cohen’s κ scores:")
print("------------------")
print(f"Partner 1 vs Partner 2 : {kappa_p1_p2:.3f}")
print(f"Partner 1 vs Assistant : {kappa_p1_asst:.3f}")
print(f"Partner 2 vs Assistant : {kappa_p2_asst:.3f}")


## 4) Confusion matrices

We generate confusion matrices for:
- **Partner 1 vs Partner 2**
- **Partner 2 vs Assistant**


In [None]:
# Union of labels across all annotators (for consistent axes)
all_labels = set(df_pair_p1_p2[PARTNER_1_COL]) | set(df_pair_p1_p2[PARTNER_2_COL]) | \
            set(df_pair_p2_asst[PARTNER_2_COL]) | set(df_pair_p2_asst[ASSISTANT_COL])
labels = sorted(all_labels)
labels


In [None]:
def show_confusion(a, b, title):
    cm = confusion_matrix(a, b, labels=labels)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
    fig, ax = plt.subplots(figsize=(8, 6))
    disp.plot(ax=ax, values_format='d')
    plt.title(title)
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()
    return cm

cm_p1_p2 = show_confusion(
    df_pair_p1_p2[PARTNER_1_COL],
    df_pair_p1_p2[PARTNER_2_COL],
    "Confusion Matrix: Partner 1 vs Partner 2"
)

cm_p2_asst = show_confusion(
    df_pair_p2_asst[PARTNER_2_COL],
    df_pair_p2_asst[ASSISTANT_COL],
    "Confusion Matrix: Partner 2 vs Assistant"
)


## 5) Error analysis

### 5a) Most frequent disagreements


In [None]:
def disagreement_table(df_pair, col_a, col_b):
    disagree = df_pair[df_pair[col_a] != df_pair[col_b]].copy()
    pairs = (
        disagree.groupby([col_a, col_b]).size().reset_index(name='count')
        .sort_values('count', ascending=False)
    )
    return disagree, pairs

disagree_p1_p2, pairs_p1_p2 = disagreement_table(df_pair_p1_p2, PARTNER_1_COL, PARTNER_2_COL)
disagree_p2_asst, pairs_p2_asst = disagreement_table(df_pair_p2_asst, PARTNER_2_COL, ASSISTANT_COL)

print("Partner 1 vs Partner 2 — disagreements:", len(disagree_p1_p2))
display(pairs_p1_p2.head(15))

print("Partner 2 vs Assistant — disagreements:", len(disagree_p2_asst))
display(pairs_p2_asst.head(15))


### 5b) Disagreement examples (qualitative)


In [None]:
if TEXT_COL:
    print("Examples: Partner 1 vs Partner 2")
    display(disagree_p1_p2[[TEXT_COL, PARTNER_1_COL, PARTNER_2_COL]].head(15))
    
    print("Examples: Partner 2 vs Assistant")
    display(disagree_p2_asst[[TEXT_COL, PARTNER_2_COL, ASSISTANT_COL]].head(15))
else:
    print("No TEXT_COL detected. Set TEXT_COL manually if your file includes turn text.")


## 6) Optional exports


In [None]:
pd.DataFrame(cm_p1_p2, index=labels, columns=labels).to_csv("cm_partner1_partner2.csv")
pd.DataFrame(cm_p2_asst, index=labels, columns=labels).to_csv("cm_partner2_assistant.csv")
pairs_p1_p2.to_csv("top_disagreements_partner1_partner2.csv", index=False)
pairs_p2_asst.to_csv("top_disagreements_partner2_assistant.csv", index=False)

if TEXT_COL:
    disagree_p1_p2[[TEXT_COL, PARTNER_1_COL, PARTNER_2_COL]].to_csv("disagreement_examples_partner1_partner2.csv", index=False)
    disagree_p2_asst[[TEXT_COL, PARTNER_2_COL, ASSISTANT_COL]].to_csv("disagreement_examples_partner2_assistant.csv", index=False)

print("Saved CSV outputs.")
