In [5]:
import os, json, re
from datetime import datetime
from collections import Counter
import pandas as pd

# ======= Absolute paths (Fix A) =======
BASE_DIR = r"C:\CALDARIUM\Caldarium"   

ANN_A_DIR = os.path.join(BASE_DIR, "bench", "ground_truth")
ANN_B_DIR = os.path.join(BASE_DIR, "bench", "parser_outputs")
OUTPUT_DIR = os.path.join(BASE_DIR, "bench", "outputs")

os.makedirs(OUTPUT_DIR, exist_ok=True)

# (Optional sanity check)
for p in [ANN_A_DIR, ANN_B_DIR, OUTPUT_DIR]:
    print(p, "->", "OK" if os.path.isdir(p) else "MISSING")


# ===================== Paths =====================
# ANN_A_DIR = "bench/ground_truth"
# ANN_B_DIR = "bench/parser_outputs"
# OUTPUT_DIR = "bench/outputs"
# os.makedirs(OUTPUT_DIR, exist_ok=True)

#pat = re.compile(r"^(consent_T\d+_gen\d+)\.json$", re.IGNORECASE)
FILE_ID_PAT = re.compile(r"^(consent_T\d+_gen\d+)", re.IGNORECASE)

# ===================== ID Extraction =====================

# def list_ids(folder):
#     ids, bad = [], []
#     for fn in os.listdir(folder):
#         if not fn.endswith(".json"): continue
#         m = pat.match(fn)
#         if m: ids.append(m.group(1))
#         else: bad.append(fn)
#     return set(ids), bad

# ids_a, bad_a = list_ids(ANN_A_DIR)
# ids_b, bad_b = list_ids(ANN_B_DIR)

# overlap = sorted(ids_a & ids_b)
# missing_in_b = sorted(ids_a - ids_b)
# missing_in_a = sorted(ids_b - ids_a)

# print(f"A count: {len(ids_a)} | B count: {len(ids_b)} | Overlap pairs: {len(overlap)}")
# if bad_a: print("[WARN] A bad names:", bad_a)
# if bad_b: print("[WARN] B bad names:", bad_b)
# if missing_in_b: print("[WARN] Missing in B:", missing_in_b)
# if missing_in_a: print("[WARN] Missing in A:", missing_in_a)

def extract_short_id(filename):
    m = FILE_ID_PAT.match(os.path.splitext(filename)[0])
    return m.group(1) if m else None

def list_ids(folder):
    ids, bad = [], []
    for fn in os.listdir(folder):
        if not fn.endswith(".json"): 
            continue
        key = extract_short_id(fn)
        if key: ids.append(key)
        else:   bad.append(fn)
    return set(ids), bad

ids_a, bad_a = list_ids(ANN_A_DIR)
ids_b, bad_b = list_ids(ANN_B_DIR)

overlap = sorted(ids_a & ids_b)
missing_in_b = sorted(ids_a - ids_b)
missing_in_a = sorted(ids_b - ids_a)

print(f"A count: {len(ids_a)} | B count: {len(ids_b)} | Overlap pairs: {len(overlap)}")
if bad_a: print("[WARN] A bad names:", bad_a)
if bad_b: print("[WARN] B bad names:", bad_b)
if missing_in_b: print("[WARN] Missing in B:", missing_in_b)
if missing_in_a: print("[WARN] Missing in A:", missing_in_a)

C:\CALDARIUM\Caldarium\bench\ground_truth -> OK
C:\CALDARIUM\Caldarium\bench\parser_outputs -> OK
C:\CALDARIUM\Caldarium\bench\outputs -> OK
A count: 16 | B count: 16 | Overlap pairs: 16


In [6]:
# ===================== Config =====================
DATE_FORMATS = ["%Y-%m-%d", "%m/%d/%Y", "%Y/%m/%d", "%m-%d-%Y", "%d-%b-%Y", "%b %d, %Y"]
#FNAME_PAT = re.compile(r"^(consent_T\d+_gen\d+)\.json$", re.IGNORECASE)

# FIELDS = [
#     "patient_name", "patient_first_name", "patient_middle_name", "patient_last_name","patient_address_name",
#     "patient_id", "patient_dob", "patient_signature", "patient_state", "patient_city", "patient_zip_code",
#     "provider_name", "provider_address_name", "provider_phone", "provider_fax", "provider_state", "provider_city",
#     "provider_zip_code", "family_name", "family_relation", "family_phone", "family_address_name", "family_state",
#     "family_city", "family_zip_code", "guardian_name", "guardian_signature", "guardian_relation", "date", "expiration_date",
#     "expiration_event", "translator_name", "translator_signature"
# ]

# # ===================== Helpers =====================
# _ws = re.compile(r"\s+")
# def norm_str(s):
#     if s is None: return None
#     s = str(s).strip().casefold()
#     return _ws.sub(" ", s) or None

# def norm_phone(s):
#     if s is None: return None
#     ds = re.sub(r"\D+", "", str(s))
#     return ds or None

# def parse_date(s):
#     if s is None: return None
#     s = str(s).strip()
#     if not s: return None
#     for fmt in DATE_FORMATS:
#         try: return datetime.strptime(s, fmt).date()
#         except: pass
#     try: return datetime.fromisoformat(s).date()
#     except: return None

# def norm_date(s):
#     d = parse_date(s)
#     return d.isoformat() if d else None


# Canonical field names (align with your consent schema)
FIELDS = [
    "patient_name","patient_first_name","patient_middle_name","patient_last_name","patient_address_name",
    "patient_id","patient_dob","patient_signature","patient_state","patient_city","patient_zip_code",
    "provider_name","provider_address_name","provider_phone","provider_fax","provider_state","provider_city",
    "provider_zip_code","family_name","family_relation","family_number","family_address_name","family_state",
    "family_city","family_zip_code","guardian_name","guardian_signature","guardian_relation","date","expiration_date",
    "expiration_event","translator_name","translator_signature"
]

# Map variant keys to canonical keys (edit if you spot more)
ALIASES = {
    "family_phone": "family_number",
    "fam_number": "family_number",
    "fam_phone": "family_number",
}

_ws = re.compile(r"\s+")
def norm_str(s):
    if s is None: return None
    s = str(s).strip().casefold()
    return _ws.sub(" ", s) or None

def norm_phone(s):
    if s is None: return None
    ds = re.sub(r"\D+", "", str(s))
    return ds or None

def parse_date(s):
    if s is None: return None
    s = str(s).strip()
    if not s: return None
    for fmt in DATE_FORMATS:
        try: return datetime.strptime(s, fmt).date()
        except: pass
    try: return datetime.fromisoformat(s).date()
    except: return None

def norm_date(s):
    d = parse_date(s)
    return d.isoformat() if d else None

# ===================== Record Normalization =====================
# def normalize_record(j, forced_id=None):
#     out = dict(j)
#     for f in FIELDS:
#         val = j.get(f)
#         if "date" in f or "dob" in f:
#             out[f] = norm_date(val)
#         elif "phone" in f:
#             out[f] = norm_phone(val)
#         else:
#             out[f] = norm_str(val)
#     # force id consistency
#     if forced_id:
#         out["consent_id"] = forced_id
#     return out

def normalize_record(raw, forced_id=None):
    # apply aliasing
    j = {}
    for k,v in raw.items():
        k2 = ALIASES.get(k, k)
        j[k2] = v

    out = {}
    for f in FIELDS:
        val = j.get(f)
        if f.endswith("date") or f.endswith("dob") or f in ("date","expiration_date","patient_dob"):
            out[f] = norm_date(val)
        elif f.endswith("number") or f.endswith("phone"):
            out[f] = norm_phone(val)
        else:
            out[f] = norm_str(val)
    if forced_id:
        out["consent_id"] = forced_id
    return out

def load_by_short_id(folder):
    data = {}
    for fn in os.listdir(folder):
        if not fn.endswith(".json"): 
            continue
        short_id = extract_short_id(fn)
        if not short_id:
            continue
        with open(os.path.join(folder, fn), "r", encoding="utf-8") as f:
            raw = json.load(f)
        data[short_id] = normalize_record(raw, forced_id=short_id)
    return data

data_a = load_by_short_id(ANN_A_DIR)   # GT
data_b = load_by_short_id(ANN_B_DIR)   # Parser
overlap = sorted(set(data_a) & set(data_b))
print(f"Comparing {len(overlap)} consent forms...")



# ===================== Load =====================
# def load_by_full_id(folder):
#     data = {}
#     for fn in os.listdir(folder):
#         if not fn.endswith(".json"): continue
#         m = FNAME_PAT.match(fn)
#         if not m: continue
#         key = m.group(1)
#         with open(os.path.join(folder, fn), "r", encoding="utf-8") as f:
#             j = json.load(f)
#         data[key] = normalize_record(j, forced_id=key)
#     return data

# ===================== Metrics =====================
def kappa(a,b):
    A = ["<MISSING>" if v is None else str(v) for v in a]
    B = ["<MISSING>" if v is None else str(v) for v in b]
    n = len(A)
    if n == 0: return None
    po = sum(1 for x,y in zip(A,B) if x==y) / n
    ca, cb = Counter(A), Counter(B)
    pe = sum((ca[k]/n)*(cb[k]/n) for k in set(ca)|set(cb))
    return 1.0 if pe == 1 else (po - pe) / (1 - pe)

def exact(a,b):
    n = len(a)
    return None if n==0 else sum(1 for x,y in zip(a,b) if x==y)/n

# ===================== Compare =====================
# data_a = load_by_full_id(ANN_A_DIR)
# data_b = load_by_full_id(ANN_B_DIR)

# ids_a, ids_b = set(data_a), set(data_b)
# overlap = sorted(ids_a & ids_b)
# print(f"Comparing {len(overlap)} consent forms...")

# summary = []
# def collect(field):
#     A = [data_a[i].get(field) for i in overlap]
#     B = [data_b[i].get(field) for i in overlap]
#     return A,B

# for f in FIELDS:
#     A,B = collect(f)
#     summary += [
#         {"field": f, "metric": "exact_match_rate", "value": exact(A,B)},
#         {"field": f, "metric": "cohens_kappa",     "value": kappa(A,B)},
#     ]

# df_summary = pd.DataFrame(summary).sort_values(["field","metric"]).reset_index(drop=True)
# out_path = os.path.join(OUTPUT_DIR, "qa_report_consent.csv")
# df_summary.to_csv(out_path, index=False)

# print("Saved:", out_path)
# display(df_summary.head(30))

summary = []
def collect(field):
    A = [data_a[i].get(field) for i in overlap]
    B = [data_b[i].get(field) for i in overlap]
    return A,B

for f in FIELDS:
    A,B = collect(f)
    summary += [
        {"field": f, "metric": "exact_match_rate", "value": exact(A,B)},
        {"field": f, "metric": "cohens_kappa",     "value": kappa(A,B)},
    ]

df_summary = pd.DataFrame(summary).sort_values(["field","metric"]).reset_index(drop=True)
out_path = os.path.join(OUTPUT_DIR, "qa_report_consent.csv")
df_summary.to_csv(out_path, index=False)
print("Saved:", out_path)
display(df_summary.head(30))

Comparing 16 consent forms...
Saved: C:\CALDARIUM\Caldarium\bench\outputs\qa_report_consent.csv


Unnamed: 0,field,metric,value
0,date,cohens_kappa,1.0
1,date,exact_match_rate,1.0
2,expiration_date,cohens_kappa,0.777778
3,expiration_date,exact_match_rate,0.875
4,expiration_event,cohens_kappa,0.676768
5,expiration_event,exact_match_rate,0.75
6,family_address_name,cohens_kappa,1.0
7,family_address_name,exact_match_rate,1.0
8,family_city,cohens_kappa,1.0
9,family_city,exact_match_rate,1.0


In [7]:
# ===================== Hybrid QA Score =====================
# mean_kappa = df_summary[df_summary['metric']=="cohens_kappa"]["value"].mean()
# mean_exact = df_summary[df_summary['metric']=="exact_match_rate"]["value"].mean()

# # Simpler weight since no numeric or line items
# hybrid_score = 0.6 * mean_exact + 0.4 * mean_kappa

# def interpret(score):
#     if score >= 0.9: return "Excellent (≥ 90%)"
#     elif score >= 0.8: return "Good (80–89%)"
#     elif score >= 0.7: return "Moderate (70–79%)"
#     else: return "Poor (< 70%)"

# interpretation = interpret(hybrid_score)
# print("\n==================== CONSENT QA SUMMARY ====================")
# print(f"Hybrid QA Score: {hybrid_score:.3f} → {hybrid_score*100:.2f}%")
# print(f"Interpretation: {interpretation}")
# print("============================================================")

mean_kappa = df_summary[df_summary['metric']=="cohens_kappa"]["value"].mean()
mean_exact = df_summary[df_summary['metric']=="exact_match_rate"]["value"].mean()
hybrid_score = 0.6 * mean_exact + 0.4 * mean_kappa

def interpret(score):
    if score >= 0.9: return "Excellent (≥ 90%)"
    elif score >= 0.8: return "Good (80–89%)"
    elif score >= 0.7: return "Moderate (70–79%)"
    else: return "Poor (< 70%)"

print("\n==================== CONSENT QA SUMMARY ====================")
print(f"Hybrid QA Score: {hybrid_score:.3f} → {hybrid_score*100:.2f}%")
print(f"Interpretation: {interpret(hybrid_score)}")
print("============================================================")

# === Additional export: field-level TP/FP/FN + precision/recall/F1 ===
CRITICAL = {"patient_name","consent_type","provider_signature"}  # adjust if needed

def eq(a,b):
    a = "" if a is None else str(a)
    b = "" if b is None else str(b)
    return a == b

rows=[]
tp=fp=fn=0
crit_f1_scores=[]

for cid in overlap:
    ga = data_a[cid]; gb = data_b[cid]
    for f in FIELDS:
        A = ga.get(f); B = gb.get(f)
        if A is None and B is None:
            tpi,fpi,fni = 1,0,0
            prc,rec,f1 = 1.0,1.0,1.0
        elif eq(A,B):
            tpi,fpi,fni = 1,0,0
            prc,rec,f1 = 1.0,1.0,1.0
        else:
            tpi = 0
            fpi = 0 if (B is None or B=="") else 1
            fni = 0 if (A is None or A=="") else 1
            prc=rec=f1=0.0
        rows.append({"document_id": cid, "field": f, "tp": tpi, "fp": fpi, "fn": fni,
                     "precision": prc, "recall": rec, "f1": f1})
        tp += tpi; fp += fpi; fn += fni
        if f in CRITICAL: crit_f1_scores.append(f1)

micro_p = tp/(tp+fp) if (tp+fp) else 0.0
micro_r = tp/(tp+fn) if (tp+fn) else 0.0
micro_f1 = (2*micro_p*micro_r)/(micro_p+micro_r) if (micro_p+micro_r) else 0.0
crit_f1 = sum(crit_f1_scores)/len(crit_f1_scores) if crit_f1_scores else 0.0

df_f1 = pd.DataFrame(rows)
out_f1 = os.path.join(OUTPUT_DIR, "consent_benchmark_results_v0.1.csv")
df_f1.to_csv(out_f1, index=False)
print(f"Saved F1 table → {out_f1}")
print(f"Micro P/R/F1: {micro_p:.4f}/{micro_r:.4f}/{micro_f1:.4f}")
print(f"Critical fields avg F1: {crit_f1:.4f}")


# ===================== Disagreement Report =====================
def disagreement_report(overlap_ids, data_a, data_b, out_csv):
    rows = []
    for cid in overlap_ids:
        na, nb = data_a[cid], data_b[cid]
        for f in FIELDS:
            va, vb = na.get(f), nb.get(f)
            if va != vb:
                rows.append({"consent_id": cid, "field": f, "A": va, "B": vb})
    df = pd.DataFrame(rows)
    if not df.empty:
        df.to_csv(out_csv, index=False)
    return df

diff_path = os.path.join(OUTPUT_DIR, "disagreements_consent.csv")
df_diff = disagreement_report(overlap, data_a, data_b, diff_path)
# print("Disagreement rows:", len(df_diff))
# display(df_diff.head(25))
print("Disagreement rows:", 0 if df_diff is None else len(df_diff))
display(df_diff.head(25) if df_diff is not None else df_diff)


Hybrid QA Score: 0.944 → 94.37%
Interpretation: Excellent (≥ 90%)
Saved F1 table → C:\CALDARIUM\Caldarium\bench\outputs\consent_benchmark_results_v0.1.csv
Micro P/R/F1: 0.9843/0.9488/0.9662
Critical fields avg F1: 0.7500
Disagreement rows: 28


Unnamed: 0,consent_id,field,A,B
0,consent_T1_gen1,patient_first_name,courtney,
1,consent_T1_gen1,patient_last_name,wilcox,
2,consent_T1_gen2,patient_first_name,chelsey,
3,consent_T1_gen2,patient_last_name,moody,
4,consent_T1_gen3,patient_first_name,john,
5,consent_T1_gen3,patient_last_name,graham,
6,consent_T1_gen3,expiration_event,period of six months from date of signature,
7,consent_T1_gen4,patient_first_name,mary,
8,consent_T1_gen4,patient_last_name,hodge,
9,consent_T1_gen4,provider_fax,539.748.7873,539


In [8]:
import shutil
from google.colab import files

zip_path = os.path.join(OUTPUT_DIR, "consent_qa_results.zip")
shutil.make_archive(zip_path.replace(".zip", ""), "zip", OUTPUT_DIR)
files.download(zip_path)


ModuleNotFoundError: No module named 'google'