In [4]:
import os, json, re
from datetime import datetime
from collections import Counter
import pandas as pd

# ===================== DEBUG: Check current directory =====================
print("Current working directory:", os.getcwd())
print("Files in current directory:", os.listdir('.'))
print()
# ===================== Paths =====================
#ANN_A_DIR = "/content/data/ann_A"
# ANN_A_DIR = "json_intakes"
# #ANN_B_DIR = "/content/data/ann_B"
# ANN_B_DIR = "output_intake_forms"
# OUTPUT_DIR = "/bench/output"
# os.makedirs(OUTPUT_DIR, exist_ok=True)
ANN_A_DIR = os.path.join("..", "json_intakes")
ANN_B_DIR = os.path.join("..", "output_intake_forms")
OUTPUT_DIR = "bench/output"  # This stays in labeler_tools folder
os.makedirs(OUTPUT_DIR, exist_ok=True)

pat = re.compile(r"^(intake_T\d+_gen\d+)(?:_.*)?\.json$", re.IGNORECASE)


#pat = re.compile(r"^(intake_T\d+_gen\d+)\.json$", re.IGNORECASE)
pat = re.compile(r"^(intake_T\d+_gen\d+)(?:_.*)?\.json$", re.IGNORECASE)

# ===================== ID Extraction =====================
# def list_ids(folder):
#     ids, bad = [], []
#     for fn in os.listdir(folder):
#         if not fn.endswith(".json"): continue
#         m = pat.match(fn)
#         if m: ids.append(m.group(1))
#         else: bad.append(fn)
#     return set(ids), bad

def list_ids(folder):
    id_to_fn = {}
    bad = []
    for fn in os.listdir(folder):
        if not fn.endswith(".json"):
            continue
        m = pat.match(fn)
        if m:
            cid = m.group(1)          # canonical id: intake_T1_gen1
            id_to_fn[cid] = fn        # store the real filename
        else:
            bad.append(fn)
    return id_to_fn, bad

# ids_a, bad_a = list_ids(ANN_A_DIR)
# ids_b, bad_b = list_ids(ANN_B_DIR)

# overlap = sorted(ids_a & ids_b)
# missing_in_b = sorted(ids_a - ids_b)
# missing_in_a = sorted(ids_b - ids_a)

# print(f"A count: {len(ids_a)} | B count: {len(ids_b)} | Overlap pairs: {len(overlap)}")
# if bad_a: print("[WARN] A bad names:", bad_a)
# if bad_b: print("[WARN] B bad names:", bad_b)
# if missing_in_b: print("[WARN] Missing in B:", missing_in_b)
# if missing_in_a: print("[WARN] Missing in A:", missing_in_a)

ids_a, bad_a = list_ids(ANN_A_DIR)   # parser outputs
ids_b, bad_b = list_ids(ANN_B_DIR)   # labeler GT

overlap = sorted(set(ids_a.keys()) & set(ids_b.keys()))
missing_in_b = sorted(set(ids_a.keys()) - set(ids_b.keys()))
missing_in_a = sorted(set(ids_b.keys()) - set(ids_a.keys()))

print(f"A count: {len(ids_a)} | B count: {len(ids_b)} | Overlap pairs: {len(overlap)}")
if bad_a: print("[WARN] A bad names:", bad_a)
if bad_b: print("[WARN] B bad names:", bad_b)
if missing_in_b: print("[WARN] Missing in B:", missing_in_b)
if missing_in_a: print("[WARN] Missing in A:", missing_in_a)

# ===================== ID Extraction =====================
ids_a, bad_a = list_ids(ANN_A_DIR)      # dict: {"intake_T1_gen1": "intake_T1_gen1_hmgs_intake.json"}
ids_b, bad_b = list_ids(ANN_B_DIR)      # dict: {"intake_T1_gen1": "intake_T1_gen1.json"}

overlap = sorted(set(ids_a.keys()) & set(ids_b.keys()))
missing_in_b = sorted(set(ids_a.keys()) - set(ids_b.keys()))
missing_in_a = sorted(set(ids_b.keys()) - set(ids_a.keys()))

print(f"A count: {len(ids_a)} | B count: {len(ids_b)} | Overlap: {len(overlap)}")
print("Missing in A:", missing_in_a)
print("Missing in B:", missing_in_b)

# ===================== NOW ADD YOUR K-HYBRID LOOP HERE =====================
results = []   # example — store your disagreement rows here

for cid in overlap:
    # parser JSON (A)
    with open(os.path.join(ANN_A_DIR, ids_a[cid]), "r") as fa:
        a = json.load(fa)

    # labeler ground truth (B)
    with open(os.path.join(ANN_B_DIR, ids_b[cid]), "r") as fb:
        b = json.load(fb)

    # =====================================
    # ADD YOUR FIELD-BY-FIELD COMPARISON
    # =====================================

    # Example (replace with your real comparison function):
    for field in set(a.keys()) | set(b.keys()):
        va = a.get(field)
        vb = b.get(field)

        if va != vb:
            results.append({
                "consent_id": cid,
                "field": field,
                "A": va,
                "B": vb,
            })

# Convert to dataframe
df = pd.DataFrame(results)
df.to_csv(os.path.join(OUTPUT_DIR, "intake_k_test_results.csv"), index=False)


Current working directory: c:\CALDARIUM\Caldarium\labeler_tools
Files in current directory: ['bench', 'consent_json_converter.ipynb', 'consent_k_test.ipynb', 'intake_json_converter.ipynb', 'intake_k_test.ipynb', 'invoice_json_converter_v2.0.ipynb', 'invoice_k_test.ipynb']

A count: 4 | B count: 4 | Overlap pairs: 4
A count: 4 | B count: 4 | Overlap: 4
Missing in A: []
Missing in B: []


In [8]:
# # ===================== Config =====================
# DATE_FORMATS = ["%Y-%m-%d", "%m/%d/%Y", "%Y/%m/%d", "%m-%d-%Y", "%d-%b-%Y", "%b %d, %Y"]
# FNAME_PAT = re.compile(r"^(intake_T\d+_gen\d+)\.json$", re.IGNORECASE)

# FIELDS = [
#     "patient_name", "patient_dob", "patient_phone", "referral_name", "provider_name"]

# ===================== Config =====================
DATE_FORMATS = ["%Y-%m-%d", "%m/%d/%Y", "%Y/%m/%d", "%m-%d-%Y", "%d-%b-%Y", "%b %d, %Y"]
# Updated pattern to handle suffixes like _hmgs_intake
FNAME_PAT = re.compile(r"^(intake_T\d+_gen\d+)(?:_.*)?\.json$", re.IGNORECASE)

FIELDS = [
    "patient_name", "patient_dob", "patient_phone", "referral_name", "provider_name"]
# ===================== Helpers =====================
_ws = re.compile(r"\s+")
def norm_str(s):
    if s is None: return None
    s = str(s).strip().casefold()
    return _ws.sub(" ", s) or None

def norm_phone(s):
    if s is None: return None
    ds = re.sub(r"\D+", "", str(s))
    return ds or None

def parse_date(s):
    if s is None: return None
    s = str(s).strip()
    if not s: return None
    for fmt in DATE_FORMATS:
        try: return datetime.strptime(s, fmt).date()
        except: pass
    try: return datetime.fromisoformat(s).date()
    except: return None

def norm_date(s):
    d = parse_date(s)
    return d.isoformat() if d else None

# ===================== Record Normalization =====================
def normalize_record(j, forced_id=None):
    out = dict(j)
    for f in FIELDS:
        val = j.get(f)
        if "date" in f or "dob" in f:
            out[f] = norm_date(val)
        elif "phone" in f:
            out[f] = norm_phone(val)
        else:
            out[f] = norm_str(val)
    # force id consistency
    if forced_id:
        out["consent_id"] = forced_id
    return out

# ===================== Load =====================
def load_by_full_id(folder):
    data = {}
    for fn in os.listdir(folder):
        if not fn.endswith(".json"): continue
        m = FNAME_PAT.match(fn)
        if not m: continue
        key = m.group(1)
        with open(os.path.join(folder, fn), "r", encoding="utf-8") as f:
            j = json.load(f)
        data[key] = normalize_record(j, forced_id=key)
    return data

# ===================== Metrics =====================
def kappa(a,b):
    A = ["<MISSING>" if v is None else str(v) for v in a]
    B = ["<MISSING>" if v is None else str(v) for v in b]
    n = len(A)
    if n == 0: return None
    po = sum(1 for x,y in zip(A,B) if x==y) / n
    ca, cb = Counter(A), Counter(B)
    pe = sum((ca[k]/n)*(cb[k]/n) for k in set(ca)|set(cb))
    return 1.0 if pe == 1 else (po - pe) / (1 - pe)

def exact(a,b):
    n = len(a)
    return None if n==0 else sum(1 for x,y in zip(a,b) if x==y)/n

# ===================== Compare =====================
data_a = load_by_full_id(ANN_A_DIR)
data_b = load_by_full_id(ANN_B_DIR)

ids_a, ids_b = set(data_a), set(data_b)
overlap = sorted(ids_a & ids_b)
print(f"Comparing {len(overlap)} intake forms...")

summary = []
def collect(field):
    A = [data_a[i].get(field) for i in overlap]
    B = [data_b[i].get(field) for i in overlap]
    return A,B

for f in FIELDS:
    A,B = collect(f)
    summary += [
        {"field": f, "metric": "exact_match_rate", "value": exact(A,B)},
        {"field": f, "metric": "cohens_kappa",     "value": kappa(A,B)},
    ]

df_summary = pd.DataFrame(summary).sort_values(["field","metric"]).reset_index(drop=True)
out_path = os.path.join(OUTPUT_DIR, "qa_report_intake.csv")
df_summary.to_csv(out_path, index=False)

print("Saved:", out_path)
display(df_summary.head(30))

Comparing 4 intake forms...
Saved: bench/output\qa_report_intake.csv


Unnamed: 0,field,metric,value
0,patient_dob,cohens_kappa,1.0
1,patient_dob,exact_match_rate,1.0
2,patient_name,cohens_kappa,1.0
3,patient_name,exact_match_rate,1.0
4,patient_phone,cohens_kappa,0.428571
5,patient_phone,exact_match_rate,0.5
6,provider_name,cohens_kappa,0.692308
7,provider_name,exact_match_rate,0.75
8,referral_name,cohens_kappa,1.0
9,referral_name,exact_match_rate,1.0


In [6]:
from google.colab import drive
drive.mount('/content/drive')

ModuleNotFoundError: No module named 'google'

In [9]:
# ===================== Hybrid QA Score =====================
mean_kappa = df_summary[df_summary['metric']=="cohens_kappa"]["value"].mean()
mean_exact = df_summary[df_summary['metric']=="exact_match_rate"]["value"].mean()

# Simpler weight since no numeric or line items
hybrid_score = 0.6 * mean_exact + 0.4 * mean_kappa

def interpret(score):
    if score >= 0.9: return "Excellent (≥ 90%)"
    elif score >= 0.8: return "Good (80–89%)"
    elif score >= 0.7: return "Moderate (70–79%)"
    else: return "Poor (< 70%)"

interpretation = interpret(hybrid_score)
print("\n==================== CONSENT QA SUMMARY ====================")
print(f"Hybrid QA Score: {hybrid_score:.3f} → {hybrid_score*100:.2f}%")
print(f"Interpretation: {interpretation}")
print("============================================================")

# ===================== Disagreement Report =====================
def disagreement_report(overlap_ids, data_a, data_b, out_csv):
    rows = []
    for cid in overlap_ids:
        na, nb = data_a[cid], data_b[cid]
        for f in FIELDS:
            va, vb = na.get(f), nb.get(f)
            if va != vb:
                rows.append({"intake_id": cid, "field": f, "A": va, "B": vb})
    df = pd.DataFrame(rows)
    if not df.empty:
        df.to_csv(out_csv, index=False)
    return df

diff_path = os.path.join(OUTPUT_DIR, "disagreements_consent.csv")
df_diff = disagreement_report(overlap, data_a, data_b, diff_path)
print("Disagreement rows:", len(df_diff))
display(df_diff.head(25))


Hybrid QA Score: 0.840 → 83.97%
Interpretation: Good (80–89%)
Disagreement rows: 3


Unnamed: 0,intake_id,field,A,B
0,intake_T1_gen1,patient_phone,8949753639,13525131229.0
1,intake_T2_gen2,patient_phone,18122428971,14528504965.0
2,intake_T2_gen2,provider_name,dr. holly porter,


In [10]:
import shutil
from google.colab import files

zip_path = os.path.join(OUTPUT_DIR, "consent_qa_results.zip")
shutil.make_archive(zip_path.replace(".zip", ""), "zip", OUTPUT_DIR)
files.download(zip_path)


ModuleNotFoundError: No module named 'google'