In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
REV Extractor — Proximity & Layout Aware (v2)
"""
from __future__ import annotations
import argparse, logging, re, math
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Dict, Iterable, List, Optional, Tuple

# import pandas as pd
from tqdm import tqdm

import fitz
# from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
# from langchain_core.documents import Document

LOG = logging.getLogger("rev_extractor_proximity_v2")
logging.basicConfig(level=logging.INFO, format="%(asctime)s | %(levelname)s | %(message)s")

REV_VALUE_RE = re.compile(r"^(?:[A-Z]{1,2}|\d{1,2}-\d{1,2})$")
REV_TOKEN_RE = re.compile(r"^rev\.?$", re.IGNORECASE)
TITLE_ANCHORS = {"DWG", "DWG.", "DWGNO", "SHEET", "SCALE", "WEIGHT", "SIZE", "TITLE"}
REV_TABLE_HEADERS = {"REVISIONS", "DESCRIPTION", "EC", "DFT", "APPR", "APPD", "DATE"}

@dataclass
class Token:
    text: str
    conf: Optional[float]
    x: float
    y: float
    w: float
    h: float

@dataclass
class PageResult:
    tokens: List[Token]
    text: str
    engine: str

@dataclass
class RevHit:
    file: str
    page: int
    value: str
    engine: str
    score: float
    context_snippet: str

def _scalarize(v):
    """Coerce any non-scalar to a plain Python scalar or string."""
    try:
        import numpy as np
        if isinstance(v, np.ndarray):
            return ", ".join(map(str, v.flatten().tolist()))
        if isinstance(v, np.generic):
            try:
                return v.item()
            except Exception:
                return str(v)
    except Exception:
        pass
    if isinstance(v, (list, tuple, set)):
        return ", ".join(map(str, v))
    if isinstance(v, dict):
        return ", ".join(f"{k}={str(vv)}" for k,vv in v.items())
    if isinstance(v, (bytes, bytearray)):
        return v.decode("utf-8", errors="ignore")
    # convert numpy scalar types if present
    try:
        import numpy as np
        if isinstance(v, (np.integer, np.floating, np.bool_)):
            return v.item()
    except Exception:
        pass
    if isinstance(v, (str, int, float, bool)):
        return v
    return str(v)

def norm_val(v: Any) -> str:
    """Normalize token text for comparisons: coerce to str, collapse whitespace, strip."""
    if v is None:
        return ""
    s = str(v)
    # replace non-breaking spaces and collapse runs of whitespace to single space
    s = s.replace("\u00A0", " ")
    s = re.sub(r"\s+", " ", s).strip()
    return s

def in_bottom_right(x: float, y: float, width: float, height: float) -> bool:
    return x > width * 0.55 and y > height * 0.60

def distance(a: Tuple[float, float], b: Tuple[float, float]) -> float:
    return math.hypot(a[0] - b[0], a[1] - b[1])

def context_snippet_from_tokens(tokens: List[Token], center: Tuple[float, float], radius: float = 160) -> str:
    close = [t.text for t in tokens if distance((t.x, t.y), center) <= radius]
    import re as _re
    s = " ".join(close)
    s = _re.sub(r"\s+", " ", s).strip()
    return s[:50]

def get_native_tokens(pdf_path: Path, page_index0: int) -> PageResult:
    tokens: List[Token] = []
    text_parts: List[str] = []
    with fitz.open(pdf_path) as doc:
        page = doc[page_index0]
        for x0, y0, x1, y1, txt, *_ in page.get_text("words"):
            txt_clean = txt.strip()
            if not txt_clean:
                continue
            cx = (x0 + x1) / 2.0
            cy = (y0 + y1) / 2.0
            tokens.append(Token(text=txt_clean, conf=None, x=cx, y=cy, w=(x1-x0), h=(y1-y0)))
            text_parts.append(txt_clean)
    return PageResult(tokens=tokens, text=" ".join(text_parts), engine="native")

class PaddleWrapper:
    def __init__(self):
        from paddleocr import PaddleOCR
        self.ocr = PaddleOCR(lang="en", use_angle_cls=True, show_log=False)

    def run(self, image_bgr):
        result = self.ocr.ocr(image_bgr, cls=True)
        tokens: List[Token] = []
        lines: List[str] = []
        for det in result:
            for (box, (txt, cf)) in det:
                txt_clean = txt.strip()
                if not txt_clean:
                    continue
                xs = [p[0] for p in box]; ys = [p[1] for p in box]
                cx, cy = sum(xs)/4.0, sum(ys)/4.0
                w = (max(xs)-min(xs)) or 1.0; h = (max(ys)-min(ys)) or 1.0
                tokens.append(Token(text=txt_clean, conf=float(cf), x=cx, y=cy, w=w, h=h))
                lines.append(txt_clean)
        return PageResult(tokens=tokens, text=" ".join(lines), engine="paddleocr")

class EasyWrapper:
    def __init__(self):
        import easyocr
        self.reader = easyocr.Reader(["en"], gpu=False)

    def run(self, image_bgr):
        result = self.reader.readtext(image_bgr)
        tokens: List[Token] = []
        lines: List[str] = []
        for (box, txt, cf) in result:
            txt_clean = txt.strip()
            if not txt_clean:
                continue
            xs = [p[0] for p in box]; ys = [p[1] for p in box]
            cx, cy = sum(xs)/4.0, sum(ys)/4.0
            w = (max(xs)-min(xs)) or 1.0; h = (max(ys)-min(ys)) or 1.0
            tokens.append(Token(text=txt_clean, conf=float(cf), x=cx, y=cy, w=w, h=h))
            lines.append(txt_clean)
        return PageResult(tokens=tokens, text=" ".join(lines), engine="easyocr")

def score_candidates(tokens: List[Token], page_w: float, page_h: float):
    anchor_tokens = [t for t in tokens if norm_val(t.text).upper() in TITLE_ANCHORS]
    rev_tokens = [t for t in tokens if REV_TOKEN_RE.match(norm_val(t.text))]
    if not rev_tokens:
        return None

    def nearby_anchor_bonus(center_xy, radius=220):
        return sum(1 for a in anchor_tokens if distance((a.x,a.y), center_xy) <= radius)

    cands = []
    for r in rev_tokens:
        r_word = norm_val(r.text).lower()
        is_revision_word = r_word.startswith("revision")
        neighborhood = [t for t in tokens if distance((t.x,t.y),(r.x,r.y)) <= 280]
        looks_like_revision_table = any(norm_val(n.text).upper() in REV_TABLE_HEADERS for n in neighborhood)

        for t in neighborhood:
            v = norm_val(t.text)
            if not REV_VALUE_RE.match(v):
                continue
            d = distance((t.x,t.y),(r.x,r.y)) + 1e-3
            same_line = abs(t.y - r.y) <= max(r.h, t.h) * 0.8
            to_right = t.x > r.x
            base = 1000.0/d
            if same_line: base += 4.0
            if to_right: base += 6.0
            if in_bottom_right(t.x,t.y,page_w,page_h): base += 5.0
            base += nearby_anchor_bonus((t.x,t.y))*1.5
            if t.conf is not None: base += (t.conf - 0.5)*2.0
            if is_revision_word: base -= 2.0
            if looks_like_revision_table: base -= 6.0

            cands.append((base, v, (t.x,t.y)))

    if not cands:
        return None

    br_cands = [c for c in cands if in_bottom_right(c[2][0], c[2][1], page_w, page_h)]
    pool = br_cands if br_cands else cands

    best = max(pool, key=lambda c: c[0])
    score, v, center = best
    ctx = context_snippet_from_tokens(tokens, center, radius=160)
    return (v, score, center, ctx)

def rasterize_to_bgr(pdf_path: Path, page_index0: int, dpi: int):
    import numpy as np, cv2, fitz
    try:
        from PIL import Image
    except Exception:
        Image = None

    with fitz.open(pdf_path) as doc:
        page = doc[page_index0]
        zoom = dpi / 72.0
        mat = fitz.Matrix(zoom, zoom)
        # try to request RGB directly (PyMuPDF may accept colorspace)
        try:
            pix = page.get_pixmap(matrix=mat, alpha=False, colorspace=fitz.csRGB)
        except TypeError:
            # older versions may not accept colorspace kw; try default
            pix = page.get_pixmap(matrix=mat, alpha=False)

        # fast path: raw samples -> numpy
        buf = getattr(pix, "samples", None)
        ncomps = getattr(pix, "n", None)
        try:
            if buf and ncomps:
                arr = np.frombuffer(buf, dtype=np.uint8)
                # validate expected size for common cases
                if ncomps == 3 and arr.size == int(pix.w) * int(pix.h) * 3:
                    img_rgb = arr.reshape((pix.h, pix.w, 3))
                    img_rgb = np.ascontiguousarray(img_rgb)
                    img_bgr = cv2.cvtColor(img_rgb, cv2.COLOR_RGB2BGR)
                    return img_bgr, float(pix.w), float(pix.h)
                if ncomps == 1 and arr.size == int(pix.w) * int(pix.h):
                    img_gray = arr.reshape((pix.h, pix.w))
                    img_gray = np.ascontiguousarray(img_gray)
                    img_bgr = cv2.cvtColor(img_gray, cv2.COLOR_GRAY2BGR)
                    return img_bgr, float(pix.w), float(pix.h)
                if ncomps == 4 and arr.size == int(pix.w) * int(pix.h) * 4:
                    img_rgba = arr.reshape((pix.h, pix.w, 4))
                    img_rgba = np.ascontiguousarray(img_rgba)
                    img_bgr = cv2.cvtColor(img_rgba, cv2.COLOR_RGBA2BGR)
                    return img_bgr, float(pix.w), float(pix.h)
        except Exception:
            # fall through to robust fallback
            pass

        # fallback A: ask for PNG bytes then decode (robust across colorspaces)
        try:
            png = None
            if hasattr(pix, "tobytes"):
                try:
                    png = pix.tobytes("png")
                except Exception:
                    png = None
            if not png and hasattr(pix, "getPNGData"):
                try:
                    png = pix.getPNGData()
                except Exception:
                    png = None
            if png:
                arr = np.frombuffer(png, dtype=np.uint8)
                img = cv2.imdecode(arr, cv2.IMREAD_COLOR)  # returns BGR or None
                if isinstance(img, np.ndarray):
                    return np.ascontiguousarray(img), float(pix.w), float(pix.h)
        except Exception:
            pass

        # fallback B: use PIL if available
        try:
            if Image is not None:
                mode = None
                if ncomps == 1:
                    mode = "L"
                elif ncomps == 3:
                    mode = "RGB"
                elif ncomps == 4:
                    mode = "RGBA"
                else:
                    mode = "RGB"
                pil = Image.frombytes(mode, (pix.w, pix.h), pix.samples)
                arr = np.asarray(pil)
                if arr.ndim == 2:
                    img_bgr = cv2.cvtColor(arr, cv2.COLOR_GRAY2BGR)
                else:
                    if arr.shape[2] == 4:
                        img_bgr = cv2.cvtColor(arr, cv2.COLOR_RGBA2BGR)
                    else:
                        img_bgr = cv2.cvtColor(arr, cv2.COLOR_RGB2BGR)
                return np.ascontiguousarray(img_bgr), float(pix.w), float(pix.h)
        except Exception:
            pass

        raise ValueError(f"Unable to rasterize {pdf_path.name} p{page_index0+1} to a valid BGR ndarray")

def analyze_page(pdf_path: Path, page_index0: int, dpi: int, use_paddle: bool, use_easy: bool):
    native = get_native_tokens(pdf_path, page_index0)
    with fitz.open(pdf_path) as doc:
        pw, ph = doc[page_index0].rect.width, doc[page_index0].rect.height
    if native.tokens:
        res = score_candidates(native.tokens, pw, ph)
        if res:
            v, score, _, ctx = res
            return ("native", v, score, ctx)
    if native.text:
        m = re.search(r"(?i)\brev(?:ision)?\b\s*[:#\-]?\s*([A-Za-z]{1,2}|\d{1,2}-\d{1,2})\b", native.text)
        if m:
            return ("native_text", norm_val(m.group(1)), 0.3, native.text[:50])

    # rasterize -> OCR fallbacks; guard rasterization errors
    try:
        image_bgr, iw, ih = rasterize_to_bgr(pdf_path, page_index0, dpi)
    except Exception as e:
        LOG.warning(f"Rasterization failed p{page_index0+1} {pdf_path.name}: {e}")
        return None

    # validate image before OCR
    import numpy as _np
    if not (isinstance(image_bgr, _np.ndarray) and image_bgr.ndim == 3 and image_bgr.shape[2] == 3 and image_bgr.dtype == _np.uint8):
        LOG.warning(f"Rasterized image invalid for OCR p{page_index0+1} {pdf_path.name}: shape={getattr(image_bgr,'shape',None)} dtype={getattr(image_bgr,'dtype',None)}")
        return None

    if use_paddle:
        try:
            padd = PaddleWrapper().run(image_bgr)
            res = score_candidates(padd.tokens, iw, ih)
            if res:
                v, score, _, ctx = res
                return ("paddleocr", v, score, ctx)
        except Exception as e:
            LOG.warning(f"PaddleOCR failed p{page_index0+1} {pdf_path.name}: {e}")
    if use_easy:
        try:
            easy = EasyWrapper().run(image_bgr)
            res = score_candidates(easy.tokens, iw, ih)
            if res:
                v, score, _, ctx = res
                return ("easyocr", v, score, ctx)
        except Exception as e:
            LOG.warning(f"EasyOCR failed p{page_index0+1} {pdf_path.name}: {e}")

    return None

def process_pdf(pdf_path: Path, dpi: int, use_paddle: bool, use_easy: bool) -> Dict[int, RevHit]:
    hits: Dict[int, RevHit] = {}
    with fitz.open(pdf_path) as d:
        n = len(d)
    for i in range(n):
        res = analyze_page(pdf_path, i, dpi, use_paddle, use_easy)
        if not res: 
            continue
        engine, value, score, ctx = res
        if (i+1) not in hits or score > hits[i+1].score:
            hits[i+1] = RevHit(file=pdf_path.name, page=i+1, value=value, engine=engine, score=score, context_snippet=ctx)
    return hits

def iter_pdfs(folder: Path) -> Iterable[Path]:
    """Yield each PDF in folder once (case-insensitive)."""
    # use iterdir + suffix.lower() to avoid duplicate matches on Windows
    for p in folder.iterdir():
        try:
            if p.is_file() and p.suffix.lower() == ".pdf":
                yield p
        except Exception:
            # skip weird filesystem entries
            continue

def aggregate_rows(file_hits: Dict[int, RevHit]) -> List[Dict[str,Any]]:
    # Return a single row per file containing only 'file' and 'value'.
    # Choose the best-scoring hit for the file (if any).
    if not file_hits:
        return []
    best = max(file_hits.values(), key=lambda h: getattr(h, 'score', 0))
    return [{
        "file": best.file,
        "value": norm_val(best.value)
    }]

def run_pipeline(input_folder: Path, output_csv: Path, dpi: int, enable_paddle: bool, enable_easy: bool) -> List[Dict[str,Any]]:
    all_rows: List[Dict[str,Any]] = []
    pdfs = list(iter_pdfs(input_folder))
    # defensive dedupe by resolved path (preserves order from iterdir)
    seen = set()
    unique_pdfs: List[Path] = []
    for p in pdfs:
        try:
            rp = p.resolve()
        except Exception:
            rp = p
        if rp in seen:
            continue
        seen.add(rp)
        unique_pdfs.append(p)
    pdfs = unique_pdfs
    if not pdfs:
        LOG.warning(f"No PDFs found in {input_folder}")
    for p in tqdm(pdfs, desc="Scanning PDFs"):
        try:
            file_hits = process_pdf(p, dpi, enable_paddle, enable_easy)
            rows = aggregate_rows(file_hits)
            if not rows:
                # ensure one row per file even with no hits
                all_rows.append({"file": p.name, "value": ""})
            else:
                all_rows.extend(rows)
        except Exception as e:
            LOG.warning(f"Failed {p.name}: {e}")
            all_rows.append({"file": p.name, "value": ""})
    # --- sanitize all rows BEFORE writing CSV ---
    safe_rows = []
    for i, row in enumerate(all_rows):
        if not isinstance(row, dict):
            LOG.warning(f"Row {i} is not a dict: {row}")
            row = {"file": "", "value": str(row)}
        sanitized: Dict[str, Any] = {}
        for k in ("file", "value"):
            try:
                sanitized[k] = _scalarize(row.get(k, ""))
            except Exception:
                sanitized[k] = str(row.get(k, ""))
        safe_rows.append(sanitized)
    all_rows = safe_rows

    # write CSV directly from sanitized rows (avoid pandas list-of-dicts issues)
    try:
        import csv
        output_csv.parent.mkdir(parents=True, exist_ok=True)
        seen_files = set()
        with open(output_csv, 'w', newline='', encoding='utf-8-sig') as outf:
            writer = csv.writer(outf)
            writer.writerow(['file', 'value'])
            for r in all_rows:
                f = r.get('file', '')
                v = r.get('value', '')
                try: fs = str(f)
                except Exception: fs = repr(f)
                try: vs = str(v)
                except Exception: vs = repr(v)
                if fs in seen_files:
                    continue
                seen_files.add(fs)
                writer.writerow([fs, vs])
        LOG.info(f"Wrote CSV directly to {output_csv.resolve()} with {len(seen_files)} rows")
    except Exception as e:
        LOG.error(f"Failed to write CSV directly: {e}")

    return all_rows

def parse_args(argv=None):
    a = argparse.ArgumentParser(description="Extract REV values using proximity & OCR fallbacks (v2).")
    a.add_argument("input_folder", type=Path)
    a.add_argument("-o","--output", type=Path, default=Path("rev_results.csv"))
    a.add_argument("--dpi", type=int, default=240)
    a.add_argument("--no-paddle", action="store_true")
    a.add_argument("--no-easy", action="store_true")
    return a.parse_args(argv)

def main(argv=None):
    args = parse_args(argv)
    return run_pipeline(args.input_folder, args.output, args.dpi, enable_paddle=not args.no_paddle, enable_easy=not args.no_easy)

def _in_notebook():
    try:
        from IPython import get_ipython
        return get_ipython() is not None
    except Exception:
        return False

if __name__ == "__main__" and not _in_notebook():
    main()

In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
REV Extractor — Bottom-Right-First (Native → OCR fallback)
Outputs exactly one row per input PDF: file, value, engine
- Native vector text pass first; only files with no native hit fall back to OCR
- Strict bottom-right ROI prioritization (title block)
- 'OF' inside ROI mapped to 'EMPTY'
- Fix for 'L' beating '1-0': edge exclusion + neighborhood assembly + pattern-aware scoring
"""

from __future__ import annotations
import argparse, logging, re, math, csv
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Dict, Iterable, List, Optional, Tuple

import fitz  # PyMuPDF
from tqdm import tqdm

LOG = logging.getLogger("rev_extractor_br_first")
logging.basicConfig(level=logging.INFO, format="%(asctime)s | %(levelname)s | %(message)s")

# ----------------------------- Patterns & Constants -----------------------------

# Allowed REV value patterns:
# - 1–2 letters (A..Z or AA)
# - hyphenated numeric pair (e.g., 1-0, 12-01)
REV_VALUE_RE = re.compile(r"^(?:[A-Z]{1,2}|\d{1,2}-\d{1,2})$")

# Tokens to detect a REV label
REV_TOKEN_RE = re.compile(r"^rev\.?$", re.IGNORECASE)

# Weak anchors often present near title blocks
TITLE_ANCHORS = {"DWG", "DWG.", "DWGNO", "SHEET", "SCALE", "WEIGHT", "SIZE", "TITLE"}

# Headers around revision tables (down-weighted in global fallback)
REV_TABLE_HEADERS = {"REVISIONS", "DESCRIPTION", "EC", "DFT", "APPR", "APPD", "DATE", "CHKD", "DRAWN"}

# ROI defaults for bottom-right title block (can be adjusted via CLI)
DEFAULT_BR_X = 0.68
DEFAULT_BR_Y = 0.72

# Edge margin to exclude tokens too close to page borders (grid letters/numbers)
DEFAULT_EDGE_MARGIN = 0.018  # ~1.8% of page width/height (tune 0.015–0.025)

# Two-letter junk frequently seen in title blocks; keep 'OF' allowed (used to infer EMPTY)
DEFAULT_REV_2L_BLOCKLIST = {"EC", "DF", "DT", "AP", "ID", "NO", "IN", "ON", "BY"}

# ----------------------------- Data Structures ---------------------------------

@dataclass
class Token:
    text: str
    conf: Optional[float]
    x: float
    y: float
    w: float
    h: float

@dataclass
class PageResult:
    tokens: List[Token]
    text: str
    engine: str

@dataclass
class RevHit:
    file: str
    page: int
    value: str
    engine: str
    score: float
    context_snippet: str

# ----------------------------- Utilities ---------------------------------------

def _scalarize(v: Any):
    """Coerce any non-scalar to a plain Python scalar or string."""
    try:
        import numpy as np
        if isinstance(v, np.ndarray):
            return ", ".join(map(str, v.flatten().tolist()))
        if isinstance(v, np.generic):
            try:
                return v.item()
            except Exception:
                return str(v)
    except Exception:
        pass
    if isinstance(v, (list, tuple, set)):
        return ", ".join(map(str, v))
    if isinstance(v, dict):
        return ", ".join(f"{k}={str(vv)}" for k, vv in v.items())
    if isinstance(v, (bytes, bytearray)):
        return v.decode("utf-8", errors="ignore")
    try:
        import numpy as np
        if isinstance(v, (np.integer, np.floating, np.bool_)):
            return v.item()
    except Exception:
        pass
    if isinstance(v, (str, int, float, bool)):
        return v
    return str(v)

def norm_val(v: Any) -> str:
    """Normalize token text for comparisons."""
    if v is None:
        return ""
    s = str(v).replace("\u00A0", " ")
    s = re.sub(r"\s+", " ", s).strip()
    return s

def in_bottom_right(x: float, y: float, width: float, height: float) -> bool:
    return x > width * 0.55 and y > height * 0.60

def in_bottom_right_strict(x: float, y: float, width: float, height: float, brx: float, bry: float) -> bool:
    return x >= width * brx and y >= height * bry

def is_far_from_edges(x: float, y: float, width: float, height: float, edge_margin: float) -> bool:
    """Filter out tokens too close to page edges (removes border grid labels like K/L/16)."""
    xm = width * edge_margin
    ym = height * edge_margin
    return (x > xm) and (x < width - xm) and (y > ym) and (y < height - ym)

def distance(a: Tuple[float, float], b: Tuple[float, float]) -> float:
    return math.hypot(a[0] - b[0], a[1] - b[1])

def context_snippet_from_tokens(tokens: List[Token], center: Tuple[float, float], radius: float = 160) -> str:
    close = [t.text for t in tokens if distance((t.x, t.y), center) <= radius]
    s = " ".join(close)
    s = re.sub(r"\s+", " ", s).strip()
    return s[:80]

# ----------------------------- Native Tokenization ------------------------------

def get_native_tokens(pdf_path: Path, page_index0: int) -> PageResult:
    tokens: List[Token] = []
    text_parts: List[str] = []
    with fitz.open(pdf_path) as doc:
        page = doc[page_index0]
        for x0, y0, x1, y1, txt, *_ in page.get_text("words"):
            txt_clean = txt.strip()
            if not txt_clean:
                continue
            cx = (x0 + x1) / 2.0
            cy = (y0 + y1) / 2.0
            tokens.append(Token(text=txt_clean, conf=None, x=cx, y=cy, w=(x1-x0), h=(y1-y0)))
            text_parts.append(txt_clean)
    return PageResult(tokens=tokens, text=" ".join(text_parts), engine="native")

# ----------------------------- OCR Wrappers ------------------------------------

class PaddleWrapper:
    def __init__(self):
        from paddleocr import PaddleOCR
        self.ocr = PaddleOCR(lang="en", use_angle_cls=True, show_log=False)

    def run(self, image_bgr):
        result = self.ocr.ocr(image_bgr, cls=True)
        tokens: List[Token] = []
        lines: List[str] = []
        for det in result:
            for (box, (txt, cf)) in det:
                txt_clean = txt.strip()
                if not txt_clean:
                    continue
                xs = [p[0] for p in box]; ys = [p[1] for p in box]
                cx, cy = sum(xs)/4.0, sum(ys)/4.0
                w = (max(xs)-min(xs)) or 1.0; h = (max(ys)-min(ys)) or 1.0
                tokens.append(Token(text=txt_clean, conf=float(cf), x=cx, y=cy, w=w, h=h))
                lines.append(txt_clean)
        return PageResult(tokens=tokens, text=" ".join(lines), engine="paddleocr")

class EasyWrapper:
    def __init__(self):
        import easyocr
        self.reader = easyocr.Reader(["en"], gpu=False)

    def run(self, image_bgr):
        result = self.reader.readtext(image_bgr)
        tokens: List[Token] = []
        lines: List[str] = []
        for (box, txt, cf) in result:
            txt_clean = txt.strip()
            if not txt_clean:
                continue
            xs = [p[0] for p in box]; ys = [p[1] for p in box]
            cx, cy = sum(xs)/4.0, sum(ys)/4.0
            w = (max(xs)-min(xs)) or 1.0; h = (max(ys)-min(ys)) or 1.0
            tokens.append(Token(text=txt_clean, conf=float(cf), x=cx, y=cy, w=w, h=h))
            lines.append(txt_clean)
        return PageResult(tokens=tokens, text=" ".join(lines), engine="easyocr")

# ----------------------------- Candidate Assembly ------------------------------

def _sort_by_x(tokens: List[Token]) -> List[Token]:
    return sorted(tokens, key=lambda t: (t.y, t.x))

def assemble_inline_candidates(neighborhood: List[Token], line_tol: float = 0.85, gap_tol: float = 0.60) -> List[str]:
    """
    Build candidate strings by concatenating adjacent small tokens on the same line:
      "1" "-" "0" -> "1-0", "A" "A" -> "AA"
    Returns de-duplicated strings.
    """
    if not neighborhood:
        return []
    by_lines: List[List[Token]] = []
    toks = _sort_by_x(neighborhood)
    for t in toks:
        placed = False
        for line in by_lines:
            anchor = line[0]
            same_line = abs(t.y - anchor.y) <= max(anchor.h, t.h) * line_tol
            if same_line:
                line.append(t); placed = True; break
        if not placed:
            by_lines.append([t])

    cands: set[str] = set()
    for line in by_lines:
        line = sorted(line, key=lambda t: t.x)
        if not line:
            continue
        avg_h = sum(t.h for t in line) / len(line)
        max_gap = avg_h * gap_tol
        texts = [norm_val(t.text) for t in line]
        xs = [t.x for t in line]
        # 2-grams
        for i in range(len(line)-1):
            if abs(xs[i+1] - xs[i]) <= max_gap:
                cands.add(texts[i] + texts[i+1])
        # 3-grams
        for i in range(len(line)-2):
            if abs(xs[i+1] - xs[i]) <= max_gap and abs(xs[i+2] - xs[i+1]) <= max_gap:
                cands.add(texts[i] + texts[i+1] + texts[i+2])
    return list(cands)

# ----------------------------- Scoring (ROI-first then global) -----------------

def _nearby_anchor_bonus(tokens_in_zone: List[Token], center_xy: Tuple[float, float], radius=220) -> int:
    return sum(1 for a in tokens_in_zone
               if norm_val(a.text).upper() in TITLE_ANCHORS and distance((a.x, a.y), center_xy) <= radius)

def score_candidates_bottom_right_first(
    tokens: List[Token], page_w: float, page_h: float,
    brx: float, bry: float, blocklist: Optional[set] = None,
    edge_margin: float = DEFAULT_EDGE_MARGIN
):
    """
    PASS A (strict, bottom-right only) with:
      - edge exclusion (filters page grid letters/numbers),
      - neighborhood assembly to recover '1-0' and 'AA',
      - pattern-aware scoring preferring N-N and double letters over single letters.
    Returns (value, score, center, context) or None.
    """
    block = {t.upper() for t in (blocklist or set())}

    # ROI filter + edge exclusion
    br_tokens = [
        t for t in tokens
        if in_bottom_right_strict(t.x, t.y, page_w, page_h, brx, bry)
        and is_far_from_edges(t.x, t.y, page_w, page_h, edge_margin)
    ]
    if not br_tokens:
        return None

    br_rev_labels = [t for t in br_tokens if REV_TOKEN_RE.match(norm_val(t.text))]

    # Priority patterns
    def is_hyphen_code(s: str) -> bool:   # e.g., 1-0, 12-01
        return bool(re.fullmatch(r"\d{1,2}-\d{1,2}", s))
    def is_double_letter(s: str) -> bool: # AA, AB ...
        return bool(re.fullmatch(r"[A-Z]{2}", s))
    def is_single_letter(s: str) -> bool:
        return bool(re.fullmatch(r"[A-Z]", s))

    def base_score_for(v: str) -> float:
        if is_hyphen_code(v):   return 40.0
        if is_double_letter(v): return 14.0
        if is_single_letter(v): return 4.0
        return 8.0

    def neighborhood_around(cx: float, cy: float, radius: float = 300.0) -> List[Token]:
        return [t for t in br_tokens if distance((t.x, t.y), (cx, cy)) <= radius]

    cands: List[Tuple[float, str, Tuple[float,float]]] = []

    def consider_token_or_assembled(ref_xy: Tuple[float,float], neigh: List[Token], label_token: Optional[Token]):
        # 1) Raw tokens
        for t in neigh:
            v = norm_val(t.text)
            if not REV_VALUE_RE.match(v):
                continue
            vu = v.upper()
            if vu in block:
                continue
            d = distance((t.x, t.y), ref_xy) + 1e-3
            score = base_score_for(v) + 1000.0 / d
            if label_token is not None:
                if abs(t.y - label_token.y) <= max(label_token.h, t.h) * 0.8:
                    score += 6.0
                if t.x > label_token.x:
                    score += 8.0
            if in_bottom_right(t.x, t.y, page_w, page_h): score += 3.0
            score += _nearby_anchor_bonus(br_tokens, (t.x, t.y)) * 1.2
            cands.append((score, v, (t.x, t.y)))

        # 2) Assembled n-grams (recover 1-0, AA, etc.)
        assembled = assemble_inline_candidates(neigh, line_tol=0.85, gap_tol=0.60)
        for s in assembled:
            s_norm = norm_val(s)
            if not REV_VALUE_RE.match(s_norm):
                continue
            if s_norm.upper() in block:
                continue
            score = base_score_for(s_norm) + 1000.0 / 30.0  # proximity proxy
            if label_token is not None:
                score += 6.0
            cands.append((score, s_norm, ref_xy))

    if br_rev_labels:
        for r in br_rev_labels:
            neigh = neighborhood_around(r.x, r.y, radius=300.0)
            consider_token_or_assembled((r.x, r.y), neigh, r)
    else:
        # Approximate typical REV cell centroid
        anchor_xy = (page_w * 0.92, page_h * 0.90)
        neigh = neighborhood_around(anchor_xy[0], anchor_xy[1], radius=320.0)
        consider_token_or_assembled(anchor_xy, neigh, None)

    if not cands:
        return None

    # If any hyphen-code exists, demote lone single letters harshly
    any_hyphen = any(re.fullmatch(r"\d{1,2}-\d{1,2}", v) for _, v, _ in cands)
    if any_hyphen:
        cands = [(s - (6.0 if re.fullmatch(r"[A-Z]", v) else 0.0), v, xy) for (s, v, xy) in cands]

    best = max(cands, key=lambda c: c[0])
    score, v, center = best
    ctx = context_snippet_from_tokens(tokens, center, radius=160)
    return (v, score, center, ctx)

def score_candidates_global(tokens: List[Token], page_w: float, page_h: float):
    """
    PASS B (fallback): Global, seeded by any REV label on the page.
    Retains down-weights for revision tables and bottom-right bonuses.
    """
    anchor_tokens = [t for t in tokens if norm_val(t.text).upper() in TITLE_ANCHORS]
    rev_tokens = [t for t in tokens if REV_TOKEN_RE.match(norm_val(t.text))]
    if not rev_tokens:
        return None

    def nearby_anchor_bonus(center_xy, radius=220):
        return sum(1 for a in anchor_tokens if distance((a.x, a.y), center_xy) <= radius)

    cands = []
    for r in rev_tokens:
        r_word = norm_val(r.text).lower()
        is_revision_word = r_word.startswith("revision")
        neighborhood = [t for t in tokens if distance((t.x, t.y), (r.x, r.y)) <= 280]
        looks_like_revision_table = any(norm_val(n.text).upper() in REV_TABLE_HEADERS for n in neighborhood)
        for t in neighborhood:
            v = norm_val(t.text)
            if not REV_VALUE_RE.match(v):
                continue
            d = distance((t.x, t.y), (r.x, r.y)) + 1e-3
            same_line = abs(t.y - r.y) <= max(r.h, t.h) * 0.8
            to_right = t.x > r.x
            base = 1000.0 / d
            if same_line: base += 4.0
            if to_right:  base += 6.0
            if in_bottom_right(t.x, t.y, page_w, page_h): base += 5.0
            base += nearby_anchor_bonus((t.x, t.y)) * 1.5
            if t.conf is not None: base += (t.conf - 0.5) * 2.0
            if is_revision_word: base -= 2.0
            if looks_like_revision_table: base -= 6.0
            cands.append((base, v, (t.x, t.y)))

    if not cands:
        return None

    br_cands = [c for c in cands if in_bottom_right(c[2][0], c[2][1], page_w, page_h)]
    pool = br_cands if br_cands else cands
    score, v, center = max(pool, key=lambda c: c[0])
    ctx = context_snippet_from_tokens(tokens, center, radius=160)
    return (v, score, center, ctx)

# ----------------------------- Rasterization -----------------------------------

def rasterize_to_bgr(pdf_path: Path, page_index0: int, dpi: int):
    import numpy as np, cv2
    try:
        from PIL import Image
    except Exception:
        Image = None

    with fitz.open(pdf_path) as doc:
        page = doc[page_index0]
        zoom = dpi / 72.0
        mat = fitz.Matrix(zoom, zoom)
        try:
            pix = page.get_pixmap(matrix=mat, alpha=False, colorspace=fitz.csRGB)
        except TypeError:
            pix = page.get_pixmap(matrix=mat, alpha=False)

        buf = getattr(pix, "samples", None)
        ncomps = getattr(pix, "n", None)
        try:
            if buf and ncomps:
                arr = np.frombuffer(buf, dtype=np.uint8)
                if ncomps == 3 and arr.size == int(pix.w) * int(pix.h) * 3:
                    img_rgb = arr.reshape((pix.h, pix.w, 3))
                    img_bgr = cv2.cvtColor(img_rgb, cv2.COLOR_RGB2BGR)
                    return img_bgr, float(pix.w), float(pix.h)
                if ncomps == 1 and arr.size == int(pix.w) * int(pix.h):
                    img_gray = arr.reshape((pix.h, pix.w))
                    img_bgr = cv2.cvtColor(img_gray, cv2.COLOR_GRAY2BGR)
                    return img_bgr, float(pix.w), float(pix.h)
                if ncomps == 4 and arr.size == int(pix.w) * int(pix.h) * 4:
                    img_rgba = arr.reshape((pix.h, pix.w, 4))
                    img_bgr = cv2.cvtColor(img_rgba, cv2.COLOR_RGBA2BGR)
                    return img_bgr, float(pix.w), float(pix.h)
        except Exception:
            pass

        # Robust fallback: PNG decode
        try:
            png = None
            if hasattr(pix, "tobytes"):
                try:
                    png = pix.tobytes("png")
                except Exception:
                    png = None
            if not png and hasattr(pix, "getPNGData"):
                try:
                    png = pix.getPNGData()
                except Exception:
                    png = None
            if png:
                arr = np.frombuffer(png, dtype=np.uint8)
                img = cv2.imdecode(arr, cv2.IMREAD_COLOR)
                if isinstance(img, np.ndarray):
                    return img, float(pix.w), float(pix.h)
        except Exception:
            pass

        # Last resort: PIL buffer interpretation
        try:
            if Image is not None:
                mode = "RGB" if ncomps in (3, None) else ("L" if ncomps == 1 else "RGBA")
                pil = Image.frombytes(mode, (pix.w, pix.h), pix.samples)
                arr = np.asarray(pil)
                import cv2 as _cv2
                if arr.ndim == 2:
                    img_bgr = _cv2.cvtColor(arr, _cv2.COLOR_GRAY2BGR)
                else:
                    img_bgr = _cv2.cvtColor(arr, _cv2.COLOR_RGBA2BGR) if arr.shape[2] == 4 else _cv2.cvtColor(arr, _cv2.COLOR_RGB2BGR)
                return img_bgr, float(pix.w), float(pix.h)
        except Exception:
            pass

        raise ValueError(f"Unable to rasterize {pdf_path.name} p{page_index0+1} to a valid BGR ndarray")

# ----------------------------- Page Analyzers ----------------------------------

def analyze_page_native(
    pdf_path: Path, page_index0: int, brx: float, bry: float, blocklist: set, edge_margin: float
) -> Optional[Tuple[str, str, float, str]]:
    """
    Returns (engine, value, score, context) or None
    Engines: 'native_br', 'native', 'native_text'
    """
    native = get_native_tokens(pdf_path, page_index0)
    with fitz.open(pdf_path) as doc:
        pw, ph = doc[page_index0].rect.width, doc[page_index0].rect.height

    # Pass A: Strict bottom-right ROI only
    if native.tokens:
        res = score_candidates_bottom_right_first(native.tokens, pw, ph, brx, bry, blocklist, edge_margin=edge_margin)
        if res:
            v, score, _, ctx = res
            return ("native_br", v, score, ctx)

    # Pass B: Global fallback (only if Pass A had no hit)
    if native.tokens:
        res = score_candidates_global(native.tokens, pw, ph)
        if res:
            v, score, _, ctx = res
            return ("native", v, score, ctx)

    # Lightweight textual fallback if page text was extracted
    if native.text:
        m = re.search(r"(?i)\brev(?:ision)?\b\s*[:#\-]?\s*([A-Za-z]{1,2}|\d{1,2}-\d{1,2})\b", native.text)
        if m:
            return ("native_text", norm_val(m.group(1)), 0.3, native.text[:80])

    return None

def analyze_page_ocr(
    pdf_path: Path, page_index0: int, dpi: int,
    use_paddle: bool, use_easy: bool,
    brx: float, bry: float, blocklist: set, edge_margin: float
) -> Optional[Tuple[str, str, float, str]]:
    """
    Returns (engine, value, score, context) or None
    Engines: 'paddleocr_br'/'paddleocr', 'easyocr_br'/'easyocr'
    """
    try:
        image_bgr, iw, ih = rasterize_to_bgr(pdf_path, page_index0, dpi)
    except Exception as e:
        LOG.warning(f"Rasterization failed p{page_index0+1} {pdf_path.name}: {e}")
        return None

    import numpy as _np
    if not (isinstance(image_bgr, _np.ndarray) and image_bgr.ndim == 3 and image_bgr.shape[2] == 3 and image_bgr.dtype == _np.uint8):
        LOG.warning(f"Rasterized image invalid for OCR p{page_index0+1} {pdf_path.name}: shape={getattr(image_bgr,'shape',None)} dtype={getattr(image_bgr,'dtype',None)}")
        return None

    if use_paddle:
        try:
            padd = PaddleWrapper().run(image_bgr)
            res = score_candidates_bottom_right_first(padd.tokens, iw, ih, brx, bry, blocklist, edge_margin=edge_margin)
            if res:
                v, score, _, ctx = res
                return ("paddleocr_br", v, score, ctx)
            res = score_candidates_global(padd.tokens, iw, ih)
            if res:
                v, score, _, ctx = res
                return ("paddleocr", v, score, ctx)
        except Exception as e:
            LOG.warning(f"PaddleOCR failed p{page_index0+1} {pdf_path.name}: {e}")

    if use_easy:
        try:
            easy = EasyWrapper().run(image_bgr)
            res = score_candidates_bottom_right_first(easy.tokens, iw, ih, brx, bry, blocklist, edge_margin=edge_margin)
            if res:
                v, score, _, ctx = res
                return ("easyocr_br", v, score, ctx)
            res = score_candidates_global(easy.tokens, iw, ih)
            if res:
                v, score, _, ctx = res
                return ("easyocr", v, score, ctx)
        except Exception as e:
            LOG.warning(f"EasyOCR failed p{page_index0+1} {pdf_path.name}: {e}")

    return None

# ----------------------------- File-Level Processing ---------------------------

def _normalize_output_value(v: str) -> str:
    """
    Map special cases:
    - 'OF' (exact, case-insensitive) -> 'EMPTY'
    - otherwise return normalized v
    """
    vu = norm_val(v).upper()
    if vu == "OF":
        return "EMPTY"
    return norm_val(v)

def process_pdf_native(pdf_path: Path, brx: float, bry: float, blocklist: set, edge_margin: float) -> Optional[RevHit]:
    hits: Dict[int, RevHit] = {}
    with fitz.open(pdf_path) as d:
        n = len(d)
    for i in range(n):
        res = analyze_page_native(pdf_path, i, brx, bry, blocklist, edge_margin)
        if not res:
            continue
        engine, value, score, ctx = res
        page_no = i + 1
        prev = hits.get(page_no)
        if not prev or score > prev.score:
            hits[page_no] = RevHit(file=pdf_path.name, page=page_no, value=value,
                                   engine=engine, score=score, context_snippet=ctx)
    if not hits:
        return None
    best = max(hits.values(), key=lambda h: getattr(h, 'score', 0))
    return best

def process_pdf_ocr(pdf_path: Path, dpi: int, use_paddle: bool, use_easy: bool,
                    brx: float, bry: float, blocklist: set, edge_margin: float) -> Optional[RevHit]:
    hits: Dict[int, RevHit] = {}
    with fitz.open(pdf_path) as d:
        n = len(d)
    for i in range(n):
        res = analyze_page_ocr(pdf_path, i, dpi, use_paddle, use_easy, brx, bry, blocklist, edge_margin)
        if not res:
            continue
        engine, value, score, ctx = res
        page_no = i + 1
        prev = hits.get(page_no)
        if not prev or score > prev.score:
            hits[page_no] = RevHit(file=pdf_path.name, page=page_no, value=value,
                                   engine=engine, score=score, context_snippet=ctx)
    if not hits:
        return None
    best = max(hits.values(), key=lambda h: getattr(h, 'score', 0))
    return best

def iter_pdfs(folder: Path) -> Iterable[Path]:
    # Avoid duplicates; yield .pdf files only
    seen = set()
    for p in folder.iterdir():
        try:
            if p.is_file() and p.suffix.lower() == ".pdf":
                rp = p.resolve()
                if rp not in seen:
                    seen.add(rp)
                    yield p
        except Exception:
            continue

# ----------------------------- Pipeline (Native → OCR) -------------------------

def run_pipeline(input_folder: Path, output_csv: Path, dpi: int,
                 enable_paddle: bool, enable_easy: bool,
                 brx: float, bry: float, rev_2l_blocklist: set,
                 edge_margin: float) -> List[Dict[str, Any]]:
    rows: List[Dict[str, Any]] = []

    pdfs = list(iter_pdfs(input_folder))
    if not pdfs:
        LOG.warning(f"No PDFs found in {input_folder}")

    for p in tqdm(pdfs, desc="Scanning PDFs"):
        try:
            # 1) Native pass (strict ROI, then global)
            native_best = process_pdf_native(p, brx, bry, rev_2l_blocklist, edge_margin)

            if native_best:
                value = _normalize_output_value(native_best.value)
                rows.append({"file": p.name, "value": value, "engine": native_best.engine})
                continue  # Do NOT OCR this file

            # 2) OCR fallback (only if native produced nothing)
            ocr_best = process_pdf_ocr(p, dpi, enable_paddle, enable_easy, brx, bry, rev_2l_blocklist, edge_margin)
            if ocr_best:
                value = _normalize_output_value(ocr_best.value)
                rows.append({"file": p.name, "value": value, "engine": ocr_best.engine})
                continue

            # 3) Neither pipeline produced a value → blank
            rows.append({"file": p.name, "value": "", "engine": ""})

        except Exception as e:
            LOG.warning(f"Failed {p.name}: {e}")
            rows.append({"file": p.name, "value": "", "engine": ""})

    # --- Write CSV (exactly one row per file, 3 columns) ---
    try:
        output_csv.parent.mkdir(parents=True, exist_ok=True)
        with open(output_csv, 'w', newline='', encoding='utf-8-sig') as outf:
            writer = csv.writer(outf)
            writer.writerow(['file', 'value', 'engine'])
            for r in rows:
                fs = _scalarize(r.get('file', ''))
                vs = _scalarize(r.get('value', ''))
                es = _scalarize(r.get('engine', ''))
                writer.writerow([fs, vs, es])
        LOG.info(f"Wrote CSV to {output_csv.resolve()} with {len(rows)} rows")
    except Exception as e:
        LOG.error(f"Failed to write CSV: {e}")

    return rows

# ----------------------------- CLI --------------------------------------------

def parse_args(argv=None):
    a = argparse.ArgumentParser(description="Extract REV values (bottom-right-first).")
    a.add_argument("input_folder", type=Path)
    a.add_argument("-o","--output", type=Path, default=Path("rev_results.csv"))
    a.add_argument("--dpi", type=int, default=240, help="OCR rasterization DPI")
    a.add_argument("--no-paddle", action="store_true", help="Disable PaddleOCR")
    a.add_argument("--no-easy", action="store_true", help="Disable EasyOCR")
    a.add_argument("--br-x", type=float, default=DEFAULT_BR_X,
                   help="Bottom-right ROI X ratio (default 0.68)")
    a.add_argument("--br-y", type=float, default=DEFAULT_BR_Y,
                   help="Bottom-right ROI Y ratio (default 0.72)")
    a.add_argument("--edge-margin", type=float, default=DEFAULT_EDGE_MARGIN,
                   help="Fraction of page dims to ignore near edges (default 0.018)")
    a.add_argument("--rev-2l-blocklist", type=str,
                   default=",".join(sorted(DEFAULT_REV_2L_BLOCKLIST)),
                   help="Comma-separated two-letter tokens to ignore as REV in ROI (OF is intentionally allowed).")
    return a.parse_args(argv)

def main(argv=None):
    args = parse_args(argv)
    blocklist = {s.strip().upper() for s in args.rev_2l_blocklist.split(",") if s.strip()}
    return run_pipeline(
        input_folder=args.input_folder,
        output_csv=args.output,
        dpi=args.dpi,
        enable_paddle=not args.no_paddle,
        enable_easy=not args.no_easy,
        brx=args.br_x,
        bry=args.br_y,
        rev_2l_blocklist=blocklist,
        edge_margin=args.edge_margin
    )

def _in_notebook():
    try:
        from IPython import get_ipython
        return get_ipython() is not None
    except Exception:
        return False

if __name__ == "__main__" and not _in_notebook():
    main()

In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
REV Extractor — Bottom-Right-First (Native → OCR fallback)
Outputs exactly one row per input PDF: file, value, engine
- Native vector text pass first; only files with no native hit fall back to OCR
- Strict bottom-right ROI prioritization (title block)
- 'OF' inside ROI mapped to 'EMPTY'
- Fix for top-right leakage (e.g., 'DF'): ROI pass now returns 'OF' from bottom-right
  when REV cell is empty, preventing global fallback from selecting top-right tokens.
"""

from __future__ import annotations
import argparse, logging, re, math, csv
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Dict, Iterable, List, Optional, Tuple

import fitz  # PyMuPDF
from tqdm import tqdm

LOG = logging.getLogger("rev_extractor_br_first")
logging.basicConfig(level=logging.INFO, format="%(asctime)s | %(levelname)s | %(message)s")

# ----------------------------- Patterns & Constants -----------------------------

# Allowed REV value patterns:
# - 1–2 letters (A..Z or AA)
# - hyphenated numeric pair (e.g., 1-0, 12-01)
REV_VALUE_RE = re.compile(r"^(?:[A-Z]{1,2}|\d{1,2}-\d{1,2})$")

# Tokens to detect a REV label
REV_TOKEN_RE = re.compile(r"^rev\.?$", re.IGNORECASE)

# Weak anchors often present near title blocks
TITLE_ANCHORS = {"DWG", "DWG.", "DWGNO", "SHEET", "SCALE", "WEIGHT", "SIZE", "TITLE"}

# Headers around revision tables (down-weighted in global fallback)
REV_TABLE_HEADERS = {"REVISIONS", "DESCRIPTION", "EC", "DFT", "APPR", "APPD", "DATE", "CHKD", "DRAWN"}

# ROI defaults for bottom-right title block (can be adjusted via CLI)
DEFAULT_BR_X = 0.68
DEFAULT_BR_Y = 0.72

# Edge margin to exclude tokens too close to page borders (grid letters/numbers)
DEFAULT_EDGE_MARGIN = 0.018  # ~1.8% of page width/height (tune 0.015–0.025)

# Two-letter junk frequently seen in title blocks; keep 'OF' allowed (used to infer EMPTY)
DEFAULT_REV_2L_BLOCKLIST = {"EC", "DF", "DT", "AP", "ID", "NO", "IN", "ON", "BY"}

# ----------------------------- Data Structures ---------------------------------

@dataclass
class Token:
    text: str
    conf: Optional[float]
    x: float
    y: float
    w: float
    h: float

@dataclass
class PageResult:
    tokens: List[Token]
    text: str
    engine: str

@dataclass
class RevHit:
    file: str
    page: int
    value: str
    engine: str
    score: float
    context_snippet: str

# ----------------------------- Utilities ---------------------------------------

def _scalarize(v: Any):
    """Coerce any non-scalar to a plain Python scalar or string."""
    try:
        import numpy as np
        if isinstance(v, np.ndarray):
            return ", ".join(map(str, v.flatten().tolist()))
        if isinstance(v, np.generic):
            try:
                return v.item()
            except Exception:
                return str(v)
    except Exception:
        pass
    if isinstance(v, (list, tuple, set)):
        return ", ".join(map(str, v))
    if isinstance(v, dict):
        return ", ".join(f"{k}={str(vv)}" for k, vv in v.items())
    if isinstance(v, (bytes, bytearray)):
        return v.decode("utf-8", errors="ignore")
    try:
        import numpy as np
        if isinstance(v, (np.integer, np.floating, np.bool_)):
            return v.item()
    except Exception:
        pass
    if isinstance(v, (str, int, float, bool)):
        return v
    return str(v)

def norm_val(v: Any) -> str:
    """Normalize token text for comparisons."""
    if v is None:
        return ""
    s = str(v).replace("\u00A0", " ")
    s = re.sub(r"\s+", " ", s).strip()
    return s

def in_bottom_right(x: float, y: float, width: float, height: float) -> bool:
    return x > width * 0.55 and y > height * 0.60

def in_bottom_right_strict(x: float, y: float, width: float, height: float, brx: float, bry: float) -> bool:
    return x >= width * brx and y >= height * bry

def is_far_from_edges(x: float, y: float, width: float, height: float, edge_margin: float) -> bool:
    """Filter out tokens too close to page edges (removes border grid labels like K/L/16)."""
    xm = width * edge_margin
    ym = height * edge_margin
    return (x > xm) and (x < width - xm) and (y > ym) and (y < height - ym)

def distance(a: Tuple[float, float], b: Tuple[float, float]) -> float:
    return math.hypot(a[0] - b[0], a[1] - b[1])

def context_snippet_from_tokens(tokens: List[Token], center: Tuple[float, float], radius: float = 160) -> str:
    close = [t.text for t in tokens if distance((t.x, t.y), center) <= radius]
    s = " ".join(close)
    s = re.sub(r"\s+", " ", s).strip()
    return s[:80]

# ----------------------------- Native Tokenization ------------------------------

def get_native_tokens(pdf_path: Path, page_index0: int) -> PageResult:
    tokens: List[Token] = []
    text_parts: List[str] = []
    with fitz.open(pdf_path) as doc:
        page = doc[page_index0]
        for x0, y0, x1, y1, txt, *_ in page.get_text("words"):
            txt_clean = txt.strip()
            if not txt_clean:
                continue
            cx = (x0 + x1) / 2.0
            cy = (y0 + y1) / 2.0
            tokens.append(Token(text=txt_clean, conf=None, x=cx, y=cy, w=(x1-x0), h=(y1-y0)))
            text_parts.append(txt_clean)
    return PageResult(tokens=tokens, text=" ".join(text_parts), engine="native")

# ----------------------------- OCR Wrappers ------------------------------------

class PaddleWrapper:
    def __init__(self):
        from paddleocr import PaddleOCR
        self.ocr = PaddleOCR(lang="en", use_angle_cls=True, show_log=False)

    def run(self, image_bgr):
        result = self.ocr.ocr(image_bgr, cls=True)
        tokens: List[Token] = []
        lines: List[str] = []
        for det in result:
            for (box, (txt, cf)) in det:
                txt_clean = txt.strip()
                if not txt_clean:
                    continue
                xs = [p[0] for p in box]; ys = [p[1] for p in box]
                cx, cy = sum(xs)/4.0, sum(ys)/4.0
                w = (max(xs)-min(xs)) or 1.0; h = (max(ys)-min(ys)) or 1.0
                tokens.append(Token(text=txt_clean, conf=float(cf), x=cx, y=cy, w=w, h=h))
                lines.append(txt_clean)
        return PageResult(tokens=tokens, text=" ".join(lines), engine="paddleocr")

class EasyWrapper:
    def __init__(self):
        import easyocr
        self.reader = easyocr.Reader(["en"], gpu=False)

    def run(self, image_bgr):
        result = self.reader.readtext(image_bgr)
        tokens: List[Token] = []
        lines: List[str] = []
        for (box, txt, cf) in result:
            txt_clean = txt.strip()
            if not txt_clean:
                continue
            xs = [p[0] for p in box]; ys = [p[1] for p in box]
            cx, cy = sum(xs)/4.0, sum(ys)/4.0
            w = (max(xs)-min(xs)) or 1.0; h = (max(ys)-min(ys)) or 1.0
            tokens.append(Token(text=txt_clean, conf=float(cf), x=cx, y=cy, w=w, h=h))
            lines.append(txt_clean)
        return PageResult(tokens=tokens, text=" ".join(lines), engine="easyocr")

# ----------------------------- Candidate Assembly ------------------------------

def _sort_by_x(tokens: List[Token]) -> List[Token]:
    return sorted(tokens, key=lambda t: (t.y, t.x))

def assemble_inline_candidates(neighborhood: List[Token], line_tol: float = 0.85, gap_tol: float = 0.60) -> List[str]:
    """
    Build candidate strings by concatenating adjacent small tokens on the same line:
      "1" "-" "0" -> "1-0", "A" "A" -> "AA"
    Returns de-duplicated strings.
    """
    if not neighborhood:
        return []
    by_lines: List[List[Token]] = []
    toks = _sort_by_x(neighborhood)
    for t in toks:
        placed = False
        for line in by_lines:
            anchor = line[0]
            same_line = abs(t.y - anchor.y) <= max(anchor.h, t.h) * line_tol
            if same_line:
                line.append(t); placed = True; break
        if not placed:
            by_lines.append([t])

    cands: set[str] = set()
    for line in by_lines:
        line = sorted(line, key=lambda t: t.x)
        if not line:
            continue
        avg_h = sum(t.h for t in line) / len(line)
        max_gap = avg_h * gap_tol
        texts = [norm_val(t.text) for t in line]
        xs = [t.x for t in line]
        # 2-grams
        for i in range(len(line)-1):
            if abs(xs[i+1] - xs[i]) <= max_gap:
                cands.add(texts[i] + texts[i+1])
        # 3-grams
        for i in range(len(line)-2):
            if abs(xs[i+1] - xs[i]) <= max_gap and abs(xs[i+2] - xs[i+1]) <= max_gap:
                cands.add(texts[i] + texts[i+1] + texts[i+2])
    return list(cands)

# ----------------------------- Scoring (ROI-first then global) -----------------

def _nearby_anchor_bonus(tokens_in_zone: List[Token], center_xy: Tuple[float, float], radius=220) -> int:
    return sum(1 for a in tokens_in_zone
               if norm_val(a.text).upper() in TITLE_ANCHORS and distance((a.x, a.y), center_xy) <= radius)

def _return_of_if_present(br_tokens: List[Token], all_tokens: List[Token]) -> Optional[Tuple[str, float, Tuple[float,float], str]]:
    """
    Last-resort sentinel for empty REV cells:
      If there's a standalone 'OF' token inside the bottom-right ROI,
      return it with a tiny score so the ROI pass succeeds and global fallback is never used.
    """
    for t in br_tokens:
        if norm_val(t.text).upper() == "OF":
            center = (t.x, t.y)
            ctx = context_snippet_from_tokens(all_tokens, center, radius=160)
            return ("OF", 0.05, center, ctx)
    return None

def score_candidates_bottom_right_first(
    tokens: List[Token], page_w: float, page_h: float,
    brx: float, bry: float, blocklist: Optional[set] = None,
    edge_margin: float = DEFAULT_EDGE_MARGIN
):
    """
    PASS A (strict, bottom-right only) with:
      - edge exclusion (filters page grid letters/numbers),
      - neighborhood assembly to recover '1-0' and 'AA',
      - pattern-aware scoring preferring N-N and double letters over single letters.
      - **NEW**: if no candidate found, but 'OF' exists in ROI, return 'OF' from ROI (sentinel for EMPTY).
    Returns (value, score, center, context) or None.
    """
    block = {t.upper() for t in (blocklist or set())}

    # ROI filter + edge exclusion
    br_tokens = [
        t for t in tokens
        if in_bottom_right_strict(t.x, t.y, page_w, page_h, brx, bry)
        and is_far_from_edges(t.x, t.y, page_w, page_h, edge_margin)
    ]
    if not br_tokens:
        return None

    br_rev_labels = [t for t in br_tokens if REV_TOKEN_RE.match(norm_val(t.text))]

    # Priority patterns
    def is_hyphen_code(s: str) -> bool:   # e.g., 1-0, 12-01
        return bool(re.fullmatch(r"\d{1,2}-\d{1,2}", s))
    def is_double_letter(s: str) -> bool: # AA, AB ...
        return bool(re.fullmatch(r"[A-Z]{2}", s))
    def is_single_letter(s: str) -> bool:
        return bool(re.fullmatch(r"[A-Z]", s))

    def base_score_for(v: str) -> float:
        if is_hyphen_code(v):   return 40.0
        if is_double_letter(v): return 14.0
        if is_single_letter(v): return 4.0
        return 8.0

    def neighborhood_around(cx: float, cy: float, radius: float = 300.0) -> List[Token]:
        return [t for t in br_tokens if distance((t.x, t.y), (cx, cy)) <= radius]

    cands: List[Tuple[float, str, Tuple[float,float]]] = []

    def consider_token_or_assembled(ref_xy: Tuple[float,float], neigh: List[Token], label_token: Optional[Token]):
        # 1) Raw tokens
        for t in neigh:
            v = norm_val(t.text)
            if not REV_VALUE_RE.match(v):
                continue
            vu = v.upper()
            if vu in block:
                continue
            d = distance((t.x, t.y), ref_xy) + 1e-3
            score = base_score_for(v) + 1000.0 / d
            if label_token is not None:
                if abs(t.y - label_token.y) <= max(label_token.h, t.h) * 0.8:
                    score += 6.0
                if t.x > label_token.x:
                    score += 8.0
            if in_bottom_right(t.x, t.y, page_w, page_h): score += 3.0
            score += _nearby_anchor_bonus(br_tokens, (t.x, t.y)) * 1.2
            cands.append((score, v, (t.x, t.y)))

        # 2) Assembled n-grams (recover 1-0, AA, etc.)
        assembled = assemble_inline_candidates(neigh, line_tol=0.85, gap_tol=0.60)
        for s in assembled:
            s_norm = norm_val(s)
            if not REV_VALUE_RE.match(s_norm):
                continue
            if s_norm.upper() in block:
                continue
            score = base_score_for(s_norm) + 1000.0 / 30.0  # proximity proxy
            if label_token is not None:
                score += 6.0
            cands.append((score, s_norm, ref_xy))

    if br_rev_labels:
        for r in br_rev_labels:
            neigh = neighborhood_around(r.x, r.y, radius=300.0)
            consider_token_or_assembled((r.x, r.y), neigh, r)
    else:
        # Approximate typical REV cell centroid
        anchor_xy = (page_w * 0.92, page_h * 0.90)
        neigh = neighborhood_around(anchor_xy[0], anchor_xy[1], radius=320.0)
        consider_token_or_assembled(anchor_xy, neigh, None)

    if not cands:
        # NEW: last-resort ROI sentinel – if 'OF' present in ROI, return it to prevent global leakage
        of_hit = _return_of_if_present(br_tokens, tokens)
        if of_hit is not None:
            v, score, center, ctx = of_hit
            return (v, score, center, ctx)
        return None

    # If any hyphen-code exists, demote lone single letters harshly
    any_hyphen = any(re.fullmatch(r"\d{1,2}-\d{1,2}", v) for _, v, _ in cands)
    if any_hyphen:
        cands = [(s - (6.0 if re.fullmatch(r"[A-Z]", v) else 0.0), v, xy) for (s, v, xy) in cands]

    best = max(cands, key=lambda c: c[0])
    score, v, center = best
    ctx = context_snippet_from_tokens(tokens, center, radius=160)
    return (v, score, center, ctx)

def score_candidates_global(tokens: List[Token], page_w: float, page_h: float):
    """
    PASS B (fallback): Global, seeded by any REV label on the page.
    Retains down-weights for revision tables and bottom-right bonuses.
    """
    anchor_tokens = [t for t in tokens if norm_val(t.text).upper() in TITLE_ANCHORS]
    rev_tokens = [t for t in tokens if REV_TOKEN_RE.match(norm_val(t.text))]
    if not rev_tokens:
        return None

    def nearby_anchor_bonus(center_xy, radius=220):
        return sum(1 for a in anchor_tokens if distance((a.x, a.y), center_xy) <= radius)

    cands = []
    for r in rev_tokens:
        r_word = norm_val(r.text).lower()
        is_revision_word = r_word.startswith("revision")
        neighborhood = [t for t in tokens if distance((t.x, t.y), (r.x, r.y)) <= 280]
        looks_like_revision_table = any(norm_val(n.text).upper() in REV_TABLE_HEADERS for n in neighborhood)
        for t in neighborhood:
            v = norm_val(t.text)
            if not REV_VALUE_RE.match(v):
                continue
            d = distance((t.x, t.y), (r.x, r.y)) + 1e-3
            same_line = abs(t.y - r.y) <= max(r.h, t.h) * 0.8
            to_right = t.x > r.x
            base = 1000.0 / d
            if same_line: base += 4.0
            if to_right:  base += 6.0
            if in_bottom_right(t.x, t.y, page_w, page_h): base += 5.0
            base += nearby_anchor_bonus((t.x, t.y)) * 1.5
            if t.conf is not None: base += (t.conf - 0.5) * 2.0
            if is_revision_word: base -= 2.0
            if looks_like_revision_table: base -= 6.0
            cands.append((base, v, (t.x, t.y)))

    if not cands:
        return None

    br_cands = [c for c in cands if in_bottom_right(c[2][0], c[2][1], page_w, page_h)]
    pool = br_cands if br_cands else cands
    score, v, center = max(pool, key=lambda c: c[0])
    ctx = context_snippet_from_tokens(tokens, center, radius=160)
    return (v, score, center, ctx)

# ----------------------------- Rasterization -----------------------------------

def rasterize_to_bgr(pdf_path: Path, page_index0: int, dpi: int):
    import numpy as np, cv2
    try:
        from PIL import Image
    except Exception:
        Image = None

    with fitz.open(pdf_path) as doc:
        page = doc[page_index0]
        zoom = dpi / 72.0
        mat = fitz.Matrix(zoom, zoom)
        try:
            pix = page.get_pixmap(matrix=mat, alpha=False, colorspace=fitz.csRGB)
        except TypeError:
            pix = page.get_pixmap(matrix=mat, alpha=False)

        buf = getattr(pix, "samples", None)
        ncomps = getattr(pix, "n", None)
        try:
            if buf and ncomps:
                arr = np.frombuffer(buf, dtype=np.uint8)
                if ncomps == 3 and arr.size == int(pix.w) * int(pix.h) * 3:
                    img_rgb = arr.reshape((pix.h, pix.w, 3))
                    img_bgr = cv2.cvtColor(img_rgb, cv2.COLOR_RGB2BGR)
                    return img_bgr, float(pix.w), float(pix.h)
                if ncomps == 1 and arr.size == int(pix.w) * int(pix.h):
                    img_gray = arr.reshape((pix.h, pix.w))
                    img_bgr = cv2.cvtColor(img_gray, cv2.COLOR_GRAY2BGR)
                    return img_bgr, float(pix.w), float(pix.h)
                if ncomps == 4 and arr.size == int(pix.w) * int(pix.h) * 4:
                    img_rgba = arr.reshape((pix.h, pix.w, 4))
                    img_bgr = cv2.cvtColor(img_rgba, cv2.COLOR_RGBA2BGR)
                    return img_bgr, float(pix.w), float(pix.h)
        except Exception:
            pass

        # Robust fallback: PNG decode
        try:
            png = None
            if hasattr(pix, "tobytes"):
                try:
                    png = pix.tobytes("png")
                except Exception:
                    png = None
            if not png and hasattr(pix, "getPNGData"):
                try:
                    png = pix.getPNGData()
                except Exception:
                    png = None
            if png:
                arr = np.frombuffer(png, dtype=np.uint8)
                img = cv2.imdecode(arr, cv2.IMREAD_COLOR)
                if isinstance(img, np.ndarray):
                    return img, float(pix.w), float(pix.h)
        except Exception:
            pass

        # Last resort: PIL buffer interpretation
        try:
            if Image is not None:
                mode = "RGB" if ncomps in (3, None) else ("L" if ncomps == 1 else "RGBA")
                pil = Image.frombytes(mode, (pix.w, pix.h), pix.samples)
                arr = np.asarray(pil)
                import cv2 as _cv2
                if arr.ndim == 2:
                    img_bgr = _cv2.cvtColor(arr, _cv2.COLOR_GRAY2BGR)
                else:
                    img_bgr = _cv2.cvtColor(arr, _cv2.COLOR_RGBA2BGR) if arr.shape[2] == 4 else _cv2.cvtColor(arr, _cv2.COLOR_RGB2BGR)
                return img_bgr, float(pix.w), float(pix.h)
        except Exception:
            pass

        raise ValueError(f"Unable to rasterize {pdf_path.name} p{page_index0+1} to a valid BGR ndarray")

# ----------------------------- Page Analyzers ----------------------------------

def analyze_page_native(
    pdf_path: Path, page_index0: int, brx: float, bry: float, blocklist: set, edge_margin: float
) -> Optional[Tuple[str, str, float, str]]:
    """
    Returns (engine, value, score, context) or None
    Engines: 'native_br', 'native', 'native_text'
    """
    native = get_native_tokens(pdf_path, page_index0)
    with fitz.open(pdf_path) as doc:
        pw, ph = doc[page_index0].rect.width, doc[page_index0].rect.height

    # Pass A: Strict bottom-right ROI only
    if native.tokens:
        res = score_candidates_bottom_right_first(native.tokens, pw, ph, brx, bry, blocklist, edge_margin=edge_margin)
        if res:
            v, score, _, ctx = res
            return ("native_br", v, score, ctx)

    # Pass B: Global fallback (only if Pass A had no hit)
    if native.tokens:
        res = score_candidates_global(native.tokens, pw, ph)
        if res:
            v, score, _, ctx = res
            return ("native", v, score, ctx)

    # Lightweight textual fallback if page text was extracted
    if native.text:
        m = re.search(r"(?i)\brev(?:ision)?\b\s*[:#\-]?\s*([A-Za-z]{1,2}|\d{1,2}-\d{1,2})\b", native.text)
        if m:
            return ("native_text", norm_val(m.group(1)), 0.3, native.text[:80])

    return None

def analyze_page_ocr(
    pdf_path: Path, page_index0: int, dpi: int,
    use_paddle: bool, use_easy: bool,
    brx: float, bry: float, blocklist: set, edge_margin: float
) -> Optional[Tuple[str, str, float, str]]:
    """
    Returns (engine, value, score, context) or None
    Engines: 'paddleocr_br'/'paddleocr', 'easyocr_br'/'easyocr'
    """
    try:
        image_bgr, iw, ih = rasterize_to_bgr(pdf_path, page_index0, dpi)
    except Exception as e:
        LOG.warning(f"Rasterization failed p{page_index0+1} {pdf_path.name}: {e}")
        return None

    import numpy as _np
    if not (isinstance(image_bgr, _np.ndarray) and image_bgr.ndim == 3 and image_bgr.shape[2] == 3 and image_bgr.dtype == _np.uint8):
        LOG.warning(f"Rasterized image invalid for OCR p{page_index0+1} {pdf_path.name}: shape={getattr(image_bgr,'shape',None)} dtype={getattr(image_bgr,'dtype',None)}")
        return None

    if use_paddle:
        try:
            padd = PaddleWrapper().run(image_bgr)
            res = score_candidates_bottom_right_first(padd.tokens, iw, ih, brx, bry, blocklist, edge_margin=edge_margin)
            if res:
                v, score, _, ctx = res
                return ("paddleocr_br", v, score, ctx)
            res = score_candidates_global(padd.tokens, iw, ih)
            if res:
                v, score, _, ctx = res
                return ("paddleocr", v, score, ctx)
        except Exception as e:
            LOG.warning(f"PaddleOCR failed p{page_index0+1} {pdf_path.name}: {e}")

    if use_easy:
        try:
            easy = EasyWrapper().run(image_bgr)
            res = score_candidates_bottom_right_first(easy.tokens, iw, ih, brx, bry, blocklist, edge_margin=edge_margin)
            if res:
                v, score, _, ctx = res
                return ("easyocr_br", v, score, ctx)
            res = score_candidates_global(easy.tokens, iw, ih)
            if res:
                v, score, _, ctx = res
                return ("easyocr", v, score, ctx)
        except Exception as e:
            LOG.warning(f"EasyOCR failed p{page_index0+1} {pdf_path.name}: {e}")

    return None

# ----------------------------- File-Level Processing ---------------------------

def _normalize_output_value(v: str) -> str:
    """
    Map special cases:
    - 'OF' (exact, case-insensitive) -> 'EMPTY'
    - otherwise return normalized v
    """
    vu = norm_val(v).upper()
    if vu == "OF":
        return "EMPTY"
    return norm_val(v)

def process_pdf_native(pdf_path: Path, brx: float, bry: float, blocklist: set, edge_margin: float) -> Optional[RevHit]:
    hits: Dict[int, RevHit] = {}
    with fitz.open(pdf_path) as d:
        n = len(d)
    for i in range(n):
        res = analyze_page_native(pdf_path, i, brx, bry, blocklist, edge_margin)
        if not res:
            continue
        engine, value, score, ctx = res
        page_no = i + 1
        prev = hits.get(page_no)
        if not prev or score > prev.score:
            hits[page_no] = RevHit(file=pdf_path.name, page=page_no, value=value,
                                   engine=engine, score=score, context_snippet=ctx)
    if not hits:
        return None
    best = max(hits.values(), key=lambda h: getattr(h, 'score', 0))
    return best

def process_pdf_ocr(pdf_path: Path, dpi: int, use_paddle: bool, use_easy: bool,
                    brx: float, bry: float, blocklist: set, edge_margin: float) -> Optional[RevHit]:
    hits: Dict[int, RevHit] = {}
    with fitz.open(pdf_path) as d:
        n = len(d)
    for i in range(n):
        res = analyze_page_ocr(pdf_path, i, dpi, use_paddle, use_easy, brx, bry, blocklist, edge_margin)
        if not res:
            continue
        engine, value, score, ctx = res
        page_no = i + 1
        prev = hits.get(page_no)
        if not prev or score > prev.score:
            hits[page_no] = RevHit(file=pdf_path.name, page=page_no, value=value,
                                   engine=engine, score=score, context_snippet=ctx)
    if not hits:
        return None
    best = max(hits.values(), key=lambda h: getattr(h, 'score', 0))
    return best

def iter_pdfs(folder: Path) -> Iterable[Path]:
    # Avoid duplicates; yield .pdf files only
    seen = set()
    for p in folder.iterdir():
        try:
            if p.is_file() and p.suffix.lower() == ".pdf":
                rp = p.resolve()
                if rp not in seen:
                    seen.add(rp)
                    yield p
        except Exception:
            continue

# ----------------------------- Pipeline (Native → OCR) -------------------------

def run_pipeline(input_folder: Path, output_csv: Path, dpi: int,
                 enable_paddle: bool, enable_easy: bool,
                 brx: float, bry: float, rev_2l_blocklist: set,
                 edge_margin: float) -> List[Dict[str, Any]]:
    rows: List[Dict[str, Any]] = []

    pdfs = list(iter_pdfs(input_folder))
    if not pdfs:
        LOG.warning(f"No PDFs found in {input_folder}")

    for p in tqdm(pdfs, desc="Scanning PDFs"):
        try:
            # 1) Native pass (strict ROI, then global)
            native_best = process_pdf_native(p, brx, bry, rev_2l_blocklist, edge_margin)

            if native_best:
                value = _normalize_output_value(native_best.value)
                rows.append({"file": p.name, "value": value, "engine": native_best.engine})
                continue  # Do NOT OCR this file

            # 2) OCR fallback (only if native produced nothing)
            ocr_best = process_pdf_ocr(p, dpi, enable_paddle, enable_easy, brx, bry, rev_2l_blocklist, edge_margin)
            if ocr_best:
                value = _normalize_output_value(ocr_best.value)
                rows.append({"file": p.name, "value": value, "engine": ocr_best.engine})
                continue

            # 3) Neither pipeline produced a value → blank
            rows.append({"file": p.name, "value": "", "engine": ""})

        except Exception as e:
            LOG.warning(f"Failed {p.name}: {e}")
            rows.append({"file": p.name, "value": "", "engine": ""})

    # --- Write CSV (exactly one row per file, 3 columns) ---
    try:
        output_csv.parent.mkdir(parents=True, exist_ok=True)
        with open(output_csv, 'w', newline='', encoding='utf-8-sig') as outf:
            writer = csv.writer(outf)
            writer.writerow(['file', 'value', 'engine'])
            for r in rows:
                fs = _scalarize(r.get('file', ''))
                vs = _scalarize(r.get('value', ''))
                es = _scalarize(r.get('engine', ''))
                writer.writerow([fs, vs, es])
        LOG.info(f"Wrote CSV to {output_csv.resolve()} with {len(rows)} rows")
    except Exception as e:
        LOG.error(f"Failed to write CSV: {e}")

    return rows

# ----------------------------- CLI --------------------------------------------

def parse_args(argv=None):
    a = argparse.ArgumentParser(description="Extract REV values (bottom-right-first).")
    a.add_argument("input_folder", type=Path)
    a.add_argument("-o","--output", type=Path, default=Path("rev_results.csv"))
    a.add_argument("--dpi", type=int, default=240, help="OCR rasterization DPI")
    a.add_argument("--no-paddle", action="store_true", help="Disable PaddleOCR")
    a.add_argument("--no-easy", action="store_true", help="Disable EasyOCR")
    a.add_argument("--br-x", type=float, default=DEFAULT_BR_X,
                   help="Bottom-right ROI X ratio (default 0.68)")
    a.add_argument("--br-y", type=float, default=DEFAULT_BR_Y,
                   help="Bottom-right ROI Y ratio (default 0.72)")
    a.add_argument("--edge-margin", type=float, default=DEFAULT_EDGE_MARGIN,
                   help="Fraction of page dims to ignore near edges (default 0.018)")
    a.add_argument("--rev-2l-blocklist", type=str,
                   default=",".join(sorted(DEFAULT_REV_2L_BLOCKLIST)),
                   help="Comma-separated two-letter tokens to ignore as REV in ROI (OF is intentionally allowed).")
    return a.parse_args(argv)

def main(argv=None):
    args = parse_args(argv)
    blocklist = {s.strip().upper() for s in args.rev_2l_blocklist.split(",") if s.strip()}
    return run_pipeline(
        input_folder=args.input_folder,
        output_csv=args.output,
        dpi=args.dpi,
        enable_paddle=not args.no_paddle,
        enable_easy=not args.no_easy,
        brx=args.br_x,
        bry=args.br_y,
        rev_2l_blocklist=blocklist,
        edge_margin=args.edge_margin
    )

def _in_notebook():
    try:
        from IPython import get_ipython
        return get_ipython() is not None
    except Exception:
        return False

if __name__ == "__main__" and not _in_notebook():
    main()

In [None]:
# ----------------------------- Default Paths for Notebook ----------------------
INPUT_FOLDER = Path(r"INPUT FOLDER PATH")  # e.g., Path("./pdfs")
OUTPUT_CSV   = Path("rev_results.csv")

In [None]:
# Native run no OCR
args = [str(INPUT_FOLDER), "-o", str(OUTPUT_CSV), "--dpi", "240", "--no-paddle", "--no-easy"]
main(args)

In [None]:
# OCR Run with both OCR engines
args = [str(INPUT_FOLDER), "-o", str(OUTPUT_CSV), "--dpi", "240"]
main(args)