In [None]:

import sys
import os
import subprocess
import tempfile
import shutil
import fitz
from PIL import Image
import pytesseract
import math



def is_image_file(path):
    ext = os.path.splitext(path)[1].lower()
    return ext in ('.png', '.jpg', '.jpeg', '.tiff', '.tif', '.bmp')

def is_pdf(path):
    return os.path.splitext(path)[1].lower() == '.pdf'

def is_docx(path):
    return os.path.splitext(path)[1].lower() == '.docx'

def is_xlsx(path):
    return os.path.splitext(path)[1].lower() == '.xlsx'

def convert_to_pdf_with_libreoffice(src_path, out_dir):
    """
    Try to convert docx/xlsx to pdf using libreoffice (soffice) headless.
    Returns path to converted pdf or None on failure.
    """
    try:
        cmd = [
            'soffice', '--headless', '--convert-to', 'pdf', '--outdir',
            out_dir, src_path
        ]
        subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        base = os.path.splitext(os.path.basename(src_path))[0] + '.pdf'
        out_pdf = os.path.join(out_dir, base)
        if os.path.exists(out_pdf):
            return out_pdf
    except Exception as e:
        # Conversion failed
        return None
    return None

def render_page_to_image(pdf_path, page_number, zoom=2):
    """
    Render a PDF page to a PIL image using PyMuPDF.
    zoom parameter scales the output to improve OCR accuracy.
    """
    doc = fitz.open(pdf_path)
    page = doc.load_page(page_number)
    mat = fitz.Matrix(zoom, zoom)
    pix = page.get_pixmap(matrix=mat, alpha=False)
    mode = "RGB"
    img = Image.frombytes(mode, [pix.width, pix.height], pix.samples)
    doc.close()
    return img

def image_to_ocr_data(pil_image):
    """
    Return pytesseract image_to_data output (list of dicts)
    Each dict contains: level, page_num, block_num, par_num, line_num, word_num, left, top, width, height, conf, text
    """
    raw = pytesseract.image_to_data(pil_image, output_type=pytesseract.Output.DICT)
    n = len(raw['text'])
    data = []
    for i in range(n):
        text = raw['text'][i].strip()
        if text == "":
            # keep empty tokens if needed
            pass
        data.append({
            'text': text,
            'left': int(raw['left'][i]),
            'top': int(raw['top'][i]),
            'width': int(raw['width'][i]),
            'height': int(raw['height'][i]),
            'conf': int(raw['conf'][i]) if raw['conf'][i] != '-1' else -1,
            'line_num': int(raw['line_num'][i]),
            'word_num': int(raw['word_num'][i]),
        })
    return data

def find_matches_in_ocr_data(ocr_data, search_text):
    """
    Given OCR token list for a page (in reading order), find occurrences of search_text (case-insensitive).
    Strategy:
      - Group tokens by line_num
      - For each line, build a concatenated string with spaces and track token indices.
      - Find substring matches; map back to tokens to compute bounding box union.
    Returns list of bounding boxes [ (left, top, right, bottom) ... ] in image pixel coordinates.
    """
    s = search_text.strip().lower()
    if not s:
        return []
    matches = []
    # group by line_num preserving order
    from collections import defaultdict, OrderedDict
    lines = OrderedDict()
    for idx, tok in enumerate(ocr_data):
        ln = tok['line_num']
        lines.setdefault(ln, []).append((idx, tok))

    for ln, toks in lines.items():
        words = [t[1]['text'] for t in toks]
        # Reconstruct line string with single spaces. Keep mapping from char index to token index.
        line_str = ""
        char_to_token = []
        for ti, (_, tok) in enumerate(toks):
            if ti > 0:
                line_str += " "
                char_to_token.append(None)  # for space
            token_text = tok['text']
            token_lower = token_text.lower()
            start = len(line_str)
            line_str += token_text
            for _ in token_text:
                char_to_token.append(ti)  # token index in toks
        # find all occurrences of s in line_str
        start_idx = 0
        while True:
            found = line_str.find(s, start_idx)
            if found == -1:
                break
            # map char range to token indices
            char_idxs = range(found, found + len(s))
            token_idxs = set()
            for ci in char_idxs:
                if ci < len(char_to_token):
                    mapped = char_to_token[ci]
                    if mapped is not None:
                        token_idxs.add(mapped)
            if not token_idxs:
                start_idx = found + 1
                continue
            # tokens indices are relative to toks list
            token_idxs_sorted = sorted(token_idxs)
            # compute bounding box union of tokens
            lefts = []
            tops = []
            rights = []
            bottoms = []
            for tindex in token_idxs_sorted:
                tok = toks[tindex][1]
                l = tok['left']; t = tok['top']; w = tok['width']; h = tok['height']
                lefts.append(l); tops.append(t); rights.append(l + w); bottoms.append(t + h)
            left = min(lefts); top = min(tops); right = max(rights); bottom = max(bottoms)
            matches.append((left, top, right, bottom))
            start_idx = found + 1
    return matches

def add_rectangles_to_pdf(input_pdf_path, matches_per_page, out_pdf_path, zoom=2):
    """
    Add unfilled red rectangles to the PDF pages using PyMuPDF.
    matches_per_page: dict page_idx -> list of boxes in image pixel coords (on rendered image with same zoom)
    Need to convert image pixel coords back to PDF coordinate space.
    """
    doc = fitz.open(input_pdf_path)
    for page_idx, page in enumerate(doc):
        page_matches = matches_per_page.get(page_idx, [])
        if not page_matches:
            continue
        # page rect in PDF points
        page_rect = page.rect
        # compute scaling between rendered image (zoom) and PDF points
        # When we rendered we used zoom factor, and PyMuPDF default resolution 72 dpi: image px = pdf points * zoom
        # So to convert image px -> pdf points: pdf_x = px / zoom
        scale = 1.0 / zoom
        for (l, t, r, b) in page_matches:
            # convert image pixel coords to pdf coordinates
            pdf_left = l * scale
            pdf_top = t * scale
            pdf_right = r * scale
            pdf_bottom = b * scale
            # PyMuPDF coordinate origin for page is (0,0) top-left in text extraction, but fitz.Rect uses points where y grows down.
            # Create a rect
            rect = fitz.Rect(pdf_left, pdf_top, pdf_right, pdf_bottom)
            # Add a rectangle annotation: red border, no fill
            annot = page.add_rect_annot(rect)
            annot.set_colors(stroke=(1, 0, 0))  # red
            annot.set_border(width=1)  # 1pt border
            annot.update()  # apply
    doc.save(out_pdf_path, garbage=4, deflate=True)
    doc.close()


def process_file(input_path, search_text, out_dir=None, zoom=3):
    if out_dir is None:
        out_dir = os.path.dirname(os.path.abspath(input_path)) or os.getcwd()
    basename = os.path.splitext(os.path.basename(input_path))[0]
    tmpdir = tempfile.mkdtemp(prefix="auditram_")
    try:
        working_pdf = None
        # If docx/xlsx, try to convert to pdf
        if is_docx(input_path) or is_xlsx(input_path):
            conv = convert_to_pdf_with_libreoffice(input_path, tmpdir)
            if conv:
                working_pdf = conv
            else:
                print("Warning: LibreOffice conversion failed or not available. Will attempt text-only search.")

                hits = []
                if is_docx(input_path):
                    try:
                        from docx import Document
                        doc = Document(input_path)
                        for p_idx, p in enumerate(doc.paragraphs):
                            if search_text.lower() in p.text.lower():
                                hits.append((p_idx+1, p.text.strip()))
                    except Exception:
                        pass
                if is_xlsx(input_path):
                    try:
                        import openpyxl
                        wb = openpyxl.load_workbook(input_path, read_only=True, data_only=True)
                        for sheet in wb.worksheets:
                            for row in sheet.iter_rows(values_only=True):
                                for cell in row:
                                    if cell and isinstance(cell, str) and search_text.lower() in cell.lower():
                                        hits.append((sheet.title, cell.strip()))
                    except Exception:
                        pass
                report_path = os.path.join(out_dir, f"{basename}_text_search_report.txt")
                with open(report_path, 'w', encoding='utf-8') as f:
                    f.write(f"Search report for '{search_text}' in {input_path}\n\n")
                    if hits:
                        for h in hits:
                            f.write(str(h) + "\n")
                    else:
                        f.write("No hits found or conversion unavailable.\n")
                print(f"Text report written to: {report_path}")
                return report_path

        elif is_pdf(input_path):
            working_pdf = input_path
        elif is_image_file(input_path):

            img = Image.open(input_path).convert("RGB")
            tmp_pdf = os.path.join(tmpdir, basename + "_from_image.pdf")
            img.save(tmp_pdf, "PDF", resolution=100.0)
            working_pdf = tmp_pdf
        else:
            raise ValueError("Unsupported file type: " + input_path)


        doc = fitz.open(working_pdf)
        matches_per_page = {}
        for page_idx in range(len(doc)):

            img = render_page_to_image(working_pdf, page_idx, zoom=zoom)
            ocr_data = image_to_ocr_data(img)
            page_matches = find_matches_in_ocr_data(ocr_data, search_text)
            if page_matches:
                matches_per_page[page_idx] = page_matches
            print(f"Page {page_idx+1}/{len(doc)} - found {len(page_matches)} matches")
        doc.close()

        out_pdf = os.path.join(out_dir, f"{basename}_boxed.pdf")

        shutil.copyfile(working_pdf, out_pdf)
        if matches_per_page:
            add_rectangles_to_pdf(out_pdf, matches_per_page, out_pdf, zoom=zoom)
            print(f"Annotated PDF written to: {out_pdf}")
        else:
            print("No matches found. No annotations added. A copy of the PDF was still created: " + out_pdf)
        return out_pdf

    finally:
        shutil.rmtree(tmpdir, ignore_errors=True)

if __name__ == "__main__":
    if len(sys.argv) < 3:
        print("Usage: python auditram_highlighter.py input_file \"search text\"")
        sys.exit(1)
    input_path = sys.argv[1]
    search_text = sys.argv[2]
    try:
        out = process_file(input_path, search_text, zoom=3)
        print("Done. Output:", out)
    except Exception as e:
        print("Error:", e)
        raise