In [1]:
import sys
from pathlib import Path
import shutil
from IPython.display import display, Image, Markdown
# If your project is not on sys.path, add it:
project_root = Path("/home/ansary/work/apsis/DocSignerNML")
if str(project_root) not in sys.path:
    sys.path.append(str(project_root))

# Cell A: simple, notebook-friendly PDF reader
import logging
import io
import shutil
from pathlib import Path
from typing import List, Dict, Any

try:
    from pdf2image import convert_from_path
    from pypdf import PdfReader, PdfWriter
    from markitdown import MarkItDown
except ImportError as e:
    raise ImportError("Missing dependency; run: pip install -r requirements.txt") from e

logger = logging.getLogger(__name__)

def extract_content_per_page(pdf_path: Path, output_dir: Path, dpi: int = 300) -> List[Dict[str, Any]]:
    """
    Convert PDF pages to PNG images (saved under output_dir) and extract per-page Markdown text.
    Returns a list of dicts: { "page_num": int, "markdown_text": str, "image_path": Path }.
    Notebook-friendly: does not require FastAPI UploadFile or context manager.
    """
    pdf_path = Path(pdf_path)
    if not pdf_path.exists():
        raise FileNotFoundError(pdf_path)

    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

    # 1) Convert PDF pages to images (returns PIL Image objects)
    logger.info(f"Converting {pdf_path} -> images (dpi={dpi}) into {output_dir}")
    pil_images = convert_from_path(str(pdf_path), dpi=dpi, fmt='png', thread_count=4)

    image_paths: List[Path] = []
    stem = pdf_path.stem
    for i, img in enumerate(pil_images):
        image_name = f"{stem}_page_{i+1:03d}.png"
        img_path = output_dir / image_name
        img.save(img_path, "PNG")
        image_paths.append(img_path)

    # 2) Extract Markdown/text per page using MarkItDown (in-memory)
    logger.info("Extracting markdown/text per page (in-memory) ...")
    markdown_texts: List[str] = []
    md_converter = MarkItDown()
    reader = PdfReader(str(pdf_path))

    for page in reader.pages:
        writer = PdfWriter()
        writer.add_page(page)
        with io.BytesIO() as bs:
            writer.write(bs)
            bs.seek(0)
            text = ""
            # Try multiple interfaces of MarkItDown for compatibility
            try:
                if hasattr(md_converter, "convert_stream"):
                    result = md_converter.convert_stream(bs)
                    text = getattr(result, "text_content", None) or getattr(result, "text", None) or str(result)
                elif hasattr(md_converter, "convert_fp"):
                    result = md_converter.convert_fp(bs)
                    text = getattr(result, "text_content", None) or getattr(result, "text", None) or str(result)
                else:
                    # fallback: give raw bytes (some MarkItDown versions accept bytes)
                    bs.seek(0)
                    result = md_converter.convert(bs.read())
                    text = getattr(result, "text_content", None) or getattr(result, "text", None) or str(result)
            except Exception as e:
                logger.warning(f"MarkItDown conversion failed for a page: {e}")
                text = ""
            markdown_texts.append(text or "")

    # safety: if mismatch, pad markdown_texts (won't crash notebook)
    if len(image_paths) != len(markdown_texts):
        logger.warning("Image/markdown count mismatch — padding markdown to match images.")
        if len(markdown_texts) < len(image_paths):
            markdown_texts += [""] * (len(image_paths) - len(markdown_texts))
        else:
            image_paths += [None] * (len(markdown_texts) - len(image_paths))

    bundles = []
    for i in range(len(image_paths)):
        bundles.append({
            "page_num": i + 1,
            "markdown_text": markdown_texts[i],
            "image_path": image_paths[i]
        })
    return bundles


In [7]:
import logging
from pathlib import Path
import json
from typing import Dict, Any, List
from document_ai_verification.utils.config_loader import load_settings
from document_ai_verification.ai.llm.client import LLMService
from document_ai_verification.ai.llm.prompts import get_ns_document_analysis_prompt_holistic
from document_ai_verification.ai.llm.schemas import PageHolisticAnalysis
# --- Setup ---
logger = logging.getLogger(__name__)

APP_SETTINGS = load_settings()
SECRETS = APP_SETTINGS['secrets']
CONFIG = APP_SETTINGS['config']

LLM_CLIENT = LLMService(
    api_key=SECRETS['llm_api_key'],
    model=SECRETS['llm_model_name'],
    base_url=SECRETS['llm_api_url'],
    max_context_tokens=CONFIG['ai_services']['llm'].get('max_context_tokens', 64000)
)


# Cell C: Run the reader on your PDF (quick test)
pdf_path = Path("doctests/test.pdf")
out_dir = Path("./temp_files_jupyter") / pdf_path.stem
# small DPI for speed while testing
bundles = extract_content_per_page(pdf_path, out_dir, dpi=96)
print(f"Pages processed: {len(bundles)}")



2025-09-11 14:23:41,073 - INFO - Converting doctests/test.pdf -> images (dpi=96) into temp_files_jupyter/test


✅ LLMService (Sync) initialized for model 'RedHatAI/gemma-3-27b-it-FP8-dynamic' with max_tokens=64000.


2025-09-11 14:23:41,698 - INFO - Extracting markdown/text per page (in-memory) ...


Pages processed: 1


In [8]:
p1t=bundles[0]["markdown_text"]
p1i=bundles[0]["image_path"]

In [9]:
prompt=get_ns_document_analysis_prompt_holistic(p1t)
result=LLM_CLIENT.invoke_vision_structured(prompt=prompt,image_path=p1i,response_model=PageHolisticAnalysis)

2025-09-11 14:23:49,028 - INFO - Performing vision call for image: test_page_001.png
2025-09-11 14:24:02,028 - INFO - HTTP Request: POST http://114.130.116.79/gemma3/v1/chat/completions "HTTP/1.1 200 OK"


In [10]:
p1t

'AI Policy and Direction from Faiz Tayeb Sir\n\nI …………. Declare that I have received the summary frim tayed sir\n\nSummary:\nThe Government of Bangladesh is currently developing an interoperability platform to streamline\ncitizen services. While the platform has reached a functional stage, the integration of numerous\nAPIs is expected to make the data exchange policy increasingly complex. Therefore, a strategic\napproach is required to address the following key areas:\n\n1.  Role of AI: We must identify how Artificial Intelligence can enhance data exchange,\nregulatory compliance, and enforcement mechanisms within this interoperability\necosystem.\n\n2.  Ministerial AI Adoption Challenges: It is essential to assess the specific challenges\nfaced by different ministries in adopting AI and determine how AI can support their\noperations. Although UNESCO has contributed work in this area, the quality has not met\nexpectations and requires improvement.\n\n3.  Cloud Infrastructure for Emergi

In [11]:
result.model_dump()

{'required_inputs': [{'input_type': 'checkbox',
   'marker_text': '[YOUR OPINION IN YES/NO]',
   'description': 'User must indicate their opinion with a yes or no checkbox.'},
  {'input_type': 'checkbox',
   'marker_text': '[YOUR OPINION IN YES/NO]',
   'description': 'User must indicate their opinion with a yes or no checkbox.'},
  {'input_type': 'checkbox',
   'marker_text': '[YOUR OPINION IN YES/NO]',
   'description': 'User must indicate their opinion with a yes or no checkbox.'},
  {'input_type': 'full_name',
   'marker_text': 'Name:',
   'description': 'User must provide their full name.'},
  {'input_type': 'other',
   'marker_text': 'Company:',
   'description': 'User must provide the company name.'},
  {'input_type': 'other',
   'marker_text': 'Position:',
   'description': 'User must provide their position.'},
  {'input_type': 'date',
   'marker_text': 'Date:',
   'description': 'User must provide the date.'},
  {'input_type': 'signature',
   'marker_text': 'Sign',
   'descrip

In [12]:
a=[]
if a:
    print("C")
else:
    print("D")

D
