# 0. Install if needed

In [None]:
%pip install "markitdown[all]" openai pdf2image pydantic

# 1. Convert PDF -> PNGs

Goes through a PDF and converts each page into an image.

In [76]:
from pathlib import Path
from typing import Dict, List, Optional, Union
from pdf2image import convert_from_path
import os

# Set the POPPLER_PATH environment variable
os.environ["POPPLER_PATH"] = r"C:\Users\RAC62971\Downloads\poppler-25.07.0\Library\bin"

def pdf_to_grouped_pngs(
    pdf_path: Union[str, Path],
    out_dir: Union[str, Path],
    dpi: int = 200,
    group_size: int = 1,
    grouping_prefix: str = "grouping",
    fmt: str = "PNG",
    poppler_path: Optional[Union[str, Path]] = None,
    first_page: Optional[int] = None,
    last_page: Optional[int] = None,
) -> Dict[str, List[str]]:
    """
    Convert a PDF into per-page images and group them into subfolders.

    Parameters
    ----------
    pdf_path : str | Path
        Path to the input PDF, e.g. r"C:\\...\\my.pdf".
    out_dir : str | Path
        Output directory root where grouped folders will be created.
    dpi : int, default 200
        Render DPI for rasterization.
    group_size : int, default 1
        Number of pages per group folder. For example, 2 will place pages
        1 and 2 into 'grouping_1', pages 3 and 4 into 'grouping_2', etc.
    grouping_prefix : str, default "grouping"
        Folder name prefix for each group.
    fmt : str, default "PNG"
        Image format to write. Common options are "PNG" and "JPEG".
    poppler_path : str | Path | None
        Path to Poppler bin directory on Windows if not on PATH.
        Example: r"C:\\tools\\poppler-24.08.0\\Library\\bin"
    first_page : int | None
        Optional first page to convert (1-indexed).
    last_page : int | None
        Optional last page to convert (inclusive).

    Returns
    -------
    Dict[str, List[str]]
        Mapping of group folder name to list of saved image paths (strings).
    """
    pdf_path = Path(pdf_path)
    out_dir = Path(out_dir)
    out_dir.mkdir(parents=True, exist_ok=True)

    if group_size < 1:
        raise ValueError("group_size must be >= 1")

    images = convert_from_path(
        str(pdf_path),
        dpi=dpi,
        first_page=first_page,
        last_page=last_page,
        poppler_path=str(poppler_path) if poppler_path else None,
    )

    saved: Dict[str, List[str]] = {}
    for i, img in enumerate(images, start=1 if not first_page else first_page):
        # Compute 1-indexed group index
        group_idx = (i - (first_page or 1)) // group_size + 1
        group_dir = out_dir / f"{grouping_prefix}_{group_idx}"
        group_dir.mkdir(parents=True, exist_ok=True)

        page_basename = f"page_{i:03}.{fmt.lower()}"
        out_path = group_dir / page_basename
        img.save(out_path, fmt)
        saved.setdefault(f"{grouping_prefix}_{group_idx}", []).append(str(out_path))

    return saved


In [51]:
pdf_to_grouped_pngs(
    pdf_path=r".\input_data\V2 - one-pagers with summaries- HRI-OH-99P Project Descriptions (1).pdf",
    out_dir=r".\output_pngs",
    dpi=200,
    group_size=2,
    grouping_prefix="grouping",
    poppler_path=os.environ["POPPLER_PATH"]
)

{'grouping_1': ['output_pngs\\grouping_1\\page_001.png',
  'output_pngs\\grouping_1\\page_002.png'],
 'grouping_2': ['output_pngs\\grouping_2\\page_003.png',
  'output_pngs\\grouping_2\\page_004.png'],
 'grouping_3': ['output_pngs\\grouping_3\\page_005.png',
  'output_pngs\\grouping_3\\page_006.png'],
 'grouping_4': ['output_pngs\\grouping_4\\page_007.png',
  'output_pngs\\grouping_4\\page_008.png'],
 'grouping_5': ['output_pngs\\grouping_5\\page_009.png',
  'output_pngs\\grouping_5\\page_010.png']}

# 2. Convert PNGs -> Markdown

In [75]:
import os
from pathlib import Path
from dotenv import load_dotenv

# Load in the API key from the top-level .env file
load_dotenv(Path.cwd().parent / ".env")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

# Define the model we want to use to convert the PNGs into Markdown
LLM_MODEL = "gpt-5-mini"

# Read in the system prompts
sys_prompt_detailed = (Path.cwd() / "system_prompts" / "img_llm_prompt_detailed.md").read_text(encoding="utf-8")
sys_prompt_summarized = (Path.cwd() / "system_prompts" / "img_llm_prompt_summarized.md").read_text(encoding="utf-8")

# Choose the system prompt you'd like to use
sys_prompt_mode = "detailed" # can be 'detailed' or 'summarized
IMG_LLM_PROMPT = sys_prompt_detailed if sys_prompt_mode == "detailed" else sys_prompt_summarized

In [None]:
import re
from markitdown import MarkItDown
from openai import OpenAI

PNG_DIR = Path("./output_pngs")
OUT_DIR = Path("./output_markdown")

client = OpenAI(api_key=OPENAI_API_KEY)
image_converter = MarkItDown(
    llm_client=client,
    llm_model=LLM_MODEL,
    llm_prompt=IMG_LLM_PROMPT
)

def _grouping_index(p: Path) -> int:
    # Extract the integer from "grouping_{number}"
    m = re.search(r"grouping_(\d+)$", p.name)
    return int(m.group(1)) if m else 0

def _sorted_imgs(imgs):
    # Sort by any integer in filename, else lexicographic
    def key_fn(p: Path):
        m = re.findall(r"\d+", p.stem)
        return (int(m[-1]) if m else 0, p.name.lower())
    return sorted(imgs, key=key_fn)

# Ensure OUT_DIR exists
OUT_DIR.mkdir(parents=True, exist_ok=True)

# Find all grouping_* directories at first level under PNG_DIR
grouping_dirs = sorted([d for d in PNG_DIR.iterdir() if d.is_dir() and d.name.startswith("grouping_")],
                    key=_grouping_index)

if not grouping_dirs:
    print(f"No grouping_* directories found in {PNG_DIR.resolve()}")

for tdir in grouping_dirs:
    grouping_num = _grouping_index(tdir)
    imgs = _sorted_imgs(list(tdir.glob("*.png")))
    if not imgs:
        print(f"Skipping {tdir.name} because it has no PNGs")
        continue

    per_image_md = []
    for i, img_path in enumerate(imgs, start=1):
        try:
            res = image_converter.convert(str(img_path))
            per_image_md.append(f"\n\n# Slide {i}\n\n" + res.text_content.strip())
        except Exception as e:
            print(f"Error processing {img_path}: {e}")
            continue

    if not per_image_md:
        print(f"No markdown produced for {tdir.name}")
        continue

    combined_image_md = "".join(per_image_md).strip()

    # Write to OUT_DIR/grouping_{number}/<BASE_NAME>_grouping_{number}_{with|no}_llm.md
    out_grouping_dir = OUT_DIR / f"grouping_{grouping_num}"
    out_grouping_dir.mkdir(parents=True, exist_ok=True)
    suffix = "processed"
    combined_path = out_grouping_dir / f"grouping_{grouping_num}.md"
    combined_path.write_text(combined_image_md, encoding="utf-8")
    print("Wrote:", combined_path.resolve())
