# 0. Install if needed

In [None]:
%pip install "markitdown[all]" openai pdf2image pydantic

### Imports, basic definitions and option selections

In [1]:
import os
import re
import shutil
import subprocess
import tempfile
import warnings
from pathlib import Path
from openai import OpenAI
from markitdown import MarkItDown
from dotenv import load_dotenv
from pdf2image import convert_from_path
from typing import Dict, Iterable, List, Optional, Tuple, Union




In [None]:
# Load in the API key from the top-level .env file
load_dotenv(Path.cwd().parent / ".env")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

# Define the model we want to use to convert the PNGs into Markdown
LLM_MODEL = "gpt-5-mini"

# Read in the system prompts
sys_prompt_detailed = (Path.cwd() / "system_prompts" / "system_prompt_detailed.md").read_text(encoding="utf-8")
sys_prompt_summarized = (Path.cwd() / "system_prompts" / "system_prompt_summarized.md").read_text(encoding="utf-8")

# Choose the system prompt you'd like to use. Detailed will attempt to keep as much of the original content
# as possible, at the cost of speed and cost. Summarized will summarize much more of the content but it will
# be fast and cost less to process the data
sys_prompt_mode = "detailed" # can be 'detailed' or 'summarized'
system_prompt = sys_prompt_detailed if sys_prompt_mode == "detailed" else sys_prompt_summarized

## Markdown conversion function
All file types except for PowerPoint are converted directory into Markdown. PowerPoints are first converted into a PDF, which is then converted into PNGs, and finally those PNGs are converted into Markdown.

Allowed file types: 
* PowerPoint
    - pptx
* Word
    - docx
* Excel
    - xlsx
    - xls
    - csv
* Text
    - txt
    - pdf
    - html
    - htm
    - html_string
    - log

### ALLLLLLLLLLLLLLLLLLLL OF THEMMMMMMMMMMMMMMMM

In [None]:
def _normalize_kind(kind: str) -> str:
    if not kind:
        raise ValueError("kind is required")
    return kind.lower().strip()


DEFAULT_OUTPUT_DIRECTORIES = {
    "docx": "./output_docx",
    "xlsx": "./output_xlsx",
    "xls": "./output_xlsx",
    "html": "./output_html",
    "htm": "./output_html",
    "txt": "./output_txt",
    "log": "./output_txt",
    "csv": "./output_csv",
    "pdf": "./output_markdown",
    "pptx": "./output_markdown",
    "png": "./output_markdown",
    "jpg": "./output_markdown",
    "jpeg": "./output_markdown",
    "html_string": "./output_html",
}

DIRECT_MARKITDOWN_KINDS = {"docx", "xlsx", "xls", "html", "htm", "txt", "log", "csv"}
IMAGE_KINDS = {"png", "jpg", "jpeg"}

DEFAULT_PDF_DPI = 200
DEFAULT_IMAGE_FORMAT = "PNG"


def _resolve_output_dir(kind: str, override: Optional[Union[str, Path]]) -> Path:
    if override is not None:
        return Path(override)
    try:
        default = DEFAULT_OUTPUT_DIRECTORIES[kind]
    except KeyError as exc:
        raise ValueError(f"No default output directory configured for kind '{kind}'") from exc
    return Path(default)


def _build_markitdown_converter(
    use_llm: bool,
    llm_client,
    llm_model: Optional[str],
    llm_prompt: Optional[str],
):
    if use_llm:
        if llm_client is None or llm_model is None:
            raise ValueError("use_llm=True requires llm_client and llm_model")
        return MarkItDown(llm_client=llm_client, llm_model=llm_model, llm_prompt=llm_prompt)
    return MarkItDown()


def _write_markdown(text: str, out_dir: Path, base_name: str) -> Path:
    out_path = out_dir / f"{base_name}.md"
    out_path.write_text(text, encoding="utf-8")
    return out_path


def _convert_html_string(converter: MarkItDown, html_value: str, out_dir: Path, base_name: str) -> Path:
    result = converter.convert_html(html_value)
    return _write_markdown(result.text_content or "", out_dir, base_name)


def _convert_standard_file(
    path: Path,
    converter: MarkItDown,
    out_dir: Path,
    base_name: Optional[str],
) -> Path:
    if not path.exists():
        raise FileNotFoundError(f"Input file does not exist: {path}")
    result = converter.convert(str(path))
    final_name = base_name or path.stem
    return _write_markdown(result.text_content or "", out_dir, final_name)


def _render_pdf_to_pngs(
    pdf_path: Path,
    image_dir: Path,
    *,
    dpi: int = DEFAULT_PDF_DPI,
    fmt: str = DEFAULT_IMAGE_FORMAT,
    poppler_path: Optional[Union[str, Path]] = None,
    first_page: Optional[int] = None,
    last_page: Optional[int] = None,
) -> List[Path]:
    image_dir.mkdir(parents=True, exist_ok=True)
    poppler_dir = str(poppler_path) if poppler_path else os.getenv("POPPLER_PATH")
    images = convert_from_path(
        str(pdf_path),
        dpi=dpi,
        first_page=first_page,
        last_page=last_page,
        poppler_path=poppler_dir,
    )
    start_index = first_page or 1
    fmt_upper = fmt.upper()
    suffix = fmt.lower()
    saved: List[Path] = []
    for offset, image in enumerate(images):
        page_number = start_index + offset
        out_path = image_dir / f"page_{page_number:03}.{suffix}"
        image.save(out_path, fmt_upper)
        saved.append(out_path)
    return saved


def _convert_images_to_markdown(
    image_paths: Iterable[Path],
    converter: MarkItDown,
    out_dir: Path,
    base_name: str,
    *,
    section_prefix: str = "Image",
) -> Path:
    ordered_images = [Path(p) for p in image_paths]
    if not ordered_images:
        raise RuntimeError("No images were produced for Markdown conversion")

    fragments: List[str] = []
    failures: List[Tuple[Path, Exception]] = []
    for idx, image_path in enumerate(ordered_images, start=1):
        try:
            result = converter.convert(str(image_path))
        except Exception as exc:  # pragma: no cover - passthrough for runtime issues
            failures.append((image_path, exc))
            continue
        content = (result.text_content or "").strip()
        header = f"# {section_prefix} {idx}"
        fragments.append(f"{header}

{content}" if content else header)

    if not fragments:
        raise RuntimeError("Failed to generate Markdown from the rendered images")

    markdown_body = "

".join(fragment.strip() for fragment in fragments if fragment).strip()
    output_path = _write_markdown(markdown_body, out_dir, base_name)

    if failures:
        issues = ", ".join(f"{path.name}: {exc}" for path, exc in failures)
        warnings.warn(
            f"Encountered {len(failures)} image conversion error(s) while building {output_path.name}: {issues}",
            RuntimeWarning,
            stacklevel=2,
        )

    return output_path


def _convert_pdf_pipeline(
    pdf_path: Path,
    converter: MarkItDown,
    out_dir: Path,
    base_name: Optional[str],
    *,
    dpi: int = DEFAULT_PDF_DPI,
    fmt: str = DEFAULT_IMAGE_FORMAT,
    poppler_path: Optional[Union[str, Path]] = None,
    first_page: Optional[int] = None,
    last_page: Optional[int] = None,
) -> Path:
    pdf_path = Path(pdf_path)
    final_name = base_name or pdf_path.stem
    png_root = Path("./output_pngs") / final_name
    if png_root.exists():
        shutil.rmtree(png_root)
    image_paths = _render_pdf_to_pngs(
        pdf_path,
        png_root,
        dpi=dpi,
        fmt=fmt,
        poppler_path=poppler_path,
        first_page=first_page,
        last_page=last_page,
    )
    return _convert_images_to_markdown(image_paths, converter, out_dir, final_name, section_prefix="Page")


def _pptx_to_pdf(pptx_path: Path, output_dir: Path) -> Path:
    pptx_path = Path(pptx_path)
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
    target_pdf = output_dir / f"{pptx_path.stem}.pdf"

    try:
        from pptx2pdf import convert as pptx2pdf_convert  # type: ignore
    except ImportError:
        pptx2pdf_convert = None

    if pptx2pdf_convert is not None:
        try:
            pptx2pdf_convert(str(pptx_path), output_path=str(target_pdf))
        except TypeError:
            pptx2pdf_convert(str(pptx_path), output_dir=str(output_dir))
        if target_pdf.exists():
            return target_pdf

    for cmd in ("soffice", "libreoffice"):
        executable = shutil.which(cmd)
        if not executable:
            continue
        subprocess.run(
            [executable, "--headless", "--convert-to", "pdf", str(pptx_path), "--outdir", str(output_dir)],
            check=True,
        )
        if target_pdf.exists():
            return target_pdf

    unoconv = shutil.which("unoconv")
    if unoconv:
        subprocess.run([unoconv, "-f", "pdf", "-o", str(target_pdf), str(pptx_path)], check=True)
        if target_pdf.exists():
            return target_pdf

    raise RuntimeError(
        "Unable to convert PPTX to PDF. Install `pptx2pdf` or LibreOffice/Unoconv and ensure it is on your PATH."
    )


def _convert_pptx_pipeline(
    pptx_path: Path,
    converter: MarkItDown,
    out_dir: Path,
    base_name: Optional[str],
    **pdf_kwargs,
) -> Path:
    pptx_path = Path(pptx_path)
    final_name = base_name or pptx_path.stem
    with tempfile.TemporaryDirectory() as tmp_dir:
        pdf_path = _pptx_to_pdf(pptx_path, Path(tmp_dir))
        return _convert_pdf_pipeline(pdf_path, converter, out_dir, final_name, **pdf_kwargs)


def _convert_image_pipeline(
    image_path: Path,
    converter: MarkItDown,
    out_dir: Path,
    base_name: Optional[str],
) -> Path:
    image_path = Path(image_path)
    final_name = base_name or image_path.stem
    return _convert_images_to_markdown([image_path], converter, out_dir, final_name, section_prefix="Image")


def convert_to_markdown_file(
    input_value: Union[str, Path],
    kind: str,
    use_llm: bool = False,
    llm_client=None,
    llm_model: Optional[str] = None,
    llm_prompt: Optional[str] = None,
    out_dir: Optional[Union[str, Path]] = None,
    out_name: Optional[str] = None,
) -> Path:
    """Convert supported inputs into Markdown files on disk.

    Parameters
    ----------
    input_value:
        For file-based kinds, pass a filesystem path. For ``html_string`` provide raw HTML.
    kind:
                One of {'docx', 'xlsx', 'xls', 'html', 'htm', 'txt', 'log', 'csv', 'pdf', 'pptx', 'png', 'jpg', 'jpeg', 'html_string'}.
    use_llm, llm_client, llm_model, llm_prompt:
        Forwarded to :class:`MarkItDown` when LLM-backed conversion is desired.
    out_dir:
        Optional override of the destination directory. Defaults to ``./output_<group>``.
    out_name:
        Optional base filename (without extension) for the generated Markdown.

    Returns
    -------
    Path
        Location of the primary Markdown file that was written.
    """
    normalized_kind = _normalize_kind(kind)
    if normalized_kind not in DEFAULT_OUTPUT_DIRECTORIES:
        raise ValueError(f"kind must be one of {sorted(DEFAULT_OUTPUT_DIRECTORIES)}")

    destination_dir = _resolve_output_dir(normalized_kind, out_dir)
    destination_dir.mkdir(parents=True, exist_ok=True)

    converter = _build_markitdown_converter(use_llm, llm_client, llm_model, llm_prompt)

    if normalized_kind == "html_string":
        if not out_name:
            raise ValueError("out_name is required when kind='html_string'")
        return _convert_html_string(converter, str(input_value), destination_dir, out_name)

    if normalized_kind in DIRECT_MARKITDOWN_KINDS:
        return _convert_standard_file(Path(input_value), converter, destination_dir, out_name)

    if normalized_kind == "pdf":
        return _convert_pdf_pipeline(
            Path(input_value),
            converter,
            destination_dir,
            out_name,
            poppler_path=os.getenv("POPPLER_PATH"),
        )

    if normalized_kind == "pptx":
        return _convert_pptx_pipeline(
            Path(input_value),
            converter,
            destination_dir,
            out_name,
            dpi=DEFAULT_PDF_DPI,
            fmt=DEFAULT_IMAGE_FORMAT,
            poppler_path=os.getenv("POPPLER_PATH"),
        )

    if normalized_kind in IMAGE_KINDS:
        return _convert_image_pipeline(Path(input_value), converter, destination_dir, out_name)

    raise ValueError(f"Unsupported kind '{normalized_kind}'")


# 1. Convert PDF -> PNGs

Goes through a PDF and converts each page into an image.

In [None]:
# Set the POPPLER_PATH environment variable
os.environ["POPPLER_PATH"] = r"C:\Users\RAC62971\Downloads\poppler-25.07.0\Library\bin"

def pdf_to_grouped_pngs(
    pdf_path: Union[str, Path],
    out_dir: Union[str, Path],
    dpi: int = 200,
    group_size: int = 1,
    grouping_prefix: str = "grouping",
    fmt: str = "PNG",
    poppler_path: Optional[Union[str, Path]] = None,
    first_page: Optional[int] = None,
    last_page: Optional[int] = None,
) -> Dict[str, List[str]]:
    """
    Convert a PDF into per-page images and group them into subfolders.

    Parameters
    ----------
    pdf_path : str | Path
        Path to the input PDF, e.g. r"C:\\...\\my.pdf".
    out_dir : str | Path
        Output directory root where grouped folders will be created.
    dpi : int, default 200
        Render DPI for rasterization.
    group_size : int, default 1
        Number of pages per group folder. For example, 2 will place pages
        1 and 2 into 'grouping_1', pages 3 and 4 into 'grouping_2', etc.
    grouping_prefix : str, default "grouping"
        Folder name prefix for each group.
    fmt : str, default "PNG"
        Image format to write. Common options are "PNG" and "JPEG".
    poppler_path : str | Path | None
        Path to Poppler bin directory on Windows if not on PATH.
        Example: r"C:\\tools\\poppler-24.08.0\\Library\\bin"
    first_page : int | None
        Optional first page to convert (1-indexed).
    last_page : int | None
        Optional last page to convert (inclusive).

    Returns
    -------
    Dict[str, List[str]]
        Mapping of group folder name to list of saved image paths (strings).
    """
    pdf_path = Path(pdf_path)
    out_dir = Path(out_dir)
    out_dir.mkdir(parents=True, exist_ok=True)

    if group_size < 1:
        raise ValueError("group_size must be >= 1")

    images = convert_from_path(
        str(pdf_path),
        dpi=dpi,
        first_page=first_page,
        last_page=last_page,
        poppler_path=str(poppler_path) if poppler_path else None,
    )

    saved: Dict[str, List[str]] = {}
    for i, img in enumerate(images, start=1 if not first_page else first_page):
        # Compute 1-indexed group index
        group_idx = (i - (first_page or 1)) // group_size + 1
        group_dir = out_dir / f"{grouping_prefix}_{group_idx}"
        group_dir.mkdir(parents=True, exist_ok=True)

        page_basename = f"page_{i:03}.{fmt.lower()}"
        out_path = group_dir / page_basename
        img.save(out_path, fmt)
        saved.setdefault(f"{grouping_prefix}_{group_idx}", []).append(str(out_path))

    return saved


In [51]:
pdf_to_grouped_pngs(
    pdf_path=r".\input_data\V2 - one-pagers with summaries- HRI-OH-99P Project Descriptions (1).pdf",
    out_dir=r".\output_pngs",
    dpi=200,
    group_size=2,
    grouping_prefix="grouping",
    poppler_path=os.environ["POPPLER_PATH"]
)

{'grouping_1': ['output_pngs\\grouping_1\\page_001.png',
  'output_pngs\\grouping_1\\page_002.png'],
 'grouping_2': ['output_pngs\\grouping_2\\page_003.png',
  'output_pngs\\grouping_2\\page_004.png'],
 'grouping_3': ['output_pngs\\grouping_3\\page_005.png',
  'output_pngs\\grouping_3\\page_006.png'],
 'grouping_4': ['output_pngs\\grouping_4\\page_007.png',
  'output_pngs\\grouping_4\\page_008.png'],
 'grouping_5': ['output_pngs\\grouping_5\\page_009.png',
  'output_pngs\\grouping_5\\page_010.png']}

# 2. Convert PNGs -> Markdown

### PNGs

In [None]:
PNG_DIR = Path("./output_pngs")
OUT_DIR = Path("./output_markdown")

client = OpenAI(api_key=OPENAI_API_KEY)
image_converter = MarkItDown(
    llm_client=client,
    llm_model=LLM_MODEL,
    llm_prompt=system_prompt
)

def _grouping_index(p: Path) -> int:
    # Extract the integer from "grouping_{number}"
    m = re.search(r"grouping_(\d+)$", p.name)
    return int(m.group(1)) if m else 0

def _sorted_imgs(imgs):
    # Sort by any integer in filename, else lexicographic
    def key_fn(p: Path):
        m = re.findall(r"\d+", p.stem)
        return (int(m[-1]) if m else 0, p.name.lower())
    return sorted(imgs, key=key_fn)

# Ensure OUT_DIR exists
OUT_DIR.mkdir(parents=True, exist_ok=True)

# Find all grouping_* directories at first level under PNG_DIR
grouping_dirs = sorted([d for d in PNG_DIR.iterdir() if d.is_dir() and d.name.startswith("grouping_")],
                    key=_grouping_index)

if not grouping_dirs:
    print(f"No grouping_* directories found in {PNG_DIR.resolve()}")

for tdir in grouping_dirs:
    grouping_num = _grouping_index(tdir)
    imgs = _sorted_imgs(list(tdir.glob("*.png")))
    if not imgs:
        print(f"Skipping {tdir.name} because it has no PNGs")
        continue

    per_image_md = []
    for i, img_path in enumerate(imgs, start=1):
        try:
            res = image_converter.convert(str(img_path))
            per_image_md.append(f"\n\n# Slide {i}\n\n" + res.text_content.strip())
        except Exception as e:
            print(f"Error processing {img_path}: {e}")
            continue

    if not per_image_md:
        print(f"No markdown produced for {tdir.name}")
        continue

    combined_image_md = "".join(per_image_md).strip()

    # Write to OUT_DIR/grouping_{number}/<BASE_NAME>_grouping_{number}_{with|no}_llm.md
    out_grouping_dir = OUT_DIR / f"grouping_{grouping_num}"
    out_grouping_dir.mkdir(parents=True, exist_ok=True)
    suffix = "processed"
    combined_path = out_grouping_dir / f"grouping_{grouping_num}.md"
    combined_path.write_text(combined_image_md, encoding="utf-8")
    print("Wrote:", combined_path.resolve())
