# 0. Install if needed

In [None]:
%pip install "markitdown[all]" openai pdf2image pydantic Send2Trash PyPDF2

# 1. Imports, basic definitions and option selections

In [1]:
# All needed imports
import os
import sys
import json
import re
import shutil
import tempfile
import logging
import subprocess
import time
from pathlib import Path
from send2trash import send2trash
from openai import OpenAI
from markitdown import MarkItDown
from dotenv import load_dotenv
from pdf2image import convert_from_path
from PyPDF2 import PdfReader
from typing import Dict, List, Optional, Union, Tuple, Any



In [2]:
# Set up the logging function
def setup_logger(level=logging.INFO) -> logging.Logger:
    logger = logging.getLogger("md_pipeline")
    if not logger.handlers:
        handler = logging.StreamHandler(stream=sys.stdout)
        formatter = logging.Formatter("[%(asctime)s] %(levelname)s %(message)s", datefmt="%H:%M:%S")
        handler.setFormatter(formatter)
        logger.addHandler(handler)
    logger.setLevel(level)
    logger.propagate = False
    return logger

LOGGER = setup_logger()

In [3]:
# Load in the API key from the top-level .env file
load_dotenv(Path.cwd().parent / ".env")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

# Define the model we want to use to convert the PNGs into Markdown
LLM_MODEL = "gpt-5-mini"

LLM_CLIENT = OpenAI(api_key=OPENAI_API_KEY)

# Read in the system prompts
sys_prompt_detailed = (Path.cwd() / "system_prompts" / "system_prompt_detailed.md").read_text(encoding="utf-8")
sys_prompt_summarized = (Path.cwd() / "system_prompts" / "system_prompt_summarized.md").read_text(encoding="utf-8")

# Choose the system prompt you'd like to use. Detailed will attempt to keep as much of the original content
# as possible, at the cost of speed and cost. Summarized will summarize much more of the content but it will
# be fast and cost less to process the data
sys_prompt_mode = "detailed" # can be 'detailed' or 'summarized'
SYSTEM_PROMPT = sys_prompt_detailed if sys_prompt_mode == "detailed" else sys_prompt_summarized

# Define input and output directories
INPUT_BASE_DIR = Path("./input_data/not_processed")
OUTPUT_BASE_DIR = Path("./input_data/processed")

# Set the POPPLER_PATH environment variable
os.environ["POPPLER_PATH"] = r"C:\\Users\\RAC62971\\Downloads\\poppler-25.07.0\\Library\\bin"

# 2. Temporary Conversions

## PPT -> PDF

Goes through a PowerPoint file and converts it to a temporary PDF. This PDF then goes through the regular PDF processing pipeline (see next section). The function tries, in order:

1. Microsoft PowerPoint COM on Windows
2. LibreOffice soffice
3. unoconv

In [4]:
def _have(cmd: str) -> bool:
    return shutil.which(cmd) is not None

In [5]:
def _ascii_safe_copy(src: Path) -> Path:
    """
    Copy the PPTX to a short ASCII-only filename in a temp folder to avoid
    COM issues with unicode, spaces, quotes, stars, or very long paths.
    Returns the path to the copied file.
    """
    tmp_root = Path(tempfile.mkdtemp(prefix="pptx2pdf_"))
    safe_name = re.sub(r"[^A-Za-z0-9._-]+", "_", src.stem)[:40] + src.suffix
    safe_path = tmp_root / safe_name
    shutil.copy2(src, safe_path)
    return safe_path

In [6]:
def pptx_to_pdf(
    pptx_path: Path,
    out_dir: Path,
    method: Optional[str] = None,
    timeout: int = 600,
) -> Path:
    """
    Convert PPTX to PDF and return the PDF path in out_dir.

    Order tried:
      1) PowerPoint COM on Windows using ExportAsFixedFormat, then SaveAs(32)
      2) soffice
      3) unoconv

    Key stability choices:
      - Copy to an ASCII-only short temp file before opening via COM
      - Use DispatchEx + EnsureDispatch to isolate a clean instance
      - Avoid touching Application.Visible
      - Pass only supported params via keywords to ExportAsFixedFormat
    """
    pptx_path = Path(pptx_path).resolve()
    out_dir = Path(out_dir).resolve()
    out_dir.mkdir(parents=True, exist_ok=True)
    target_pdf = (out_dir / f"{pptx_path.stem}.pdf").resolve()

    LOGGER.info(f"Converting PPTX to PDF: {pptx_path.name}")

    # 1) PowerPoint COM path on Windows
    if method == "powerpoint" or (method is None and os.name == "nt"):
        try:
            import platform
            if platform.system() == "Windows":
                # Copy to safe temp path to avoid unicode and long path issues
                safe_src = _ascii_safe_copy(pptx_path)
                LOGGER.info(f"Copied to safe temp path for COM: {safe_src}")

                import win32com.client  # type: ignore
                from win32com.client import gencache, constants  # type: ignore

                LOGGER.info("Starting PowerPoint COM via DispatchEx")
                app = win32com.client.DispatchEx("PowerPoint.Application")
                # Avoid setting .Visible. It can throw.
                # Reduce popups if possible
                try:
                    app.DisplayAlerts = 1  # ppAlertsNone
                except Exception:
                    pass

                pres = None
                try:
                    # Open with window suppressed but without toggling .Visible
                    # Signature: Open(FileName, ReadOnly, Untitled, WithWindow)
                    pres = app.Presentations.Open(
                        FileName=str(safe_src),
                        ReadOnly=1,
                        Untitled=0,
                        WithWindow=False
                    )

                    # Prefer ExportAsFixedFormat with minimal, supported kwargs
                    exported = False
                    try:
                        # constants:
                        #   ppFixedFormatTypePDF = 2
                        #   ppFixedFormatIntentPrint = 2
                        #   ppPrintAll = 1
                        pres.ExportAsFixedFormat(
                            Path=str(target_pdf),
                            FixedFormatType=constants.ppFixedFormatTypePDF,
                            Intent=constants.ppFixedFormatIntentPrint,
                            FrameSlides=True,
                            RangeType=constants.ppPrintAll,
                            PrintHiddenSlides=False,
                            IncludeDocProperties=True,
                            KeepIRMSettings=True,
                            DocStructureTags=True,
                            BitmapMissingFonts=True,
                            UseISO19005_1=False
                        )
                        exported = target_pdf.exists()
                        if exported:
                            LOGGER.info(f"Wrote PDF via ExportAsFixedFormat: {target_pdf}")
                    except Exception as e:
                        LOGGER.warning(f"ExportAsFixedFormat failed, will try SaveAs(32): {e}")

                    if not exported:
                        # Fallback: SaveAs with ppSaveAsPDF = 32
                        pres.SaveAs(str(target_pdf), 32)
                        exported = target_pdf.exists()
                        if exported:
                            LOGGER.info(f"Wrote PDF via SaveAs: {target_pdf}")

                    if not exported:
                        raise RuntimeError("PowerPoint did not produce the PDF file")

                    return target_pdf

                finally:
                    # Close presentation and quit app
                    try:
                        if pres is not None:
                            pres.Close()
                    except Exception:
                        pass
                    try:
                        app.Quit()
                    except Exception:
                        pass
        except Exception as e:
            LOGGER.warning(f"PowerPoint COM path failed: {e}")

    # 2) LibreOffice soffice
    if method == "soffice" or (method is None and _have("soffice")):
        try:
            LOGGER.info("Using LibreOffice soffice")
            cmd = [
                "soffice",
                "--headless",
                "--invisible",
                "--norestore",
                "--convert-to",
                "pdf",
                "--outdir",
                str(out_dir),
                str(pptx_path),
            ]
            subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=timeout)
            if target_pdf.exists():
                LOGGER.info(f"Wrote PDF via soffice: {target_pdf}")
                return target_pdf
        except Exception as e:
            LOGGER.warning(f"soffice path failed: {e}")

    # 3) unoconv
    if method == "unoconv" or (method is None and _have("unoconv")):
        try:
            LOGGER.info("Using unoconv")
            cmd = [
                "unoconv",
                "-f", "pdf",
                "-o", str(out_dir),
                str(pptx_path),
            ]
            subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=timeout)
            if target_pdf.exists():
                LOGGER.info(f"Wrote PDF via unoconv: {target_pdf}")
                return target_pdf
        except Exception as e:
            LOGGER.warning(f"unoconv path failed: {e}")

    raise RuntimeError(
        "Could not convert PPTX to PDF. Install LibreOffice or unoconv, or ensure desktop PowerPoint is installed."
    )

## PDF -> PNG

Goes through a PDF and converts each page into an image.

In [7]:
def pdf_pages_to_pngs(
    pdf_path: Union[str, Path],
    dpi: int = 200,
    poppler_path: Optional[str] = None,
) -> Tuple[Path, List[Path]]:
    """
    Convert every page of a PDF to PNGs stored in a temporary directory.
    Returns (temp_dir, [png_paths]). Caller removes temp_dir.
    """
    pdf_path = Path(pdf_path)
    temp_dir = Path(tempfile.mkdtemp(prefix=f"{pdf_path.stem}_pdfpages_"))

    # Page count estimate for better logs
    try:
        reader = PdfReader(str(pdf_path))
        total_pages = len(reader.pages)
    except Exception:
        total_pages = None

    if poppler_path is None:
        poppler_path = os.environ.get("POPPLER_PATH")

    start = time.time()
    if total_pages is not None:
        LOGGER.info(f"Rendering {total_pages} page(s) from {pdf_path.name} at {dpi} dpi via pdf2image")
    else:
        LOGGER.info(f"Rendering pages from {pdf_path.name} at {dpi} dpi via pdf2image")

    # Write directly to temp_dir so files appear incrementally on disk
    png_paths = convert_from_path(
        str(pdf_path),
        dpi=dpi,
        poppler_path=poppler_path,
        output_folder=str(temp_dir),
        paths_only=True,
        fmt="png",
        output_file=f"{pdf_path.stem}_page"
    )

    elapsed = time.time() - start
    LOGGER.info(f"Rendering complete in {elapsed:.1f}s. Generated {len(png_paths)} image(s) in {temp_dir}")
    return temp_dir, [Path(p) for p in png_paths]

# 3. Markdown Conversions

In [8]:
def _mirror_markdown_to_backend(md_path: Path) -> None:
    """Ensure backend/merged_files has a symlink to the generated markdown."""
    repo_root = None
    for candidate in [Path.cwd()] + list(Path.cwd().parents):
        backend_dir = candidate / "backend"
        preprocessing_dir = candidate / "preprocessing"
        if backend_dir.exists() and preprocessing_dir.exists():
            repo_root = candidate
            break
    if repo_root is None:
        LOGGER.warning("Skipping backend symlink; repository root not found.")
        return
    merged_dir = repo_root / "backend" / "merged_files"
    merged_dir.mkdir(parents=True, exist_ok=True)
    link_path = merged_dir / md_path.name
    try:
        if link_path.exists() or link_path.is_symlink():
            if link_path.is_dir() and not link_path.is_symlink():
                LOGGER.warning(f"Cannot replace directory at {link_path}; skipping symlink.")
                return
            link_path.unlink()
        link_path.symlink_to(md_path.resolve())
        LOGGER.info(f"Symlinked {md_path} to {link_path}")
    except OSError as err:
        LOGGER.warning(f"Failed to symlink {md_path} into backend/merged_files: {err}")
        
def convert_to_markdown_file(
    input_value: Union[str, Path],
    kind: str,
    use_llm: bool = False,
    llm_client=None,
    llm_model: Optional[str] = None,
    llm_prompt: Optional[str] = None,
    out_dir: Optional[Union[str, Path]] = None,
    out_name: Optional[str] = None,
    project_context: Optional[Dict] = None
) -> Path:
    """
    Convert an input file or HTML string to Markdown using MarkItDown 
    and write it to ./processed/{project_name}/markdown/{file_name}.md

    Parameters
    - input_value
      - For file-based kinds, pass a filesystem path.
      - For 'html_string', pass a raw HTML string.
    - kind: one of {'docx','xlsx','xls','html','htm','txt','log','csv','html_string', "png", "pptx", "pdf"}
    - use_llm: default False. If True, provide llm_client and llm_model.
    - llm_client, llm_model, llm_prompt: forwarded to MarkItDown when use_llm=True
    - out_dir: optional override of the output directory. Defaults to ./input_data/processed
    - out_name: optional base filename without extension. Defaults to input file stem when applicable.
    - project_context: dict containing project metadata that gets added to markdown output

    Returns
    - Path to the written Markdown file.
    """
    # 1) Normalize kind and choose default output directory bucket
    file_type = kind.lower().strip()
    allowed = {"docx", "xlsx", "xls", "html", "htm", "txt", "log", "csv", "html_string", "png", "pptx", "pdf"}
    if file_type not in allowed:
        raise ValueError(f"kind must be one of {allowed}")

    if out_dir is None:
        out_dir = "./input_data/processed"
    out_dir = Path(out_dir)
    out_dir.mkdir(parents=True, exist_ok=True)

    # 2) Add project context to the system prompt
    if project_context:
        context_str = "#### PROJECT CONTEXT\n"
        for key, value in project_context.items():
            context_str += f"- **{key}:** {value}\n"
        context_str += "\n"
        llm_prompt = context_str + (llm_prompt or "")

    # 3) Build MarkItDown converter
    from markitdown import MarkItDown
    if use_llm:
        if llm_client is None or llm_model is None:
            raise ValueError("use_llm=True requires llm_client and llm_model")
        converter = MarkItDown(llm_client=llm_client, llm_model=llm_model, llm_prompt=llm_prompt)
    else:
        converter = MarkItDown()

    # 4) Convert depending on kind
    if file_type == "html_string":
        if not out_name:
            raise ValueError("out_name is required when kind='html_string'")
        res = converter.convert_html(str(input_value))
        base_name = out_name
    else:
        p = Path(input_value)
        res = converter.convert(str(p))
        base_name = out_name or p.stem

    # 5) Write markdown
    out_path = out_dir / f"{base_name}.md"
    out_path.write_text(res.text_content, encoding="utf-8")
    print(f'Wrote {base_name}.md to {out_path}')
    _mirror_markdown_to_backend(out_path)
    return out_path

Converts a list of images into a single Markdown file

In [9]:
def images_to_single_markdown(
    image_paths: List[Path],
    out_md_path: Path,
    project_context: Optional[dict],
    llm_client,
    llm_model: str,
    llm_prompt: str,
    per_page_heading: bool = True,
    log_every: int = 1,
) -> Path:
    """
    Convert a list of images to one markdown file via MarkItDown.
    Logs progress per page. Writes once to out_md_path.
    """
    md_converter = MarkItDown(
        llm_client=llm_client,
        llm_model=llm_model,
        llm_prompt=llm_prompt,
    )

    total = len(image_paths)
    parts: List[str] = []

    if project_context:
        context_lines = [f"- **{k}**: {v}" for k, v in project_context.items()]
        parts.append("### Project context\n" + "\n".join(context_lines) + "\n")

    LOGGER.info(f"Starting LLM extraction for {total} page image(s)")
    start = time.time()

    for idx, img_path in enumerate(image_paths, start=1):
        t0 = time.time()
        try:
            res = md_converter.convert(str(img_path))
            page_md = res.text_content if hasattr(res, "text_content") else str(res)
            ok = True
        except Exception as e:
            page_md = f"_Extraction failed for {img_path.name}: {e}_"
            ok = False

        if per_page_heading:
            parts.append(f"\n## Page {idx}\n")
        parts.append(page_md.strip())

        if log_every and (idx % log_every == 0 or idx == total):
            status = "ok" if ok else "error"
            pct = int((idx / total) * 100) if total else 100
            LOGGER.info(f"Extracted page {idx}/{total} [{pct}%] in {time.time() - t0:.1f}s ({status})")

    out_md_path.parent.mkdir(parents=True, exist_ok=True)
    out_md_path.write_text("\n\n".join(parts), encoding="utf-8")

    LOGGER.info(f"Wrote combined markdown to {out_md_path} in {time.time() - start:.1f}s")
    _mirror_markdown_to_backend(out_md_path)
    return out_md_path

# 4. Process a Project

In [10]:
def _unique_destination(dest_dir: Path, name: str) -> Path:
    """Return a unique path inside dest_dir for name without overwriting existing files."""
    candidate = dest_dir / name
    if not candidate.exists():
        return candidate
    stem, suffix = Path(name).stem, Path(name).suffix
    i = 1
    while True:
        nxt = dest_dir / f"{stem}__copy_{i}{suffix}"
        if not nxt.exists():
            return nxt
        i += 1

In [11]:
def process_project(project_name: str):
    """
    Process an entire project directory.
    
    Args:
        project_name (str): Name of the project directory
    """
    print(f"Processing project: {project_name}")

    # Get input and output paths
    project_input_dir = INPUT_BASE_DIR / project_name
    project_output_dir = OUTPUT_BASE_DIR / project_name

    # Create output directory structure
    markdown_dir = project_output_dir / "markdown"
    originals_dir = project_output_dir / "original_files"
    markdown_dir.mkdir(parents=True, exist_ok=True)
    originals_dir.mkdir(parents=True, exist_ok=True)

    # Read project metadata
    metadata_path = project_input_dir / "metadata.json"
    project_metadata = {}
    if metadata_path.exists():
        with open(metadata_path, 'r') as f:
            project_metadata = json.load(f)

    for file_path in project_input_dir.iterdir():
        if file_path.is_file() and file_path.name != "metadata.json":
            try:
                
                # Determine file type
                file_ext = file_path.suffix.lower()[1:]  # Remove the dot

                if file_ext in ["pptx"]:
                    LOGGER.info(f"PPTX pipeline started for {file_path.name}")

                    # 0) Create a temp folder for the transient PDF
                    tmp_pdf_root = Path(tempfile.mkdtemp(prefix=f"{file_path.stem}_tmp_pdf_"))

                    try:
                        # 1) Convert PPTX to a temporary PDF
                        pdf_path = pptx_to_pdf(
                            pptx_path=file_path,
                            out_dir=tmp_pdf_root,
                            method=None  # or "powerpoint" | "soffice" | "unoconv"
                        )

                        # 2) Render the PDF to images in a temp dir
                        tmp_img_dir, png_paths = pdf_pages_to_pngs(
                            pdf_path=pdf_path,
                            dpi=200,
                            poppler_path=os.environ.get("POPPLER_PATH")
                        )

                        # 3) Convert images to a single combined markdown
                        combined_out_md = markdown_dir / f"{file_path.stem}.md"
                        images_to_single_markdown(
                            image_paths=sorted(png_paths, key=lambda p: p.name),
                            out_md_path=combined_out_md,
                            project_context=project_metadata,
                            llm_client=LLM_CLIENT,
                            llm_model=LLM_MODEL,
                            llm_prompt=SYSTEM_PROMPT,
                            per_page_heading=True,
                            log_every=1
                        )

                        # 4) Clean up temporary images
                        try:
                            shutil.rmtree(tmp_img_dir, ignore_errors=True)
                            LOGGER.info(f"Removed temporary image directory {tmp_img_dir}")
                        except Exception as cleanup_err:
                            LOGGER.warning(f"Failed to remove temp image folder {tmp_img_dir}: {cleanup_err}")

                        # 5) We do not keep the transient PDF
                        try:
                            shutil.rmtree(tmp_pdf_root, ignore_errors=True)
                            LOGGER.info(f"Removed temporary PDF directory {tmp_pdf_root}")
                        except Exception as cleanup_err:
                            LOGGER.warning(f"Failed to remove temp PDF folder {tmp_pdf_root}: {cleanup_err}")

                        output_path = combined_out_md
                        LOGGER.info(f"PPTX pipeline complete for {file_path.name}")

                    except Exception as e:
                        # Attempt to clean temp dir on error as well
                        try:
                            shutil.rmtree(tmp_pdf_root, ignore_errors=True)
                        except Exception:
                            pass
                        raise

                elif file_ext in ["pdf"]:
                    LOGGER.info(f"PDF pipeline start for {file_path.name}")
                    
                    # 1) Render all pages to PNGs in a temp dir
                    tmp_dir, png_paths = pdf_pages_to_pngs(
                        pdf_path=file_path,
                        dpi=200,
                        poppler_path=os.environ.get("POPPLER_PATH")
                    )

                    # 2) Convert all images to one combined markdown
                    combined_out_md = markdown_dir / f"{file_path.stem}.md"
                    images_to_single_markdown(
                        image_paths=sorted(png_paths, key=lambda p: p.name),
                        out_md_path=combined_out_md,
                        project_context=project_metadata,
                        llm_client=LLM_CLIENT,
                        llm_model=LLM_MODEL,
                        llm_prompt=SYSTEM_PROMPT,
                        per_page_heading=True,
                        log_every=1        # increase to 2 or 5 if you want fewer log lines
                    )

                    # 3) Clean up temporary images
                    try:
                        shutil.rmtree(tmp_dir, ignore_errors=True)
                        LOGGER.info(f"Removed temporary image directory {tmp_dir}")
                    except Exception as cleanup_err:
                        LOGGER.warning(f"Failed to remove temp image folder {tmp_dir}: {cleanup_err}")

                    output_path = combined_out_md
                    LOGGER.info(f"PDF pipeline complete for {file_path.name}")

                elif file_ext in ["png"]:
                #     # Convert the images to markdown via the image_to_markdown() function ()
                #     print("images processed")
                    output_path = convert_to_markdown_file(
                        input_value=str(file_path),
                        kind=file_ext,
                        use_llm=True,
                        llm_client=LLM_CLIENT,
                        llm_model=LLM_MODEL,
                        llm_prompt=SYSTEM_PROMPT,
                        out_dir=markdown_dir,
                        project_context=project_metadata
                    )

                else:
                    # Process other formats normally
                    output_path = convert_to_markdown_file(
                        input_value=str(file_path),
                        kind=file_ext,
                        use_llm=False,
                        out_dir=markdown_dir
                )
                    
                # Copy the original into /original_files before deletion
                dest = _unique_destination(originals_dir, file_path.name)
                shutil.copy2(file_path, dest)

                # Now remove the source from the input folder by sending to recycle bin
                send2trash(str(file_path))
            
            except Exception as e:
                print(f"Error processing {file_path.name}: {e}")
                continue
    
    print(f"Completed processing project: {project_name}")

    return output_path

In [12]:
process_project("moadchat")

Processing project: moadchat
[11:16:00] INFO PDF pipeline start for Componentization- Decomposing Monolithic LLM Responses into Manipulable Semantic Units V1.pdf
[11:16:00] INFO Rendering 12 page(s) from Componentization- Decomposing Monolithic LLM Responses into Manipulable Semantic Units V1.pdf at 200 dpi via pdf2image
[11:16:09] INFO Rendering complete in 9.5s. Generated 12 image(s) in C:\Users\RAC62971\AppData\Local\Temp\Componentization- Decomposing Monolithic LLM Responses into Manipulable Semantic Units V1_pdfpages_tiz5479b
[11:16:09] INFO Starting LLM extraction for 12 page image(s)
[11:16:39] INFO Extracted page 1/12 [8%] in 30.0s (ok)
[11:17:10] INFO Extracted page 2/12 [16%] in 31.0s (ok)
[11:17:45] INFO Extracted page 3/12 [25%] in 34.3s (ok)
[11:18:14] INFO Extracted page 4/12 [33%] in 29.5s (ok)
[11:18:47] INFO Extracted page 5/12 [41%] in 33.1s (ok)
[11:19:25] INFO Extracted page 6/12 [50%] in 37.8s (ok)
[11:20:00] INFO Extracted page 7/12 [58%] in 35.3s (ok)
[11:20:28] 

WindowsPath('input_data/processed/moadchat/markdown/Componentization- Decomposing Monolithic LLM Responses into Manipulable Semantic Units V1.md')