# 0. Install if needed

In [None]:
%pip install "markitdown[all]" openai pdf2image pydantic

### Imports, basic definitions and option selections

In [1]:
import os
import re
import json
from pathlib import Path
from openai import OpenAI
from markitdown import MarkItDown
from dotenv import load_dotenv
from pdf2image import convert_from_path
from typing import Dict, List, Optional, Union, Any



In [2]:
# Load in the API key from the top-level .env file
load_dotenv(Path.cwd().parent / ".env")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

# Define the model we want to use to convert the PNGs into Markdown
LLM_MODEL = "gpt-5-mini"

# Read in the system prompts
sys_prompt_detailed = (Path.cwd() / "system_prompts" / "system_prompt_detailed.md").read_text(encoding="utf-8")
sys_prompt_summarized = (Path.cwd() / "system_prompts" / "system_prompt_summarized.md").read_text(encoding="utf-8")

# Choose the system prompt you'd like to use. Detailed will attempt to keep as much of the original content
# as possible, at the cost of speed and cost. Summarized will summarize much more of the content but it will
# be fast and cost less to process the data
sys_prompt_mode = "detailed" # can be 'detailed' or 'summarized'
system_prompt = sys_prompt_detailed if sys_prompt_mode == "detailed" else sys_prompt_summarized

# Define input and output directories
INPUT_BASE_DIR = Path("./input_data/not_processed")
OUTPUT_BASE_DIR = Path("./input_data/processed")

In [None]:
def convert_to_markdown_file(
    input_value: Union[str, Path],
    kind: str,
    use_llm: bool = False,
    llm_client=None,
    llm_model: Optional[str] = None,
    llm_prompt: Optional[str] = None,
    out_dir: Optional[Union[str, Path]] = None,
    out_name: Optional[str] = None,
    project_context: Optional[Dict] = None
) -> Path:
    """
    Convert an input file or HTML string to Markdown using MarkItDown and write it to ./output_(file_type).

    Parameters
    - input_value
      - For file-based kinds, pass a filesystem path.
      - For 'html_string', pass a raw HTML string.
    - kind: one of {'docx','xlsx','xls','html','htm','txt','log','csv','html_string'}
    - use_llm: default False. If True, provide llm_client and llm_model.
    - llm_client, llm_model, llm_prompt: forwarded to MarkItDown when use_llm=True
    - out_dir: optional override of the output directory. Defaults to ./output_(group)
    - out_name: optional base filename without extension. Defaults to input file stem when applicable.
    - project_context: dict containing project metadata that gets added to markdown output

    Returns
    - Path to the written Markdown file.
    """
    # 1) Normalize kind and choose default output directory bucket
    k = kind.lower().strip()
    allowed = {"docx", "xlsx", "xls", "html", "htm", "txt", "log", "csv", "html_string", "png", "pptx", "pdf"}
    if k not in allowed:
        raise ValueError(f"kind must be one of {allowed}")

    if out_dir is None:
        if k in {"xlsx", "xls"}:
            out_dir = "./output_xlsx"
        elif k in {"html", "htm", "html_string"}:
            out_dir = "./output_html"
        elif k in {"txt", "log"}:
            out_dir = "./output_txt"
        elif k == "docx":
            out_dir = "./output_docx"
        elif k == "csv":
            out_dir = "./output_csv"
        elif k in {"png", "pptx", "pdf"}:
            out_dir = "./output_images"

    out_dir = Path(out_dir)
    out_dir.mkdir(parents=True, exist_ok=True)

    # 2) Build MarkItDown converter
    from markitdown import MarkItDown
    if use_llm:
        if llm_client is None or llm_model is None:
            raise ValueError("use_llm=True requires llm_client and llm_model")
        converter = MarkItDown(llm_client=llm_client, llm_model=llm_model, llm_prompt=llm_prompt)
    else:
        converter = MarkItDown()

    # 3) Convert depending on kind
    if k == "html_string":
        if not out_name:
            raise ValueError("out_name is required when kind='html_string'")
        res = converter.convert_html(str(input_value))
        base_name = out_name
    else:
        p = Path(input_value)
        res = converter.convert(str(p))
        base_name = out_name or p.stem

    # 4) Add project context to beginning of Markdown output if provided
    if project_context:
        context_str = "#### PROJECT CONTEXT\n"
        for key, value in project_context.items():
            context_str += f"- **{key}:** {value}\n"
        context_str += "\n"
        res.text_content = context_str + res.text_content
        
    # 5) Write markdown
    out_path = out_dir / f"{base_name}.md"
    out_path.write_text(res.text_content, encoding="utf-8")
    return out_path

## Markdown conversion function
All file types except for PowerPoint are converted directory into Markdown. PowerPoints are first converted into a PDF, which is then converted into PNGs, and finally those PNGs are converted into Markdown.

Allowed file types: 
* PowerPoint
    - pptx
* Word
    - docx
* Excel
    - xlsx
    - xls
    - csv
* Text
    - txt
    - pdf
    - html
    - htm
    - html_string
    - log

# 1. Convert PDF -> PNGs

Goes through a PDF and converts each page into an image.

In [None]:
# Set the POPPLER_PATH environment variable
os.environ["POPPLER_PATH"] = r"C:\\Users\\RAC62971\\Downloads\\poppler-25.07.0\\Library\\bin"

def pdf_to_grouped_pngs(
    pdf_path: Union[str, Path],
    out_dir: Union[str, Path],
    dpi: int = 200,
    group_size: int = 1,
    grouping_prefix: str = "grouping",
    fmt: str = "PNG",
    poppler_path: Optional[Union[str, Path]] = None,
    first_page: Optional[int] = None,
    last_page: Optional[int] = None,
) -> Dict[str, List[str]]:
    """
    Convert a PDF into per-page images and group them into subfolders.

    Parameters
    ----------
    pdf_path : str | Path
        Path to the input PDF, e.g. r"C:\\...\\my.pdf".
    out_dir : str | Path
        Output directory root where grouped folders will be created.
    dpi : int, default 200
        Render DPI for rasterization.
    group_size : int, default 1
        Number of pages per group folder. For example, 2 will place pages
        1 and 2 into 'grouping_1', pages 3 and 4 into 'grouping_2', etc.
    grouping_prefix : str, default "grouping"
        Folder name prefix for each group.
    fmt : str, default "PNG"
        Image format to write. Common options are "PNG" and "JPEG".
    poppler_path : str | Path | None
        Path to Poppler bin directory on Windows if not on PATH.
        Example: r"C:\\tools\\poppler-24.08.0\\Library\\bin"
    first_page : int | None
        Optional first page to convert (1-indexed).
    last_page : int | None
        Optional last page to convert (inclusive).

    Returns
    -------
    Dict[str, List[str]]
        Mapping of group folder name to list of saved image paths (strings).
    """
    pdf_path = Path(pdf_path)
    out_dir = Path(out_dir)
    out_dir.mkdir(parents=True, exist_ok=True)

    if group_size < 1:
        raise ValueError("group_size must be >= 1")

    images = convert_from_path(
        str(pdf_path),
        dpi=dpi,
        first_page=first_page,
        last_page=last_page,
        poppler_path=str(poppler_path) if poppler_path else None,
    )

    saved: Dict[str, List[str]] = {}
    for i, img in enumerate(images, start=1 if not first_page else first_page):
        # Compute 1-indexed group index
        group_idx = (i - (first_page or 1)) // group_size + 1
        group_dir = out_dir / f"{grouping_prefix}_{group_idx}"
        group_dir.mkdir(parents=True, exist_ok=True)

        page_basename = f"page_{i:03}.{fmt.lower()}"
        out_path = group_dir / page_basename
        img.save(out_path, fmt)
        saved.setdefault(f"{grouping_prefix}_{group_idx}", []).append(str(out_path))

    return saved

In [None]:
def process_project(project_name: str):
    """
    Process an entire project directory.
    
    Args:
        project_name (str): Name of the project directory
    """
    print(f"Processing project: {project_name}")
    
    # Get input and output paths
    project_input_dir = INPUT_BASE_DIR / project_name
    project_output_dir = OUTPUT_BASE_DIR / project_name
    
    # Create output directory
    project_output_dir.mkdir(parents=True, exist_ok=True)
    
    # Read project metadata
    metadata_path = project_input_dir / "metadata.json"
    project_metadata = {}
    if metadata_path.exists():
        with open(metadata_path, 'r') as f:
            project_metadata = json.load(f)
    
    # Initialize client for LLM-based conversions
    client = OpenAI(api_key=OPENAI_API_KEY)
    image_converter = MarkItDown(
        llm_client=client,
        llm_model=LLM_MODEL,
        llm_prompt=system_prompt
    )
    
    # Process files in project directory
    for file_path in project_input_dir.iterdir():
        if file_path.is_file() and file_path.name != "metadata.json":
            try:
                print(f"Processing file: {file_path.name}")
                
                # Determine file type
                file_ext = file_path.suffix.lower()[1:]  # Remove the dot
                
                # Handle different file types
                if file_ext in ["pptx"]:
                    # Convert PowerPoint to PDF first (skipped for simplicity)
                    # For now, we'll treat it as a regular file
                    output_path = convert_to_markdown_file(
                        input_value=str(file_path),
                        kind=file_ext,
                        use_llm=False,
                        project_context=project_metadata
                    )
                    
                elif file_ext in ["pdf"]:
                    # Convert PDF to PNGs first, then to markdown
                    png_output_dir = Path("./temp_pngs")
                    png_output_dir.mkdir(exist_ok=True)
                    
                    # Convert PDF to grouped PNGs
                    pdf_to_grouped_pngs(
                        pdf_path=str(file_path),
                        out_dir=png_output_dir,
                        dpi=200,
                        group_size=1,
                        grouping_prefix="page",
                        poppler_path=os.environ.get("POPPLER_PATH")
                    )
                    
                    # Process PNGs
                    png_files = list(png_output_dir.glob("page_*.png"))
                    if png_files:
                        # Process all PNGs and combine into single markdown
                        combined_md = ""
                        for i, png_path in enumerate(png_files, 1):
                            res = image_converter.convert(str(png_path))
                            combined_md += f"\n\n# Page {i}\n\n" + res.text_content.strip()
                            
                        # Write combined markdown with project context
                        md_file_name = file_path.stem + "_pages.md"
                        md_output_path = project_output_dir / md_file_name
                        md_output_path.write_text(combined_md.strip(), encoding="utf-8")
                        print(f"Wrote: {md_output_path}")
                        
                        # Clean up temporary PNGs
                        for png_file in png_files:
                            png_file.unlink()
                        png_output_dir.rmdir()
                    
                elif file_ext in ["png"]:
                    # Process PNG directly
                    res = image_converter.convert(str(file_path))
                    output_path = project_output_dir / (file_path.stem + ".md")
                    output_path.write_text(res.text_content, encoding="utf-8")
                    print(f"    Wrote: {output_path}")
                    
                else:
                    # Process other formats normally
                    output_path = convert_to_markdown_file(
                        input_value=str(file_path),
                        kind=file_ext,
                        use_llm=False,
                        project_context=project_metadata
                    )
                    
                # Move the original file to processed directory
                # (We're just marking it as processed by writing the markdown)
                print(f"Successfully processed: {file_path.name}")
                
            except Exception as e:
                print(f"Error processing {file_path.name}: {e}")
                continue
    
    print(f"Completed processing project: {project_name}")

# 2. Main Processing Pipeline

### Process all projects with context from metadata.json

In [None]:
def convert_to_markdown_file(
    input_value: Union[str, Path],
    kind: str,
    use_llm: bool = False,
    llm_client=None,
    llm_model: Optional[str] = None,
    llm_prompt: Optional[str] = None,
    out_dir: Optional[Union[str, Path]] = None,
    out_name: Optional[str] = None,
    project_context: Optional[Dict] = None
) -> Path:
    """
    Convert an input file or HTML string to Markdown using MarkItDown and write it to ./output_(file_type).

    Parameters
    - input_value
      - For file-based kinds, pass a filesystem path.
      - For 'html_string', pass a raw HTML string.
    - kind: one of {'docx','xlsx','xls','html','htm','txt','log','csv','html_string'}
    - use_llm: default False. If True, provide llm_client and llm_model.
    - llm_client, llm_model, llm_prompt: forwarded to MarkItDown when use_llm=True
    - out_dir: optional override of the output directory. Defaults to ./output_(group)
    - out_name: optional base filename without extension. Defaults to input file stem when applicable.
    - project_context: dict containing project metadata that gets added to markdown output

    Returns
    - Path to the written Markdown file.
    """
    # 1) Normalize kind and choose default output directory bucket
    k = kind.lower().strip()
    allowed = {"docx", "xlsx", "xls", "html", "htm", "txt", "log", "csv", "html_string", "png", "pptx", "pdf"}
    if k not in allowed:
        raise ValueError(f"kind must be one of {allowed}")

    if out_dir is None:
        if k in {"xlsx", "xls"}:
            out_dir = "./output_xlsx"
        elif k in {"html", "htm", "html_string"}:
            out_dir = "./output_html"
        elif k in {"txt", "log"}:
            out_dir = "./output_txt"
        elif k == "docx":
            out_dir = "./output_docx"
        elif k == "csv":
            out_dir = "./output_csv"
        elif k in {"png", "pptx", "pdf"}:
            out_dir = "./output_images"

    out_dir = Path(out_dir)
    out_dir.mkdir(parents=True, exist_ok=True)

    # 2) Build MarkItDown converter
    from markitdown import MarkItDown
    if use_llm:
        if llm_client is None or llm_model is None:
            raise ValueError("use_llm=True requires llm_client and llm_model")
        converter = MarkItDown(llm_client=llm_client, llm_model=llm_model, llm_prompt=llm_prompt)
    else:
        converter = MarkItDown()

    # 3) Convert depending on kind
    if k == "html_string":
        if not out_name:
            raise ValueError("out_name is required when kind='html_string'")
        res = converter.convert_html(str(input_value))
        base_name = out_name
    else:
        p = Path(input_value)
        res = converter.convert(str(p))
        base_name = out_name or p.stem

    # 4) Add project context to beginning of Markdown output if provided
    if project_context:
        context_str = "#### PROJECT CONTEXT\n"
        for key, value in project_context.items():
            context_str += f"- **{key}:** {value}\n"
        context_str += "\n"
        res.text_content = context_str + res.text_content
        
    # 5) Write markdown
    out_path = out_dir / f"{base_name}.md"
    out_path.write_text(res.text_content, encoding="utf-8")
    return out_path


def process_project(project_name: str):
    """
    Process an entire project directory.
    
    Args:
        project_name (str): Name of the project directory
    """
    print(f"Processing project: {project_name}")
    
    # Get input and output paths
    project_input_dir = INPUT_BASE_DIR / project_name
    project_output_dir = OUTPUT_BASE_DIR / project_name
    
    # Create output directory
    project_output_dir.mkdir(parents=True, exist_ok=True)
    
    # Read project metadata
    metadata_path = project_input_dir / "metadata.json"
    project_metadata = {}
    if metadata_path.exists():
        with open(metadata_path, 'r') as f:
            project_metadata = json.load(f)
    
    # Initialize client for LLM-based conversions
    client = OpenAI(api_key=OPENAI_API_KEY)
    image_converter = MarkItDown(
        llm_client=client,
        llm_model=LLM_MODEL,
        llm_prompt=system_prompt
    )
    
    # Process files in project directory
    for file_path in project_input_dir.iterdir():
        if file_path.is_file() and file_path.name != "metadata.json":
            try:
                print(f"Processing file: {file_path.name}")
                
                # Determine file type
                file_ext = file_path.suffix.lower()[1:]  # Remove the dot
                
                # Handle different file types
                if file_ext in ["pptx"]:
                    # Convert PowerPoint to PDF first (skipped for simplicity)
                    # For now, we'll treat it as a regular file
                    output_path = convert_to_markdown_file(
                        input_value=str(file_path),
                        kind=file_ext,
                        use_llm=False,
                        project_context=project_metadata
                    )
                    
                elif file_ext in ["pdf"]:
                    # Convert PDF to PNGs first, then to markdown
                    png_output_dir = Path("./temp_pngs")
                    png_output_dir.mkdir(exist_ok=True)
                    
                    # Convert PDF to grouped PNGs
                    pdf_to_grouped_pngs(
                        pdf_path=str(file_path),
                        out_dir=png_output_dir,
                        dpi=200,
                        group_size=1,
                        grouping_prefix="page",
                        poppler_path=os.environ.get("POPPLER_PATH")
                    )
                    
                    # Process PNGs
                    png_files = list(png_output_dir.glob("page_*.png"))
                    if png_files:
                        # Process all PNGs and combine into single markdown
                        combined_md = ""
                        for i, png_path in enumerate(png_files, 1):
                            res = image_converter.convert(str(png_path))
                            combined_md += f"\n\n# Page {i}\n\n" + res.text_content.strip()
                            
                        # Write combined markdown with project context
                        md_file_name = file_path.stem + "_pages.md"
                        md_output_path = project_output_dir / md_file_name
                        md_output_path.write_text(combined_md.strip(), encoding="utf-8")
                        print(f"Wrote: {md_output_path}")
                        
                        # Clean up temporary PNGs
                        for png_file in png_files:
                            png_file.unlink()
                        png_output_dir.rmdir()
                    
                elif file_ext in ["png"]:
                    # Process PNG directly
                    res = image_converter.convert(str(file_path))
                    output_path = project_output_dir / (file_path.stem + ".md")
                    output_path.write_text(res.text_content, encoding="utf-8")
                    print(f"    Wrote: {output_path}")
                    
                else:
                    # Process other formats normally
                    output_path = convert_to_markdown_file(
                        input_value=str(file_path),
                        kind=file_ext,
                        use_llm=False,
                        project_context=project_metadata
                    )
                    
                # Move the original file to processed directory
                # (We're just marking it as processed by writing the markdown)
                print(f"Successfully processed: {file_path.name}")
                
            except Exception as e:
                print(f"Error processing {file_path.name}: {e}")
                continue
    
    print(f"Completed processing project: {project_name}")


def process_all_projects():
    """
    Process all projects in the not_processed directory.
    """
    print("Starting project processing...")
    
    # Iterate through each project directory in not_processed
    for project_dir in INPUT_BASE_DIR.iterdir():
        if project_dir.is_dir():
            process_project(project_dir.name)
            
    print("All projects processed!")

# Run the processing pipeline
#process_all_projects()