# Convert PDF to Markdown

### Helper Functions

In [8]:
# -----------------------------

from PyPDF2 import PdfReader, PdfWriter

def extract_pdf_pages(input_pdf, output_pdf, page_start=0, page_end=None):
    """
    Extract specific pages from a PDF and save them to a new file.
    
    Parameters:
        input_pdf (str): Path to the input PDF.
        output_pdf (str): Path where the new PDF will be saved.
        pages (list): List of page numbers to extract (1-based index).
        
    Example:
        extract_pdf_pages("input.pdf", "output.pdf", [1, 2, 4])
    """
    reader = PdfReader(input_pdf)
    writer = PdfWriter()

    # Derive page indexes
    if page_end is None:
        page_end = len(reader.pages)
    pages = list(range(page_start, page_end))
    
    # Convert to 0-based indexing for PyPDF2
    for page_num in pages:
        idx = page_num - 1
        if 0 <= idx < len(reader.pages):
            writer.add_page(reader.pages[idx])
        else:
            print(f"Warning: Page {page_num} is out of range and will be skipped.")

    # Save the new PDF
    with open(output_pdf, "wb") as f:
        writer.write(f)

    print(f"Extracted {len(pages)} pages and saved to {output_pdf}")

# -----------------------------

from PyPDF2 import PdfReader

def pdf_to_text_pypdf2(input_pdf, output_txt):
    """
    Extracts all text from a PDF and saves it to a text file.

    Parameters:
        input_pdf (str): Path to the input PDF file.
        output_txt (str): Path to save the extracted text.
    
    Example:
        pdf_to_text("example.pdf", "output.txt")
    """
    reader = PdfReader(input_pdf)
    all_text = []

    # Extract text from each page
    for i, page in enumerate(reader.pages):
        text = page.extract_text() or ""  # Handle blank pages gracefully
        all_text.append(f"--- Page {i+1} ---\n{text}\n")

    # Write to output file
    with open(output_txt, "w", encoding="utf-8") as f:
        f.writelines(all_text)

    print(f"Extracted text saved to {output_txt}")

# -----------------------------

# !pip install PyMuPDF markdownify
import os
import fitz  # PyMuPDF
from markdownify import markdownify as md

def pdf_to_markdown_pymupdf(pdf_path, markdown_path):
    doc = fitz.open(pdf_path)
    full_html_content = ""
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        # Extract as HTML. 'layout=True' helps with structure.
        # You can experiment with other flags for get_text().
        html_content = page.get_text("text") # or get_text("xhtml")
        full_html_content += html_content
        # Add a page break marker (optional, but can be useful for LLM context)
        # full_html_content += "\n<hr />\n" # HTML horizontal rule
        full_html_content += f"\n --- END OF PAGE ({page_num + 1}) ---\n\n" # HTML horizontal rule

    # Clean up common PDF artifacts from HTML before converting to Markdown
    # This is a very basic example; you'll likely need more sophisticated cleaning
    # full_html_content = full_html_content.replace("­", "") # Remove soft hyphens

    # markdown_content = md(full_html_content)
    markdown_content = full_html_content

    # Further cleanup of Markdown (example: remove excessive newlines)
    # markdown_content = "\n".join([line for line in markdown_content.splitlines() if line.strip()])

    with open(markdown_path, "w", encoding="utf-8") as f:
        f.write(markdown_content)

    print(f"Converted '{pdf_path}' to '{markdown_path}'")

# -----------------------------

# !pip install pdfplumber tqdm
import pdfplumber
from tqdm import tqdm

def pdf_to_text_pdfplumber(input_file_path, output_file_path):
    with pdfplumber.open(input_file_path) as pdf, open(output_file_path, "w", encoding="utf-8") as f:
        for page in tqdm(pdf.pages, desc="Extracting text", unit="page"):
            t = page.extract_text()
            if t:
                f.write(t + '\n')

# -----------------------------

### List pdf files

In [5]:
rules_pdf_folder = '/home/luisresende/work/projects/monopoly/assets/rules/pdf'
rules_markdown_folder = '/home/luisresende/work/projects/monopoly/assets/rules/markdown'

files = os.listdir(rules_pdf_folder)
files

['rules_monopolypedia.pdf',
 'rules_fgbradleys.pdf',
 'rules_official-game-rules.pdf',
 'rules_netsuite.pdf',
 'rules_wikibooks.pdf']

### Convert PDF to text with `PyPDF2`

In [10]:
for i in range(len(files)):

    input_path = f'{rules_pdf_folder}/{files[i]}'
    output_path = f'{rules_markdown_folder}/pypdf2/{files[i].replace(".pdf", ".md")}'
    
    pdf_to_text_pypdf2(input_path, output_path)

Extracted text saved to /home/luisresende/work/projects/monopoly/assets/rules/markdown/pypdf2/rules_monopolypedia.md
Extracted text saved to /home/luisresende/work/projects/monopoly/assets/rules/markdown/pypdf2/rules_fgbradleys.md
Extracted text saved to /home/luisresende/work/projects/monopoly/assets/rules/markdown/pypdf2/rules_official-game-rules.md
Extracted text saved to /home/luisresende/work/projects/monopoly/assets/rules/markdown/pypdf2/rules_netsuite.md
Extracted text saved to /home/luisresende/work/projects/monopoly/assets/rules/markdown/pypdf2/rules_wikibooks.md


### Convert PDF to markdown with `PyMuPDF`

In [4]:
for i in range(len(files)):

    input_path = f'{rules_pdf_folder}/{files[i]}'
    output_path = f'{rules_markdown_folder}/pymupdf/{files[i].replace(".pdf", ".md")}'
    
    pdf_to_markdown_pymupdf(input_path, output_path)

Converted '/home/luisresende/work/projects/fg-ai/checklist-generation/data/spec-docs/Specifications_small_selected_pages.pdf' to '/home/luisresende/work/projects/fg-ai/checklist-generation/data/spec-docs/Specifications_small_selected_pages.md'


### Convert PDF to text with `pdfplumber`

In [24]:
for i in range(len(files)):

    input_path = f'{rules_pdf_folder}/{files[i]}'
    output_path = f'{rules_markdown_folder}/pdfplumber/{files[i].replace(".pdf", ".md")}'
    
    pdf_to_text_pdfplumber(input_path, output_path)

Extracting text: 100%|████████████████████████| 72/72 [00:47<00:00,  1.50page/s]


### Convert PDF to text with `markitdown`

In [11]:
# Source: https://github.com/microsoft/markitdown
# !pip install 'markitdown[pdf, docx, pptx]'
# !pip install 'markitdown[pdf]' # pdf plugin only

for i in range(len(files)):

    input_path = f'{rules_pdf_folder}/{files[i]}'
    output_path = f'{rules_markdown_folder}/markitdown/{files[i].replace(".pdf", ".md")}'
    
    !markitdown {input_path} > {output_path}

### Convert PDF to text using `MinerU`

In [3]:
# !pip install -U "mineru[core]"
for i in range(len(files)):

    input_path = f'{rules_pdf_folder}/{files[i]}'
    output_path = f'{rules_markdown_folder}/markitdown/{files[i].replace(".pdf", ".md")}'
    
    !mineru -p '{input_path}' -o '{output_path}'

2025-07-25 09:39:55.568753: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753447195.740332    9773 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753447195.791705    9773 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1753447196.101012    9773 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1753447196.101112    9773 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1753447196.101122    9773 computation_placer.cc:177] computation placer alr

### Remove output

In [48]:
!/usr/bin/rm -r '{output_folder_path}'