In [1]:
import fitz  # PyMuPDF
import os
import re

def pdf_to_text_with_images(pdf_path, output_text_path, image_folder='images'):
    os.makedirs(image_folder, exist_ok=True)
    pdf_document = fitz.open(pdf_path)
    all_text = []

    for page_number in range(pdf_document.page_count):
        page = pdf_document.load_page(page_number)
        
        # Get text with position information
        text_instances = page.get_text("words")  # Returns a list of words with their positions
        
        # Sort text instances by vertical position (top to bottom)
        text_instances.sort(key=lambda w: (w[3], w[0]))  # Sort by y-coordinate, then x-coordinate
        
        # Get images with their positions
        images = page.get_images(full=True)
        image_list = []
        for img_index, img in enumerate(images):
            xref = img[0]
            base_image = pdf_document.extract_image(xref)
            image_rect = page.get_image_bbox(img)
            
            image_ext = base_image["ext"]
            image_name = f"page_{page_number+1}_img_{img_index+1}.{image_ext}"
            image_path = os.path.join(image_folder, image_name)
            
            with open(image_path, 'wb') as image_file:
                image_file.write(base_image["image"])
            
            image_list.append((image_rect, f"[Image: {image_name}]"))
        
        # Sort images by their vertical position (top to bottom)
        image_list.sort(key=lambda i: i[0][1])  # Sort by y-coordinate of top-left corner
        
        # Merge text and images
        result = []
        img_index = 0
        for word in text_instances:
            while img_index < len(image_list) and image_list[img_index][0][1] < word[3]:
                result.append(image_list[img_index][1])
                img_index += 1
            result.append(word[4])  # word[4] is the text content
        
        # Add any remaining images
        while img_index < len(image_list):
            result.append(image_list[img_index][1])
            img_index += 1
        
        all_text.extend(result)
        all_text.append("\n--- Page Break ---\n")

    # Write all extracted text and image placeholders to a text file
    with open(output_text_path, 'w', encoding='utf-8') as text_file:
        text_file.write(' '.join(all_text))

if __name__ == "__main__":
    pdf_path = "pdfs/1706.03762v7.pdf"        # Path to the PDF file
    output_text_path = "output.txt"  # Path for the output text file
    pdf_to_text_with_images(pdf_path, output_text_path)


In [11]:
def replace_figure_references(text):
    # Find all unique "Figure x" patterns with preceding image placeholders
    pattern = re.compile(r'((?:\[Image: [^\]]+\] )+)(Figure \d+)')
    image_map = {}

    # Extract matches and create a mapping
    for match in pattern.finditer(text):
        placeholder, figure_reference = match.groups()
        if figure_reference not in image_map:
            image_map[figure_reference] = placeholder

    # Insert image placeholders before "Figure x", but only if they're not already there
    def replacement(match):
        figure_reference = match.group(0)  # The full "Figure x" text
        
        # Check if there are image placeholders immediately before this "Figure x"
        pre_context = text[max(0, match.start() - 100):match.start()]
        if pre_context.strip().endswith(']'):
            # If there are image placeholders, don't modify
            return figure_reference
        
        # Insert placeholders before the figure reference
        return image_map.get(figure_reference, '') + figure_reference

    return re.sub(r'Figure \d+', replacement, text)


In [13]:
with open("output.txt", 'r', encoding='utf-8') as text_file:
    all_text = text_file.readlines()


# Perform post-processing
processed_text = replace_figure_references(' '.join(all_text))


In [14]:
# Write the processed text to a new file
output_processed_text_path = "output_processed.txt"
with open(output_processed_text_path, 'w', encoding='utf-8') as text_file:
    text_file.write(processed_text)