In [15]:
import fitz  # PyMuPDF for PDF handling
import pandas as pd  # For DataFrame handling
import re  # Regular Expressions for text processing

In [16]:
def extract_text_from_paragraph(page, bbox):
    """Extract text from a specific bounding box on a page."""
    text = page.get_text("text", clip=bbox)
    return text.strip()

In [17]:
def extract_images_from_paragraph(page, bbox):
    """Extract images from a specific bounding box on a page."""
    # You'll need to implement logic to extract images from the specified bounding box
    images = []  # Placeholder for images
    return images

In [28]:
def extract_headings_by_font(page):
    headings = []
    blocks = page.get_text("dict")["blocks"]
    for block in blocks:
        if "text" in block:  # Ensure 'text' key exists in the block
            # Assuming headings have larger font sizes (adjust threshold as needed)
            if block.get("size", 0) > 12:  # Example threshold for font size
                headings.append(block["text"])
    return headings

In [19]:
def extract_headings_by_position(page):
    headings = []
    for block in page.get_text("dict")["blocks"]:
        # Assuming headings are centered (adjust based on document layout)
        if block.get("x0", 0) == block.get("x1", 0) / 2:
            headings.append(block["text"])
    return headings

In [39]:
def extract_headings_from_toc(doc):
    toc = doc.get_toc()
    headings = [entry[1] for entry in toc if len(entry) > 1]  # Assuming TOC structure: (page_num, heading)
    return headings

In [33]:
def extract_headings_by_regex(page):
    headings = []
    blocks = page.get_text("dict")["blocks"]
    for block in blocks:
        if "text" in block:  # Ensure 'text' key exists in the block
            # Example: Looking for text starting with "Chapter" followed by a number
            if re.match(r"Chapter\s\d+", block["text"]):
                headings.append(block["text"])
    return headings

In [22]:
def extract_headings_by_structure(page):
    # Custom logic based on specific document structure, e.g., looking for <h1>, <h2> tags
    # Implement your logic here
    pass

In [23]:
def extract_headings(page, method="font"):
    if method == "font":
        return extract_headings_by_font(page)
    elif method == "position":
        return extract_headings_by_position(page)
    elif method == "toc":
        return extract_headings_from_toc(page.get_toc())
    elif method == "regex":
        return extract_headings_by_regex(page)
    elif method == "structure":
        return extract_headings_by_structure(page)
    else:
        return []

In [40]:
def parse_pdf(file_path, title_extraction_method="font"):
    doc = fitz.open(file_path)
    pages = len(doc)

    parsed_data = []
    toc_headings = extract_headings_from_toc(doc)

    for page_num in range(pages):
        page = doc.load_page(page_num)
        if page_num < len(toc_headings):
            title = toc_headings[page_num]
        else:
            title = "Untitled"
        #headings = extract_headings(page, method=title_extraction_method)
        blocks = page.get_text("dict")["blocks"]

        for b in blocks:
            if b["type"] == 0:  # Text block
                bbox = b["bbox"]
                text = extract_text_from_paragraph(page, bbox)
                images = extract_images_from_paragraph(page, bbox)
                #title = headings[page_num] if page_num < len(headings) else "Untitled"
                #title = 

                parsed_content = {
                    "page_numbers": [page_num + 1],
                    "title": title,
                    "text": text,
                    "images": images,
                    "tables": []  # Placeholder for tables
                }
                parsed_data.append(parsed_content)

    doc.close()
    return parsed_data

In [41]:
# Rest of the code remains unchanged

# Usage
file_path = "a.pdf"
parsed_data = parse_pdf(file_path, title_extraction_method="regex")
df = pd.DataFrame(parsed_data)
print(df)

     page_numbers                                              title  \
0             [1]  Abbildung 1: Verwendete Methoden der von Kaggl...   
1             [1]  Abbildung 1: Verwendete Methoden der von Kaggl...   
2             [1]  Abbildung 1: Verwendete Methoden der von Kaggl...   
3             [3]  Abbildung 3: Schematische Darstellung eines En...   
4             [3]  Abbildung 3: Schematische Darstellung eines En...   
...           ...                                                ...   
2452         [55]                                           Untitled   
2453         [55]                                           Untitled   
2454         [55]                                           Untitled   
2455         [55]                                           Untitled   
2456         [55]                                           Untitled   

                                                   text images tables  
0                                                           [] 

In [42]:
df.head(50)

Unnamed: 0,page_numbers,title,text,images,tables
0,[1],Abbildung 1: Verwendete Methoden der von Kaggl...,,[],[]
1,[1],Abbildung 1: Verwendete Methoden der von Kaggl...,MASCHINELLES LERNEN,[],[]
2,[1],Abbildung 1: Verwendete Methoden der von Kaggl...,"EINE ANALYSE ZU KOMPETENZEN, FORSCHUNG UND ANW...",[],[]
3,[3],Abbildung 3: Schematische Darstellung eines En...,,[],[]
4,[3],Abbildung 3: Schematische Darstellung eines En...,MASCHINELLES LERNEN,[],[]
5,[3],Abbildung 3: Schematische Darstellung eines En...,"EINE ANALYSE ZU KOMPETENZEN, \nFORSCHUNG UND ...",[],[]
6,[3],Abbildung 3: Schematische Darstellung eines En...,F R A U N H O F E R - G E S E L L S C H A F T,[],[]
7,[3],Abbildung 3: Schematische Darstellung eines En...,Autoren,[],[]
8,[3],Abbildung 3: Schematische Darstellung eines En...,Inga Döbel | Fraunhofer IMW,[],[]
9,[3],Abbildung 3: Schematische Darstellung eines En...,Dr. Miriam Leis | Fraunhofer-Zentrale,[],[]
