In [1]:
!pip install PyMuPDF




In [42]:
import os
import fitz  # PyMuPDF
import re
from multi_coulumn import column_boxes

In [43]:
doc = fitz.open('./nexon-owner-manual-2022.pdf') ## Replace Path with your own PDF Path

In [44]:

def detect_tables(page): ## Detects tables in page
    tables = page.find_tables(horizontal_strategy="lines", vertical_strategy="lines")
    return tables
def find_min(group): ## Finds minimum coordinate value of bounding box group
    sorted_group = sorted(group,key=lambda x:x[0])
    return sorted_group[0][0]

In [45]:
def group_coordinates(coordinates, threshold,tables):
    groups = []
    current_group = []

    # Sort the coordinates based on the x-coordinate
    sorted_coordinates = sorted(coordinates, key=lambda x: x[0])

    # Iterate through sorted coordinates
    for i in range(len(sorted_coordinates)):
        if i == 0:
            current_group.append(sorted_coordinates[i])
        else:
            # Check the difference between consecutive x-coordinates
            diff = sorted_coordinates[i][0] - sorted_coordinates[i-1][0]
            if diff <= threshold:
                # Add coordinate to the current group
                current_group.append(sorted_coordinates[i])
            else:
                # Start a new group
                if tables.tables:
                    for table in tables:
                        min_x = find_min(current_group)
                        if min_x<int(table.bbox[0]):
                            groups.append(current_group)
                else:
                    groups.append(current_group)
                current_group = [sorted_coordinates[i]]

    # Add the last group
    if tables.tables:
        for table in tables:
            min_x = find_min(current_group)
            if min_x<int(table.bbox[0]):
                groups.append(current_group)
    else:
        groups.append(current_group)
    return groups

In [46]:
def normalize_text(text):
    # Remove line breaks and extra spaces
    text = text.replace('\n', ' ').strip()
    # Lowercase the text
    text = text.lower()
    # Standardize punctuation
    text = text.replace(' ,', ',').replace(' .', '.').replace(' ?', '?').replace(' !', '!')
    # Replace placeholders (if any)
    # No placeholders to replace in the provided text
    return text

In [47]:
page = doc.load_page(9)
tables = detect_tables(page)

bboxes = column_boxes(page, header_margin=40, no_image_text=True) ## Finds all the bounding boxes in the page

rect_list = [] 
for rect in bboxes:## Rect stores the coordinates of the bounding box
    rect_list.append(rect.rect)
# Function to group coordinates based on their x-coordinate values

groups = group_coordinates(rect_list,70,tables)

for group in groups:
    sorted_group = sorted(group,key=lambda x:x[1])
    for rect in sorted_group:
        # print(page.get_text(clip=rect, sort=True))
        # print("-" * 80)
        box_string = page.get_text(clip=rect, sort=True)
        print(normalize_text(box_string),end='')

for table in tables:
    df = table.to_pandas()
    df_normalized = df.applymap(normalize_text)
    df_normalized.columns = [normalize_text(col) for col in df.columns]
    print(df_normalized.to_csv(index=False))




In [48]:
## Extracting entire PDF
def page_extractor(page):
    normalized_string = ""
    normalized_table = ""

    tables = detect_tables(page)

    bboxes = column_boxes(page, header_margin=40, no_image_text=True) ## Finds all the bounding boxes in the page

    rect_list = [] 
    for rect in bboxes:## Rect stores the coordinates of the bounding box
        rect_list.append(rect.rect)
    # Function to group coordinates based on their x-coordinate values

    groups = group_coordinates(rect_list,70,tables)

    for group in groups:
        sorted_group = sorted(group,key=lambda x:x[1])
        for rect in sorted_group:
            # print(page.get_text(clip=rect, sort=True))
            # print("-" * 80)
            box_string = page.get_text(clip=rect, sort=True)
            normalized_string = normalized_string + normalize_text(box_string)
    
    for table in tables:
        df = table.to_pandas()
        df_normalized = df.applymap(normalize_text)
        df_normalized.columns = [normalize_text(col) for col in df.columns]
        normalized_table = normalized_table + "\n" + df_normalized.to_csv(index=False)
    return normalized_string + '\n' + normalized_table

In [52]:
page = doc.load_page(16)
page_string = page_extractor(page)
print('Example of page extracted:\n')
print(page_string)

Example of page extracted:


group,mass group,age group,fr,rear out­ board lh,rear out­ board rh,rear center
0,up to 10 kg,up to 9 months,x,u,u,x
0+,up to 13 kg,up to 24 months,x,u,u,x
i,9 to 18 kg,9 months to 48 months,x,u,u,x
ii,15 to 25 kg,approx. 3 to 7 years,x,u,u,x
iii,22 to 36 kg,approx. 6 to 12 years,x,u,u,x



In [50]:
page_count = doc.page_count

with open("output.txt", "a") as file:
    # Iterate through each page
    for i in range(page_count):
        try:
            page = doc.load_page(i)
            # Extract page_string for the current page
            page_string = page_extractor(page)
            
            # Append page_string to the text file
            file.write(page_string + "\n")
        except:
            pass