### Libraries

In [299]:
import os
import cv2
import fitz
import easyocr
import numpy as np
from PIL import Image
from matplotlib import pyplot as plt
from skimage.morphology import skeletonize

### Extract images from PDF

In [300]:
def pdf_to_images(pdf_file, zoom=6):
    pdf_name = os.path.splitext(os.path.basename(pdf_file))[0]
    output_dir = f"{pdf_name}_images"
    
    # Create the output directory if it doesn't exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    # Check if images already exist in the output folder
    existing_files = [f for f in os.listdir(output_dir) if f.endswith(".png")]
    if existing_files:
        print(f"Images already exist in '{output_dir}'. Skipping conversion.")
        return output_dir  # Always return the directory path
    
    # Open the PDF file and extract images
    doc = fitz.open(pdf_file)
    print(f"Converting PDF '{pdf_file}' with {doc.page_count} pages into images...")

    mat = fitz.Matrix(zoom, zoom)
    page_count = 0

    for i in range(doc.page_count):
        output_path = os.path.join(output_dir, f"image_{i+1}.png")
        page = doc.load_page(i)
        pix = page.get_pixmap(matrix=mat)
        pix.save(output_path)
        page_count += 1
        print(f"Saved {output_path}")

    doc.close()
    
    print(f"Converted {page_count} pages to images in '{output_dir}'.")
    return output_dir  # Return the output directory path


### Image Preprocessing

In [301]:
def preprocess_images(folder_path):
    for image_file in os.listdir(folder_path):
        if image_file.endswith(".png") and image_file.startswith("image_"):
            image_path = os.path.join(folder_path, image_file)
            
            # Load image using cv2
            image = cv2.imread(image_path)

            # Determine the image height and width
            height, width, _ = image.shape

            # Crop logic
            if "image_1" in image_file:  
                top_crop = 900  # Crop more from the top for page 1
                image = image[top_crop:, :]
            else:
                top_crop = 200  # Crop less from the top for other pages
                image = image[top_crop:, :]
            
            bottom_crop = 300  
            image = image[:-bottom_crop, :]

            # Left crop logic
            left_crop = 600  
            image = image[:, left_crop:]  
            
            # Right 
            right_crop = 200  
            image = image[:, :-right_crop]  

            ## Convert to grayscale and denoise
            gray_img = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
            
            denoised_img = cv2.GaussianBlur(gray_img, (5, 5), 1)

            # Adjust contrast
            #adjusted = cv2.convertScaleAbs(denoised_img, alpha=2.0, beta=1)

            _, im_bw = cv2.threshold(denoised_img, 250, 150, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    
            # Morphological operations
            #kernel = np.ones((2, 2), np.uint8)
            #opening = cv2.morphologyEx(im_bw, cv2.MORPH_OPEN, kernel)
            kernel1 = np.ones((2, 3), np.uint8)
            erosion = cv2.erode(im_bw, kernel1, iterations=1)

            # Adaptive thresholding
            

            #kernel2 = np.ones((1, 1), np.uint8)
            #dilation = cv2.dilate(erosion, kernel2, iterations=1)

            # Optional scaling
            #scale_percent = 200
            #width = int(dilation.shape[1] * scale_percent / 100)
            #height = int(dilation.shape[0] * scale_percent / 100)
            #scaled_img = cv2.resize(dilation, (width, height), interpolation=cv2.INTER_LINEAR)

            # Save the processed image in the same folder with a new name
            processed_image_path = os.path.join(folder_path, f"processed_{image_file}")

            cv2.imwrite(processed_image_path, im_bw)
            print(f"Processed and saved {processed_image_path}")

    print(f"All images in '{folder_path}' have been processed.")

### Get bounding boxes

In [302]:
def process_image_with_easyocr(image_path, reader, output_folder):
    
    img = cv2.imread(image_path)
    
    # Detect text using EasyOCR
    results = reader.readtext(img)
    
    # Initialize lists to store the coordinates of bounding boxes
    horizontal_boxes = []
    current_line_boxes = []

    # Collect bounding boxes that are horizontal
    for (bbox, text, confidence) in results:
        if confidence > 0.2:  # Confidence threshold
            (top_left, top_right, bottom_right, bottom_left) = bbox
            top_left = tuple(map(int, top_left))
            bottom_right = tuple(map(int, bottom_right))

            # Calculate width and height
            width = bottom_right[0] - top_left[0]
            height = bottom_right[1] - top_left[1]

            # Only process horizontal bounding boxes
            if width > height:
                horizontal_boxes.append((top_left[0], top_left[1], bottom_right[0], bottom_right[1]))

    # Sort boxes by their y-coordinate for line-based grouping
    horizontal_boxes.sort(key=lambda box: box[1])

    # Merge bounding boxes on the same line
    merged_boxes = []
    for box in horizontal_boxes:
        if not current_line_boxes:
            current_line_boxes.append(box)
            continue

        _, y1, _, y2 = box
        _, prev_y1, _, prev_y2 = current_line_boxes[-1]

        # Check if the box is on the same line as the previous one
        if abs(y1 - prev_y1) <= 10 or abs(y2 - prev_y2) <= 10:  # y_threshold
            current_line_boxes.append(box)
        else:
            min_x1 = min(b[0] for b in current_line_boxes)
            min_y1 = min(b[1] for b in current_line_boxes)
            max_x2 = max(b[2] for b in current_line_boxes)
            max_y2 = max(b[3] for b in current_line_boxes)
            merged_boxes.append((min_x1, min_y1, max_x2 - min_x1, max_y2 - min_y1))
            current_line_boxes = [box]

    if current_line_boxes:
        min_x1 = min(b[0] for b in current_line_boxes)
        min_y1 = min(b[1] for b in current_line_boxes)
        max_x2 = max(b[2] for b in current_line_boxes)
        max_y2 = max(b[3] for b in current_line_boxes)
        merged_boxes.append((min_x1, min_y1, max_x2 - min_x1, max_y2 - min_y1))

    # Draw merged bounding boxes on the image
    for (x, y, w, h) in merged_boxes:
        img = cv2.rectangle(img, (x, y), (x + w, y + h), (0, 255, 0), 2)  # Green box

    # Save the image with merged bounding boxes
    output_image_path = os.path.join(output_folder, f"bounding_boxes_{os.path.basename(image_path)}")
    cv2.imwrite(output_image_path, img)
    print(f"Image with merged bounding boxes saved as {output_image_path}")
    
    return merged_boxes

### Get the content

In [303]:
# Function to extract text from each bounding box in an image
def extract_text_from_merged_boxes(merged_boxes, img, reader):
    document = []  # List to store the extracted text from each bounding box
    
    # Perform OCR on each bounding box region
    for i, (x, y, w, h) in enumerate(merged_boxes):
        # Crop each region of interest
        roi = img[y:y + h, x:x + w]

        # Perform OCR on the cropped region
        result = reader.readtext(roi, decoder='beamsearch', detail=1)

        # Extract text from OCR results and append to document
        box_text = " ".join([text for (_, text, confidence) in result if confidence > 0.2])
        document.append(box_text)

    return document  

In [304]:
def process_and_extract_text_from_images(folder_path):
    # Initialize the EasyOCR reader with the desired language
    reader = easyocr.Reader(['fr'])  # Specify language
    
    # Create an output folder for images with bounding boxes if it doesn't exist
    output_folder = os.path.join(folder_path, "processed_with_boxes")
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Dictionary to store the extracted text for each image
    extracted_texts = {}

    # Process each image that starts with "processed_"
    for image_file in os.listdir(folder_path):
        if image_file.startswith("processed_") and image_file.endswith(".png"):
            image_path = os.path.join(folder_path, image_file)
            img = cv2.imread(image_path)
            
            # Step 1: Detect and merge bounding boxes (this will not save new images)
            merged_boxes = process_image_with_easyocr(image_path, reader, output_folder)

            # Step 2: Extract text from each bounding box
            document = []  # List to store text extracted from each bounding box
            for (x, y, w, h) in merged_boxes:
                # Crop each region of interest
                roi = img[y:y + h, x:x + w]
                
                # Perform OCR on the cropped region
                result = reader.readtext(roi, detail=1)
                
                # Extract text from OCR results and append to document
                box_text = " ".join([text for (_, text, confidence) in result if confidence > 0.2])
                document.append(box_text)
            
            # Store the extracted text for each image
            extracted_texts[image_file] = document

    # Save the extracted text from all images into a single .txt file
    output_text_file = os.path.join(folder_path, "extracted_texts.txt")
    with open(output_text_file, "w", encoding="utf-8") as text_file:
        for image_name, texts in extracted_texts.items():
            text_file.write(f"### {image_name} ###\n")
            text_file.write("\n".join(texts) + "\n\n")
    
    return extracted_texts

In [305]:
for i in range(5, 6):
    pdf_file = f"extract/240000{i:02}.pdf"  
    output_dir = pdf_to_images(pdf_file)
    preprocess_images(output_dir)
    
    extracted_texts = process_and_extract_text_from_images(output_dir)
    print(extracted_texts)

Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


Images already exist in '24000005_images'. Skipping conversion.
Processed and saved 24000005_images\processed_image_1.png
All images in '24000005_images' have been processed.
Image with merged bounding boxes saved as 24000005_images\processed_with_boxes\bounding_boxes_processed_image_1.png
{'processed_image_1.png': ['24000005* du tribuffâl dë lentreprise greffe au', 'au greffe', '', 'Nofn', '(en entler) : GRANDR', '(en abrégé)', 'Forme légale 2 SRL', 'Adresse complèle du siège 209 Boulevard Lambermont; Bolte 12, 1030 Schaerbeok', 'MODIFICATION SIEGE SOCIAL', "résulle dune décision émise par /organe d'adminietratlon; reçu par Tresor Madoda Tuyinama; gérant de", 'la soclété GRANDR SRL le 13 décombre 2023 , contenant procos-verbal do [assemblée générale extraordinaire', 'W de la soc8té à responsablllte llmitée GRANDR ayant son slège soclal à 1030 Schaerbeek; Boulevard', 'Lambermont 209/b012 que:', 'Premlèra réxoluton Modifcation do ladresse du sluge social de la sociéte_', "Lorgane dadmln