# AnonED: Complex Region Anonymisation in Electrical Diagrams using Hybrid Density-Based Spatial Clustering

### **Note**: Replace file paths, tessaract ocr language settings and OpenAI key in the notebook

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os

# Define the target directory
target_directory = '/content/drive/MyDrive'

# Create the directory if it doesn't exist
os.makedirs(target_directory, exist_ok=True)

In [3]:
# installation of dependencies

!apt-get install -y tesseract-ocr
%pip install opencv-python-headless
%pip install pytesseract
%pip install pandas
%pip install regex

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.
Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Installing collected packages: pytesseract
Successfully installed pytesseract-0.3.13


In [4]:
# import libraries

import cv2
import pytesseract
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pytesseract import Output
import re
from scipy.spatial.distance import euclidean
from sklearn.cluster import DBSCAN

## download Tesseract OCR for your language(s) required

In [5]:
# Step 1: Verify and download the language data file (if necessary)
!ls /usr/share/tesseract-ocr/4.00/tessdata
!wget -P /usr/share/tesseract-ocr/4.00/tessdata https://github.com/tesseract-ocr/tessdata/raw/main/spa.traineddata # replace with dataset main language
!wget -P /usr/share/tesseract-ocr/4.00/tessdata https://github.com/tesseract-ocr/tessdata/raw/main/cat.traineddata # replace with dataset secondary language(s)


# Step 2: Set the TESSDATA_PREFIX environment variable
os.environ['TESSDATA_PREFIX'] = '/usr/share/tesseract-ocr/4.00/tessdata'

# Step 3: Verify the installation
!tesseract --list-langs

configs  eng.traineddata  osd.traineddata  pdf.ttf  tessconfigs
--2025-08-13 15:49:45--  https://github.com/tesseract-ocr/tessdata/raw/main/spa.traineddata
Resolving github.com (github.com)... 140.82.114.3
Connecting to github.com (github.com)|140.82.114.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/tesseract-ocr/tessdata/main/spa.traineddata [following]
--2025-08-13 15:49:45--  https://raw.githubusercontent.com/tesseract-ocr/tessdata/main/spa.traineddata
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.108.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 18256019 (17M) [application/octet-stream]
Saving to: ‘/usr/share/tesseract-ocr/4.00/tessdata/spa.traineddata’


2025-08-13 15:49:45 (113 MB/s) - ‘/usr/share/tesseract-ocr/4.00/tessdata/spa.tr

#### Clean Detected Text
This section contains the `clean_text` function, which is used to clean the detected text by removing unwanted characters while keeping letters with accents.

##### The `create_keyword_patterns` function, which generates regular expression patterns for a list of keywords. These patterns are used to match keywords in the detected text during OCR processing.

In [6]:
# Function to clean detected text, keeping letters with accents
def clean_text(text):
    text = re.sub(r'[^a-zA-Z0-9áéíóòúÁÉÍÓÒÚñÑ\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Function to create regex patterns for keywords
def create_keyword_patterns(keywords):
    patterns = []
    for keyword in keywords:
        pattern = r'\b' + re.escape(keyword) + r'\b'
        patterns.append(pattern)
    return patterns

### Approximate segments
This section contains the `appoximate_segments` function, it finds contours in the image using traditional image processing techniques. These will be used to confirm the OCR-based detected regions. This will help to cover parts of the title block that contain no text.

#### step 1: preprocess image (grayscale, gaussian blur, adaptative threshold)

#### step 2: morphological operations for horizontal and vertical lines

#### step 3: extend lines by 1%

#### step 4: detect contours

#### step 5: separate large contours

criteria for large contours: not exceed 15% of image area (for closed contours - derived from maximum ratio of title blocks) or 50% of image perimeter (for full-length title blocks on longest side)



In [7]:
def appoximate_segments(image_path):
    # Read the image
    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)

    if image is None:
        print(f"Error: Unable to open image file {image_path}")
        return None

    # Get image dimensions
    height, width = image.shape[:2]

    # Apply Gaussian blur to reduce noise
    blurred_image = cv2.GaussianBlur(image, (5, 5), 0)

    # Apply adaptive thresholding
    binary_image = cv2.adaptiveThreshold(blurred_image, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, 11, 2)

    k__pixel_length = int(min(height, width) * 0.01)

    # Define kernels for morphological operations
    kernel_horizontal = np.ones((1, k__pixel_length), np.uint8)
    kernel_vertical = np.ones((k__pixel_length, 1), np.uint8)

    # Detect horizontal lines
    horizontal_lines = cv2.morphologyEx(binary_image, cv2.MORPH_OPEN, kernel_horizontal)

    # Detect vertical lines
    vertical_lines = cv2.morphologyEx(binary_image, cv2.MORPH_OPEN, kernel_vertical)

    # Extend the length of detected horizontal and vertical lines by 1%
    horizontal_lines_extended = cv2.dilate(horizontal_lines, np.ones((1, int(horizontal_lines.shape[1] * 0.01))), iterations=1)
    vertical_lines_extended = cv2.dilate(vertical_lines, np.ones((int(vertical_lines.shape[0] * 0.01), 1)), iterations=1)

    # Combine extended horizontal and vertical lines
    document_structure = cv2.add(horizontal_lines_extended, vertical_lines_extended)

    # Find contours of the document structure
    contours, _ = cv2.findContours(document_structure, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_NONE)

    # Initialize lists for large contours and remaining contours
    large_contours = []
    remaining_contours = []

    # Filter contours based on area and perimeter
    for contour in contours:
        contour_area = cv2.contourArea(contour)
        x, y, w, h = cv2.boundingRect(contour)
        perimeter =  2 * (w) + (h)
        if contour_area > (0.15 * height * width) or perimeter > (0.5 * (height + width)):
            large_contours.append(contour)
        else:
            remaining_contours.append([[x, y, w, h]])

    return remaining_contours

##### Mask most likely drawing sections
######This section contains the `mask_images` function, which processes images in the input folder and by masks regions where drawings are highly likely to reduce amout of OCR text to extract. The processed images are saved to another output directory.

In [8]:
# Function to mask a rectangular region in the image with white pixels and save the processed images
def mask_images(input_directory, output_directory):
    # Create the output directory if it doesn't exist
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)

    # Loop through all files in the input directory
    for filename in os.listdir(input_directory):
        if filename.endswith(('.png', '.jpg', '.jpeg')):
            # Read the image
            image_path = os.path.join(input_directory, filename)
            image = cv2.imread(image_path)

            # Get image dimensions
            height, width = image.shape[:2]

            # Calculate the center point of the image
            center_x, center_y = width // 2, height // 2

            # Calculate the dimensions of the rectangle (as a % of the image's area)
            rect_width = int(width * 0.5)
            rect_height = int(height * 0.4)

            # Calculate the top-left and bottom-right points of the rectangle
            top_left_x = center_x - rect_width // 2 - int(rect_width * 0.2)
            top_left_y = center_y - rect_height // 2 - int(rect_height * 0.3)
            bottom_right_x = center_x + rect_width // 2
            bottom_right_y = center_y + rect_height // 2

            # Mask the rectangular region with white pixels
            image[top_left_y:bottom_right_y, top_left_x:bottom_right_x] = (255, 255, 255)

            # Save the processed image to the output directory
            output_path = os.path.join(output_directory, filename)
            cv2.imwrite(output_path, image)

##### Extract Bounding Boxes
This section includes the `extract_bounding_boxes` function, which extracts bounding box coordinates in PSM 5 & 12 on the original images with sharpening since this proved to detect as more bounding boxes than either of  the two methods. Words are used as tokens to match keywords in a provided dictionary.

## Conditional sharpening based on Laplacian variance

obtained threshold from dataset (below 4000, OCR improves for 72%, unchanged 22% and 6% worsen) (above threshold, OCR improves for 18%, unchanged 36% and 46% worsen)

picked the threshold that gave the highest number of detected bounding boxes from the representative samples

In [9]:
def variance_of_laplacian(image):
    # Compute the Laplacian of the image and then return the variance
    return cv2.Laplacian(image, cv2.CV_64F).var()

def calculate_sharpness_scores(directory):
    sharpness_scores = {}
    for filename in os.listdir(directory):
        if filename.endswith(('.png', '.jpg', '.jpeg', '.bmp', '.tiff')):
            image_path = os.path.join(directory, filename)
            image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
            if image is not None:
                score = variance_of_laplacian(image)
                sharpness_scores[filename] = score
    return sharpness_scores

# Specify the directory containing the images
image_directory = '/content/drive/MyDrive/ICDAR_workshop/Github_AnonED/sample_EDs' # (REPLACE FILE PATH)

# Calculate sharpness scores
sharpness_scores = calculate_sharpness_scores(image_directory)
sorted_by_values = dict(sorted(sharpness_scores.items(), key=lambda item: item[1]))

# Print the sharpness scores
for filename, score in sorted_by_values.items():
    print(f'{filename}: {score}')

a_9_page_1.png: 1766.4493936732333
a_14_page_1.png: 1820.3171495969002
a_2_page_1.png: 2096.288656056248
a_41_page_1.png: 3255.463095929233
a_16_page_1.png: 4311.51835523299


In [10]:
def sharpen_image(image):
    # Create a kernel for sharpening
    kernel = np.array([[0, -1, 0],
                       [-1, 5,-1],
                       [0, -1, 0]])
    # Apply the kernel to the image
    sharpened = cv2.filter2D(image, -1, kernel)
    return sharpened

In [11]:
# Function to extract bounding box coordinates for each matched keyword with combined PSM modes & languages
def extract_bounding_boxes(image, keywords):
    bounding_boxes = set()

    keyword_patterns = create_keyword_patterns(keywords)

    # Perform OCR on the original image
    # Check if the image is already in grayscale
    if len(image.shape) == 3:
        gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    else:
        gray_image = image

    # Check the sharpness score and apply sharpening if necessary
    sharpness_score = sharpness_scores.get(filename, 0)
    if sharpness_score < 4000:
        gray_image = sharpen_image(gray_image)

    for psm in [3, 5, 12]:
        config = f'--psm {psm}'
        data_dict_original = pytesseract.image_to_data(gray_image, config=config, lang='spa+cat', output_type=pytesseract.Output.DICT)

        n_boxes_original = len(data_dict_original['level'])

        for i in range(n_boxes_original):
            text_original = clean_text(data_dict_original['text'][i].lower())
            for pattern in keyword_patterns:
                if re.search(pattern, text_original):
                    bbox = (data_dict_original['left'][i], data_dict_original['top'][i], data_dict_original['width'][i], data_dict_original['height'][i])
                    bounding_boxes.add(bbox)
                    break

    # Check if the image is already in grayscale
    if len(image.shape) == 3:
        gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    else:
        gray_image = image

    # Apply adaptive thresholding
    gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    adapt_thresh_image = cv2.adaptiveThreshold(gray_image, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)

    # Perform OCR on the adaptively thresholded image
    for psm in [12]:
        config = f'--psm {psm}'
        data_dict_adapt_thresh = pytesseract.image_to_data(adapt_thresh_image, config=config, lang='spa+cat', output_type=pytesseract.Output.DICT)

        n_boxes_adapt_thresh = len(data_dict_adapt_thresh['level'])

        for i in range(n_boxes_adapt_thresh):
            text_adapt_thresh = clean_text(data_dict_adapt_thresh['text'][i].lower())
            for pattern in keyword_patterns:
                if re.search(pattern, text_adapt_thresh):
                    bbox = (data_dict_adapt_thresh['left'][i], data_dict_adapt_thresh['top'][i], data_dict_adapt_thresh['width'][i], data_dict_adapt_thresh['height'][i])
                    bounding_boxes.add(bbox)
                    break

    return list(bounding_boxes)

##### Adapt DBSCAN Eps
##### This section contains the `adapt_dbscan_eps` function, which adapts the DBSCAN epsilon parameter based on the dimensions of the image. This ensures that the clustering algorithm works effectively for images of different sizes.

In [12]:
# Function to adapt the DBSCAN eps based on the dimensions of each image
def adapt_dbscan_eps(image):
    height, width = image.shape[:2]
    longest_side = max(height, width)
    eps = longest_side
    return eps

# extract bbox from feature-based

In [13]:
def extract_bounding_boxes_from_contours(contours, image):
    bounding_boxes = []
    img_height, img_width = image.shape[:2]
    for contour in contours:
        if len(contour) > 0:  # Ensure the contour is not empty
            x, y, w, h = contour[0]  # Directly unpack the bounding box
            contour_area = w * h
            image_area = img_width * img_height
            if contour_area <= 0.5 * image_area:  # Ensure the contour is not greater than 50% of image area
              bounding_boxes.append([x, y, x + w, y + h])
    return bounding_boxes

# extract bbox from OCR clusters

In [14]:
# Function to create bounding boxes from clusters
def create_bounding_boxes_from_clusters(image, final_clusters):
    cluster_boxes = []
    height, width = image.shape[:2]
    max_box_area = 0.15 * (height * width)

    for cluster in final_clusters:
        x_coords = [bbox[0] for bbox in cluster]
        y_coords = [bbox[1] for bbox in cluster]
        w_coords = [bbox[2] for bbox in cluster]
        h_coords = [bbox[3] for bbox in cluster]

        x_min = min(x_coords)
        y_min = min(y_coords)
        x_max = max(x_coords) + min(w_coords)
        y_max = max(y_coords) + min(h_coords)

        # Calculate the area of the bounding box
        box_area = (x_max - x_min) * (y_max - y_min)

        # Check if the bounding box area exceeds 15% of the image area
        if box_area < max_box_area:
            cluster_boxes.append([x_min, y_min, x_max, y_max])
        else:
            # If the box exceeds the limit, add individual boxes instead
            for bbox in cluster:
                cluster_boxes.append(bbox)

    return cluster_boxes

# remove feature-based contours that do not overlap with OCR-based areas

In [15]:
# Filter by overlap function
def filter_contours_by_overlap(remaining_contours, cluster_boxes, image, threshold=0):
    # Extract bounding boxes from remaining_contours
    remaining_boxes = extract_bounding_boxes_from_contours(remaining_contours, image)

    # Create bounding boxes around each of the clusters in final_clusters
    final_cluster_boxes_after_group = cluster_boxes

    filtered_contours = []

    def is_overlapping(box1, box2):
        x1_min, y1_min, x1_max, y1_max = box1
        x2_min, y2_min, x2_max, y2_max = box2
        return not (x1_max <= x2_min or x2_max <= x1_min or y1_max <= y2_min or y2_max <= y1_min)

    for rbox in remaining_boxes:
        rx1, ry1, rx2, ry2 = rbox
        for fbox in final_cluster_boxes_after_group:
            fx1, fy1, fx2, fy2 = fbox

            # Calculate the intersection area
            ix1 = max(rx1, fx1)
            iy1 = max(ry1, fy1)
            ix2 = min(rx2, fx2)
            iy2 = min(ry2, fy2)

            iw = max(0, ix2 - ix1)
            ih = max(0, iy2 - iy1)

            intersection_area = iw * ih

            # Calculate the union area
            rbox_area = (rx2 - rx1) * (ry2 - ry1)
            fbox_area = (fx2 - fx1) * (fy2 - fy1)

            union_area = rbox_area + fbox_area - intersection_area

            # Calculate the Intersection over Union (IoU)
            if union_area == 0:
                iou = 0
            else:
                iou = intersection_area / union_area

            if iou > threshold or is_overlapping(rbox, fbox):
                filtered_contours.append(rbox)
                break

    return filtered_contours

### union of  feature-based and OCR-based

In [16]:
def combine_and_merge(image_path, cluster_boxes):
    # Load the image to get its dimensions
    image = cv2.imread(image_path)
    img_height, img_width = image.shape[:2]

    remaining_contours = appoximate_segments(image_path)

    # Filter contours in remaining_contours by overlap and proximity to final_clusters
    filtered_contours = filter_contours_by_overlap(remaining_contours, cluster_boxes, image)

    # Create bounding boxes around each of the clusters in final_clusters
    final_cluster_boxes_after_group = cluster_boxes

    # Combine filtered_contours with final_cluster_boxes_after_group
    all_boxes_list = filtered_contours + final_cluster_boxes_after_group

    return all_boxes_list

### Group keywords by orientation
This section calls the `group_keywords_and_check_size` function. It ensures images that are larger than the threshold in the previous step are classified (2 categories - horizontal & vertical) and regrouped using DBSCAN within a smaller ROI. Outliers are removed by calculating Euclidean distance until the title block ROI is within the threshold.

In [17]:
# Function to group keywords by their corrected bounding box orientation and check size requirements
def group_keywords_and_check_size(bounding_boxes, img_height, img_width):
    keyword_groups_by_orientation = {'horizontal': [], 'vertical': []}

    # Group keywords by their corrected bounding box orientation
    for bbox in bounding_boxes:
        width, height = bbox[2], bbox[3]
        if width >= height:
            keyword_groups_by_orientation['horizontal'].append(bbox)
        else:
            keyword_groups_by_orientation['vertical'].append(bbox)

    # Check size requirements and split clusters if necessary
    final_clusters_by_orientation = []

    for group in keyword_groups_by_orientation.values():
        if not group:
            continue
        points = np.array([(bbox[0] + bbox[2] / 2, bbox[1] + bbox[3] / 2) for bbox in group])
        eps = adapt_dbscan_eps(image)
        dbscan = DBSCAN(eps=0.35 * eps, min_samples=1).fit(points)
        labels = dbscan.labels_

        clusters = {}
        for label, bbox in zip(labels, group):
            if label not in clusters:
                clusters[label] = []
            clusters[label].append(bbox)

        for cluster in clusters.values():
            x_coords = [bbox[0] for bbox in cluster]
            y_coords = [bbox[1] for bbox in cluster]
            w_coords = [bbox[2] for bbox in cluster]
            h_coords = [bbox[3] for bbox in cluster]

            x_min = min(x_coords)
            y_min = min(y_coords)
            x_max = max(x_coords) + min(w_coords)
            y_max = max(y_coords) + min(h_coords)

            cluster_area = (x_max - x_min) * (y_max - y_min)
            min_cluster_area = 0.01 * img_height * img_width
            max_cluster_area = 0.085 * img_height * img_width

            if cluster_area >= min_cluster_area and cluster_area <= max_cluster_area:
                final_clusters_by_orientation.append(cluster)
            else:
                # Successively remove outliers until the group's size meets the size requirements
                while cluster_area >= max_cluster_area and len(cluster) != 0:
                    cluster_center = np.median([(bbox[0] + bbox[2] / 2, bbox[1] + bbox[3] / 2) for bbox in cluster], axis=0)
                    points = np.array([(bbox[0] + bbox[2] / 2, bbox[1] + bbox[3] / 2) for bbox in cluster])

                    distances = [euclidean(point, cluster_center) for point in points]

                    farthest_index = np.argmax(distances)
                    farthest_bbox = cluster.pop(farthest_index)

                    x_coords = [bbox[0] for bbox in cluster]
                    y_coords = [bbox[1] for bbox in cluster]
                    w_coords = [bbox[2] for bbox in cluster]
                    h_coords = [bbox[3] for bbox in cluster]

                    x_min = min(x_coords)
                    y_min = min(y_coords)
                    x_max = max(x_coords) + min(w_coords)
                    y_max = max(y_coords) + max(h_coords)

                    cluster_area = (x_max - x_min) * (y_max - y_min)

                final_clusters_by_orientation.append(cluster)

    return final_clusters_by_orientation

### Draw Bounding Boxes
This section contains the `draw_bounding_boxes` function, which draws bounding boxes around clusters and isolated keywords on the image. The function uses the results of the non-maximum suppression to ensure that the bounding boxes are properly drawn.

In [18]:
def draw_bounding_boxes(image, all_boxes_list, final_clusters_by_orientation):
    # Function to check if two boxes overlap or touch
    def is_overlapping(box1, box2):
        x1_min, y1_min, x1_max, y1_max = box1
        x2_min, y2_min, x2_max, y2_max = box2
        return not (x1_max < x2_min or x2_max < x1_min or y1_max < y2_min or y2_max < y1_min)

    # Function to merge overlapping boxes into one
    def merge_boxes(boxes):
        if not boxes:
            return []
        x_min = min(box[0] for box in boxes)
        y_min = min(box[1] for box in boxes)
        x_max = max(box[2] for box in boxes)
        y_max = max(box[3] for box in boxes)
        return [x_min, y_min, x_max, y_max]

    # Function to find clusters of overlapping boxes
    def find_clusters(boxes):
        clusters = []
        visited = set()

        def dfs(box, cluster):
            for i, other_box in enumerate(boxes):
                if i not in visited and is_overlapping(box, other_box):
                    visited.add(i)
                    cluster.append(other_box)
                    dfs(other_box, cluster)

        for i, box in enumerate(boxes):
            if i not in visited:
                cluster = [box]
                visited.add(i)
                dfs(box, cluster)
                clusters.append(cluster)

        return clusters

    # Find clusters of overlapping boxes
    clusters = find_clusters(all_boxes_list)

    # Separate isolated boxes and merged clusters
    isolated_boxes = [cluster[0] for cluster in clusters if len(cluster) == 1]
    merged_boxes = [merge_boxes(cluster) for cluster in clusters if len(cluster) > 1]

    # Ensure the size of any box in merged_boxes cannot exceed 15% of the image area
    image_area = image.shape[0] * image.shape[1]
    max_box_area = 0.15 * image_area

    valid_isolated_boxes = []
    for i in range(len(isolated_boxes)):
        x_min, y_min, x_max, y_max = isolated_boxes[i]

        box_area = (x_max - x_min) * (y_max - y_min)
        if box_area < max_box_area:
            width_increase = int((x_max - x_min) * 0.005)
            height_increase = int((y_max - y_min) * 0.01)
            new_x_min = x_min - width_increase
            new_y_min = y_min - height_increase
            new_x_max = x_max + (width_increase * 2)
            new_y_max = y_max + (height_increase * 4)
            valid_isolated_boxes.append((new_x_min, new_y_min, new_x_max, new_y_max))
        else:
            # Use create_bounding_boxes_from_clusters function to create cluster_boxes from final_clusters_by_orientation
            cluster_boxes = create_bounding_boxes_from_clusters(image, final_clusters_by_orientation)
            valid_isolated_boxes.extend(cluster_boxes)

    valid_merged_boxes = []
    for i in range(len(merged_boxes)):
        x_min, y_min, x_max, y_max = merged_boxes[i]

        box_area = (x_max - x_min) * (y_max - y_min)
        if box_area < max_box_area:
            width_increase = int((x_max - x_min) * 0.005)
            height_increase = int((y_max - y_min) * 0.01)
            new_x_min = x_min - width_increase
            new_y_min = y_min - height_increase
            new_x_max = x_max + (width_increase * 2)
            new_y_max = y_max + (height_increase * 4)
            valid_merged_boxes.append((new_x_min, new_y_min, new_x_max, new_y_max))
        else:
            # Use create_bounding_boxes_from_clusters function to create cluster_boxes from final_clusters_by_orientation
            cluster_boxes = create_bounding_boxes_from_clusters(image, final_clusters_by_orientation)
            valid_isolated_boxes.extend(cluster_boxes)

    # Draw the final bounding boxes on the image
    for (x_min, y_min, x_max, y_max) in valid_isolated_boxes + valid_merged_boxes:
        cv2.rectangle(image, (x_min, y_min), (x_max, y_max), (255, 0, 255), 3)

# process image to get final_clusters

In [19]:
# Function to process image and return final_clusters
def process_image(image, masked_image, keywords, img_height, img_width):
    # Extract bounding boxes for matched keywords from the masked images
    bounding_boxes = extract_bounding_boxes(masked_image, keywords)

    # Convert bounding boxes to points for DBSCAN
    points = np.array([(bbox[0] + bbox[2] / 2, bbox[1] + bbox[3] / 2) for bbox in bounding_boxes])

    # Perform DBSCAN clustering
    eps = adapt_dbscan_eps(image)
    dbscan = DBSCAN(eps=eps, min_samples=1).fit(points)
    labels = dbscan.labels_

    clusters = {}
    for label, bbox in zip(labels, bounding_boxes):
        if label not in clusters:
            clusters[label] = []
        clusters[label].append(bbox)

    # Check cluster sizes and adjust if necessary
    final_clusters = []
    for cluster in clusters.values():
        x_coords = [bbox[0] for bbox in cluster]
        y_coords = [bbox[1] for bbox in cluster]
        w_coords = [bbox[2] for bbox in cluster]
        h_coords = [bbox[3] for bbox in cluster]

        x_min = min(x_coords)
        y_min = min(y_coords)
        x_max = max(x_coords) + min(w_coords)
        y_max = max(y_coords) + min(h_coords)

        cluster_area = (x_max - x_min) * (y_max - y_min)
        max_cluster_area = 0.15 * img_height * img_width

        if cluster_area < max_cluster_area:
            final_clusters.append(cluster)
        else:
            # Regroup the cluster into 2 groups using group_keywords_and_check_size function
            sub_clusters = group_keywords_and_check_size(cluster, img_height, img_width)
            final_clusters.extend(sub_clusters)

    return final_clusters

# Combined script

In [20]:
# Main script # (REPLACE FILE PATHS)
input_dir = '/content/drive/MyDrive/ICDAR_workshop/Github_AnonED/sample_EDs'
output_dir_final_images = '/content/drive/MyDrive/ICDAR_workshop/Github_AnonED/predictions'
output_dir_masked_images = '/content/drive/MyDrive/ICDAR_workshop/Github_AnonED/mask_applied'
keywords_file_path = '/content/drive/MyDrive/ICDAR_workshop/Github_AnonED/4o_mini_dict.txt'

# Create output directories if they don't exist
os.makedirs(output_dir_final_images, exist_ok=True)
os.makedirs(output_dir_masked_images, exist_ok=True)

# Load keywords from text file
with open(keywords_file_path, 'r') as file:
    keywords = [re.sub(r'[^a-zA-Z0-9áéíóòúÁÉÍÓÒÚñÑ\\s]', ' ', line).strip().lower() for line in file]

# Mask images and save to the masked images directory
mask_images(input_dir, output_dir_masked_images)

# Process images in the input directory
for filename in os.listdir(input_dir):
    if filename.endswith('.png') or filename.endswith('.jpg'):
        image_path = os.path.join(input_dir, filename)
        masked_image_path = os.path.join(output_dir_masked_images, filename)

        image = cv2.imread(image_path)
        masked_image = cv2.imread(masked_image_path)
        img_height, img_width = image.shape[:2]

        # Extract bounding boxes with combined PSM modes & languages
        bounding_boxes = extract_bounding_boxes(masked_image, keywords)
        # Count the number of bounding boxes
        num_bounding_boxes = len(bounding_boxes)
        print(f"Image: {filename}, Number of bounding boxes: {num_bounding_boxes}")

        # Process the image and get final_clusters
        final_clusters = process_image(image, masked_image, keywords, img_height, img_width)

        # Get clusters separated by orientation
        final_clusters_by_orientation = group_keywords_and_check_size(bounding_boxes, img_height, img_width)

        # Create bounding boxes from clusters
        cluster_boxes = create_bounding_boxes_from_clusters(image, final_clusters_by_orientation)

        # Combine and merge bounding boxes
        all_boxes_list = combine_and_merge(image_path, cluster_boxes)

        # Draw final bounding boxes
        draw_bounding_boxes(image, all_boxes_list, final_clusters_by_orientation)

        # Save the final image with drawn bounding boxes
        final_image_path = os.path.join(output_dir_final_images, filename)
        cv2.imwrite(final_image_path, image)

print("Processing complete.")

Image: a_16_page_1.png, Number of bounding boxes: 43
Image: a_2_page_1.png, Number of bounding boxes: 41
Image: a_14_page_1.png, Number of bounding boxes: 12
Image: a_41_page_1.png, Number of bounding boxes: 11
Image: a_9_page_1.png, Number of bounding boxes: 6
Processing complete.
