# Összefüggő képrészek klaszterezése

In [None]:
import cv2
import numpy as np
from pdf2image import convert_from_path

def load_images_from_pdf(pdf_path, size=None):
    """
    Load OpenCV images from a PDF file.
    :param pdf_path: path of the PDF file
    :param size: the preferred size in pixels as a (width, height) tuple
    :return: list of OpenCV images
    """
    pil_images = convert_from_path(pdf_path, size=size)
    images = [
        cv2.cvtColor(np.array(pil_image), cv2.COLOR_BGR2GRAY)
        for pil_image in pil_images
    ]
    return images

In [None]:
images = load_images_from_pdf('samples/test12.pdf', size=(2500, None))
image = images[1]
image.shape

In [None]:
cv2.imwrite('/tmp/test.png', image)

In [None]:
def find_blob(image, start_point, threshold, is_visited):
    """
    Collect the coordinates of the blob.
    :param image: NumPy intensity image
    :param start_point: (row, column) tuple of the start point
    :param threshold: value of intensity threshold
    :param is_visited: binary matrix which signs the visited pixels
    :return: list of the coordinates of the blob points
    """
    blob = []
    if image[start_point] >= threshold:
        return blob
    unchecked = [start_point]
    while unchecked:
        row, column = unchecked.pop()
        if row - 1 >= 0 and image[row - 1, column] < threshold and is_visited[row - 1, column] == False:
            unchecked.append((row - 1, column))
            blob.append(unchecked[-1])
        if column - 1 >= 0 and image[row, column - 1] < threshold and is_visited[row, column - 1] == False:
            unchecked.append((row, column - 1))
            blob.append(unchecked[-1])
        if column + 1 < image.shape[1] and image[row, column + 1] < threshold and is_visited[row, column + 1] == False:
            unchecked.append((row, column + 1))
            blob.append(unchecked[-1])
        if row + 1 < image.shape[0] and image[row + 1, column] < threshold and is_visited[row + 1, column] == False:
            unchecked.append((row + 1, column))
            blob.append(unchecked[-1])
        is_visited[row, column] = True
    return blob

In [None]:
threshold = 128
is_visited = np.zeros(image.shape, dtype=int)
n_rows, n_columns = image.shape

In [None]:
find_blob(image, (341, 304), 128, is_visited)

In [None]:
cv2.imwrite('/tmp/is_visited.png', is_visited)

In [None]:
def find_blobs(image):
    """
    Find the blobs on the image.
    :param image: a two dimensional NumPy array
    :return: list of the collected blobs
    """
    blobs = []
    threshold = 128
    is_visited = np.zeros(image.shape, dtype=int)
    n_rows, n_columns = image.shape
    for i in range(n_rows):
        for j in range(n_columns):
            if is_visited[i, j] == False and image[i, j] < threshold:
                blob = find_blob(image, (i, j), threshold, is_visited)
                blobs.append(blob)
            is_visited[i, j] = True
    return blobs

In [None]:
blobs = find_blobs(image)
len(blobs)

## Analyzis of the blobs

Display of the bounding boxes

In [None]:
bounding_boxes = cv2.merge((image, image, image))
for blob in blobs:
    xs = [j for _, j in blob]
    ys = [i for i, _ in blob]
    x_1 = min(xs)
    y_1 = min(ys)
    x_2 = max(xs)
    y_2 = max(ys)
    bounding_boxes = cv2.rectangle(bounding_boxes, (x_1, y_1), (x_2, y_2), (0, 0, 255))
cv2.imwrite('/tmp/bounded_boxes.png', bounding_boxes)

Width and height distribution

In [None]:
widths = []
heights = []
counts = []
for blob in blobs:
    xs = [j for _, j in blob]
    ys = [i for i, _ in blob]
    x_1 = min(xs)
    y_1 = min(ys)
    x_2 = max(xs)
    y_2 = max(ys)
    width = x_2 - x_1
    height = y_2 - y_1
    widths.append(width)
    heights.append(height)
    counts.append(len(blob))

In [None]:
from matplotlib import pyplot as plt

plt.figure()
plt.hist(widths)
plt.show()

In [None]:
from matplotlib import pyplot as plt

plt.figure()
plt.hist(heights)
plt.show()

In [None]:
from matplotlib import pyplot as plt

plt.figure()
plt.hist(counts)
plt.show()

## Find character *a*

In [None]:
point = (440, 450)
a_blob = None
for blob in blobs:
    if point in blob:
        a_blob = blob
        break

In [None]:
def render_blob(blob, image_size):
    """
    Render the blob to a fixed size image.
    :param blob: list of pixel coordinates
    :param image_size: size of the rendered image
    :return: a NumPy array with 0 and 1 values
    """
    image = np.zeros(image_size, dtype=int)
    min_row = min([i for i, _ in blob])
    min_column = min([j for _, j in blob])
    for row, column in blob:
        image[row - min_row, column - min_column] = 1
    return image

In [None]:
def compare_blob(blob_1, blob_2):
    """
    Calculate the distance of the blobs.
    :param blob_1: list of blob pixels
    :param blob_2: list of blob pixels
    :return: distance of the blobs
    """
    image_1 = render_blob(blob_1, (50, 50))
    image_2 = render_blob(blob_2, (50, 50))
    distance = np.sum(np.sum(np.abs(image_1 - image_2)))
    return distance

In [None]:
distances = []
for blob in blobs:
    distance = compare_blob(blob, a_blob)
    distances.append(distance)

In [None]:
from matplotlib import pyplot as plt

plt.figure()
plt.hist(distances, bins=20)
plt.show()

In [None]:
result = cv2.merge((image, image, image))

for blob in blobs:
    distance = compare_blob(blob, a_blob)
    if distance < 60:
        min_row = min([i for i, _ in blob])
        min_column = min([j for _, j in blob])
        max_row = max([i for i, _ in blob])
        max_column = max([j for _, j in blob])
        result[min_row:max_row, min_column:max_column, 0] = 0

cv2.imwrite('/tmp/a_characters.png', result)