# Page Structure

## Read image from PDF file

https://github.com/Belval/pdf2image/blob/master/docs/reference.md

In [None]:
import cv2
import numpy as np
from pdf2image import convert_from_path

pil_images = convert_from_path('samples/test2pdf.pdf', size=(2500, None))
images = [
    cv2.cvtColor(np.array(pil_image), cv2.COLOR_BGR2GRAY)
    for pil_image in pil_images
]

In [None]:
from matplotlib import pyplot as plt

for image in images:
    plt.figure(figsize=(3, 6))
    plt.imshow(image)
    plt.show()

In [None]:
images[0].shape

In [None]:
def load_images_from_pdf(pdf_path, size=None):
    """
    Load OpenCV images from a PDF file.
    :param pdf_path: path of the PDF file
    :param size: the preferred size in pixels as a (width, height) tuple
    :return: list of OpenCV images
    """
    pil_images = convert_from_path(pdf_path, size=size)
    images = [
        cv2.cvtColor(np.array(pil_image), cv2.COLOR_BGR2GRAY)
        for pil_image in pil_images
    ]
    return images

## Margin estimation

## Row and column profiles

In [None]:
images = load_images_from_pdf('samples/test2pdf.pdf', size=(2500, None))
image = images[0]
row_profile = np.mean(image, axis=1)
column_profile = np.mean(image, axis=0)

In [None]:
plt.figure(figsize=(16, 1))
plt.plot(row_profile)
plt.show()

In [None]:
plt.figure()
plt.plot(column_profile)
plt.show()

In [None]:
def find_first_change(values):
    """
    Find the index of the first changed value in the values.
    :param values: an iterable array of comparable objects
    :return: the i index where values[i - 1] != values[i]
    """
    i = 1
    while i < len(values):
        if values[i - 1] != values[i]:
            return i
        i += 1
    raise ValueError('All values are the same!')

In [None]:
def find_last_change(values):
    """
    Find the index of the last changed value in the values.
    :param values: an iterable array of comparable objects
    :return: the i index where values[i] != values[i + 1]
    """
    i = len(values) - 2
    while i >= 0:
        if values[i] != values[i + 1]:
            return i
        i -= 1
    raise ValueError('All values are the same!')

In [None]:
def calc_margins(image):
    """
    Calculate the margins of the image.
    :param image: the NumPy array of page intensity image
    :return: dictionary of the estimated margins
    """
    row_profile = np.mean(image, axis=1)
    column_profile = np.mean(image, axis=0)
    margins = {
        'left': find_first_change(column_profile),
        'right': find_last_change(column_profile),
        'top': find_first_change(row_profile),
        'bottom': find_last_change(row_profile)
    }
    return margins

In [None]:
calc_margins(image)

Display the margins

In [None]:
from matplotlib import pyplot as plt

margins = calc_margins(image)
width = margins['right'] - margins['left']
height = margins['bottom'] - margins['top']
fig, ax = plt.subplots(figsize=(10, 20))
plt.imshow(image, cmap='gray')
rectangle = plt.Rectangle(
    (margins['left'], margins['top']), width, height,
    facecolor='black', alpha=0.1)
ax.add_patch(rectangle)
plt.show()

In [None]:
plt.figure(figsize=(16, 2))
plt.plot(row_profile)
plt.xlim(200, 500)
plt.show()

In [None]:
segments = []
start = None
end = None
background_color = 255
for i, value in enumerate(row_profile):
    if value != background_color:
        if start is None:
            start = i
    elif start is not None:
        end = i
        segments.append((start, end))
        start = None
segments

In [None]:
def find_segments(values, background_color):
    """
    Find the segments with non-background colors in the iterable.
    :param values: intensity values
    :param background_color: the background color which should be skipped
    :return: list of segments as [start, end) tuples of indices
    """
    segments = []
    start = None
    end = None
    background_color = 255
    for i, value in enumerate(values):
        if value != background_color:
            if start is None:
                start = i
        elif start is not None:
            end = i
            segments.append((start, end))
            start = None
    return segments

Display the segments

In [None]:
margins = calc_margins(image)
width = margins['right'] - margins['left']
row_profile = np.mean(image, axis=1)
background_color = 255
segments = find_segments(row_profile, background_color)
fig, ax = plt.subplots(figsize=(10, 20))
plt.imshow(image, cmap='gray')
for segment in segments:
    height = segment[1] - segment[0]
    rectangle = plt.Rectangle((margins['left'], segment[0]), width, height, facecolor='blue', alpha=0.1)
    ax.add_patch(rectangle)
plt.show()

Calculate the spacing between segments and display the histogram

In [None]:
def calc_spacing_between_segments(segments):
    """
    Calculate the spacing between the segments.
    :param segments: list of segments as [start, end) intervals
    :return: list of the distances between the segments
    """
    spacing = []
    for i in range(len(segments) - 1):
        space = segments[i + 1][0] - segments[i][1]
        spacing.append(space)
    return spacing

In [None]:
spacing = calc_spacing_between_segments(segments)
print(spacing)

In [None]:
plt.figure()
plt.hist(spacing)
plt.show()

Join the closer segments

In [None]:
min_spacing = 30
joined_segments = [segments[0]]
for segment in segments[1:]:
    if (segment[0] - joined_segments[-1][1]) < min_spacing:
        joined_segments[-1] = (joined_segments[-1][0], segment[1])
    else:
        joined_segments.append(segment)
print(joined_segments)

Display the paragraphs

In [None]:
margins = calc_margins(image)
width = margins['right'] - margins['left']
row_profile = np.mean(image, axis=1)
background_color = 255
segments = find_segments(row_profile, background_color)
fig, ax = plt.subplots(figsize=(10, 20))
plt.imshow(image, cmap='gray')
for segment in joined_segments:
    height = segment[1] - segment[0]
    rectangle = plt.Rectangle((margins['left'], segment[0]), width, height, facecolor='red', alpha=0.1)
    ax.add_patch(rectangle)
plt.show()

In [None]:
def join_segments(segments, min_spacing):
    """
    Join the segments which are closer to each others than the minimal spacing.
    :param segments: list of segments as [start, end) intervals
    :param min_spacing: the minimal spacing between the joined segments
    :return: list of segments in the same format as the input
    """
    joined_segments = [segments[0]]
    for segment in segments[1:]:
        if (segment[0] - joined_segments[-1][1]) < min_spacing:
            joined_segments[-1] = (joined_segments[-1][0], segment[1])
        else:
            joined_segments.append(segment)
    return joined_segments

TODO: Estimate the optimal spacing from the histogram!