In [1]:
import os
import itertools
from tqdm.notebook import tqdm
import numpy as np

import cv2
from matplotlib import pyplot as plt

MARGIN_SIZE = 120

In [2]:
def hide_infobox(img_path, output_path):
    img = cv2.imread(img_path)[MARGIN_SIZE:-MARGIN_SIZE, MARGIN_SIZE:-MARGIN_SIZE]
    height, width, channels = img.shape

    # Grayscale processing
    gray = cv2.cvtColor(img, code=cv2.COLOR_BGR2GRAY)
    alpha = 1.5
    beta = -50
    contrast = cv2.convertScaleAbs(gray, alpha=alpha, beta=beta)

    retval, img2 = cv2.threshold(contrast, thresh=180, maxval=255, type=cv2.THRESH_TOZERO)

    # Detect lines
    edges = cv2.Canny(img2, threshold1=10, threshold2=200)
    blur_edges = cv2.blur(edges, (2, 2)) 

    contours, hierarchy = cv2.findContours(blur_edges, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)

    line_coords = []
    line_contours = []
    for cnt in contours:
        arclen = cv2.arcLength(cnt, True)
        approx = cv2.approxPolyDP(cnt, arclen*0.01, True)
        #drawContours
        if len(approx) == 2:
            min_diff, max_diff = np.sort(np.ptp(approx[:,0], axis=0))
            if True or min_diff < 10 and max_diff > 40:
                line_contours.append(approx)
                line_coords.extend(approx)
        else:
            pass # line_contours.append(approx)

    line_coords = np.reshape(line_coords, (-1, 2))

    # Remove lines in upper quarter of image (underline for running header)
    line_coords = line_coords[line_coords[:,1]>height//4]

    try:
        corners = np.array([np.min(line_coords, axis=0), np.max(line_coords, axis=0)])
    except ValueError:
        tqdm.write(f"ERROR: Failed to detect infobox in {output_path}.")
        cv2.imwrite(output_path, img)
        return None

    # Adjust covering rectangle
    corners[1][1] = height
    corners[1][0] += width//500

    cv2.rectangle(img, *corners, (255, 255, 255), -1)
    # img2 = cv2.cvtColor(img2, cv2.COLOR_GRAY2BGR)
    # cv2.drawContours(img2, line_contours, -1, (0, 255, 0), 4)
    cv2.imwrite(output_path, img)

    # Possibly incorrect selection
    # if corners.sum(axis=0).prod() > 0.4 * height * width:
    #     tqdm.write(f"WARNING: Detected infobox occupies more than 40%: {output_path}")

    return corners

In [3]:
def rename(img_dirs, reverse=True):
    page_count = 0

    for img_dir in tqdm(img_dirs):
        files = os.listdir(img_dir)
        page_count += len(files)

        if reverse:
            files = files[::-1]

        for index, file in tqdm(enumerate(files)):
            rename_path = os.path.join(img_dir, f"raw_{index:04}.jp2")
            os.rename(
                os.path.join(img_dir, file),
                rename_path
            )

    return page_count

In [6]:
def batch_preprocess(img_dirs, out_dir, batch_size=100):
    files = []
    for img_dir in img_dirs:
        files.extend([os.path.join(img_dir, filename) for filename in os.listdir(img_dir)])

    page_count = 0
    for index, batch in enumerate(tqdm(list(itertools.batched(files, batch_size)))):
        
        batch_dir = os.path.join(out_dir, f"batch_{index:04}")
        os.mkdir(batch_dir)

        for file in tqdm(batch):
            hide_infobox(file, os.path.join(batch_dir, f"{page_count:04}.jpg"))
            page_count += 1

    return page_count

In [5]:
input_dirs = ["kokuyaku1", "kokuyaku2"]
output_dir = "collated"
rename(input_dirs)

  0%|          | 0/2 [00:00<?, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

1032

In [7]:
batch_preprocess(input_dirs, output_dir)

  0%|          | 0/11 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

ERROR: Failed to detect infobox in collated\batch_0005\0548.jpg.


  0%|          | 0/100 [00:00<?, ?it/s]

ERROR: Failed to detect infobox in collated\batch_0006\0613.jpg.
ERROR: Failed to detect infobox in collated\batch_0006\0657.jpg.
ERROR: Failed to detect infobox in collated\batch_0006\0680.jpg.


  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

1032