In [19]:
#import libraries

import xml.etree.ElementTree as ET
from lxml import etree
import os, os.path
import json
import numpy as np
from tqdm import tqdm

In [None]:
import cv2
import numpy as np
import os
import xml.etree.ElementTree as ET
from tqdm import tqdm

def get_text_blocks(root, h, w):
    """
    Returns the list of TextBlock elements with coordinates
    """
    ns = "{http://www.loc.gov/standards/alto/ns-v4#}"
    
    # check if MainZone ontology exists
    tags = root.findall(f"{ns}Tags/{ns}OtherTag")
    has_main_zone = False
    for tag in tags:
        if tag.get('LABEL') == 'MainZone':
            has_main_zone = True
            other_tag_id = tag.get('ID')
            break
    
    if has_main_zone:
        text_blocks = root.findall(f"{ns}Layout/{ns}Page/{ns}PrintSpace/{ns}TextBlock[@TAGREFS='{other_tag_id}']")
    else:
        text_blocks = root.findall(f"{ns}Layout/{ns}Page/{ns}PrintSpace/{ns}TextBlock")
        
    text_blocks_with_coords = []
    for text_block in text_blocks:
        if not all(key in text_block.attrib for key in ['HPOS', 'VPOS', 'WIDTH', 'HEIGHT']):
            continue
        
        hpos = text_block.attrib.get('HPOS', '')
        vpos = text_block.attrib.get('VPOS', '')
        width = text_block.attrib.get('WIDTH', '')
        height = text_block.attrib.get('HEIGHT', '')
        if not hpos or not vpos or not width or not height:
            continue
        
        x = round(float(hpos))
        y = round(float(vpos))
        width = round(float(width))
        height = round(float(height))
        if x + width > w:
            width = w - x
        if y + height > h:
            height = h - y
        text_blocks_with_coords.append({'x': x, 'y': y, 'width': width, 'height': height})
    
    return text_blocks_with_coords


def create_zone_image(img, zone_coords):
    """
    Returns the zone image cropped from the original image
    """
    mask = np.zeros((img.shape[0], img.shape[1]), dtype=np.uint8)
    cv2.rectangle(mask, (zone_coords['x'], zone_coords['y']), (zone_coords['x'] + zone_coords['width'], zone_coords['y'] + zone_coords['height']), 255, -1)
    masked_img = cv2.bitwise_and(img, img, mask=mask)
    y, x = np.nonzero(mask)
    zone_img = masked_img[min(y):max(y), min(x):max(x)]
    return zone_img


def save_zone_image(zone_img, img_file, i, img_save_path):
    """
    Saves the zone image
    """
    zone_img_name = img_file.split('.')[0] + str(i) + '.png'
    zone_img_path = os.path.join(img_save_path, zone_img_name)
    cv2.imwrite(zone_img_path, zone_img, [cv2.IMWRITE_PNG_COMPRESSION, 9])
    return zone_img_name


def process_image_file(directory, img_file):
    if not img_file.endswith(".jpg"):
        return None, None

    # Determine the directory for the image file
    if os.path.exists(os.path.join(directory, "fort_abbr", img_file)):
        img_dir = "fort_abbr"
    elif os.path.exists(os.path.join(directory, "faible_abbr", img_file)):
        img_dir = "faible_abbr"
    else:
        return None, None

    # Load image and corresponding XML ALTO file
    img_path = os.path.join(directory, img_dir, img_file)
    img = cv2.imread(img_path)
    
    xml_file = os.path.join(directory, "annotations", img_file.replace(".jpg", ".xml"))
    tree = ET.parse(xml_file)
    root = tree.getroot()
    
    return img, root

In [None]:
# Define the source directories
dirs = ["CREMMA_fro", "CREMMA_lat", "ECMEN", "Gallicorpora_15"]

# Define the output directories
faible_abbr_output_dir = 'final_faible_abbr'
fort_abbr_output_dir = 'final_fort_abbr'

missing_zones = []

# Loop over the source directories
for directory in dirs:

    # Define the subdirectory for fort_abbr and faible_abbr files
    fort_abbr_dir = os.path.join(directory, "fort_abbr")
    faible_abbr_dir = os.path.join(directory, "faible_abbr")
    
    # Process fort_abbr files in the directory if the folder exists
    if os.path.exists(fort_abbr_dir):
        # Get list of image files in the fort_abbr directory
        img_files_fort = os.listdir(fort_abbr_dir)

        # Process each image file in fort_abbr
        for img_file_fort in tqdm(img_files_fort):
            img, root = process_image_file(directory, img_file_fort)
            if img is None or root is None:
                continue

            # Extract text block elements and their coordinates
            text_blocks = get_text_blocks(root, img.shape[0], img.shape[1])

            # Process each text block element
            if text_blocks:
                for j, text_block_coords in enumerate(text_blocks):
                    # Create cropped image of text block
                    zone_img = create_zone_image(img, text_block_coords)

                    # Save cropped image and add to list of images
                    zone_img_name = save_zone_image(zone_img, img_file_fort, j, fort_abbr_output_dir)

            else:
                missing_zones.append(img_file_fort)

    # Process faible_abbr files in the directory if the folder exists
    if os.path.exists(faible_abbr_dir):
        # Get list of image files in the fort_abbr directory
        img_files_faible = os.listdir(faible_abbr_dir)

        # Process each image file in faible_abbr
        for img_file_faible in tqdm(img_files_faible):
            img, root = process_image_file(directory, img_file_faible)
            if img is None or root is None:
                continue

            # Extract text block elements and their coordinates
            text_blocks = get_text_blocks(root, img.shape[0], img.shape[1])

            # Process each text block element
            if text_blocks:
                for j, text_block_coords in enumerate(text_blocks):
                    # Create cropped image of text block
                    zone_img = create_zone_image(img, text_block_coords)

                    # Save cropped image and add to list of images
                    zone_img_name = save_zone_image(zone_img, img_file_faible, j, faible_abbr_output_dir)
                    
            else:
                missing_zones.append(img_file_faible)


100%|███████████████████████████████████████████| 46/46 [04:13<00:00,  5.51s/it]
100%|███████████████████████████████████████████| 64/64 [09:06<00:00,  8.54s/it]
100%|███████████████████████████████████████████| 12/12 [00:16<00:00,  1.41s/it]
100%|█████████████████████████████████████████████| 2/2 [00:32<00:00, 16.17s/it]
 73%|███████████████████████████████▎           | 62/85 [05:49<00:14,  1.57it/s]