# OCR XML Processing
This notebook shows the process of extracting relevant OCR information from the XML files downloaded in `load_data.ipynb` and explored in `explore_xml.ipynb`. The idea is to get the information we want into a more convenient format. For each XML file, we produce two auxiliary files: a text file containing each `TextBlock` as a continuous string and a JSON file containing information about bounding boxes.

## Imports

In [14]:
from xml.etree import ElementTree as ET # for parsing XML
from matplotlib import pyplot as plt    # for data visualization
import json

## Globals

In [15]:
DATA_PATH = './data/'
XML_DIR = 'xml/'
TXT_DIR = 'txt/'
JSON_DIR = 'json/'

EX_PATH = './example_data/'
EX_XML_PATH = f'{EX_PATH}lccnsn830302141906-04-20ed-1seq-1.xml'

SCHEMA = '{http://schema.ccs-gmbh.com/ALTO}'

## XML Processing
This cell defines a function that processes XML files into two files: a `.txt` file containing the OCR data and a `.json` file containing the block- and line-level bounding box data.

In [24]:
def process_xml(filename, txt_dir, json_dir):
    block_root: ET.Element = ET.parse(EX_XML_PATH).getroot() or ET.Element('')
    for subtag in ('Layout', 'Page', 'PrintSpace'):
        block_root = block_root.find(f'{SCHEMA}{subtag}') or ET.Element('')

    page_dict = {}

    with open(f'{txt_dir}{filename}.txt', 'w') as tfp:
        for block in block_root.findall(f'{SCHEMA}TextBlock'):
            block_dict = {}
            left,  upper = int(block.attrib['HPOS']), int(block.attrib['VPOS'])
            right, lower = left + int(block.attrib["WIDTH"]), upper + int(block.attrib['HEIGHT'])
            block_dict['left'], block_dict['upper'], block_dict['right'], block_dict['lower'] = left, upper, right, lower
            for line in block.findall(f'{SCHEMA}TextLine'):
                line_dict = {}
                left,  upper = int(line.attrib['HPOS']), int(line.attrib['VPOS'])
                right, lower = left + int(line.attrib["WIDTH"]), upper + int(line.attrib['HEIGHT'])
                line_dict['left'], line_dict['upper'], line_dict['right'], line_dict['lower'] = left, upper, right, lower
                for string in line.findall(f'{SCHEMA}String'):
                    tfp.write(string.attrib['CONTENT'] + ' ')
                    left,  upper = int(string.attrib['HPOS']), int(string.attrib['VPOS'])
                    right, lower = left + int(string.attrib["WIDTH"]), upper + int(string.attrib['HEIGHT'])
                    line_dict[string.attrib['ID']] = {'content': string.attrib['CONTENT'], 'left': left, 'upper': upper, 'right': right, 'lower': lower}
                tfp.write('\n')
                block_dict[line.attrib["ID"]] = line_dict
            tfp.write('\n')
            page_dict[block.attrib["ID"]] = block_dict

    with open(f'{json_dir}{filename}.json', 'w') as jfp:
        json.dump(page_dict, jfp)

process_xml('example', './example_data/', './example_data/')