In [82]:
from copy import deepcopy

import pandas as pd
import json
from pathlib import Path

In [21]:
data = json.loads(Path(
    "/home/egordm/Projects/workshops/hackalod-gouda23-na-td/data/suppleties/nl_zh_ha_na-2_10_50-505_13_page_data.json").read_text())

In [22]:
data

{'creator': 'READ-COOP',
 'model_name': 'TrHtr',
 'model_version': '2.3.0',
 'model_identifier': '51170',
 'model_date': '04_10_2023_12',
 'created': '2023-10-03T17:32:38.967+02:00',
 'last_change': '2023-11-04T00:06:39.070+01:00',
 'measurement_unit': 'pixel',
 'primary_language': None,
 'producer': 'Transkribus',
 'image': {'image_filename': 'nl_zh_ha_na-2_10_50-505_13.jpg',
  'image_width': 8374,
  'image_height': 5897,
  'image_size': 8384,
  'bbox_page_coordinates': [118, 132, 8123, 5787],
  'page_area': 43564793.0,
  'page_contour_shape': 'rectangular'},
 'regions': {'0': {'region_orientation': '0',
   'region_type': 'table_row',
   'identifier': 'region_1696352088230_1364',
   'reading_order': '0',
   'structure': 'index',
   'shape_class': 'Rectangle',
   'coordinates_list': [['267', '1395'],
    ['267', '1463'],
    ['456', '1463'],
    ['456', '1395']],
   'bbox_height': 68,
   'bbox_width': 189,
   'bbox_area': 12852,
   'bbox_area_relative': 0.00029500886185778504,
   'cent

In [37]:
allowed_structures = [
    'index_military',
    'name',
    'family_data',
    'muster_data',
    'campaign_data',
    'comments',
]

In [40]:


def preprocess(element, prefix=''):
    prefix_add = '  '
    if isinstance(element, dict):
        if 'structure' in element:
            print(prefix + 'structure: ' + element['structure'])
        if 'text' in element:
            print(prefix + element['text'])
        if 'region_text' in element:
            print(prefix + 'region_text: ' + element['region_text'])
        if 'row_surname' in element:
            print(prefix + 'row_surname: ' + element['row_surname'])
        if 'row_name' in element:
            print(prefix + 'row_name: ' + element['row_name'])

        for key, value in element.items():
            preprocess(value, prefix + prefix_add + '- ')
    elif isinstance(element, list):
        # print('parent:')
        for item in element:
            preprocess(item, prefix + prefix_add)
    else:
        pass



In [41]:
hello = preprocess(data)

  -   - structure: index
  -   - region_text: 5118 
  -   - row_surname: 5118
  -   -   -   - 5118
  -   - structure: page_number
  -   - region_text: 1707 
  -   -   -   - 1707
  -   - structure: index
  -   - region_text: J.J. Montagne 5120 Montagne Johannes, Jacobus V. P. 
  -   - row_surname: Montagne
  -   - row_name: Johannes, Jacobus
  -   -   -   - J.J. Montagne
  -   -   -   - 5120
  -   -   -   - Montagne
  -   -   -   - Johannes, Jacobus
  -   -   -   - V. P.
  -   - structure: index
  -   - region_text: 63635 5119 56220 Toole Geert 
  -   - row_surname: Toole
  -   - row_name: 56220
  -   -   -   - 63635
  -   -   -   - 5119
  -   -   -   - 56220
  -   -   -   - Toole
  -   -   -   - Geert
  -   - structure: index
  -   -   -   - Door¬
  -   -   -   - loopende
  -   -   -   - nummers.
  -   -   -   - 1
  -   - structure: index
  -   - region_text: V.6-2-40 N:o 24 Afwijzing steunverzoek. 5118 56064 't Hart Jan 
  -   - row_surname: 't Hart
  -   - row_name: 56064
  -   -   -

In [83]:
def extract_entities(items: list):
    output = []
    cur_entity_type = None
    cur_entity_content = None
    
    # print(f'extract_entities: {items}')
    
    for item in items:
        if isinstance(item, dict):
            if 'key' in item and item['key'] == 'structure':
                if cur_entity_type:
                    output.append({
                        'type': cur_entity_type,
                        'content': cur_entity_content
                    })
                cur_entity_type = item['value']
                cur_entity_content = []
            else:
                if cur_entity_type:
                    cur_entity_content.append(item)
                else:
                    output.append(item)
        elif cur_entity_type:
            cur_entity_content.append(item)
        else:
            output.append(item)
    
    if cur_entity_type:
        output.append({
            'entity_type': cur_entity_type,
            'content': cur_entity_content
        })
    
    return output
    

def preprocess_v2(element):
    if isinstance(element, dict):
        output = []

        if 'structure' in element:
            output.append({
                'key': 'structure',
                'value': element['structure']
            })
        if 'text' in element:
            output.append({
                'key': 'text',
                'value': element['text']
            })
        if 'region_text' in element:
            output.append({
                'key': 'region_text',
                'value': element['region_text']
            })
        if 'row_surname' in element:
            output.append({
                'key': 'row_surname',
                'value': element['row_surname']
            })
        if 'row_name' in element:
            output.append({
                'key': 'row_name',
                'value': element['row_name']
            })

        children = []
        for key, value in element.items():
            if child := preprocess_v2(value):
                children.append(child)
        if children:
            if all([isinstance(c, str) for c in children]):
                children = ' '.join(children)
                output.append({
                    'key': 'child_text',
                    'value': children
                })
            else:
                children = extract_entities(children)
                output.append({
                    'key': 'children',
                    'value': children
                })
            
        if len(output) == 1 and output[0]['key'] == 'text':
            output = output[0]['value']
        else:
            output = extract_entities(output)
    elif isinstance(element, list):
        output = []
        for item in element:
            if child := preprocess_v2(item):
                output.append(child)
                
        output = extract_entities(output)
    else:
        output = None
    return output



In [84]:
helloz = preprocess_v2(data)
helloz[0]

{'key': 'children',
 'value': [[{'key': 'children',
    'value': [[{'entity_type': 'index',
       'content': [{'key': 'region_text', 'value': '5118 '},
        {'key': 'row_surname', 'value': '5118'},
        {'key': 'children',
         'value': [[{'key': 'child_text', 'value': '5118'}]]}]}],
     [{'entity_type': 'page_number',
       'content': [{'key': 'region_text', 'value': '1707 '},
        {'key': 'children',
         'value': [[{'key': 'child_text', 'value': '1707'}]]}]}],
     [{'entity_type': 'index',
       'content': [{'key': 'region_text',
         'value': 'J.J. Montagne 5120 Montagne Johannes, Jacobus V. P. '},
        {'key': 'row_surname', 'value': 'Montagne'},
        {'key': 'row_name', 'value': 'Johannes, Jacobus'},
        {'key': 'children',
         'value': [[{'key': 'child_text',
            'value': 'J.J. Montagne 5120 Montagne Johannes, Jacobus V. P.'}]]}]}],
     [{'entity_type': 'index',
       'content': [{'key': 'region_text',
         'value': '63635 5

In [85]:
len(helloz)

1

In [95]:
def flatten_me_like_a_bus(data):
    if isinstance(data, dict):
        if 'entity_type' in data:
            data['content'] = flatten_me_like_a_bus(data['content'])
        elif 'key' in data and data['key'] == 'children':
            data['value'] = flatten_me_like_a_bus(data['value'])
            
            if isinstance(data['value'], dict) and data['value'].get('key', None) == 'children':
                data['value'] = data['value']['value']
                
            data = data['value']
        elif 'key' in data and data['key'] == 'child_text':
            data = data['value']
        elif 'key' in data and data['key'] == 'region_text':
            data = data['value']
        else:
            # data['value'] = flatten_me_like_a_bus(data['value'])
            data = f'{data["key"]}: {flatten_me_like_a_bus(data["value"])}'
    elif isinstance(data, list):
        data = [flatten_me_like_a_bus(item) for item in data]
        if len(data) == 1 and isinstance(data[0], list | str):
            data = data[0]
    elif isinstance(data, str):
        pass
    return data

flattened_data = flatten_me_like_a_bus(deepcopy(helloz))
flattened_data

[[{'entity_type': 'index', 'content': ['5118 ', 'row_surname: 5118', '5118']}],
 [{'entity_type': 'page_number', 'content': ['1707 ', '1707']}],
 [{'entity_type': 'index',
   'content': ['J.J. Montagne 5120 Montagne Johannes, Jacobus V. P. ',
    'row_surname: Montagne',
    'row_name: Johannes, Jacobus',
    'J.J. Montagne 5120 Montagne Johannes, Jacobus V. P.']}],
 [{'entity_type': 'index',
   'content': ['63635 5119 56220 Toole Geert ',
    'row_surname: Toole',
    'row_name: 56220',
    '63635 5119 56220 Toole Geert']}],
 [{'entity_type': 'index', 'content': 'Door¬ loopende nummers. 1'}],
 [{'entity_type': 'index',
   'content': ["V.6-2-40 N:o 24 Afwijzing steunverzoek. 5118 56064 't Hart Jan ",
    "row_surname: 't Hart",
    'row_name: 56064',
    "V.6-2-40 N:o 24 Afwijzing steunverzoek. 5118 56064 't Hart Jan"]}],
 [{'entity_type': 'index_military',
   'content': 'Algemeen stamboek nummer in Indië 2'}],
 [{'entity_type': 'name', 'content': 'NAMEN EN VOORNAMEN. 3'}],
 [{'entity_

In [113]:
def clean_me_like_a_firehose(data):
    if isinstance(data, list):
        data = [clean_me_like_a_firehose(item) for item in data]
        data_new = []
        already_seen = set()
        for item in data:
            if isinstance(item, dict | list):
                data_new.append(item)
                continue
            
            if item in already_seen:
                continue
            already_seen.add(item)
            data_new.append(item)
        
        data = data_new
    elif isinstance(data, dict):
        if 'entity_type' in data:
            entity_type = data['entity_type']
            value = None
            if entity_type in {'comments', 'campaign_data', 'muster_data', 'index', 'family_data'}:
                value = data['content'][1:]
            else:
                value = data['content']
            
            data = {
                'entity': entity_type,
                'value': value
            }
    elif isinstance(data, str):
        data = data.strip()
    else:
        pass
    return data

cleaned_data = clean_me_like_a_firehose(deepcopy(flattened_data))
cleaned_data

[[{'entity': 'index', 'value': ['row_surname: 5118', '5118']}],
 [{'entity': 'page_number', 'value': ['1707 ', '1707']}],
 [{'entity': 'index',
   'value': ['row_surname: Montagne',
    'row_name: Johannes, Jacobus',
    'J.J. Montagne 5120 Montagne Johannes, Jacobus V. P.']}],
 [{'entity': 'index',
   'value': ['row_surname: Toole',
    'row_name: 56220',
    '63635 5119 56220 Toole Geert']}],
 [{'entity': 'index', 'value': 'oor¬ loopende nummers. 1'}],
 [{'entity': 'index',
   'value': ["row_surname: 't Hart",
    'row_name: 56064',
    "V.6-2-40 N:o 24 Afwijzing steunverzoek. 5118 56064 't Hart Jan"]}],
 [{'entity': 'index_military',
   'value': 'Algemeen stamboek nummer in Indië 2'}],
 [{'entity': 'name', 'value': 'NAMEN EN VOORNAMEN. 3'}],
 [{'entity': 'family_data',
   'value': 'amen der ouders, geboorteplaats, datum van geboorte, laatste woonplaats en signalement. 4'}],
 [{'entity': 'family_data',
   'value': ["row_surname: 't Hart",
    'row_name: 56064',
    'Vader Jan Moeder 

In [127]:
def pannekoek(data):
    output = []
    
    def inner_pannekoek(data):
        if isinstance(data, dict):
            output.append(data)
        elif isinstance(data, list):
            for item in data:
                inner_pannekoek(item)
        else:
            pass
    
    inner_pannekoek(data)
    
    title = None
    people = []
    events = []
    
    for entity in output:
        if entity['entity'] in ['campaign_data', 'muster_data']:
            value = entity['value'] if isinstance(entity['value'], str) else entity['value']
            events.append(value)
        elif entity['entity'] in [ 'family_data']:
            value = entity['value'] if isinstance(entity['value'], str) else entity['value']
            people.append(value)
        elif entity['entity'] in ['index_military']:
            title = entity['value']
        else:
            pass
        

    
    return {
        'title': title,
        'people': people,
        'events': events
    }


result = pannekoek(deepcopy(cleaned_data))
result

{'title': 'Algemeen stamboek nummer in Indië 2',
 'people': ['amen der ouders, geboorteplaats, datum van geboorte, laatste woonplaats en signalement. 4',
  ["row_surname: 't Hart",
   'row_name: 56064',
   'Vader Jan Moeder Catharina van Bemmel Geboren te Leiderdorp /Z.H. den 12:e November 1883 Laatst gewoond te Rotterdam Bij zijne aankomst bij het korps lang 1:667 Meter.'],
  ['row_surname: Toole',
   'row_name: 56220',
   'Vader Hendrik Moeder Jantje Baptist Geboren te Borger (Drenthe) den 15 Augustus 1875 Laatst gewoond te Borger Bij zijne aankomst bij het korp lang 1.708 Meter'],
  ['row_surname: Montagne',
   'row_name: Johannes, Jacobus',
   'Vader Johannes Moeder Jannetje van der Meer Geboren te Leiden (Zuid-Holl:d.) 30:en Januari 1862 den Laatst gewoond te Leiden Bij zijne aankomst bij het korps lang 1.563 Meter. Gehuwd'],
  ['row_surname: Toole',
   'row_name: 56220',
   'Haar Kin Aangezicht Ovaal Voorhoofd smal Oogen bruin Neus gewoon Mond id rond blond Wenkbrauwen id Merkbar

In [124]:
with open('out.json', 'w') as f:
    json.dump(result, f, indent=2)