Add:

* Year of the book
* Identification number of the text

In [1]:
import pandas as pd
from os.path import join
import json
import re
from collections import Counter
from tqdm import tqdm



def clean_title(title):
    title = re.sub(r'\s+',' ',title)
    title = re.sub(r'<\/b>','',title)
    title = re.sub(r',','',title)
    return title

def regularise_sequence_number(seq_nr):
    seq_nr = str(seq_nr)
    max_length = 4
    if re.search(r'^\d+$',seq_nr):
        seq_nr = seq_nr.zfill(max_length)
    else:
        nr_char = len(seq_nr)
        leading_char = max_length - nr_char
        seq_nr = '9'*(leading_char)+'-'+seq_nr
    return seq_nr

In [2]:
path = join('..','Export','Export_leiden_prayer_20250218131307.json')
json_file = open(path)
json_data = json.load(json_file)
records = json_data['heurist']['records']

* 102 - Production layer
* 112 - Expression
* 123 - Text
* 1 - Record relationship

## Collect the titles

This creates a dictionary connected Heurist IDs to titles

In [3]:
titles = dict()
production_dates = dict()
book_types = dict()

for record in records:
    
    # Text
    if record['rec_RecTypeID'] == '123':
        H_ID = int(record['rec_ID'])
        for field in record['details']:
            if field['dty_ID']==1152:
                title = field['value']
                titles[H_ID] = title
                
    # Production Layer
    if record['rec_RecTypeID'] == '102':
        H_ID = int(record['rec_ID'])

        for field in record['details']:
            
            if field['dty_ID']==1201:
                book_types[H_ID] = field['termLabel']

            if field['dty_ID']==1145:
                field_value = field['value']

                if re.search( r'^[\dXx]{4}(-\d{2})?(-\d{2})?$' , str(field_value)):
                    production_dates[H_ID] = int(str(field_value)[:4])
                    
                elif 'estMinDate' in field_value and 'estMaxDate' in field_value:
                    min_date = field_value['estMinDate']
                    max_date = field_value['estMaxDate']
                    average = (min_date+max_date)/2
                    production_dates[H_ID] = round(average)

                else:
                    print('Problem', end=': ')
                    print(field_value)
  
            
            if field['dty_ID']==1098:
                title = field['value']
                title = re.sub('\'S-GRAVENHAGE, KB\s*','KB NL',title)
                title = re.sub('BRUSSEL, KB\s*','KB BE',title)
                title = re.sub('KOPENHAGEN, KB\s*','KB DE',title)
                title = re.sub('LEIDEN, UB\s*','UBL',title)
                title = re.sub('LONDEN, BL\s*','BL',title)
                title = re.sub('PARIJS, BN\s*','BNF',title)
                
                titles[H_ID] = title

                

In [4]:
rows = []

for record in records:
    
    # Expression
    if record['rec_RecTypeID']=='112':
        
        book = None
        text = None
        sequence_numer = None
        
        row = []
        
        for field in record['details']:
            
            if field['dty_ID']==1274:
                book = int(field['value']['id'])
            if field['dty_ID']==1193:
                text = int(field['value']['id'])
                text_id = None
                if re.search(r':',str(field['value']['title'])):
                    text_id = field['value']['title']
                    text_id = text_id[:text_id.index(':')]
            if field['dty_ID']==1094:
                sequence_number=field['value']
                
        if book is not None and text is not None:
            row.append(book)
            row.append(text)
            row.append(text_id)
            row.append(sequence_number)
            rows.append(row)

texts_in_books = pd.DataFrame(rows,columns=['book','text','text_id','sequence_number'])   
texts_in_books['sequence_number'] = texts_in_books['sequence_number'].apply(regularise_sequence_number)

In [5]:
books = texts_in_books['book'].unique().tolist()
print(f'{len(books)} books were found.')

303 books were found.


In [6]:
book_json = []

for book in books:
    
    book_dict = dict()
    book_dict['id'] = book 
    book_dict['title'] = titles.get(book,'')
    book_dict['year'] = production_dates.get(book)
    book_dict['type'] = book_types[book]
    
    texts = texts_in_books.query( f'book=={book}')
    texts = texts.sort_values(by='sequence_number')
    
    texts_dict = []
    
    for i,row in texts.iterrows():
        text_dict = dict()
        text_dict['id'] = row['text']
        text_dict['title'] = titles.get(row['text'])
        text_dict['text_id'] = row['text_id']
        text_dict['sequence_number'] = row['sequence_number']
        texts_dict.append(text_dict)
        
    book_dict['texts'] = texts_dict
    
    book_json.append(book_dict)


with open('texts_in_books.json','w',encoding='utf-8') as out:
    out.write(json.dumps(book_json))
