In [54]:
import os
import json
import xml.etree.ElementTree as ET

# Directory containing XML files
xml_folder = '../bensira_xml'

# Configuration
remove_linebreaks = True
remove_vokalization = True
remove_diacritics = True
remove_reconstructions = False

# List of diacritic characters to remove
diacritics_to_remove = ['\u030A', '\u0307', '\u25E6', '\u2022', '\u0336', '\u059F']

# Function to remove diacritics from text
def remove_diacritics_from_text(text):
    return ''.join(char for char in text if char not in diacritics_to_remove)

# Initialize the bs_manuscripts dictionary
bs_manuscripts = {}

# Function to convert XML to JSON
def convert_xml_to_json(xml_folder):
    for filename in os.listdir(xml_folder):
        if filename.endswith('.xml'):
            xml_path = os.path.join(xml_folder, filename)
            print(xml_path)
            with open(xml_path, 'r') as xml_file:
                print(f"Converting {xml_path} to JSON...")

                # Parse the XML file
                tree = ET.parse(xml_file)
                root = tree.getroot()

                # Extract the manuscript name
                ms_name = root.find('.//ms').attrib['name']
                bs_manuscripts[ms_name] = {}

                # Iterate through each chapter
                for chap in root.findall('.//div[@type="chap"]'):
                    chap_num = chap.attrib['n']
                    bs_manuscripts [ms_name][chap_num] = {}



                    # Iterate through each verse in the chapter
                    for verse in chap.findall('.//div[@type="verse"]'):
                        verse_num = verse.attrib['n']
                        #print(f"{chap_num}:{verse_num} ")
                        words = []

                        for w in verse.findall('.//w'):
                            #print(w.text)
                            word_text = ''
                            if w.text and not w.text.isspace():
                                word_text += w.text
                            if not remove_reconstructions:
                                for g in w.findall('.//g'):
                                    if g.text:
                                        word_text += f"[{g.text}]"
                                    if g.tail and not g.tail.isspace():
                                        word_text += g.tail
                            if remove_diacritics:
                                word_text = remove_diacritics_from_text(word_text)
                            words.append(word_text)
                        if remove_linebreaks:
                            words = [w.replace('\n', '') for w in words]
                        bs_manuscripts[ms_name][chap_num][verse_num] = ' '.join(words).strip()

                # Convert the JSON structure to a JSON string
                json_output = json.dumps(bs_manuscripts, ensure_ascii=False, indent=4)

    # Write the bs_manuscripts dictionary to a JSON file
    with open('bs_manuscripts.json', 'w', encoding='utf-8') as f:
        json.dump(bs_manuscripts, f, ensure_ascii=False, indent=4)

    # Print confirmation
    print("JSON data has been written to bs_manuscripts.json")

# Convert XML files to JSON
convert_xml_to_json(xml_folder)

../bensira_xml/ms_b.xml
Converting ../bensira_xml/ms_b.xml to JSON...
../bensira_xml/ms_c.xml
Converting ../bensira_xml/ms_c.xml to JSON...
../bensira_xml/ms_a.xml
Converting ../bensira_xml/ms_a.xml to JSON...
../bensira_xml/ms_d.xml
Converting ../bensira_xml/ms_d.xml to JSON...
../bensira_xml/ms_e.xml
Converting ../bensira_xml/ms_e.xml to JSON...
../bensira_xml/ms_f.xml
Converting ../bensira_xml/ms_f.xml to JSON...
../bensira_xml/ms_11QPsa.xml
Converting ../bensira_xml/ms_11QPsa.xml to JSON...
../bensira_xml/ms_massada.xml
Converting ../bensira_xml/ms_massada.xml to JSON...
../bensira_xml/ms_2Q18.xml
Converting ../bensira_xml/ms_2Q18.xml to JSON...
JSON data has been written to bs_manuscripts.json
