In [1]:
import xml.etree.ElementTree as ET
import re
import pandas as pd
import unicodedata
from docx import Document
from lxml import etree
import zipfile
import os
from win32com import client


In [18]:
from lxml import etree
import re

def convert_custom_xml_to_tei(input_file, output_file, appid=1):
    """ Main function: Converts APPI_CH_01.xml into TEI format. """
    
    # Define TEI and XML namespaces
    tei_ns = "http://www.tei-c.org/ns/1.0"
    xml_ns = "http://www.w3.org/XML/1998/namespace"

    # Parse input XML
    tree = etree.parse(input_file)
    root = tree.getroot()

    # Create TEI root
    tei = etree.Element(f"{{{tei_ns}}}TEI", nsmap={None: tei_ns, "xml": xml_ns})
    
    # Add TEI Header
    teiHeader = etree.SubElement(tei, f"{{{tei_ns}}}teiHeader")
    fileDesc = etree.SubElement(teiHeader, f"{{{tei_ns}}}fileDesc")
    titleStmt = etree.SubElement(fileDesc, f"{{{tei_ns}}}titleStmt")
    title = etree.SubElement(titleStmt, f"{{{tei_ns}}}title")
    title.text = "Converted TEI Document"
    pubStmt = etree.SubElement(fileDesc, f"{{{tei_ns}}}publicationStmt")
    pubStmt.text = "Generated from custom XML"
    sourceDesc = etree.SubElement(fileDesc, f"{{{tei_ns}}}sourceDesc")
    sourceDesc.text = "Converted from APPI_CH_01.xml"

    # Add <text> structure
    text = etree.SubElement(tei, f"{{{tei_ns}}}text")
    body = etree.SubElement(text, f"{{{tei_ns}}}body")
    listApp = etree.SubElement(body, f"{{{tei_ns}}}listApp")

    # Witness and Sigla Sections
    listWit = etree.SubElement(tei, f"{{{tei_ns}}}listWit")
    listPrefixDef = etree.SubElement(tei, f"{{{tei_ns}}}listPrefixDef")

    # Process each entry in the XML
    for entry in root.findall(".//Entry"):
        process_entry(entry, listApp, listWit, listPrefixDef, tei_ns, xml_ns, appid)

    # Save the TEI XML
    tree = etree.ElementTree(tei)
    tree.write(output_file, pretty_print=True, xml_declaration=True, encoding="UTF-8")


def process_entry(entry, listApp, listWit, listPrefixDef, tei_ns, xml_ns, appid):
    """ Processes an individual <Entry> and converts it into <app>. """
    
    entry_id = entry.get("ID")  # Unique ID for the entry

    # ✅ Find the <Chapter> parent
    chapter_element = entry
    while chapter_element is not None and chapter_element.tag != "Chapter":
        chapter_element = chapter_element.getparent()

    # ✅ Extract "Number" attribute safely from <Chapter>
    chapter = chapter_element.get("Number") if chapter_element is not None else "unknown_chap"

    # ✅ Extract <Verse> information from **nested** structure
    verse_element = entry.find(".//Verse/Verse")  # ✅ Extracts inner <Verse>
    verse = verse_element.text.strip() if verse_element is not None else "unknown_verse"

    verse_from = entry.findtext(".//From_verse")  # ✅ Ensure correct placement
    verse_to = entry.findtext(".//To_verse")  # ✅ Ensure correct placement

    # ✅ Build XML ID with range support
    if verse_from and verse_to:
        app_id = f"app{appid}-{chapter}-{verse_from}-{verse_to}-{entry_id}"
    else:
        app_id = f"app{appid}-{chapter}-{verse}-{entry_id}"

    # ✅ Set attributes
    app_attributes = {f"{{{xml_ns}}}id": app_id}
    if verse_from:
        app_attributes["from"] = verse_from
    if verse_to:
        app_attributes["to"] = verse_to

    # ✅ Create <app> element
    app = etree.SubElement(listApp, f"{{{tei_ns}}}app", app_attributes)

    # Add components
    add_lemmas(entry, app, tei_ns)
    add_readings(entry, app, listWit, listPrefixDef, tei_ns, xml_ns)
    add_cross_references(entry, app, tei_ns, chapter, verse, entry_id)  # ✅ Now `verse` is correctly assigned
    add_import_note(entry, app, tei_ns)


def add_cross_references(entry, app, tei_ns, chapter, verse, entry_number):
    """ Converts cross-references into TEI format with proper ID structure. """
    
    roman_numerals = {
        "I": "1", "II": "2", "III": "3", "IV": "4", "V": "5", "VI": "6"
    }

    cross_refs = entry.findall(".//Cross_references/Item")
    
    if cross_refs:
        listRef = etree.SubElement(app, f"{{{tei_ns}}}listRef")
        
        for ref in cross_refs:
            ref_text = ref.text.strip()
            
            # ✅ Convert Roman numeral to app ID format
            ref_appid = roman_numerals.get(ref_text, ref_text)  # Default to text if no match
            
            # ✅ Create <ref> element with correct target
            ref_el = etree.SubElement(
                listRef, f"{{{tei_ns}}}ref",
                {"target": f"app{ref_appid}-{chapter}-{verse}-{entry_number}"}
            )
            ref_el.text = ref_text  # Keep the original Roman numeral for display

    
    
    
def add_lemmas(entry, app, tei_ns):
    """ Extracts and formats lemmas, including handling ranges. """
    lemma_section = entry.find("Lemma")
    
    if lemma_section is not None:
        from_lemmas = lemma_section.findall("From/Detail")
        to_lemmas = lemma_section.findall("To/Detail")

        if from_lemmas and to_lemmas:
            lemGrp = etree.SubElement(app, f"{{{tei_ns}}}lemGrp", {"type": "range"})

            from_grp = etree.SubElement(lemGrp, f"{{{tei_ns}}}lemGrp", {"type": "from"})
            from_lem = etree.SubElement(from_grp, f"{{{tei_ns}}}lem")
            for lemma in from_lemmas:
                text = lemma.text
                if text:
                    etree.SubElement(from_lem, f"{{{tei_ns}}}w").text = text

            to_grp = etree.SubElement(lemGrp, f"{{{tei_ns}}}lemGrp", {"type": "to"})
            to_lem = etree.SubElement(to_grp, f"{{{tei_ns}}}lem")
            for lemma in to_lemmas:
                text = lemma.text
                if text:
                    etree.SubElement(to_lem, f"{{{tei_ns}}}w").text = text

        else:
            lem = etree.SubElement(app, f"{{{tei_ns}}}lem")
            for detail in lemma_section.findall("Detail"):
                text = detail.text
                if text:
                    etree.SubElement(lem, f"{{{tei_ns}}}w").text = text


def add_readings(entry, app, listWit, listPrefixDef, tei_ns, xml_ns):
    """ Processes readings, witness mapping, and sigla handling. """
    rdgGrp = etree.SubElement(app, f"{{{tei_ns}}}rdgGrp", {"type": "variant"})

    for reading in entry.findall(".//Reading"):
        rdg = etree.SubElement(rdgGrp, f"{{{tei_ns}}}rdg")

        text, sigla_ref = handle_sigla(reading.text, listPrefixDef, tei_ns, xml_ns)
        if sigla_ref:
            rdg.set("ref", sigla_ref)

        seg = etree.SubElement(rdg, f"{{{tei_ns}}}seg")
        seg.text = text

        
def add_import_note(entry, app, tei_ns):
    """ Extracts footnotes or commentary and adds them as <note type='import'> inside <app>. """
    
    footnote_text = entry.findtext(".//Footnotes/Footnote")  # Find footnote text if available
    
    if footnote_text:
        note_import = etree.SubElement(app, f"{{{tei_ns}}}note", {"type": "import"})
        note_import.text = footnote_text
        

def handle_sigla(text, listPrefixDef, tei_ns, xml_ns):
    """ Detects, removes, and maps sigla. """
    sigla_map = {"!": "uncertain", "+": "addition", ">": "omission"}
    sigla_refs = []

    for sig, desc in sigla_map.items():
        if sig in text:
            sig_id = f"sig-{desc}"
            if not listPrefixDef.xpath(f"//tei:prefixDef[@xml:id='{sig_id}']", namespaces={"tei": tei_ns}):
                prefix_def = etree.SubElement(listPrefixDef, f"{{{tei_ns}}}prefixDef", {f"{{{xml_ns}}}id": sig_id})
                etree.SubElement(prefix_def, f"{{{tei_ns}}}prefix").text = sig
                etree.SubElement(prefix_def, f"{{{tei_ns}}}desc").text = desc
            sigla_refs.append(f"#{sig_id}")
            text = text.replace(sig, "").strip()

    return text, " ".join(sigla_refs) if sigla_refs else None


In [19]:
# Run the conversion with appid = 1
input_xml = "flattened_verse_with_to_check.xml"
output_tei = "app1_tei.xml"
convert_custom_xml_to_tei(input_xml, output_tei, appid=1)


In [None]:
######## custome-xml parser

In [2]:
def process_footnotes(docx_path):
    # Dictionary to store footnote data
    footnotes_dict = {}

    # Step 1: Extract the footnotes XML from the .docx file
    with zipfile.ZipFile(docx_path, 'r') as docx:
        # Look for footnotes XML part
        if 'word/footnotes.xml' in docx.namelist():
            footnote_xml = docx.read('word/footnotes.xml').decode('utf-8')
        else:
            print("No footnotes.xml found in this document.")
            return footnotes_dict

    # Step 2: Parse the footnote XML and store in dictionary
    if footnote_xml:
        root = ET.fromstring(footnote_xml)
        namespaces = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}
        
        non_blank_count = 1  # Start counting footnotes for non-blank lines only

        for footnote in root.findall('w:footnote', namespaces):
            footnote_text = ""

            # Extract each run's text, font, and superscript/subscript information in the footnote
            for run in footnote.findall('.//w:r', namespaces):
                text_elem = run.find('w:t', namespaces)
                font_elem = run.find('.//w:rPr//w:rFonts', namespaces)
                vert_align_elem = run.find('.//w:rPr//w:vertAlign', namespaces)

                if text_elem is not None:
                    text = text_elem.text
                    font = font_elem.get(f'{{{namespaces["w"]}}}ascii') if font_elem is not None else "Unknown"

                    # Check for superscript or subscript alignment
                    if vert_align_elem is not None:
                        align_val = vert_align_elem.get(f'{{{namespaces["w"]}}}val')
                        if align_val == "superscript":
                            text = f"<superscript {text} >"
                        elif align_val == "subscript":
                            text = f"<subscript {text} >"

                    # Wrap the text in <specialFont ...> if it’s in the special font
                    if font == "HUBPSigla":  # Replace with your actual font name if different
                        footnote_text += f"<specialFont {text} >"
                    else:
                        footnote_text += text

            # Only store non-blank footnotes in the dictionary
            if footnote_text.strip():  # Check if footnote text is non-blank
                footnotes_dict[f'footnote-{non_blank_count}'] = footnote_text.strip()
                non_blank_count += 1  # Increment count only for non-blank footnotes

    return footnotes_dict


def process_main_text_with_normalized_footnotes(docx_path):
    document_xml = None

    # Step 1: Extract and normalize valid footnotes
    normalized_footnotes = {}  # Map of normalized ID -> actual footnote text
    id_mapping = {}  # Map of actual ID -> normalized ID
    with zipfile.ZipFile(docx_path, 'r') as docx:
        if 'word/footnotes.xml' in docx.namelist():
            footnotes_xml = docx.read('word/footnotes.xml').decode('utf-8')
            root = ET.fromstring(footnotes_xml)
            namespaces = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}

            normalized_id = 1
            for footnote in root.findall('w:footnote', namespaces):
                actual_id = int(footnote.get(f'{{{namespaces["w"]}}}id'))
                footnote_type = footnote.get(f'{{{namespaces["w"]}}}type', 'regular')
                text = ''.join(run.text or '' for run in footnote.findall('.//w:t', namespaces))

                # Include only regular footnotes with valid IDs
                if actual_id > 0 and footnote_type == 'regular':
                    normalized_footnotes[normalized_id] = text.strip()
                    id_mapping[actual_id] = normalized_id
                    normalized_id += 1

    # Step 2: Process `document.xml` with normalized IDs
    with zipfile.ZipFile(docx_path, 'r') as docx:
        if 'word/document.xml' in docx.namelist():
            document_xml = docx.read('word/document.xml').decode('utf-8')

    namespaces = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}
    main_text_dict = {}

    if document_xml:
        document_root = ET.fromstring(document_xml)
        title_and_chapter = ""
        first_row_processed = False

        for i, paragraph in enumerate(document_root.findall('.//w:p', namespaces), start=1):
            paragraph_text = ""
            if not first_row_processed:
                title_and_chapter = ''.join(
                    run.find('w:t', namespaces).text or ''
                    for run in paragraph.findall('.//w:r', namespaces)
                    if run.find('w:t', namespaces) is not None
                ).strip()
                first_row_processed = True
                continue

            if not any(run.find('w:t', namespaces) is not None for run in paragraph.findall('.//w:r', namespaces)):
                continue

            app_id = f"app-{i-2}"
            combined_special_fonts = []  # To collect specialFont text for merging

            for run in paragraph.findall('.//w:r', namespaces):
                text_elem = run.find('w:t', namespaces)
                footnote_ref = run.find('.//w:footnoteReference', namespaces)
                font_elem = run.find('.//w:rPr//w:rFonts', namespaces)
                vert_align_elem = run.find('.//w:rPr//w:vertAlign', namespaces)

                if text_elem is not None:
                    text = text_elem.text
                    font = font_elem.get(f'{{{namespaces["w"]}}}ascii') if font_elem is not None else "Unknown"

                    # Check for superscript or subscript alignment
                    if vert_align_elem is not None:
                        align_val = vert_align_elem.get(f'{{{namespaces["w"]}}}val')
                        if align_val == "superscript":
                            text = f"<superscript {text}>"
                        elif align_val == "subscript":
                            text = f"<subscript {text}>"

                    # Collect special font text instead of appending immediately
                    if font == "HUBPSigla":  # Replace with your actual font name if different
                        combined_special_fonts.append(text)
                    else:
                        # If there's non-specialFont text, finalize combined_special_fonts
                        if combined_special_fonts:
                            paragraph_text += f"<specialFont {''.join(combined_special_fonts)}>"
                            combined_special_fonts = []  # Reset buffer
                        paragraph_text += text

                elif footnote_ref is not None:
                    actual_id = int(footnote_ref.get(f'{{{namespaces["w"]}}}id'))
                    # Use the normalized ID if the actual ID exists in the mapping
                    if actual_id in id_mapping:
                        normalized_id = id_mapping[actual_id]
                        paragraph_text += f"<ref {normalized_id}>"

            # Finalize any remaining combined_special_fonts
            if combined_special_fonts:
                paragraph_text += f"<specialFont {''.join(combined_special_fonts)}>"

            main_text_dict[app_id] = paragraph_text

    return {
        'title': title_and_chapter.split()[0],
        'chapter': title_and_chapter.split()[1],
        'content': main_text_dict
    }

def split_full_entry(text):
    """
    Splits a full entry into lemma and entry. Processes sequences of <subscript> tags
    into a single concatenated <subscript> tag.
    """

    # Split at the first occurrence of ']'
    sliced_entry = text.split(sep=']', maxsplit=1)
    
    # Initialize `lemma` and `entry`
    lemma = sliced_entry[0]
    entry = sliced_entry[1] if len(sliced_entry) > 1 else ""

    # Process subscript sequences in the entry
    subscript_pattern = r'(?:<subscript\s*([^>]+)>)+'
    
    def merge_subscripts(match):
        subscripts = match.group(0)
        combined_content = ''.join(re.findall(r'<subscript\s*([^>]+)>', subscripts))
        return f'<subscript {combined_content}>'

    entry = re.sub(subscript_pattern, merge_subscripts, entry)

    # Check if a superscript follows immediately after the `]`
    superscript_pattern = re.compile(r'^\s*<ref\s*([^>]+)>')
    match = superscript_pattern.match(entry)
    lemma_dict = {'lemma': lemma}
    if match:
        # Append the superscript to the lemma
        lemma_dict.update({'ref': match.group(0)})
        
        # Remove the superscript from the start of the entry
        entry = entry[len(match.group(0)):].strip()

    # Check for specific patterns in the lemma
    to_check_patterns = [
        r'<specialFont\s*<subscript\s*v>>',  # Pattern for subscript v
        r'<specialFont\s*,'                 # Pattern for specialFont ,
        r'<specialFont\s*v'
    ]
    if any(re.search(pattern, lemma) for pattern in to_check_patterns):
        lemma_dict['to_check'] = True  # Add "to_check" flag if any pattern matches
        
    return lemma_dict, entry


def extract_initial_sigla(reading):
       
    # Pattern to match one or more sigla at the start of the reading
    sigla_pattern = re.compile(r'^[\+\>\~]+')
    
    # Dictionary to store extracted sigla and cleaned reading
    result = {
        'sigla': "",
        'cleaned_reading': reading.strip()  # Initialize cleaned reading as the full reading
    }
    
    # Find initial sigla if present
    match = sigla_pattern.match(reading)
    if match:
        # Extract sigla and set them in the result dictionary
        result['sigla'] = match.group(0)
        
        # Remove the matched sigla from the beginning of the reading
        result['cleaned_reading'] = reading[len(result['sigla']):].strip()

    return result

def extract_cross_references(reading):
    # Pattern to find all <subscript ...> tags
    subscript_pattern = re.compile(r'<subscript\s*([^>]+)>')
    
    # List to store cross-references and clean reading with subscripts
    cross_references = []
    cleaned_reading = reading

    # Define a pattern to identify Roman numerals with only I's and V's (e.g., I, II, III, IV, V, VI, VII, VIII)
    roman_numeral_pattern = re.compile(r'[IV\s]{0,}$')

    # Process each subscript match
    matches = subscript_pattern.findall(reading)
    for match in matches:
        if roman_numeral_pattern.match(match):
            for item in match.split():
                cross_references.append(item)  # Store as cross-reference if it's a valid Roman numeral with I's and V's
            cleaned_reading = cleaned_reading.replace(f"<subscript {match}>", "")
        
    # Remove duplicates in cross-references and retain order
    unique_references = sorted(set(cross_references), key=cross_references.index)
    
    # Final cleanup of <subscript ...> tags for Roman numerals in cleaned reading
#     cleaned_reading = roman_numeral_pattern.sub('', cleaned_reading).strip()

    return {
        'cross_references': unique_references,
        'reading': cleaned_reading
    }


def parse_and_classify_entry(entry):
    # Define splitting characters with classifications
    splitters = {
        '|': 'additional_variant',
        '=': 'synonymous_variant',
        ',': 'related_variant',
        '<specialFont +>': 'similar_variant'
    }
    
    special_font_pattern = re.compile(r'<specialFont\s*([^>+]+)>')

    
    # Preprocess: Replace <specialFont ...> with placeholders
    special_font_placeholders = []
    def replace_special_font(match):
        special_font_placeholders.append(match.group(0))  # Store the full tag
        return f"__SPECIAL_FONT_{len(special_font_placeholders) - 1}__"

    entry = special_font_pattern.sub(replace_special_font, entry)

    # Compile regex to split by any of the splitters
    splitter_pattern = re.compile(r'(\||=|<specialFont\s\+>|,)')

    # List to store parsed entries with classifications
    parsed_entries = []

    # Split the entry by the main split characters, keeping split characters separate
    parts = splitter_pattern.split(entry)

    # Initialize a default classification for the first part
    current_classification = "variant"

    # Process each part separately
    for part in parts:
        part = part.strip()  # Remove leading/trailing whitespace
        
        # Skip if the part is a splitter, set classification for the next part
        if part in splitters:
            current_classification = splitters[part]
            continue  # Skip to the next part

        # Restore <specialFont ...> tags in this part
        for i, placeholder in enumerate(special_font_placeholders):
            part = part.replace(f"__SPECIAL_FONT_{i}__", placeholder)

        # Initialize dictionaries to store witnesses, reading, classification, and cross-references
        entry_data = {
            'classification': current_classification,
            'witnesses': [],
            'reading': "",
            'cross_references': [],
            'sigla': ""
        }
                
        combined_witnesses = []  # Collect all witness contents

        i = 0
        while i < len(part):
            # Match <specialFont ...>
            special_font_match = special_font_pattern.match(part, i)
            if special_font_match:
                combined_witnesses.append(special_font_match.group(1))  # Collect witness content
                i = special_font_match.end()
            else:
                # Once done collecting witnesses, finalize the <specialFont> wrapper
                if combined_witnesses:
                    combined_witness_text = ''.join(combined_witnesses)
                    entry_data['witnesses'].append(f"<specialFont {combined_witness_text}>")
                    combined_witnesses = []  # Reset the buffer
                # Anything else becomes the reading
                entry_data['reading'] = part[i:].strip()
                break

        # If any witnesses remain in the buffer, finalize them
        if combined_witnesses:
            combined_witness_text = ''.join(combined_witnesses)
            entry_data['witnesses'].append(f"<specialFont {combined_witness_text}>")
        # process witnesses
        entry_data['witnesses'] = process_combined_witnesses(entry_data['witnesses'])
        
        # Extract cross-references from the reading
        result = extract_cross_references(entry_data['reading'])
        entry_data['cross_references'] = result['cross_references']
        entry_data['reading'] = result['reading']  # Cleaned reading without <subscript ...> tags

        
        sigla_result = extract_initial_sigla(entry_data['reading'])
        entry_data['sigla'] = sigla_result['sigla']
        entry_data['reading'] = sigla_result['cleaned_reading']  # Cleaned reading without sigla

        # Add entry data to the parsed entries list
        parsed_entries.append(entry_data)

    return parsed_entries


def split_and_process_witnesses(witness_text):
    """
    Split witnesses from text, attaching 'h' or '-' to the preceding witness.
    """
    processed_witnesses = []
    
    for char in witness_text:
        if char in ['h', '-'] and processed_witnesses:
            # Attach 'h' or '-' to the last witness
            processed_witnesses[-1] += char
        else:
            # Start a new witness
            processed_witnesses.append(char)

    return processed_witnesses


def process_combined_witnesses(witnesses):
    """
    Process combined witnesses, splitting and wrapping them in <specialFont> tags.
    """
    processed_witnesses = []

    for witness in witnesses:
        # Extract the content within <specialFont>
        match = re.match(r'<specialFont\s*(.*?)>', witness)
        if match:
            content = match.group(1)
            # Split the content into individual witnesses
            split_witnesses = split_and_process_witnesses(content)
            # Wrap each witness back in <specialFont> tags
            processed_witnesses.extend([f"<specialFont {w}>" for w in split_witnesses])
        else:
            # If no <specialFont>, just append as-is
            processed_witnesses.append(witness)

    return processed_witnesses

In [3]:
# def split_verse_lemma(entry_text, previous_verse=None):
#     """
#     Splits the entry into verse and lemma. If a verse range is found, splits into from_verse and to_verse.
#     If no verse is found, it uses the previous entry's verse.
#     Adds a `to_check` flag to the lemma if specific patterns are detected.
#     """
#     # Regex to match verse patterns: digits optionally followed by a dash or range
#     pattern = re.match(r'^(\d+(?:–\d+)?)(.*)$', entry_text.strip())
#     if pattern:
#         verse_part = pattern.group(1).strip()  # Extract verse
#         lemma = pattern.group(2).strip()  # Extract lemma

#         # Check if the verse contains a range
#         if '–' in verse_part:
#             from_verse, to_verse = map(str.strip, verse_part.split('–', 1))
#             verse = {'from_verse': from_verse, 'to_verse': to_verse}
#         else:
#             verse = {'verse': verse_part}  # Single verse
#     else:
#         verse = {'verse': previous_verse}  # Use previous verse if no verse found
#         lemma = entry_text.strip()  # Assume the rest is lemma

#     # Check for specific patterns in the lemma
#     to_check_patterns = [
#         r'<specialFont\s*<subscript\s*v>>',  # Pattern for subscript v
#         r'<specialFont\s*,>'                 # Pattern for specialFont comma
#     ]
#     to_check = any(re.search(pattern, lemma) for pattern in to_check_patterns)

#     # Return the structured data, including `to_check` if applicable
#     return {
#         'verse': verse,
#         'lemma': {'text': lemma, 'to_check': to_check} if to_check else lemma
#     }

def split_verse_lemma(lemma_dict, previous_verse=None):
    """
    Splits the entry into verse and lemma. If a verse range is found, splits into from_verse and to_verse.
    If no verse is found, it uses the previous entry's verse.
    Also carries `ref` and `to_check` from the lemma if present.
    """
    # Extract text and ref from the input dictionary
    lemma_text = lemma_dict.get("lemma", "")
    ref = lemma_dict.get("ref", None)

    # Regex to match verse patterns: digits optionally followed by a dash or range
    pattern = re.match(r'^(\d+(?:–\d+)?)(.*)$', lemma_text.strip())
    if pattern:
        verse_part = pattern.group(1).strip()  # Extract verse
        lemma_text = pattern.group(2).strip()  # Extract lemma

        # Check if the verse contains a range
        if '–' in verse_part:
            from_verse, to_verse = map(str.strip, verse_part.split('–', 1))
            verse = {'from_verse': from_verse, 'to_verse': to_verse}
        else:
            verse = {'verse': verse_part}  # Single verse
    else:
        verse = {'verse': previous_verse}  # Use previous verse if no verse found
        lemma_text = lemma_text.strip()  # Assume the rest is lemma

    # Check for specific patterns in the lemma for `to_check`
    to_check_patterns = [
        r'<specialFont\s*<subscript\s*v>>',  # Pattern for subscript v
        r'<specialFont\s*,>',               # Pattern for specialFont comma
        r'<specialFont\s*v>',               # Standalone specialFont with "v"
        r'<specialFont\s*[^>]+>',           # General case for specialFont with content
    ]
    to_check = any(re.search(pattern, lemma_text) for pattern in to_check_patterns)

    # Return structured data, including `ref` and `to_check` if applicable
    return {
        'verse': verse,
        'lemma': [{'text': lemma_text, 'to_check': to_check, 'ref': ref}]
    }


def process_lemma_specific_range(lemma_text):
    """
    Process a lemma with one or multiple specific ranges (...).
    Handles cases like 'word ... word ... word ... word'.
    """
    # Split the lemma by '...'
    parts = [part.strip() for part in lemma_text.split('...') if part.strip()]

    # Handle cases with multiple specific ranges
    processed_ranges = []
    for i, part in enumerate(parts):
        if i == 0:
            # The first part is the start
            processed_ranges.append({'tag': 'start lemma', 'content': part})
        elif i == len(parts) - 1:
            # The last part is the end
            processed_ranges.append({'tag': 'end lemma', 'content': part})
        else:
            # Intermediate parts are tagged as middle ranges
            processed_ranges.append({'tag': f'middle lemma {i}', 'content': part})

    # Return the processed range structure
    return {
        'type': 'specific_range',
        'parts': processed_ranges
    }

def process_lemma_range(lemma_text):
    """
    Process a lemma with a range (–).
    """
    parts = lemma_text.split('–', 1)
    part_from = parts[0].strip()
    part_to = parts[1].strip()

    return {
        'type': 'full_range',
        'from': part_from,
        'to': part_to
    }

def process_lemma_transposition(lemma_text):
    """
    Process a lemma with a transposition (~).
    """
    parts = lemma_text.split('~', 1)
    part_a = parts[0].strip()
    part_b = parts[1].strip()

    return {
        'type': 'transposition',
        'parts': {
            'a': process_lemma(part_a),
            'b': process_lemma(part_b)
        }
    }

def process_lemma_lex(lemma_text):
    """
    Process a lemma containing 'lex'.
    Removes 'lex' (as a whole word) and processes the remainder.
    """
    # Remove the first occurrence of 'lex' and optional trailing punctuation (e.g., ':', '-', '–', or spaces)
    cleaned_text = re.sub(r'\blex\b[\s:–-]*', '', lemma_text, count=1, flags=re.IGNORECASE)

    return {
        'type': 'lex',
        'text': cleaned_text.strip()
    }



def split_k_q_lemmas(lemma_text): ### check this function. maybe use replace with regex instead of match
    """
    Splits lemmas prefixed by k and q (e.g., 'k עינתם / q עוֹנֹתם')
    into distinct lemmas with their respective tags, dropping k/q prefixes.
    """
    # Regex to match prefixed lemmas and split them
    pattern = r'\b[kq]\s+([^\s/]+)\s*/\s*\b[kq]\s+([^\s]+)'
    match = re.match(pattern, lemma_text)

    if match:
        lemma_k = match.group(1).strip()  # Extract the lemma content after 'k'
        lemma_q = match.group(2).strip()  # Extract the lemma content after 'q'

        return [
            {'tag': 'lemma_k', 'content': lemma_k},
            {'tag': 'lemma_q', 'content': lemma_q}
        ]

    # If no match, return the lemma as-is
    return [{'tag': 'lemma', 'content': lemma_text}]


def process_lemma(lemma_text):
    """
    Process the lemma string. Checks for transposition, range, or specific range
    and delegates to specialized functions. If none found, processes individual lemmas.
    """
    if not lemma_text:
        return None  # Return if lemma is empty
    
    if '/' in lemma_text and re.search(r'\b[kq]\s', lemma_text):
        return split_k_q_lemmas(lemma_text)
    elif '~' in lemma_text:
        return process_lemma_transposition(lemma_text)
    elif '–' in lemma_text:
        return process_lemma_range(lemma_text)
    elif '...' in lemma_text:
        return process_lemma_specific_range(lemma_text)
    elif 'lex' in lemma_text:
        return process_lemma_lex(lemma_text)

    # Default: Process individual lemmas
    return process_individual_lemma(lemma_text)


def process_individual_lemma(lemma_dict):
    """
    Process individual lemmas, handling parentheses, concatenating multiple superscripts
    into a single number if present, and constructing `full_lemma` where appropriate.
    If a `to_check` flag is already present in the lemma_dict, return it unchanged.
    """
    # If `to_check` is present, return the dictionary unchanged
    if isinstance(lemma_dict, dict) and lemma_dict.get("to_check", False):
        return [lemma_dict]

    individual_lemma = lemma_dict.get('lemma', '') if isinstance(lemma_dict, dict) else lemma_dict

    # Regex to match words, parentheses, and superscripts
    lemma_regex = r'([^\s<\(\)]+|\([^\)]+\)|<superscript\s*[^>]+>|>|\s+)'
    superscript_regex = r'<superscript\s*([^>]+)>'

    matches = re.findall(lemma_regex, individual_lemma)

    processed_lemmas = []
    combined_parenthesis_content = ""
    inside_parentheses = False
    outside_parts = []  # Track parts outside parentheses
    inside_parts = []  # Track parts inside parentheses
    has_split_lemma = False  # Flag for split lemmas
    lemma_counter = 1  # Counter for normal lemmas
    parenthesis_first = False  # Flag to check if parentheses are first

    for i, word in enumerate(matches):
        if word.startswith("(") and not word.endswith(")"):
            # Start a parenthetical group
            inside_parentheses = True
            combined_parenthesis_content = word
            has_split_lemma = True
            if i == 0:
                parenthesis_first = True
        elif word.endswith(")") and inside_parentheses:
            # End a parenthetical group
            inside_parentheses = False
            combined_parenthesis_content += f" {word}"
            processed_lemmas.append({'tag': 'parenthesis', 'content': combined_parenthesis_content})
            inside_parts.append(combined_parenthesis_content[1:-1])  # Content inside parentheses
            combined_parenthesis_content = ""
        elif inside_parentheses:
            # Inside a parenthetical group
            combined_parenthesis_content += f" {word}"
        elif word.startswith("(") and word.endswith(")"):
            # Complete parenthetical group
            processed_lemmas.append({'tag': 'parenthesis', 'content': word})
            inside_parts.append(word[1:-1])  # Content inside parentheses
            has_split_lemma = True
            if i == 0:
                parenthesis_first = True
        elif word.startswith("<superscript"):
            # Handle superscripts
            if processed_lemmas and 'content' in processed_lemmas[-1]:
                current_lemma = processed_lemmas[-1]
                superscript_values = re.findall(superscript_regex, word)
                if 'numbers' not in current_lemma:
                    current_lemma['numbers'] = ''.join(superscript_values)
                else:
                    current_lemma['numbers'] += ''.join(superscript_values)
        elif word.strip():
            # Normal word processing
            lemma_dict = {'tag': f'lemma{lemma_counter}', 'content': word}
            processed_lemmas.append(lemma_dict)
            lemma_counter += 1
            outside_parts.append(word)

    # If parentheses remain open, finalize them
    if inside_parentheses and combined_parenthesis_content:
        processed_lemmas.append({'tag': 'parenthesis', 'content': combined_parenthesis_content})
        inside_parts.append(combined_parenthesis_content[1:])  # Remaining content without leading '('

    # Add `full_lemma` if necessary
    if has_split_lemma and outside_parts and inside_parts:
        # Concatenate inside and outside parts based on order
        if parenthesis_first:
            full_lemma_content = ''.join(inside_parts + outside_parts).strip()
        else:
            full_lemma_content = ''.join(outside_parts + inside_parts).strip()

        # Only create `full_lemma` if parentheses are part of a split lemma
        if len(outside_parts) > 0 and len(inside_parts) == 1 and ' ' not in inside_parts[0]:
            processed_lemmas.append({'tag': 'full_lemma', 'content': full_lemma_content})

    return processed_lemmas




In [4]:
import os
import pandas as pd
from docx import Document
import zipfile
import xml.etree.ElementTree as ET
import re
import unicodedata
from lxml import etree
from win32com import client


def process_docx_files(folder_path):
    # Initialize an empty list to collect data for each file
    data = []

    # Iterate over each .docx file in the folder
    for filename in sorted(os.listdir(folder_path)):
        if filename.endswith('.docx'):
            if filename.startswith('~$'):
                continue  # Skip temporary files
            docx_path = os.path.join(folder_path, filename)
            
            # Extract main text and footnotes
            main_text_data = process_main_text_with_normalized_footnotes(docx_path)
            footnotes_data = process_footnotes(docx_path)
            
            # Extract title and chapter from main text data
            title = main_text_data.get('title', 'Unknown Title')
            chapter = main_text_data.get('chapter', 'Unknown Chapter')
            
            # Append to data list with structured dictionary
            data.append({
                'Title': title,
                'Chapter': int(chapter),
                'Main Text': main_text_data.get('content', {}),
                'Footnotes': footnotes_data
            })

    # Convert the list of dictionaries to a DataFrame
    df = pd.DataFrame(data).sort_values(by=['Title', 'Chapter']).reset_index(drop=True)
    return df

# Folder path containing .docx files
folder_path = 'Hosea.App.1'

# Process and get DataFrame
df = process_docx_files(folder_path)

# Display the DataFrame
df.head()


Unnamed: 0,Title,Chapter,Main Text,Footnotes
0,Hosea,1,{'app-1': '1 יותם אחז יחזקיה] <specialFont ][>...,"{'footnote-1': 'cf app Mic 1<subscript 1 >', '..."
1,Hosea,2,{'app-1': '1–2] <specialFont ]*> 1<subscript 1...,"{'footnote-1': '“there”, for במקום אשר ... שם ..."
2,Hosea,3,{'app-1': '1 אהב] <specialFont ]*h> <specialFo...,{'footnote-1': 'voc אֹהֶבֶת רָע; cf Mic 3<subs...
3,Hosea,4,"{'app-0': '1 (ל)יהוה] <specialFont ]h> div', '...","{'footnote-1': '“(in) the fear of (God)”, theo..."
4,Hosea,5,{'app-1': '1 לְמצפָּה] <specialFont ]> τῇ σκοπ...,"{'footnote-1': '“to the lookout”, similarly <s..."


In [5]:
df[:1]['Main Text'].values[0]

{'app-1': '1 יותם אחז יחזקיה] <specialFont ][> <specialFont &>',
 'app-2': 'מלכי] <specialFont ]h> num<subscript II><ref 1>',
 'app-3': '2 דִּבֶּר] <specialFont ]> λόγου<ref 2> = <specialFont [T>',
 'app-4': 'ב(הושע)] <specialFont ]h> πρός<ref 3> <specialFont +> <specialFont [>ܕܗܘܐ ܥܠ<ref 4>',
 'app-5': 'ו(יאמר)] <specialFont ]h[> >',
 'app-6': 'לֵךְ] <specialFont ]h> ><subscript II>',
 'app-7': '3 ויקח] <specialFont [> + ܠܗ<ref 5>',
 'app-8': 'לו] <specialFont ]h*-> ><subscript II III><subscript  IV><ref 6>',
 'app-9': '4 יהוא] <specialFont ]h> Ιουδα<ref 7>',
 'app-10': '(ו)הִשְׁבַּתִּי] <specialFont ]-> ἀποστρέψω<ref 8>',
 'app-11': 'בית<superscript 2>] <specialFont ]h*hT-> + prep<ref 9>',
 'app-12': '5 והיה (ביום ההוא)] <specialFont [> ><ref 10>',
 'app-13': '(ו)היה] <specialFont *> >',
 'app-14': 'ההוא] <specialFont ]h> + dicit dominus<ref 11>',
 'app-15': '6 עוד<superscript 1>] <specialFont *hT-> ><subscript II><subscript  III IV>',
 'app-16': 'לו] <specialFont ]h> + κύριος<subscr

In [133]:
df['Footnotes'][:1].values

array([{'footnote-1': 'cf app Mic 1<subscript 1 >', 'footnote-2': '(a) voc דְּבַר (יהוה), formula, cf v<subscript 1 > 4<subscript 1 > et al; cf app 13<subscript 1 >; cf תחלת דִּבְרֵי Qoh 10<subscript 13 >; note seq; (b) noun דִּבֵּר, cf app Jer 5<subscript 13 > and Rabb Heb; cf gerund in <specialFont * > loquendi “of speaking”', 'footnote-3': '“to”; main evid, cf v<subscript 1 >', 'footnote-4': '“which was to”, ex v<subscript 1 >', 'footnote-5': '“for himself”, cf v<subscript 2 >', 'footnote-6': 'cf vv<subscript 6,8 >; contrast Hier 10<subscript 154 >', 'footnote-7': 'main evid; inner-Grk (בית יהודה common collocation), cf Hier 12<subscript 208−211 >', 'footnote-8': 'voc הֲשִׁבֹתִי, similarly app 2<subscript 13 > Ezek 7<subscript 24 >; for parall השיב//פקד cf 4<subscript 9 > 12<subscript 3 >; main evid <specialFont ]h ><subscript   >καταπαύσω (=<specialFont x >)', 'footnote-9': 'common formula השבית מן, cf e.g. Lev 26<subscript 6 > Jer 7<subscript 34 >', 'footnote-10': 'formulaic chang

In [134]:
df.to_json('App.1.FULL.json')

In [135]:
#processing trials

In [6]:
import re

def process_lemma(lemma_text):
    if not lemma_text:
        return {"type": None, "parts": []}

    # lex
    if re.search(r'\blex\b', lemma_text, flags=re.IGNORECASE):
        return process_lemma_lex(lemma_text)

    # k/q
    if '/' in lemma_text and re.search(r'\b[kq]\s', lemma_text):
        return process_lemma_kq(lemma_text)

    # transposition
    if '~' in lemma_text:
        return process_lemma_transposition(lemma_text)

    # specific range
    if '...' in lemma_text:
        return process_lemma_specific_range(lemma_text)

    # full range
    if '–' in lemma_text:
        return process_lemma_range(lemma_text)

    # fallback
    return {
        "type": 'Standard',
        "parts": [{
            "content": lemma_text,
            "detail": process_individual_lemma(lemma_text)
        }]
    }


def process_lemma_lex(lemma_text):
    cleaned = re.sub(r'\blex\b[\s:–-]*', '', lemma_text, count=1, flags=re.IGNORECASE)
    return {
        "type": "lex",
        "parts": [{
            "content": cleaned,
            "detail": process_individual_lemma(cleaned)
        }]
    }


def process_lemma_specific_range(lemma_text):
    # split into start/end
    start, end = map(str.strip, lemma_text.split('...', 1))
    return {
        "type": "specific_range",
        "parts": [
            {
                "tag": "start_lemma",
                "content": start,
                "detail": process_individual_lemma(start)
            },
            {
                "tag": "end_lemma",
                "content": end,
                "detail": process_individual_lemma(end)
            }
        ]
    }


def process_lemma_range(lemma_text):
    frm, to = map(str.strip, lemma_text.split('–', 1))
    return {
        "type": "full_range",
        "parts": [
            {
                "tag": "range_from",
                "content": frm,
                "detail": process_individual_lemma(frm)
            },
            {
                "tag": "range_to",
                "content": to,
                "detail": process_individual_lemma(to)
            }
        ]
    }


def process_lemma_transposition(lemma_text):
    a, b = map(str.strip, lemma_text.split('~', 1))
    return {
        "type": "transposition",
        "parts": [
            {
                "tag": "transposition_a",
                "content": a,
                "detail": process_individual_lemma(a)
            },
            {
                "tag": "transposition_b",
                "content": b,
                "detail": process_individual_lemma(b)
            }
        ]
    }


def process_lemma_kq(lemma_text):
    m = re.match(r'\b[kq]\s+([^\s/]+)\s*/\s*\b[kq]\s+([^\s]+)', lemma_text)
    if not m:
        # fallback single
        return {
            "type": "K\Q",
            "parts": [{
                "tag": "lemma",
                "content": lemma_text,
                "detail": process_individual_lemma(lemma_text)
            }]
        }
    k, q = m.group(1), m.group(2)
    return {
        "type": "K\Q",
        "parts": [
            {"tag": "lemma_k", "content": k, "detail": process_individual_lemma(k)},
            {"tag": "lemma_q", "content": q, "detail": process_individual_lemma(q)}
        ]
    }


In [7]:
def enrich_df_with_processed_lemmas(df):
    """
    Given a DataFrame with a 'Main Text' column (dict of entry_id → raw string),
    adds a 'ProcessedEntries' column where each row is a list of dicts:
      - EntryID
      - Verse      : the verse dict from split_verse_lemma
      - LemmaMeta  : original lemma metadata (text, to_check, ref)
      - ProcessedLemma : structured { type, parts } from process_lemma
      - EntryText  : the raw entry string
      - ClassifiedEntry (optional) : output of parse_and_classify_entry
      - ClassifyError   (optional) : error message if classification failed
    """
    processed_rows = []
    
    for idx, row in df.iterrows():
        main_text = row.get("Main Text", {})
        previous_verse = None
        row_entries = []
        
        if isinstance(main_text, dict):
            for entry_id, raw_value in main_text.items():
                try:
                    # 1) split out lemma text vs. entry text
                    lemma_block, entry_text = split_full_entry(raw_value)
                    
                    # 2) extract verse + carry-over
                    split = split_verse_lemma(lemma_block, previous_verse)
                    verse_info = split["verse"]
                    if verse_info.get("verse") or verse_info.get("to_verse"):
                        previous_verse = verse_info
                    
                    # 3) pull off the raw lemma metadata
                    lemma_meta = split["lemma"][0]
                    raw_lemma_text = lemma_meta["text"]
                    
                    # 4) process the lemma into structured parts
                    processed = process_lemma(raw_lemma_text)
                    
                    # 5) build the entry dict
                    entry_dict = {
                        "EntryID": entry_id,
                        "Verse": verse_info,
                        "LemmaMeta": lemma_meta,
                        "ProcessedLemma": processed,
                        "EntryText": entry_text
                    }
                    
                    # 6) classify the entry text if possible
                    try:
                        entry_dict["ClassifiedEntry"] = parse_and_classify_entry(entry_text)
                    except Exception as ce:
                        entry_dict["ClassifiedEntry"] = None
                        entry_dict["ClassifyError"] = str(ce)
                    
                    row_entries.append(entry_dict)
                
                except Exception as e:
                    # On any failure, record the entry ID and error
                    row_entries.append({
                        "EntryID": entry_id,
                        "Error": str(e)
                    })
        
        processed_rows.append(row_entries)
    
    df["ProcessedEntries"] = processed_rows
    return df


In [11]:
enrich_df_with_processed_lemmas(df[:8])['ProcessedEntries'][0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["ProcessedEntries"] = processed_rows


[{'EntryID': 'app-1',
  'Verse': {'verse': '1'},
  'LemmaMeta': {'text': 'יותם אחז יחזקיה', 'to_check': False, 'ref': None},
  'ProcessedLemma': {'type': 'Standard',
   'parts': [{'content': 'יותם אחז יחזקיה',
     'detail': [{'tag': 'lemma1', 'content': 'יותם'},
      {'tag': 'lemma2', 'content': 'אחז'},
      {'tag': 'lemma3', 'content': 'יחזקיה'}]}]},
  'EntryText': ' <specialFont ][> <specialFont &>',
  'ClassifiedEntry': [{'classification': 'variant',
    'witnesses': ['<specialFont ]>', '<specialFont [>'],
    'reading': '<specialFont &>',
    'cross_references': [],
    'sigla': ''}]},
 {'EntryID': 'app-2',
  'Verse': {'verse': {'verse': '1'}},
  'LemmaMeta': {'text': 'מלכי', 'to_check': False, 'ref': None},
  'ProcessedLemma': {'type': 'Standard',
   'parts': [{'content': 'מלכי',
     'detail': [{'tag': 'lemma1', 'content': 'מלכי'}]}]},
  'EntryText': ' <specialFont ]h> num<subscript II><ref 1>',
  'ClassifiedEntry': [{'classification': 'variant',
    'witnesses': ['<specialFon

In [None]:
#old processing and flatenning 

In [73]:
def df_to_xml_with_flattened_verse_and_to_check(df, output_file):
    def extract_innermost_verse(verse_data):
        """
        Recursively extract the innermost 'verse' value from a nested structure.
        """
        if isinstance(verse_data, dict) and "verse" in verse_data:
            return extract_innermost_verse(verse_data["verse"])
        return verse_data

    root = ET.Element("root")

    for row_idx, row in df.iterrows():
        try:
            print(f"Processing Row: {row_idx}, Title: {row['Title']}, Chapter: {row['Chapter']}")

            # Create a Book element for each title
            book_element = ET.SubElement(root, "Book", {"Title": row["Title"]})
            chapter_element = ET.SubElement(book_element, "Chapter", {"Number": str(row["Chapter"])})
            main_text_data = row["Main Text"]

            if isinstance(main_text_data, dict):
                main_text_element = ET.SubElement(chapter_element, "MainText")
                previous_verse = None  # Initialize previous verse
                for key, value in main_text_data.items():
                    try:
                        print(f"Processing Entry ID: {key}")
                        entry_element = ET.SubElement(main_text_element, "Entry", {"ID": key})

                        # Split lemma and entry
                        lemma, entry = split_full_entry(value)
                        print(f"Split Lemma: {lemma}, Entry: {entry}")
                        
#                         lemma, entry = split_full_entry(value)

                        # --- quick sanity check ---------------------------------
                        if not isinstance(entry, str):
                            print(f"[DEBUG] entry for {key!r} is {type(entry).__name__}: {entry}")
                        if not isinstance(value, str):
                            print(f"[DEBUG] value for {key!r} is {type(value).__name__}")
                        # ---------------------------------------------------------
                        
                        if isinstance(lemma, dict) and "lemma" in lemma:
                            # Add Verse to Entry
                            split_verse = split_verse_lemma(lemma, previous_verse)  # Pass the full lemma dict
                            print(f"Split Verse Lemma: {split_verse}")

                            if split_verse["verse"].get("verse") or split_verse["verse"].get("to_verse"):
                                previous_verse = split_verse["verse"]

                            verse_element = ET.SubElement(entry_element, "Verse")
                            for verse_key, verse_value in split_verse["verse"].items():
                                # Flatten the verse structure if necessary
                                flattened_value = extract_innermost_verse(verse_value)
                                ET.SubElement(verse_element, verse_key.capitalize()).text = str(flattened_value)

                            # Extract processed lemma type, if any
                            processed_lemma = process_lemma(split_verse["lemma"][0]["text"])  # Pass only the lemma text
                            print(f"Processed Lemma: {processed_lemma}")

                            # Determine lemma type for the overall Lemma tag
                            lemma_type = processed_lemma.get("type") if isinstance(processed_lemma, dict) else None
                            lemma_attributes = {"Type": lemma_type} if lemma_type else {}

                            # Include `to_check` and `ref` flags if present
                            if split_verse["lemma"][0].get("to_check", False):
                                lemma_attributes["ToCheck"] = "true"
                            if split_verse["lemma"][0].get("ref"):
                                lemma_attributes["Ref"] = split_verse["lemma"][0]["ref"]

                            lemma_element = ET.SubElement(entry_element, "Lemma", lemma_attributes)

                            # Process the lemma content
                            if isinstance(processed_lemma, list):
                                for lemma_detail in processed_lemma:
                                    lemma_detail_element = ET.SubElement(lemma_element, "Detail")
                                    for field, field_value in lemma_detail.items():
                                        ET.SubElement(lemma_detail_element, field.capitalize()).text = str(field_value)
                            elif isinstance(processed_lemma, dict):
                                if processed_lemma.get("type") == "specific_range":
                                    for part in processed_lemma.get("parts", []):
                                        part_element = ET.SubElement(lemma_element, "Part")
                                        part_element.set("Type", processed_lemma["type"])
                                        for field, field_value in part.items():
                                            ET.SubElement(part_element, field.capitalize()).text = str(field_value)
                                elif processed_lemma.get("type") == "transposition":
                                    for part_key, part_value in processed_lemma["parts"].items():
                                        part_element = ET.SubElement(lemma_element, "Part", {"Type": part_key})
                                        if isinstance(part_value, list):
                                            for item in part_value:
                                                detail_element = ET.SubElement(part_element, "Detail")
                                                for sub_key, sub_value in item.items():
                                                    ET.SubElement(detail_element, sub_key.capitalize()).text = str(sub_value)
                                        elif isinstance(part_value, dict):
                                            for field, field_value in part_value.items():
                                                ET.SubElement(part_element, field.capitalize()).text = str(field_value)
                                        else:
                                            ET.SubElement(part_element, "Content").text = str(part_value)
                                elif processed_lemma.get("type") == "full_range":
                                    range_element = ET.SubElement(lemma_element, "Range")
                                    ET.SubElement(range_element, "From").text = str(processed_lemma.get("from"))
                                    ET.SubElement(range_element, "To").text = str(processed_lemma.get("to"))

                        # Step 2: Process Entry
                        if entry:
                            try:
                                classified_entry = parse_and_classify_entry(entry)
                                print(f"Classified Entry: {classified_entry}")

                                entry_details_element = ET.SubElement(entry_element, "EntryDetails")
                                for classified in classified_entry:
                                    classified_element = ET.SubElement(entry_details_element, "ClassifiedEntry")
                                    for field, field_value in classified.items():
                                        if isinstance(field_value, list):  # Handle lists (e.g., witnesses)
                                            list_element = ET.SubElement(classified_element, field.capitalize())
                                            for item in field_value:
                                                ET.SubElement(list_element, "Item").text = str(item)
                                        elif field_value:  # Skip empty fields
                                            ET.SubElement(classified_element, field.capitalize()).text = str(field_value)

                            except Exception as entry_error:
                                print(f"Error processing Entry for ID {key}: {entry_error}")
                                ET.SubElement(entry_element, "Error").text = f"Failed to process entry: {entry_error}"

                    except Exception as entry_id_error:
                        print(f"Error processing Entry ID {key}: {entry_id_error}")
                        ET.SubElement(main_text_element, "Error").text = f"Failed to process Entry ID {key}: {entry_id_error}"

            # Process Footnotes
            footnotes_data = row["Footnotes"]
            if isinstance(footnotes_data, dict):
                footnotes_element = ET.SubElement(chapter_element, "Footnotes")
                for key, value in footnotes_data.items():
                    footnote_element = ET.SubElement(footnotes_element, "Footnote", {"ID": key})
                    footnote_element.text = value

        except Exception as row_error:
            print(f"Error processing Row {row_idx}: {row_error}")
            ET.SubElement(root, "Error").text = f"Failed to process Row {row_idx}: {row_error}"

    # Write to XML file
    tree = ET.ElementTree(root)
    tree.write(output_file, encoding="utf-8", xml_declaration=True)


In [74]:
output_file = "flattened_verse_with_to_check.xml"
df_to_xml_with_flattened_verse_and_to_check(df, output_file)
print(f"XML saved to {output_file}")


Processing Row: 0, Title: Hosea, Chapter: 1
Processing Entry ID: app-1
Split Lemma: {'lemma': '1 יותם אחז יחזקיה'}, Entry:  <specialFont ][> <specialFont &>
Split Verse Lemma: {'verse': {'verse': '1'}, 'lemma': [{'text': 'יותם אחז יחזקיה', 'to_check': False, 'ref': None}]}
Processed Lemma: [{'tag': 'lemma1', 'content': 'יותם'}, {'tag': 'lemma2', 'content': 'אחז'}, {'tag': 'lemma3', 'content': 'יחזקיה'}]
Classified Entry: [{'classification': 'variant', 'witnesses': ['<specialFont ]>', '<specialFont [>'], 'reading': '<specialFont &>', 'cross_references': [], 'sigla': ''}]
Processing Entry ID: app-2
Split Lemma: {'lemma': 'מלכי'}, Entry:  <specialFont ]h> num<subscript II><ref 1>
Split Verse Lemma: {'verse': {'verse': {'verse': '1'}}, 'lemma': [{'text': 'מלכי', 'to_check': False, 'ref': None}]}
Processed Lemma: [{'tag': 'lemma1', 'content': 'מלכי'}]
Classified Entry: [{'classification': 'variant', 'witnesses': ['<specialFont ]h>'], 'reading': 'num<ref 1>', 'cross_references': ['II'], 'sig

Processing Entry ID: app-40
Split Lemma: {'lemma': '11 קראו'}, Entry:  <specialFont ]-> ἐπεκαλεῖτο καί<ref 42> | <specialFont [> ܐܬܘ<ref 43> <specialFont +> <specialFont T> אתקרבו
Split Verse Lemma: {'verse': {'verse': '11'}, 'lemma': [{'text': 'קראו', 'to_check': False, 'ref': None}]}
Processed Lemma: [{'tag': 'lemma1', 'content': 'קראו'}]
Classified Entry: [{'classification': 'variant', 'witnesses': ['<specialFont ]->'], 'reading': 'ἐπεκαλεῖτο καί<ref 42>', 'cross_references': [], 'sigla': ''}, {'classification': 'additional_variant', 'witnesses': ['<specialFont [>'], 'reading': 'ܐܬܘ<ref 43>', 'cross_references': [], 'sigla': ''}, {'classification': 'similar_variant', 'witnesses': ['<specialFont T>'], 'reading': 'אתקרבו', 'cross_references': [], 'sigla': ''}]
Processing Entry ID: app-41
Split Lemma: {'lemma': '12 כאשר'}, Entry:  <specialFont [T> prep | <specialFont *> <specialFont &>
Split Verse Lemma: {'verse': {'verse': '12'}, 'lemma': [{'text': 'כאשר', 'to_check': False, 'ref': No

Split Verse Lemma: {'verse': {'verse': {'verse': {'verse': {'verse': {'verse': {'verse': {'verse': '6'}}}}}}}, 'lemma': [{'text': 'מעצתו', 'to_check': False, 'ref': None}]}
Processed Lemma: [{'tag': 'lemma1', 'content': 'מעצתו'}]
Classified Entry: [{'classification': 'variant', 'witnesses': ['<specialFont ]>', '<specialFont *>', '<specialFont [>'], 'reading': 'prep', 'cross_references': [], 'sigla': ''}]
Processing Entry ID: app-36
Split Lemma: {'lemma': '7 נדמה', 'ref': '<ref 38>'}, Entry: <specialFont ]> ἀπέρριψε<ref 39> = <specialFont [>ܫܕܬ <ref 40> <specialFont +> <specialFont *> transire fecit<ref 41> | <specialFont T> בהיתת ... ב-<ref 42>
Split Verse Lemma: {'verse': {'verse': '7'}, 'lemma': [{'text': 'נדמה', 'to_check': False, 'ref': '<ref 38>'}]}
Processed Lemma: [{'tag': 'lemma1', 'content': 'נדמה'}]
Classified Entry: [{'classification': 'variant', 'witnesses': ['<specialFont ]>'], 'reading': 'ἀπέρριψε<ref 39>', 'cross_references': [], 'sigla': ''}, {'classification': 'synonym

XML saved to flattened_verse_with_to_check.xml


In [12]:
df['Footnotes'][2]['footnote-10']

'theol diffic <specialFont 7 > different solutions in Vrs (for similar enumeration cf <specialFont ][ > Dan 3<subscript 37−38 >); cf Rofé, FS Weinfeld, 135−149'

In [25]:
df['Main Text'][6]['app-56']

'מזעם lex] <specialFont ]> ἀπαιδευσία<ref 61> = <specialFont |> <specialFont +> <specialFont [>'

In [11]:
split_verse_lemma(lemma['lemma'])

{'verse': {'verse': '4'}, 'lemma': 'זֶבַח'}

In [12]:
df['Footnotes'][1]['footnote-44']

'והכרַתי, cf Zech 13<subscript 2 >; cf also v<subscript 20 > וכרתי'

In [13]:
df['Main Text'][3]['app-14']

'כמריבי כהן]<ref 13> <specialFont ]> ὡς ἀντιλεγόμενος ἱερεύς<ref 14> | <specialFont [> ܐܝܟ ܟܗܢܐ ܡܬܚܪܐ<ref 15> | <specialFont T> נצן עם מלפיהון<ref 16> | <specialFont *> sicut hii qui contradicunt sacerdoti<ref 17> <specialFont +> <specialFont ~9>'

In [35]:
lemma, entry = split_full_entry(df['Main Text'][6]['app-56'])
parse_and_classify_entry(entry)


[{'classification': 'variant',
  'witnesses': ['<specialFont ]>'],
  'reading': 'ἀπαιδευσία<ref 61>',
  'cross_references': [],
  'sigla': ''},
 {'classification': 'synonymous_variant',
  'witnesses': ['<specialFont |>'],
  'reading': '',
  'cross_references': [],
  'sigla': ''},
 {'classification': 'similar_variant',
  'witnesses': ['<specialFont [>'],
  'reading': '',
  'cross_references': [],
  'sigla': ''}]

In [48]:
lemma

{'lemma': 'מזעם lex'}

In [56]:
process_lemma_lex(lemma['lemma'])

{'type': 'lex', 'text': [{'tag': 'lemma1', 'content': 'מזעם'}]}

In [38]:
process_lemma_lex(lemma['lemma'])

IndexError: no such group

In [9]:
entry

' <specialFont T> (ד)דָבַח<ref 9>'