In [1]:
import xml.etree.ElementTree as ET
import re
import pandas as pd
import unicodedata
from docx import Document
from lxml import etree
import zipfile
import os
from win32com import client


In [2]:
def process_footnotes(docx_path):
    # Dictionary to store footnote data
    footnotes_dict = {}

    # Step 1: Extract the footnotes XML from the .docx file
    with zipfile.ZipFile(docx_path, 'r') as docx:
        # Look for footnotes XML part
        if 'word/footnotes.xml' in docx.namelist():
            footnote_xml = docx.read('word/footnotes.xml').decode('utf-8')
        else:
            print("No footnotes.xml found in this document.")
            return footnotes_dict

    # Step 2: Parse the footnote XML and store in dictionary
    if footnote_xml:
        root = ET.fromstring(footnote_xml)
        namespaces = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}
        
        non_blank_count = 1  # Start counting footnotes for non-blank lines only

        for footnote in root.findall('w:footnote', namespaces):
            footnote_text = ""

            # Extract each run's text, font, and superscript/subscript information in the footnote
            for run in footnote.findall('.//w:r', namespaces):
                text_elem = run.find('w:t', namespaces)
                font_elem = run.find('.//w:rPr//w:rFonts', namespaces)
                vert_align_elem = run.find('.//w:rPr//w:vertAlign', namespaces)

                if text_elem is not None:
                    text = text_elem.text
                    font = font_elem.get(f'{{{namespaces["w"]}}}ascii') if font_elem is not None else "Unknown"

                    # Check for superscript or subscript alignment
                    if vert_align_elem is not None:
                        align_val = vert_align_elem.get(f'{{{namespaces["w"]}}}val')
                        if align_val == "superscript":
                            text = f"<superscript {text} >"
                        elif align_val == "subscript":
                            text = f"<subscript {text} >"

                    # Wrap the text in <specialFont ...> if it’s in the special font
                    if font == "HUBPSigla":  # Replace with your actual font name if different
                        footnote_text += f"<specialFont {text} >"
                    else:
                        footnote_text += text

            # Only store non-blank footnotes in the dictionary
            if footnote_text.strip():  # Check if footnote text is non-blank
                footnotes_dict[f'footnote-{non_blank_count}'] = footnote_text.strip()
                non_blank_count += 1  # Increment count only for non-blank footnotes

    return footnotes_dict


def process_main_text_with_normalized_footnotes(docx_path):
    document_xml = None

    # Step 1: Extract and normalize valid footnotes
    normalized_footnotes = {}  # Map of normalized ID -> actual footnote text
    id_mapping = {}  # Map of actual ID -> normalized ID
    with zipfile.ZipFile(docx_path, 'r') as docx:
        if 'word/footnotes.xml' in docx.namelist():
            footnotes_xml = docx.read('word/footnotes.xml').decode('utf-8')
            root = ET.fromstring(footnotes_xml)
            namespaces = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}

            normalized_id = 1
            for footnote in root.findall('w:footnote', namespaces):
                actual_id = int(footnote.get(f'{{{namespaces["w"]}}}id'))
                footnote_type = footnote.get(f'{{{namespaces["w"]}}}type', 'regular')
                text = ''.join(run.text or '' for run in footnote.findall('.//w:t', namespaces))

                # Include only regular footnotes with valid IDs
                if actual_id > 0 and footnote_type == 'regular':
                    normalized_footnotes[normalized_id] = text.strip()
                    id_mapping[actual_id] = normalized_id
                    normalized_id += 1

    # Step 2: Process `document.xml` with normalized IDs
    with zipfile.ZipFile(docx_path, 'r') as docx:
        if 'word/document.xml' in docx.namelist():
            document_xml = docx.read('word/document.xml').decode('utf-8')

    namespaces = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}
    main_text_dict = {}

    if document_xml:
        document_root = ET.fromstring(document_xml)
        title_and_chapter = ""
        first_row_processed = False

        for i, paragraph in enumerate(document_root.findall('.//w:p', namespaces), start=1):
            paragraph_text = ""
            if not first_row_processed:
                title_and_chapter = ''.join(
                    run.find('w:t', namespaces).text or ''
                    for run in paragraph.findall('.//w:r', namespaces)
                    if run.find('w:t', namespaces) is not None
                ).strip()
                first_row_processed = True
                continue

            if not any(run.find('w:t', namespaces) is not None for run in paragraph.findall('.//w:r', namespaces)):
                continue

            app_id = f"app-{i-2}"
            combined_special_fonts = []  # To collect specialFont text for merging

            for run in paragraph.findall('.//w:r', namespaces):
                text_elem = run.find('w:t', namespaces)
                footnote_ref = run.find('.//w:footnoteReference', namespaces)
                font_elem = run.find('.//w:rPr//w:rFonts', namespaces)
                vert_align_elem = run.find('.//w:rPr//w:vertAlign', namespaces)

                if text_elem is not None:
                    text = text_elem.text
                    font = font_elem.get(f'{{{namespaces["w"]}}}ascii') if font_elem is not None else "Unknown"

                    # Check for superscript or subscript alignment
                    if vert_align_elem is not None:
                        align_val = vert_align_elem.get(f'{{{namespaces["w"]}}}val')
                        if align_val == "superscript":
                            text = f"<superscript {text}>"
                        elif align_val == "subscript":
                            text = f"<subscript {text}>"

                    # Collect special font text instead of appending immediately
                    if font == "HUBPSigla":  # Replace with your actual font name if different
                        combined_special_fonts.append(text)
                    else:
                        # If there's non-specialFont text, finalize combined_special_fonts
                        if combined_special_fonts:
                            paragraph_text += f"<specialFont {''.join(combined_special_fonts)}>"
                            combined_special_fonts = []  # Reset buffer
                        paragraph_text += text

                elif footnote_ref is not None:
                    actual_id = int(footnote_ref.get(f'{{{namespaces["w"]}}}id'))
                    # Use the normalized ID if the actual ID exists in the mapping
                    if actual_id in id_mapping:
                        normalized_id = id_mapping[actual_id]
                        paragraph_text += f"<ref {normalized_id}>"

            # Finalize any remaining combined_special_fonts
            if combined_special_fonts:
                paragraph_text += f"<specialFont {''.join(combined_special_fonts)}>"

            main_text_dict[app_id] = paragraph_text

    return {
        'title': title_and_chapter.split()[0],
        'chapter': title_and_chapter.split()[1],
        'content': main_text_dict
    }

def process_lemma_with_superscript(lemma_text):
    # Regex to match verse, lemma, and superscript number
    pattern = r'^(\d+)\s+([^\s<]+)(?:<superscript\s*(\d+)>)?'
    
    # Match the pattern
    match = re.match(pattern, lemma_text)
    if not match:
        return None  # Return None if the pattern doesn't match
    
    # Extract verse, lemma, and superscript number
    verse = int(match.group(1))
    lemma = match.group(2)
    number = int(match.group(3)) if match.group(3) else None
    
    # Return structured data
    return {
        'verse': verse,
        'lemma': lemma,
        'number': number
    }

def process_single_lemma_with_range(lemma_text):
    """
    Process a single lemma unit, checking first for ranges and then tagging appropriately.
    """
    # Check if there's a range (– or -)
    if '–' in lemma_text or '-' in lemma_text:
        delimiter = '–' if '–' in lemma_text else '-'
        parts = lemma_text.split(delimiter, 1)
        part_from = parts[0].strip()
        part_to = parts[1].strip()

        return [
            {'tag': 'from lemma', 'content': part_from},
            {'delimiter': delimiter, 'type': 'range'},
            {'tag': 'to lemma', 'content': part_to}
        ]

    # If no range, split and tag as lemma 1, lemma 2, etc.
    lemma_parts = lemma_text.split()
    structured_parts = []
    for i, part in enumerate(lemma_parts, start=1):
        structured_parts.append({'tag': f'lemma {i}', 'content': part})

    return structured_parts

def process_lemma_with_transposition(lemma, previous_verse=None):
    # Pattern to identify if the lemma starts with a digit (verse number)
    verse_pattern = re.compile(r'^\d+')
    verse_match = verse_pattern.match(lemma)

    # Determine verse number
    if verse_match:
        verse_number = int(verse_match.group(0))
        lemma_text = lemma[verse_match.end():].strip()
    else:
        verse_number = previous_verse
        lemma_text = lemma.strip()

    # Check for transposition (~)
    if '~' in lemma_text:
        parts = lemma_text.split('~', 1)
        part_a = parts[0].strip()
        part_b = parts[1].strip()

        return {
            'verse': verse_number,
            'lemma_parts': [
                {'type': 'transposition', 'part': 'a', 'details': process_single_lemma_with_range(part_a)},
                {'type': 'transposition', 'part': 'b', 'details': process_single_lemma_with_range(part_b)}
            ]
        }
    else:
        # No transposition, process as a single lemma unit
        return {
            'verse': verse_number,
            'lemma_parts': process_single_lemma_with_range(lemma_text)
        }

    
def split_full_entry(text):
    """
    Splits a full entry into lemma and entry. Processes sequences of <subscript> tags
    into a single concatenated <subscript> tag.
    """

    # Split at the first occurrence of ']'
    sliced_entry = text.split(sep=']', maxsplit=1)
    
    # Initialize `lemma` and `entry`
    lemma = sliced_entry[0]
    entry = sliced_entry[1] if len(sliced_entry) > 1 else ""

    # Process subscript sequences in the entry
    subscript_pattern = r'(?:<subscript\s*([^>]+)>)+'
    
    def merge_subscripts(match):
        subscripts = match.group(0)
        combined_content = ''.join(re.findall(r'<subscript\s*([^>]+)>', subscripts))
        return f'<subscript {combined_content}>'

    entry = re.sub(subscript_pattern, merge_subscripts, entry)

    # Check if a superscript follows immediately after the `]`
    superscript_pattern = re.compile(r'^\s*<ref\s*([^>]+)>')
    match = superscript_pattern.match(entry)
    lemma_dict = {'lemma': lemma}
    if match:
        # Append the superscript to the lemma
        lemma_dict.update({'ref': match.group(0)})
        
        # Remove the superscript from the start of the entry
        entry = entry[len(match.group(0)):].strip()

    # Check for specific patterns in the lemma
    to_check_patterns = [
        r'<specialFont\s*<subscript\s*v>>',  # Pattern for subscript v
        r'<specialFont\s*,'                 # Pattern for specialFont ,
        r'<specialFont\s*v'
    ]
    if any(re.search(pattern, lemma) for pattern in to_check_patterns):
        lemma_dict['to_check'] = True  # Add "to_check" flag if any pattern matches
        
    return lemma_dict, entry


def extract_initial_sigla(reading):
       
    # Pattern to match one or more sigla at the start of the reading
    sigla_pattern = re.compile(r'^[\+\>\~]+')
    
    # Dictionary to store extracted sigla and cleaned reading
    result = {
        'sigla': "",
        'cleaned_reading': reading.strip()  # Initialize cleaned reading as the full reading
    }
    
    # Find initial sigla if present
    match = sigla_pattern.match(reading)
    if match:
        # Extract sigla and set them in the result dictionary
        result['sigla'] = match.group(0)
        
        # Remove the matched sigla from the beginning of the reading
        result['cleaned_reading'] = reading[len(result['sigla']):].strip()

    return result

def extract_cross_references(reading):
    # Pattern to find all <subscript ...> tags
    subscript_pattern = re.compile(r'<subscript\s*([^>]+)>')
    
    # List to store cross-references and clean reading with subscripts
    cross_references = []
    cleaned_reading = reading

    # Define a pattern to identify Roman numerals with only I's and V's (e.g., I, II, III, IV, V, VI, VII, VIII)
    roman_numeral_pattern = re.compile(r'[IV\s]{0,}$')

    # Process each subscript match
    matches = subscript_pattern.findall(reading)
    for match in matches:
        if roman_numeral_pattern.match(match):
            for item in match.split():
                cross_references.append(item)  # Store as cross-reference if it's a valid Roman numeral with I's and V's
            cleaned_reading = cleaned_reading.replace(f"<subscript {match}>", "")
        
    # Remove duplicates in cross-references and retain order
    unique_references = sorted(set(cross_references), key=cross_references.index)
    
    # Final cleanup of <subscript ...> tags for Roman numerals in cleaned reading
#     cleaned_reading = roman_numeral_pattern.sub('', cleaned_reading).strip()

    return {
        'cross_references': unique_references,
        'reading': cleaned_reading
    }


def parse_and_classify_entry(entry):
    # Define splitting characters with classifications
    splitters = {
        '|': 'additional_variant',
        '=': 'synonymous_variant',
        ',': 'related_variant',
        '<specialFont +>': 'similar_variant'
    }
    
    special_font_pattern = re.compile(r'<specialFont\s*([^>+]+)>')

    
    # Preprocess: Replace <specialFont ...> with placeholders
    special_font_placeholders = []
    def replace_special_font(match):
        special_font_placeholders.append(match.group(0))  # Store the full tag
        return f"__SPECIAL_FONT_{len(special_font_placeholders) - 1}__"

    entry = special_font_pattern.sub(replace_special_font, entry)

    # Compile regex to split by any of the splitters
    splitter_pattern = re.compile(r'(\||=|<specialFont\s\+>|,)')

    # List to store parsed entries with classifications
    parsed_entries = []

    # Split the entry by the main split characters, keeping split characters separate
    parts = splitter_pattern.split(entry)

    # Initialize a default classification for the first part
    current_classification = "variant"

    # Process each part separately
    for part in parts:
        part = part.strip()  # Remove leading/trailing whitespace
        
        # Skip if the part is a splitter, set classification for the next part
        if part in splitters:
            current_classification = splitters[part]
            continue  # Skip to the next part

        # Restore <specialFont ...> tags in this part
        for i, placeholder in enumerate(special_font_placeholders):
            part = part.replace(f"__SPECIAL_FONT_{i}__", placeholder)

        # Initialize dictionaries to store witnesses, reading, classification, and cross-references
        entry_data = {
            'classification': current_classification,
            'witnesses': [],
            'reading': "",
            'cross_references': [],
            'sigla': ""
        }
                
        combined_witnesses = []  # Collect all witness contents

        i = 0
        while i < len(part):
            # Match <specialFont ...>
            special_font_match = special_font_pattern.match(part, i)
            if special_font_match:
                combined_witnesses.append(special_font_match.group(1))  # Collect witness content
                i = special_font_match.end()
            else:
                # Once done collecting witnesses, finalize the <specialFont> wrapper
                if combined_witnesses:
                    combined_witness_text = ''.join(combined_witnesses)
                    entry_data['witnesses'].append(f"<specialFont {combined_witness_text}>")
                    combined_witnesses = []  # Reset the buffer
                # Anything else becomes the reading
                entry_data['reading'] = part[i:].strip()
                break

        # If any witnesses remain in the buffer, finalize them
        if combined_witnesses:
            combined_witness_text = ''.join(combined_witnesses)
            entry_data['witnesses'].append(f"<specialFont {combined_witness_text}>")
        # process witnesses
        entry_data['witnesses'] = process_combined_witnesses(entry_data['witnesses'])
        
        # Extract cross-references from the reading
        result = extract_cross_references(entry_data['reading'])
        entry_data['cross_references'] = result['cross_references']
        entry_data['reading'] = result['reading']  # Cleaned reading without <subscript ...> tags

        
        sigla_result = extract_initial_sigla(entry_data['reading'])
        entry_data['sigla'] = sigla_result['sigla']
        entry_data['reading'] = sigla_result['cleaned_reading']  # Cleaned reading without sigla

        # Add entry data to the parsed entries list
        parsed_entries.append(entry_data)

    return parsed_entries


def split_and_process_witnesses(witness_text):
    """
    Split witnesses from text, attaching 'h' or '-' to the preceding witness.
    """
    processed_witnesses = []
    
    for char in witness_text:
        if char in ['h', '-'] and processed_witnesses:
            # Attach 'h' or '-' to the last witness
            processed_witnesses[-1] += char
        else:
            # Start a new witness
            processed_witnesses.append(char)

    return processed_witnesses


def process_combined_witnesses(witnesses):
    """
    Process combined witnesses, splitting and wrapping them in <specialFont> tags.
    """
    processed_witnesses = []

    for witness in witnesses:
        # Extract the content within <specialFont>
        match = re.match(r'<specialFont\s*(.*?)>', witness)
        if match:
            content = match.group(1)
            # Split the content into individual witnesses
            split_witnesses = split_and_process_witnesses(content)
            # Wrap each witness back in <specialFont> tags
            processed_witnesses.extend([f"<specialFont {w}>" for w in split_witnesses])
        else:
            # If no <specialFont>, just append as-is
            processed_witnesses.append(witness)

    return processed_witnesses

In [3]:
# def split_verse_lemma(entry_text, previous_verse=None):
#     """
#     Splits the entry into verse and lemma. If a verse range is found, splits into from_verse and to_verse.
#     If no verse is found, it uses the previous entry's verse.
#     """
#     # Regex to match verse patterns: digits optionally followed by a dash or range
#     pattern = re.match(r'^(\d+(?:–\d+)?)(.*)$', entry_text.strip())
#     if pattern:
#         verse_part = pattern.group(1).strip()  # Extract verse
#         lemma = pattern.group(2).strip()  # Extract lemma

#         # Check if the verse contains a range
#         if '–' in verse_part:
#             from_verse, to_verse = map(str.strip, verse_part.split('–', 1))
#             verse = {'from_verse': from_verse, 'to_verse': to_verse}
#         else:
#             verse = {'verse': verse_part}  # Single verse
#     else:
#         verse = {'verse': previous_verse}  # Use previous verse if no verse found
#         lemma = entry_text.strip()  # Assume the rest is lemma

#     return {
#         'verse': verse,
#         'lemma': lemma
#     }

def split_verse_lemma(entry_text, previous_verse=None):
    """
    Splits the entry into verse and lemma. If a verse range is found, splits into from_verse and to_verse.
    If no verse is found, it uses the previous entry's verse.
    Adds a `to_check` flag to the lemma if specific patterns are detected.
    """
    # Regex to match verse patterns: digits optionally followed by a dash or range
    pattern = re.match(r'^(\d+(?:–\d+)?)(.*)$', entry_text.strip())
    if pattern:
        verse_part = pattern.group(1).strip()  # Extract verse
        lemma = pattern.group(2).strip()  # Extract lemma

        # Check if the verse contains a range
        if '–' in verse_part:
            from_verse, to_verse = map(str.strip, verse_part.split('–', 1))
            verse = {'from_verse': from_verse, 'to_verse': to_verse}
        else:
            verse = {'verse': verse_part}  # Single verse
    else:
        verse = {'verse': previous_verse}  # Use previous verse if no verse found
        lemma = entry_text.strip()  # Assume the rest is lemma

    # Check for specific patterns in the lemma
    to_check_patterns = [
        r'<specialFont\s*<subscript\s*v>>',  # Pattern for subscript v
        r'<specialFont\s*,>'                 # Pattern for specialFont comma
    ]
    to_check = any(re.search(pattern, lemma) for pattern in to_check_patterns)

    # Return the structured data, including `to_check` if applicable
    return {
        'verse': verse,
        'lemma': {'text': lemma, 'to_check': to_check} if to_check else lemma
    }

def process_lemma_specific_range(lemma_text):
    """
    Process a lemma with one or multiple specific ranges (...).
    Handles cases like 'word ... word ... word ... word'.
    """
    # Split the lemma by '...'
    parts = [part.strip() for part in lemma_text.split('...') if part.strip()]

    # Handle cases with multiple specific ranges
    processed_ranges = []
    for i, part in enumerate(parts):
        if i == 0:
            # The first part is the start
            processed_ranges.append({'tag': 'start lemma', 'content': part})
        elif i == len(parts) - 1:
            # The last part is the end
            processed_ranges.append({'tag': 'end lemma', 'content': part})
        else:
            # Intermediate parts are tagged as middle ranges
            processed_ranges.append({'tag': f'middle lemma {i}', 'content': part})

    # Return the processed range structure
    return {
        'type': 'specific_range',
        'parts': processed_ranges
    }

def process_lemma_range(lemma_text):
    """
    Process a lemma with a range (–).
    """
    parts = lemma_text.split('–', 1)
    part_from = parts[0].strip()
    part_to = parts[1].strip()

    return {
        'type': 'full_range',
        'from': part_from,
        'to': part_to
    }

def process_lemma_transposition(lemma_text):
    """
    Process a lemma with a transposition (~).
    """
    parts = lemma_text.split('~', 1)
    part_a = parts[0].strip()
    part_b = parts[1].strip()

    return {
        'type': 'transposition',
        'parts': {
            'a': process_lemma(part_a),
            'b': process_lemma(part_b)
        }
    }


def split_k_q_lemmas(lemma_text):
    """
    Splits lemmas prefixed by k and q (e.g., 'k עינתם / q עוֹנֹתם')
    into distinct lemmas with their respective tags, dropping k/q prefixes.
    """
    # Regex to match prefixed lemmas and split them
    pattern = r'\b[kq]\s+([^\s/]+)\s*/\s*\b[kq]\s+([^\s]+)'
    match = re.match(pattern, lemma_text)

    if match:
        lemma_k = match.group(1).strip()  # Extract the lemma content after 'k'
        lemma_q = match.group(2).strip()  # Extract the lemma content after 'q'

        return [
            {'tag': 'lemma_k', 'content': lemma_k},
            {'tag': 'lemma_q', 'content': lemma_q}
        ]

    # If no match, return the lemma as-is
    return [{'tag': 'lemma', 'content': lemma_text}]


def process_lemma(lemma_text):
    """
    Process the lemma string. Checks for transposition, range, or specific range
    and delegates to specialized functions. If none found, processes individual lemmas.
    """
    if not lemma_text:
        return None  # Return if lemma is empty
    
    if '/' in lemma_text and re.search(r'\b[kq]\s', lemma_text):
        return split_k_q_lemmas(lemma_text)
    elif '~' in lemma_text:
        return process_lemma_transposition(lemma_text)
    elif '–' in lemma_text:
        return process_lemma_range(lemma_text)
    elif '...' in lemma_text:
        return process_lemma_specific_range(lemma_text)

    # Default: Process individual lemmas
    return process_individual_lemma(lemma_text)


# def process_individual_lemma(individual_lemma):
#     """
#     Process individual lemmas, concatenating multiple superscripts into a single number if present,
#     and including parentheses when they are part of the lemma.
#     """
#     # Regex to match words (including parentheses as part of the lemma) with optional superscripts
#     lemma_regex = r'([^\s<]+)((?:<superscript\s*[^>]+>)*)'
#     superscript_regex = r'<superscript\s*([^>]+)>'

#     matches = re.findall(lemma_regex, individual_lemma)

#     processed_lemmas = []
#     for i, (word, superscripts) in enumerate(matches, start=1):
#         lemma_dict = {'tag': f'lemma{i}', 'content': word}

#         # Concatenate all superscripts into a single string
#         if superscripts:
#             superscript_values = re.findall(superscript_regex, superscripts)
#             lemma_dict['numbers'] = ''.join(superscript_values)  # Combine into a single string

#         processed_lemmas.append(lemma_dict)

#     return processed_lemmas

def process_individual_lemma(lemma_dict):
    """
    Process individual lemmas, handling parentheses, concatenating multiple superscripts
    into a single number if present, and constructing `full_lemma` where appropriate.
    If a `to_check` flag is already present in the lemma_dict, return it unchanged.
    """
    # If `to_check` is present, return the dictionary unchanged
    if isinstance(lemma_dict, dict) and lemma_dict.get("to_check", False):
        return [lemma_dict]

    individual_lemma = lemma_dict.get('lemma', '') if isinstance(lemma_dict, dict) else lemma_dict

    # Regex to match words, parentheses, and superscripts
    lemma_regex = r'([^\s<\(\)]+|\([^\)]+\)|<superscript\s*[^>]+>|>|\s+)'
    superscript_regex = r'<superscript\s*([^>]+)>'

    matches = re.findall(lemma_regex, individual_lemma)

    processed_lemmas = []
    combined_parenthesis_content = ""
    inside_parentheses = False
    outside_parts = []  # Track parts outside parentheses
    inside_parts = []  # Track parts inside parentheses
    has_split_lemma = False  # Flag for split lemmas
    lemma_counter = 1  # Counter for normal lemmas
    parenthesis_first = False  # Flag to check if parentheses are first

    for i, word in enumerate(matches):
        if word.startswith("(") and not word.endswith(")"):
            # Start a parenthetical group
            inside_parentheses = True
            combined_parenthesis_content = word
            has_split_lemma = True
            if i == 0:
                parenthesis_first = True
        elif word.endswith(")") and inside_parentheses:
            # End a parenthetical group
            inside_parentheses = False
            combined_parenthesis_content += f" {word}"
            processed_lemmas.append({'tag': 'parenthesis', 'content': combined_parenthesis_content})
            inside_parts.append(combined_parenthesis_content[1:-1])  # Content inside parentheses
            combined_parenthesis_content = ""
        elif inside_parentheses:
            # Inside a parenthetical group
            combined_parenthesis_content += f" {word}"
        elif word.startswith("(") and word.endswith(")"):
            # Complete parenthetical group
            processed_lemmas.append({'tag': 'parenthesis', 'content': word})
            inside_parts.append(word[1:-1])  # Content inside parentheses
            has_split_lemma = True
            if i == 0:
                parenthesis_first = True
        elif word.startswith("<superscript"):
            # Handle superscripts
            if processed_lemmas and 'content' in processed_lemmas[-1]:
                current_lemma = processed_lemmas[-1]
                superscript_values = re.findall(superscript_regex, word)
                if 'numbers' not in current_lemma:
                    current_lemma['numbers'] = ''.join(superscript_values)
                else:
                    current_lemma['numbers'] += ''.join(superscript_values)
        elif word.strip():
            # Normal word processing
            lemma_dict = {'tag': f'lemma{lemma_counter}', 'content': word}
            processed_lemmas.append(lemma_dict)
            lemma_counter += 1
            outside_parts.append(word)

    # If parentheses remain open, finalize them
    if inside_parentheses and combined_parenthesis_content:
        processed_lemmas.append({'tag': 'parenthesis', 'content': combined_parenthesis_content})
        inside_parts.append(combined_parenthesis_content[1:])  # Remaining content without leading '('

    # Add `full_lemma` if necessary
    if has_split_lemma and outside_parts and inside_parts:
        # Concatenate inside and outside parts based on order
        if parenthesis_first:
            full_lemma_content = ''.join(inside_parts + outside_parts).strip()
        else:
            full_lemma_content = ''.join(outside_parts + inside_parts).strip()

        # Only create `full_lemma` if parentheses are part of a split lemma
        if len(outside_parts) > 0 and len(inside_parts) == 1 and ' ' not in inside_parts[0]:
            processed_lemmas.append({'tag': 'full_lemma', 'content': full_lemma_content})

    return processed_lemmas


In [4]:
import os
import pandas as pd
from docx import Document

def process_docx_files(folder_path):
    # Initialize an empty list to collect data for each file
    data = []

    # Iterate over each .docx file in the folder
    for filename in sorted(os.listdir(folder_path)):
        if filename.endswith('.docx'):
            if filename.startswith('~$'):
                continue  # Skip temporary files
            docx_path = os.path.join(folder_path, filename)
            
            # Extract main text and footnotes
            main_text_data = process_main_text_with_normalized_footnotes(docx_path)
            footnotes_data = process_footnotes(docx_path)
            
            # Extract title and chapter from main text data
            title = main_text_data.get('title', 'Unknown Title')
            chapter = main_text_data.get('chapter', 'Unknown Chapter')
            
            # Append to data list with structured dictionary
            data.append({
                'Title': title,
                'Chapter': int(chapter),
                'Main Text': main_text_data.get('content', {}),
                'Footnotes': footnotes_data
            })

    # Convert the list of dictionaries to a DataFrame
    df = pd.DataFrame(data).sort_values(by=['Title', 'Chapter']).reset_index(drop=True)
    return df

# Folder path containing .docx files
folder_path = 'Hosea.App.1'

# Process and get DataFrame
df = process_docx_files(folder_path)

# Display the DataFrame
df.head()


Unnamed: 0,Title,Chapter,Main Text,Footnotes
0,Hosea,1,{'app-1': '1 יותם אחז יחזקיה] <specialFont ][>...,"{'footnote-1': 'cf app Mic 1<subscript 1 >', '..."
1,Hosea,2,{'app-1': '1–2] <specialFont ]*> 1<subscript 1...,"{'footnote-1': '“there”, for במקום אשר ... שם ..."
2,Hosea,3,{'app-1': '1 אהב] <specialFont ]*h> <specialFo...,{'footnote-1': 'voc אֹהֶבֶת רָע; cf Mic 3<subs...
3,Hosea,4,"{'app-0': '1 (ל)יהוה] <specialFont ]h> div', '...","{'footnote-1': '“(in) the fear of (God)”, theo..."
4,Hosea,5,{'app-1': '1 לְמצפָּה] <specialFont ]> τῇ σκοπ...,"{'footnote-1': '“to the lookout”, similarly <s..."


In [199]:
def df_to_xml_with_flattened_verse_and_to_check(df, output_file):
    def extract_innermost_verse(verse_data):
        """
        Recursively extract the innermost 'verse' value from a nested structure.
        """
        if isinstance(verse_data, dict) and "verse" in verse_data:
            return extract_innermost_verse(verse_data["verse"])
        return verse_data

    root = ET.Element("root")

    for row_idx, row in df.iterrows():
        try:
            print(f"Processing Row: {row_idx}, Title: {row['Title']}, Chapter: {row['Chapter']}")

            # Create a Book element for each title
            book_element = ET.SubElement(root, "Book", {"Title": row["Title"]})
            chapter_element = ET.SubElement(book_element, "Chapter", {"Number": str(row["Chapter"])})
            main_text_data = row["Main Text"]

            if isinstance(main_text_data, dict):
                main_text_element = ET.SubElement(chapter_element, "MainText")
                previous_verse = None  # Initialize previous verse
                for key, value in main_text_data.items():
                    try:
                        print(f"Processing Entry ID: {key}")
                        entry_element = ET.SubElement(main_text_element, "Entry", {"ID": key})

                        # Split lemma and entry
                        lemma, entry = split_full_entry(value)
                        print(f"Split Lemma: {lemma}, Entry: {entry}")

                        if isinstance(lemma, dict) and "lemma" in lemma:
                            # Add Verse to Entry
                            split_verse = split_verse_lemma(lemma, previous_verse)  # Pass the full lemma dict
                            print(f"Split Verse Lemma: {split_verse}")

                            if split_verse["verse"].get("verse") or split_verse["verse"].get("to_verse"):
                                previous_verse = split_verse["verse"]

                            verse_element = ET.SubElement(entry_element, "Verse")
                            for verse_key, verse_value in split_verse["verse"].items():
                                # Flatten the verse structure if necessary
                                flattened_value = extract_innermost_verse(verse_value)
                                ET.SubElement(verse_element, verse_key.capitalize()).text = str(flattened_value)

                            # Extract processed lemma type, if any
                            processed_lemma = process_lemma(split_verse["lemma"][0]["text"])  # Pass only the lemma text
                            print(f"Processed Lemma: {processed_lemma}")

                            # Determine lemma type for the overall Lemma tag
                            lemma_type = processed_lemma.get("type") if isinstance(processed_lemma, dict) else None
                            lemma_attributes = {"Type": lemma_type} if lemma_type else {}

                            # Include `to_check` and `ref` flags if present
                            if split_verse["lemma"][0].get("to_check", False):
                                lemma_attributes["ToCheck"] = "true"
                            if split_verse["lemma"][0].get("ref"):
                                lemma_attributes["Ref"] = split_verse["lemma"][0]["ref"]

                            lemma_element = ET.SubElement(entry_element, "Lemma", lemma_attributes)

                            # Process the lemma content
                            if isinstance(processed_lemma, list):
                                for lemma_detail in processed_lemma:
                                    lemma_detail_element = ET.SubElement(lemma_element, "Detail")
                                    for field, field_value in lemma_detail.items():
                                        ET.SubElement(lemma_detail_element, field.capitalize()).text = str(field_value)
                            elif isinstance(processed_lemma, dict):
                                if processed_lemma.get("type") == "specific_range":
                                    for part in processed_lemma.get("parts", []):
                                        part_element = ET.SubElement(lemma_element, "Part")
                                        part_element.set("Type", processed_lemma["type"])
                                        for field, field_value in part.items():
                                            ET.SubElement(part_element, field.capitalize()).text = str(field_value)
                                elif processed_lemma.get("type") == "transposition":
                                    for part_key, part_value in processed_lemma["parts"].items():
                                        part_element = ET.SubElement(lemma_element, "Part", {"Type": part_key})
                                        if isinstance(part_value, list):
                                            for item in part_value:
                                                detail_element = ET.SubElement(part_element, "Detail")
                                                for sub_key, sub_value in item.items():
                                                    ET.SubElement(detail_element, sub_key.capitalize()).text = str(sub_value)
                                        elif isinstance(part_value, dict):
                                            for field, field_value in part_value.items():
                                                ET.SubElement(part_element, field.capitalize()).text = str(field_value)
                                        else:
                                            ET.SubElement(part_element, "Content").text = str(part_value)
                                elif processed_lemma.get("type") == "full_range":
                                    range_element = ET.SubElement(lemma_element, "Range")
                                    ET.SubElement(range_element, "From").text = str(processed_lemma.get("from"))
                                    ET.SubElement(range_element, "To").text = str(processed_lemma.get("to"))

                        # Step 2: Process Entry
                        if entry:
                            try:
                                classified_entry = parse_and_classify_entry(entry)
                                print(f"Classified Entry: {classified_entry}")

                                entry_details_element = ET.SubElement(entry_element, "EntryDetails")
                                for classified in classified_entry:
                                    classified_element = ET.SubElement(entry_details_element, "ClassifiedEntry")
                                    for field, field_value in classified.items():
                                        if isinstance(field_value, list):  # Handle lists (e.g., witnesses)
                                            list_element = ET.SubElement(classified_element, field.capitalize())
                                            for item in field_value:
                                                ET.SubElement(list_element, "Item").text = str(item)
                                        elif field_value:  # Skip empty fields
                                            ET.SubElement(classified_element, field.capitalize()).text = str(field_value)

                            except Exception as entry_error:
                                print(f"Error processing Entry for ID {key}: {entry_error}")
                                ET.SubElement(entry_element, "Error").text = f"Failed to process entry: {entry_error}"

                    except Exception as entry_id_error:
                        print(f"Error processing Entry ID {key}: {entry_id_error}")
                        ET.SubElement(main_text_element, "Error").text = f"Failed to process Entry ID {key}: {entry_id_error}"

            # Process Footnotes
            footnotes_data = row["Footnotes"]
            if isinstance(footnotes_data, dict):
                footnotes_element = ET.SubElement(chapter_element, "Footnotes")
                for key, value in footnotes_data.items():
                    footnote_element = ET.SubElement(footnotes_element, "Footnote", {"ID": key})
                    footnote_element.text = value

        except Exception as row_error:
            print(f"Error processing Row {row_idx}: {row_error}")
            ET.SubElement(root, "Error").text = f"Failed to process Row {row_idx}: {row_error}"

    # Write to XML file
    tree = ET.ElementTree(root)
    tree.write(output_file, encoding="utf-8", xml_declaration=True)

output_file = "flattened_verse_with_to_check.xml"
df_to_xml_with_flattened_verse_and_to_check(df, output_file)
print(f"XML saved to {output_file}")


Processing Row: 0, Title: Hosea, Chapter: 1
Processing Entry ID: app-1
Split Lemma: {'lemma': '1 יותם אחז יחזקיה'}, Entry:  <specialFont ][> <specialFont &>
Split Verse Lemma: {'verse': {'verse': '1'}, 'lemma': [{'text': 'יותם אחז יחזקיה', 'to_check': False, 'ref': None}]}
Processed Lemma: [{'tag': 'lemma1', 'content': 'יותם'}, {'tag': 'lemma2', 'content': 'אחז'}, {'tag': 'lemma3', 'content': 'יחזקיה'}]
Classified Entry: [{'classification': 'variant', 'witnesses': ['<specialFont ]>', '<specialFont [>'], 'reading': '<specialFont &>', 'cross_references': [], 'sigla': ''}]
Processing Entry ID: app-2
Split Lemma: {'lemma': 'מלכי'}, Entry:  <specialFont ]h> num<subscript II><ref 1>
Split Verse Lemma: {'verse': {'verse': {'verse': '1'}}, 'lemma': [{'text': 'מלכי', 'to_check': False, 'ref': None}]}
Processed Lemma: [{'tag': 'lemma1', 'content': 'מלכי'}]
Classified Entry: [{'classification': 'variant', 'witnesses': ['<specialFont ]h>'], 'reading': 'num<ref 1>', 'cross_references': ['II'], 'sig

Processed Lemma: [{'tag': 'lemma1', 'content': 'קֵרבו'}]
Classified Entry: [{'classification': 'variant', 'witnesses': ['<specialFont ]>'], 'reading': 'ἀνεκαύθησαν<ref 25>', 'cross_references': [], 'sigla': ''}, {'classification': 'similar_variant', 'witnesses': ['<specialFont [>'], 'reading': '', 'cross_references': [], 'sigla': ''}, {'classification': 'additional_variant', 'witnesses': ['<specialFont ~>'], 'reading': 'ἤγγισαν<ref 26>', 'cross_references': [], 'sigla': ''}, {'classification': 'synonymous_variant', 'witnesses': ['<specialFont |>', '<specialFont T>'], 'reading': '', 'cross_references': [], 'sigla': ''}]
Processing Entry ID: app-24
Split Lemma: {'lemma': 'כתנור'}, Entry:  <specialFont T> + בער<ref 27>
Split Verse Lemma: {'verse': {'verse': {'verse': '6'}}, 'lemma': [{'text': 'כתנור', 'to_check': False, 'ref': None}]}
Processed Lemma: [{'tag': 'lemma1', 'content': 'כתנור'}]
Classified Entry: [{'classification': 'variant', 'witnesses': ['<specialFont T>'], 'reading': 'בער<

Processed Lemma: [{'tag': 'lemma1', 'content': 'עלוה'}]
Classified Entry: [{'classification': 'variant', 'witnesses': ['<specialFont ]>'], 'reading': 'ἀδικίας<ref 51>', 'cross_references': ['III'], 'sigla': ''}, {'classification': 'synonymous_variant', 'witnesses': ['<specialFont *>', '<specialFont [>'], 'reading': '', 'cross_references': [], 'sigla': ''}, {'classification': 'additional_variant', 'witnesses': ['<specialFont T>'], 'reading': 'סליקו<ref 52>', 'cross_references': [], 'sigla': ''}]
Processing Entry ID: app-46
Split Lemma: {'lemma': '10 בְּאַוָּתִי'}, Entry:  <specialFont ]-> ἦλθον<ref 53> | <specialFont ]h><subscript 1> ἦλθε(ν)<ref 54> | <specialFont ]h><subscript 2> > | <specialFont [> ܒܟܐܬܝ<ref 55>
Split Verse Lemma: {'verse': {'verse': '10'}, 'lemma': [{'text': 'בְּאַוָּתִי', 'to_check': False, 'ref': None}]}
Processed Lemma: [{'tag': 'lemma1', 'content': 'בְּאַוָּתִי'}]
Classified Entry: [{'classification': 'variant', 'witnesses': ['<specialFont ]->'], 'reading': 'ἦλθο

Split Lemma: {'lemma': '10 נבון'}, Entry:  <specialFont ]-[> connect
Split Verse Lemma: {'verse': {'verse': '10'}, 'lemma': [{'text': 'נבון', 'to_check': False, 'ref': None}]}
Processed Lemma: [{'tag': 'lemma1', 'content': 'נבון'}]
Classified Entry: [{'classification': 'variant', 'witnesses': ['<specialFont ]->', '<specialFont [>'], 'reading': 'connect', 'cross_references': [], 'sigla': ''}]
Processing Entry ID: app-46
Split Lemma: {'lemma': 'ופשעים'}, Entry:  <specialFont ]> det
Split Verse Lemma: {'verse': {'verse': {'verse': '10'}}, 'lemma': [{'text': 'ופשעים', 'to_check': False, 'ref': None}]}
Processed Lemma: [{'tag': 'lemma1', 'content': 'ופשעים'}]
Classified Entry: [{'classification': 'variant', 'witnesses': ['<specialFont ]>'], 'reading': 'det', 'cross_references': [], 'sigla': ''}]
XML saved to flattened_verse_with_to_check.xml


In [178]:
df['Footnotes'][2]['footnote-10']

'theol diffic <specialFont 7 > different solutions in Vrs (for similar enumeration cf <specialFont ][ > Dan 3<subscript 37−38 >); cf Rofé, FS Weinfeld, 135−149'

In [177]:
df['Main Text'][2]['app-15']

'ואין מצבה ואין אפוד ותרפים]<ref 10>'

In [195]:
def process_individual_lemma(lemma_dict):
    """
    Process individual lemmas, handling parentheses, concatenating multiple superscripts
    into a single number if present, and constructing `full_lemma` where appropriate.
    If a `to_check` flag is already present in the lemma_dict, return it unchanged.
    """
    # If `to_check` is present, return the dictionary unchanged
    if isinstance(lemma_dict, dict) and lemma_dict.get("to_check", True):
        return [lemma_dict]

    individual_lemma = lemma_dict.get('lemma', '') if isinstance(lemma_dict, dict) else lemma_dict

    # Regex to match words, parentheses, and superscripts
    lemma_regex = r'([^\s<\(\)]+|\([^\)]+\)|<superscript\s*[^>]+>|>|\s+)'
    superscript_regex = r'<superscript\s*([^>]+)>'

    matches = re.findall(lemma_regex, individual_lemma)

    processed_lemmas = []
    combined_parenthesis_content = ""
    inside_parentheses = False
    outside_parts = []  # Track parts outside parentheses
    inside_parts = []  # Track parts inside parentheses
    has_split_lemma = False  # Flag for split lemmas
    lemma_counter = 1  # Counter for normal lemmas
    parenthesis_first = False  # Flag to check if parentheses are first

    for i, word in enumerate(matches):
        if word.startswith("(") and not word.endswith(")"):
            # Start a parenthetical group
            inside_parentheses = True
            combined_parenthesis_content = word
            has_split_lemma = True
            if i == 0:
                parenthesis_first = True
        elif word.endswith(")") and inside_parentheses:
            # End a parenthetical group
            inside_parentheses = False
            combined_parenthesis_content += f" {word}"
            processed_lemmas.append({'tag': 'parenthesis', 'content': combined_parenthesis_content})
            inside_parts.append(combined_parenthesis_content[1:-1])  # Content inside parentheses
            combined_parenthesis_content = ""
        elif inside_parentheses:
            # Inside a parenthetical group
            combined_parenthesis_content += f" {word}"
        elif word.startswith("(") and word.endswith(")"):
            # Complete parenthetical group
            processed_lemmas.append({'tag': 'parenthesis', 'content': word})
            inside_parts.append(word[1:-1])  # Content inside parentheses
            has_split_lemma = True
            if i == 0:
                parenthesis_first = True
        elif word.startswith("<superscript"):
            # Handle superscripts
            if processed_lemmas and 'content' in processed_lemmas[-1]:
                current_lemma = processed_lemmas[-1]
                superscript_values = re.findall(superscript_regex, word)
                if 'numbers' not in current_lemma:
                    current_lemma['numbers'] = ''.join(superscript_values)
                else:
                    current_lemma['numbers'] += ''.join(superscript_values)
        elif word.strip():
            # Normal word processing
            lemma_dict = {'tag': f'lemma{lemma_counter}', 'content': word}
            processed_lemmas.append(lemma_dict)
            lemma_counter += 1
            outside_parts.append(word)

    # If parentheses remain open, finalize them
    if inside_parentheses and combined_parenthesis_content:
        processed_lemmas.append({'tag': 'parenthesis', 'content': combined_parenthesis_content})
        inside_parts.append(combined_parenthesis_content[1:])  # Remaining content without leading '('

    # Add `full_lemma` if necessary
    if has_split_lemma and outside_parts and inside_parts:
        # Concatenate inside and outside parts based on order
        if parenthesis_first:
            full_lemma_content = ''.join(inside_parts + outside_parts).strip()
        else:
            full_lemma_content = ''.join(outside_parts + inside_parts).strip()

        # Only create `full_lemma` if parentheses are part of a split lemma
        if len(outside_parts) > 0 and len(inside_parts) == 1 and ' ' not in inside_parts[0]:
            processed_lemmas.append({'tag': 'full_lemma', 'content': full_lemma_content})

    return processed_lemmas


def split_verse_lemma(lemma_dict, previous_verse=None):
    """
    Splits the entry into verse and lemma. If a verse range is found, splits into from_verse and to_verse.
    If no verse is found, it uses the previous entry's verse.
    Also carries `ref` and `to_check` from the lemma if present.
    """
    # Extract text and ref from the input dictionary
    lemma_text = lemma_dict.get("lemma", "")
    ref = lemma_dict.get("ref", None)

    # Regex to match verse patterns: digits optionally followed by a dash or range
    pattern = re.match(r'^(\d+(?:–\d+)?)(.*)$', lemma_text.strip())
    if pattern:
        verse_part = pattern.group(1).strip()  # Extract verse
        lemma_text = pattern.group(2).strip()  # Extract lemma

        # Check if the verse contains a range
        if '–' in verse_part:
            from_verse, to_verse = map(str.strip, verse_part.split('–', 1))
            verse = {'from_verse': from_verse, 'to_verse': to_verse}
        else:
            verse = {'verse': verse_part}  # Single verse
    else:
        verse = {'verse': previous_verse}  # Use previous verse if no verse found
        lemma_text = lemma_text.strip()  # Assume the rest is lemma

    # Check for specific patterns in the lemma for `to_check`
    to_check_patterns = [
        r'<specialFont\s*<subscript\s*v>>',  # Pattern for subscript v
        r'<specialFont\s*,>',               # Pattern for specialFont comma
        r'<specialFont\s*v>',               # Standalone specialFont with "v"
        r'<specialFont\s*[^>]+>',           # General case for specialFont with content
    ]
    to_check = any(re.search(pattern, lemma_text) for pattern in to_check_patterns)

    # Return structured data, including `ref` and `to_check` if applicable
    return {
        'verse': verse,
        'lemma': [{'text': lemma_text, 'to_check': to_check, 'ref': ref}]
    }



def split_full_entry(text):
    """
    Splits a full entry into lemma and entry. Processes sequences of <subscript> tags
    into a single concatenated <subscript> tag. Adds `to_check` flag for problematic patterns.
    """
    # Split at the first occurrence of ']'
    sliced_entry = text.split(sep=']', maxsplit=1)
    
    # Initialize `lemma` and `entry`
    lemma = sliced_entry[0]
    entry = sliced_entry[1] if len(sliced_entry) > 1 else ""

    # Process subscript sequences in the entry
    subscript_pattern = r'(?:<subscript\s*([^>]+)>)+'    
    def merge_subscripts(match):
        subscripts = match.group(0)
        combined_content = ''.join(re.findall(r'<subscript\s*([^>]+)>', subscripts))
        return f'<subscript {combined_content}>'

    entry = re.sub(subscript_pattern, merge_subscripts, entry)

    # Check if a superscript follows immediately after the `]`
    superscript_pattern = re.compile(r'^\s*<ref\s*([^>]+)>')
    match = superscript_pattern.match(entry)
    lemma_dict = {'lemma': lemma}
    if match:
        # Append the superscript to the lemma
        lemma_dict.update({'ref': match.group(0)})
        
        # Remove the superscript from the start of the entry
        entry = entry[len(match.group(0)):].strip()

    # Enhanced `to_check` pattern detection
    to_check_patterns = [
        r'<specialFont\s*<subscript\s*v>>',  # Subscript v inside specialFont
        r'<specialFont\s*,>',               # Comma inside specialFont
        r'<specialFont\s*v>',               # Any standalone specialFont with "v"
        r'<specialFont\s*[^>]+>',           # General case for specialFont with content
    ]
    if any(re.search(pattern, lemma) for pattern in to_check_patterns):
        lemma_dict['to_check'] = True  # Add "to_check" flag if any pattern matches
        
    return lemma_dict, entry


In [196]:
lemma, entry = split_full_entry(df['Main Text'][2]['app-15'])
parse_and_classify_entry(entry)


[{'classification': 'variant',
  'witnesses': [],
  'reading': '',
  'cross_references': [],
  'sigla': ''}]

In [197]:
lemma

{'lemma': 'ואין מצבה ואין אפוד ותרפים', 'ref': '<ref 10>'}

In [141]:
lemma # Ων v,

{'lemma': '2 פרצו<specialFont v>', 'to_check': True}

In [12]:
from docx2python import docx2python

# Path to the .docx file
docx_path = 'Hosea.1.App I.Full.docx'

# Load the document using docx2python
doc_content = docx2python(docx_path)

# Print a portion of the structure to examine it
print("Sample structure of doc_content.body_runs:")
print(doc_content.body_runs[:5])  # Print the first few elements for inspection


Sample structure of doc_content.body_runs:
[[[[['\t', 'Hosea 1'], [], ['1 יותם אחז יחזקיה] ][ &'], ['מלכי] ]h numII', '----footnote1----'], ['2 דִּבֶּר] ] λόγου', '----footnote2----', ' = [T'], ['ב(הושע)] ]h πρός', '----footnote3----', ' + [ܕܗܘܐ ܥܠ', '----footnote4----'], ['ו(יאמר)] ]h[ >'], ['לֵךְ] ]h >II'], ['3 ויקח] [ + ܠܗ', '----footnote5----'], ['לו] ]h*- >II III IV', '----footnote6----'], ['4 יהוא] ]h Ιουδα', '----footnote7----'], ['(ו)הִשְׁבַּתִּי] ]- ἀποστρέψω', '----footnote8----'], ['בית2] ]h*hT- + prep', '----footnote9----'], ['5 והיה (ביום ההוא)] [ >', '----footnote10----'], ['(ו)היה] * >'], ['ההוא] ]h + dicit dominus', '----footnote11----'], ['6 עוד1] *hT- >II III IV'], ['לו] ]h + κύριοςII', '----footnote12----', ' = [| [ pron', '----footnote13----'], ['נשׂא אשׂא] ] ἀντιτασσόμενος ἀντιτάξομαι', '----footnote14----', ' | * oblivione obliviscar', '----footnote15----', ' + ~'], ['7 בית] ] υἱούς', '----footnote16----'], ['יהודה] ]h >'], ['ולא] ][ rep'], ['(ו)במלחמה] ]- + (οὐδὲ

In [13]:
import zipfile
import xml.etree.ElementTree as ET

# Path to your .docx file
docx_path = 'Hosea.1.App I.Full.docx'

# Temporary storage for extracted XML content
document_xml = None
footnotes_xml = None

# Step 1: Extract `document.xml` and `footnotes.xml` from the .docx file
with zipfile.ZipFile(docx_path, 'r') as docx:
    if 'word/document.xml' in docx.namelist():
        document_xml = docx.read('word/document.xml').decode('utf-8')
    if 'word/footnotes.xml' in docx.namelist():
        footnotes_xml = docx.read('word/footnotes.xml').decode('utf-8')

# Step 2: Parse `footnotes.xml` to create a dictionary of footnote content
footnotes_dict = {}
if footnotes_xml:
    footnotes_root = ET.fromstring(footnotes_xml)
    namespaces = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}

    for footnote in footnotes_root.findall('w:footnote', namespaces):
        footnote_id = footnote.get(f'{{{namespaces["w"]}}}id')
        footnote_text = ''.join(node.text or '' for node in footnote.findall('.//w:t', namespaces))
        footnotes_dict[footnote_id] = footnote_text

# Step 3: Parse `document.xml` to identify and tag footnote references
formatted_text = ""
if document_xml:
    document_root = ET.fromstring(document_xml)
    for paragraph in document_root.findall('.//w:p', namespaces):
        paragraph_text = ""
        
        # Process each run in the paragraph
        for run in paragraph.findall('.//w:r', namespaces):
            text_elem = run.find('w:t', namespaces)
            footnote_ref = run.find('.//w:footnoteReference', namespaces)
            
            if text_elem is not None:
                paragraph_text += text_elem.text or ''
            elif footnote_ref is not None:
                # Get the ID of the footnote reference and wrap it in <ref ...> tags
                footnote_id = footnote_ref.get(f'{{{namespaces["w"]}}}id')
                footnote_content = footnotes_dict.get(footnote_id, "")
                paragraph_text += f"<ref {footnote_id}>{footnote_content}</ref>"

        # Add formatted paragraph text to the main text
        formatted_text += f"<p>{paragraph_text}</p>\n"

print("Formatted Document with Footnote References:")
print(formatted_text)


Formatted Document with Footnote References:
<p>Hosea 1</p>
<p></p>
<p>1 יותם אחז יחזקיה] ][ &</p>
<p>מלכי] ]h numII<ref 1> cf app Mic 11</ref></p>
<p>2 דִּבֶּר] ] λόγου<ref 2> (a) voc דְּבַר (יהוה), formula, cf v1 41 et al; cf app 131; cf תחלת דִּבְרֵי Qoh 1013; note seq; (b) noun דִּבֵּר, cf app Jer 513 and Rabb Heb; cf gerund in * loquendi “of speaking”</ref> = [T</p>
<p>ב(הושע)] ]h πρός<ref 3> “to”; main evid, cf v1</ref> + [ܕܗܘܐ ܥܠ<ref 4> “which was to”, ex v1</ref></p>
<p>ו(יאמר)] ]h[ ></p>
<p>לֵךְ] ]h >II</p>
<p>3 ויקח] [ + ܠܗ<ref 5> “for himself”, cf v2</ref></p>
<p>לו] ]h*- >II III IV<ref 6> cf vv6,8; contrast Hier 10154</ref></p>
<p>4 יהוא] ]h Ιουδα<ref 7> main evid; inner-Grk (בית יהודה common collocation), cf Hier 12208−211</ref></p>
<p>(ו)הִשְׁבַּתִּי] ]- ἀποστρέψω<ref 8> voc הֲשִׁבֹתִי, similarly app 213 Ezek 724; for parall השיב//פקד cf 49 123; main evid ]h καταπαύσω (=x)</ref></p>
<p>בית2] ]h*hT- + prep<ref 9> common formula השבית מן, cf e.g. Lev 266 Jer 734</ref></p>
<

In [6]:
import zipfile
import xml.etree.ElementTree as ET
from docx import Document
import shutil
import os

# Paths for input and output files
docx_path = 'Hosea.1.App I.Full.docx'  # Original document path
output_docx_path = 'Hosea.1.App I.Modified.docx'  # Output document with modified content
special_font_name = 'HUBPSigla'  # Replace with actual font name for special characters
replacement_font_name = 'Times New Roman'  # Font to replace special font

# Temporary directory to modify the .docx content
temp_dir = "temp_docx"
if not os.path.exists(temp_dir):
    os.makedirs(temp_dir)

# Step 1: Extract .docx contents to a temporary directory
with zipfile.ZipFile(docx_path, 'r') as zip_ref:
    zip_ref.extractall(temp_dir)

# Step 2: Access and modify footnotes.xml in the extracted content
footnotes_path = os.path.join(temp_dir, "word", "footnotes.xml")
if os.path.exists(footnotes_path):
    tree = ET.parse(footnotes_path)
    root = tree.getroot()
    namespaces = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}

    # Modify each footnote element
    for footnote in root.findall('w:footnote', namespaces):
        footnote_id = footnote.get(f'{{{namespaces["w"]}}}id')
        wrapped_text = f"<footnote ID='{footnote_id}'>"

        # Append each run's text with special font tags and close the footnote tag
        for run in footnote.findall('.//w:r', namespaces):
            text_elem = run.find('w:t', namespaces)
            font_elem = run.find('.//w:rPr//w:rFonts', namespaces)
            
            if text_elem is not None:
                text = text_elem.text or ""
                font = font_elem.get(f'{{{namespaces["w"]}}}ascii') if font_elem is not None else "Unknown"
                
                # Wrap text in <specialFont ...> if in special font, otherwise add it normally
                if font == special_font_name:
                    wrapped_text += f"<specialFont {text} >"
                else:
                    wrapped_text += text
                
        wrapped_text += "</footnote>"

        # Replace the original footnote text with wrapped text in the first <w:t> element
        for elem in footnote.findall('.//w:t', namespaces):
            elem.text = wrapped_text  # Replace with modified wrapped text
            break  # Only replace the first <w:t> element to avoid duplicating

    # Save the modified footnotes.xml
    tree.write(footnotes_path, encoding="utf-8", xml_declaration=True)

# Step 3: Modify main text with python-docx to add <specialFont ...> tags
document = Document(docx_path)

for para in document.paragraphs:
    for run in para.runs:
        font_name = run.font.name if run.font else "Unknown"
        
        # Wrap text in <specialFont ...> tags if it is in the special font
        if font_name == special_font_name:
            modified_text = ''.join(f"<specialFont {char} >" for char in run.text)
            run.text = modified_text
            run.font.name = replacement_font_name

# Save the modified main text as a new temporary .docx file
temp_main_docx_path = os.path.join(temp_dir, "modified_main.docx")
document.save(temp_main_docx_path)

# Step 4: Repackage the modified contents into a new .docx file
# Replace the main document (document.xml) from the modified_main.docx in the temp directory
with zipfile.ZipFile(temp_main_docx_path, 'r') as temp_main_zip:
    temp_main_zip.extract('word/document.xml', temp_dir)

# Create the final modified .docx by repackaging
with zipfile.ZipFile(output_docx_path, 'w') as zip_out:
    for foldername, subfolders, filenames in os.walk(temp_dir):
        for filename in filenames:
            file_path = os.path.join(foldername, filename)
            arcname = os.path.relpath(file_path, temp_dir)
            zip_out.write(file_path, arcname)

# Cleanup temporary directory
shutil.rmtree(temp_dir)

print(f"Modified document saved as {output_docx_path}")


Modified document saved as Hosea.1.App I.Modified.docx


In [21]:
doc_content.body_runs

[[[[['\t', 'Hosea 1'],
    [],
    ['1 יותם אחז יחזקיה] ][ &'],
    ['מלכי] ]h numII', '----footnote1----'],
    ['2 דִּבֶּר] ] λόγου', '----footnote2----', ' = [T'],
    ['ב(הושע)] ]h πρός',
     '----footnote3----',
     ' + [ܕܗܘܐ ܥܠ',
     '----footnote4----'],
    ['ו(יאמר)] ]h[ >'],
    ['לֵךְ] ]h >II'],
    ['3 ויקח] [ + ܠܗ', '----footnote5----'],
    ['לו] ]h*- >II III IV', '----footnote6----'],
    ['4 יהוא] ]h Ιουδα', '----footnote7----'],
    ['(ו)הִשְׁבַּתִּי] ]- ἀποστρέψω', '----footnote8----'],
    ['בית2] ]h*hT- + prep', '----footnote9----'],
    ['5 והיה (ביום ההוא)] [ >', '----footnote10----'],
    ['(ו)היה] * >'],
    ['ההוא] ]h + dicit dominus', '----footnote11----'],
    ['6 עוד1] *hT- >II III IV'],
    ['לו] ]h + κύριοςII',
     '----footnote12----',
     ' = [| [ pron',
     '----footnote13----'],
    ['נשׂא אשׂא] ] ἀντιτασσόμενος ἀντιτάξομαι',
     '----footnote14----',
     ' | * oblivione obliviscar',
     '----footnote15----',
     ' + ~'],
    ['7 בית] ] υἱούς

In [21]:
from docx import Document

# Load the document
docx_path = 'Hosea.1.App I.Full.docx'  # Replace with the actual file path
document = Document(docx_path)

# Function to get font name from a run
def get_font_name(run):
    try:
        if run.font and run.font.name:
            return run.font.name
        else:
            # If the font is not set explicitly, it might be inherited from the style
            if run.style and run.style.font and run.style.font.name:
                return run.style.font.name
    except AttributeError:
        pass
    return "Unknown"

# Iterate over paragraphs and runs to print the font of each word
for para in document.paragraphs:
    for run in para.runs:
        font_name = get_font_name(run)
        words = run.text.split()  # Split run text into individual words
        for word in words:
            print(f"Word: '{word}' - Font: {font_name}")


Word: 'Hos' - Font: Unknown
Word: 'ea' - Font: Unknown
Word: '1' - Font: Unknown
Word: '1' - Font: Unknown
Word: 'יותם' - Font: Unknown
Word: 'אחז' - Font: Unknown
Word: 'יחזקיה' - Font: Unknown
Word: ']' - Font: Unknown
Word: ']' - Font: HUBPSigla
Word: '[' - Font: HUBPSigla
Word: '&' - Font: HUBPSigla
Word: 'מלכי' - Font: Unknown
Word: ']' - Font: Unknown
Word: ']' - Font: HUBPSigla
Word: 'h' - Font: HUBPSigla
Word: 'num' - Font: Unknown
Word: 'II' - Font: Unknown
Word: '2' - Font: Unknown
Word: 'דִּבֶּר' - Font: Unknown
Word: ']' - Font: Unknown
Word: ']' - Font: HUBPSigla
Word: 'λόγου' - Font: GFS Porson
Word: '=' - Font: Unknown
Word: '[' - Font: HUBPSigla
Word: 'T' - Font: HUBPSigla
Word: 'ב' - Font: Unknown
Word: '(' - Font: Unknown
Word: 'הושע' - Font: Unknown
Word: ')' - Font: Unknown
Word: ']' - Font: Unknown
Word: ']h' - Font: HUBPSigla
Word: 'πρός' - Font: GFS Porson
Word: '+' - Font: HUBPSigla
Word: '[' - Font: HUBPSigla
Word: 'ܕܗܘܐ' - Font: Unknown
Word: 'ܥܠ' - Font: Unkn

In [23]:
from docx import Document
import xml.etree.ElementTree as ET
from fontTools.ttLib import TTFont

# Load the custom HUBPSigla font to access glyph mappings
hubps_font_path = 'HUBPS_.ttf'
hubps_font = TTFont(hubps_font_path)

# Load mappings from the HUBPSigla font
hubps_glyph_map = {code: glyph_name for cmap in hubps_font['cmap'].tables for code, glyph_name in cmap.cmap.items()}

# Load the .docx file
docx_path = 'Hosea.1.App I.Full.docx'
document = Document(docx_path)

# Function to get font name from a run
def get_font_name(run):
    try:
        if run.font and run.font.name:
            return run.font.name
        elif run.style and run.style.font and run.style.font.name:
            return run.style.font.name
    except AttributeError:
        pass
    return "Unknown"

# Check if a character is a special HUBPSigla glyph
def is_hubps_special(char):
    codepoint = ord(char)
    return codepoint in hubps_glyph_map

# Create the root element for the XML
root = ET.Element("document")

# Process each paragraph in the Word document
for para in document.paragraphs:
    para_elem = ET.SubElement(root, "paragraph")
    
    # Process each run in the paragraph
    for run in para.runs:
        font_name = get_font_name(run)
        run_elem = ET.SubElement(para_elem, "run")
        run_elem.set("font", font_name)

        run_text = ""
        for char in run.text:
            if font_name == "HUBPSigla" and is_hubps_special(char):
                # If it's a HUBPSigla special character, output it as a unique element
                special_char_elem = ET.SubElement(run_elem, "special_char")
                special_char_elem.set("unicode", f"U+{ord(char):04X}")
                special_char_elem.set("font", "HUBPSigla")
                special_char_elem.text = char
            else:
                # Otherwise, treat it as normal text
                run_text += char
        
        # Add normal text to the run element
        if run_text:
            run_elem.text = run_text

# Save the XML to a file
xml_path = 'Hosea1_AppI_Full_withFonts.xml'
tree = ET.ElementTree(root)
tree.write(xml_path, encoding="utf-8", xml_declaration=True)

print(f"XML file with differentiated font encoding saved at: {xml_path}")


XML file with differentiated font encoding saved at: Hosea1_AppI_Full_withFonts.xml


In [12]:
def convert_docx_to_txt(docx_file_path, txt_file_path):
    # Load the .docx file
    doc = Document(docx_file_path)

    # Extract text from each paragraph in the document
    text_content = '\n'.join([paragraph.text for paragraph in doc.paragraphs])

    # Write the extracted text to a .txt file
    with open(txt_file_path, 'w', encoding='utf-8') as txt_file:
        txt_file.write(text_content)

    print(f"File converted successfully and saved as '{txt_file_path}'")

# Example usage

convert_docx_to_txt('Hosea.1.App I.Full.docx', 'App1-output1.txt')


File converted successfully and saved as 'App1-output1.txt'


In [5]:
from docx import Document

def read_docx(file_path):
    document = Document(file_path)
    text = []
    for paragraph in document.paragraphs:
        text.append(paragraph.text)
    return '\n'.join(text)

# Example usage
file_path = open('Hosea.1.App I.txt'
doc_text = read_docx(file_path)
print(doc_text)


PackageNotFoundError: Package not found at 'Hosea.1.App I.txt'

In [9]:
open('Hosea.1.App I.txt', mode='r', encoding ='utf8').read()

"\ufeffHosea 1\n\n1 יותם אחז יחזקיה] ][ &\nמלכי] ]h numII1\n2 דִּבֶּר] ] λόγου2 = [T\nב(הושע)] ]h πρός3 + [ܕܗܘܐ ܥܠ4\nו(יאמר)] ]h[ >\nלֵךְ] ]h >II\n3 ויקח] [ + ܠܗ5\nלו] ]h*- >II III IV6\n4 יהוא] ]h Ιουδα7\n(ו)הִשְׁבַּתִּי] ]- ἀποστρέψω8\nבית2] ]h*hT- + prep9\n5 והיה (ביום ההוא)] [ >10\n(ו)היה] * >\nההוא] ]h + dicit dominus11\n6 עוד1] *hT- >II III IV\nלו] ]h + κύριοςII12 = [| [ pron13\nנשׂא אשׂא] ] ἀντιτασσόμενος ἀντιτάξομαι14 | * oblivione obliviscar15 + ~\n7 בית] ] υἱούς16\nיהודה] ]h >\nולא] ][ rep\n(ו)במלחמה] ]- + (οὐδὲ) ἐν ἅρμασιν17\nבסוסים] ]*[ &III IV\n8 ותהר] ]- + ἔτιII18 = [\nותלד] ]h + αὐτῷIV19 = *h\n9 ויאמר] ]h + κύριος (αὐτῷ)II20 = Th + [\nכי – עמי ~ ואנכי – לכם] 9 ~\nלכם] ]h + θεός21 = *h + [h\n1 cf app Mic 11\n2 (a) voc דְּבַר (יהוה), formula, cf v1 41 et al; cf app 131; cf תחלת דִּבְרֵי Qoh 1013; note seq; (b) noun דִּבֵּר, cf app Jer 513 and Rabb Heb; cf gerund in * loquendi “of speaking”\n3 “to”; main evid, cf v1\n4 “which was to”, ex v1\n5 “for himself”, cf v2\n6 cf vv6,

In [10]:
lines = open('Hosea.1.App I.txt', mode='r', encoding ='utf8').read().split(sep="\n")
print(lines[2])

1 יותם אחז יחזקיה] ][ &


In [4]:
########## render apparatus 4 ##############

In [5]:
def process_lines(lines):
    # Initialize variables
    chapter = None
    data = []

    # Process each line
    for line in lines:
        if line == '':
            # Skip empty lines
            continue
        elif len(line.split()) == 2:
            # If the line has only two words, save it as the chapter
            chapter = line
        else:
            # If the line is text, couple it with the chapter
            if chapter is not None:
                data.append([chapter, line])
                chapter = None  # Reset the chapter after coupling

    # Create a DataFrame
    df = pd.DataFrame(data, columns=['Chapter', 'Line'])

    return df


def split_entries(df):
    # Define the regular expression pattern to split on
    pattern = r'\xa0+\'\xa0+|\xa0+(?=\s?\d+)'
    
    # Split each line into entries based on the pattern
    df['Entries'] = df['Line'].apply(lambda x: re.split(pattern, x))
    
    return df


In [6]:
df = process_lines(lines)
df = split_entries(df)
df


Unnamed: 0,Chapter,Line,Entries
0,הושע א,"1 (בספר החילופים וברשימת הסדרים שבכ""י ל מצוין ...","[1 (בספר החילופים וברשימת הסדרים שבכ""י ל מצוין..."
1,הושע ב,1 (פ)] ל ל18 פ ר מ: (ס) ' וְהיה(1)] ר': וַ...,"[1 (פ)] ל ל18 פ ר מ: (ס) , וְהיה(1)] ר': וַ, ..."
2,הושע ג,1 (פ)] ל29 פ: (ס) ' אֱהב־] נ: אֶ ' אהבת...,"[1 (פ)] ל29 פ: (ס) , אֱהב־] נ: אֶ, אהבת] ר: ..."
3,הושע ד,"1 (פ)] ל29 37 פ מ: (ס) ; ר: (>) ; ר־מ""ק: ""פרשׄ...","[1 (פ)] ל29 37 פ מ: (ס) ; ר: (>) ; ר־מ""ק: ""פרש..."
4,הושע ה,1 (פ)] ל ל18 פ ק ש מ: (ס) ; נ: (>) ' והקש֣י...,"[1 (פ)] ל ל18 פ ק ש מ: (ס) ; נ: (>), והקש֣יבו..."
...,...,...,...
62,זכריה יג,1 (>)] ל מ: (ס) ' דויד] פ ר: דוד 2 נאם...,"[1 (>)] ל מ: (ס) , דויד] פ ר: דוד , 2 נאם ׀ ..."
63,זכריה יד,1 (פ)] ל ל10 18 פ מ: (ס) 2 הגויִ֥ם ׀ ] ק: ...,"[1 (פ)] ל ל10 18 פ מ: (ס) , 2 הגויִ֥ם ׀ ] ק: ..."
64,מלאכי א,"1 >] ל: יט (התוספת ""יט"" מיד שנייה) 2 הלוֹא־...","[1 >] ל: יט (התוספת ""יט"" מיד שנייה) , 2 הלוֹא־..."
65,מלאכי ב,"1 אליכם] ל20 ק־מ""ק: ""דׄ מטעׄ"" ; ל20־מ""ג: ""אליכ...","[1 אליכם] ל20 ק־מ""ק: ""דׄ מטעׄ"" ; ל20־מ""ג: ""אלי..."


In [7]:
df["Entries"][3]#[5]#.strip()
#22 וארשתיך – יהוה] ר!: כל הפסוק אינו מנוקד  '   

['1 (פ)] ל29 37 פ מ: (ס) ; ר: (>) ; ר־מ"ק: "פרשׄ"',
 " יושבי] פ ק1' ר: ישבי",
 ' ואֵֽין־דַּ֥עת] ל20\': ואֵ֥ין דַּעת־ ; ב"א: ואֵֽין־דַּ֥עת, ב"נ: ואֵ֥ין דַּֽעת־ ',
 '2 אלֹה] נ־מ"ק: "לו קׄ" ; ר־מ"ק: "וׄ קׄ"',
 " ורצח] פ': רצח II; ר: רצוח",
 ' וגנב] ר: וגנוב ',
 '3 כ֣ן ׀ ] ל18־מ"ק: "ל֣גׄ ׀" ; נ ר־מ"ק: "לגׄ"',
 ' יושב] ל29\' פ: ישב ; ק1\' ר: ישבי ; ר־מ"ק: "יתׄ יׄ"III II I',
 ' בחית] ר: בחיית ',
 '4 (>)] מ: (פ) ',
 " איש(2)] ל': אמש",
 ' כִּמְרִיבי] ש!: כמִרְ ',
 '6 מאסת] ר: מאסתה',
 ' וְאמאסאך] מ: וָֽ ; ל18 פ ר\': ואמאסך ; א ל ל29 ק1 מ־מ"ק: "יתיר אׄ" ; ק־מ"ק: "ן לׄ ויתׄ אׄ" ; ל30־מ"ק: "ןׄ ואמאסך קרי" ; ר־מ"ק: "לׄ ויתיר אׄ" ; ל20־מ"ק: "לׄ וכתׄ כן" ; ל37־מ"ק: "יתי א" ; פ־מ"ק: "ואמאסאך כך כתׄ ולא ק א תלתא" ',
 '8 נפשוֹ] ר: נפשֹם ; ר־מ"ק: "נפשו קׄ <...>פשם כתׄ וחד מן <...>לין דכתבין <...>וׄ תיבוׄ ק וׄ"  ; ל־מ"ק: "דׄ מטעׄ" ; ל20 נ־מ"ק: "גׄ מטע" ; ל29־מ"ק: "גׄ דמטעׄ" ; ל29־מ"ג: "נפשו גׄ דמטעׄ..." ; ל30־מ"ג: "נפשוֹ גׄ מטעיין וסימנהׄ..." ; ש־מ"ק: "גׄ מטׄ" ; מ־מ"ק: "ג\' מטעי\' בהון" ; ל20־מ"ג: "נפשו ג מטעׄ

In [8]:
# processing functions for sub-units of app_entry, for which there is matching lemma and verse data processed above

def lemma_verse_processor(text, chapter):
    # Regex to match the verse numbers at the beginning
    verse_regex = r'^(\d+(?:–\d+)?)\s'
    
    # Extract verses
    verses_match = re.match(verse_regex, text)
    if verses_match:
        verse_range = verses_match.group(1).split('–')
        if len(verse_range) == 2 and verse_range[0] != verse_range[1]:
            verses = {'from': int(verse_range[0]), 'to': int(verse_range[1])}
        else:
            verses = int(verse_range[0])
    else:
        verses = None
    
    # Isolate lemmas part by removing the verses
    lemmas_part = text[len(verses_match.group(0)):].strip() if verses_match else text
    
    return chapter, verses, process_lemma_with_range_and_diacritics(lemmas_part)

def parse_reading_entry(entry):
    # Refined regex pattern
    pattern = r"""
        \s?(?P<Sigla>[+<>~\.]*)                         # Captures special sigla
        \s*
        (?P<Reading>[^\(\)]*)                           # Captures reading (excluding parentheses)
        \s*
        (?P<Comment>\(.*\))?                            # Captures comments
    """

    # Compiling regex with VERBOSE flag for better readability and explanation
    compiled_pattern = re.compile(pattern, re.VERBOSE)
    match = compiled_pattern.match(entry)

    if not match:
        return None  # Return None if no match is found

    # Extracting groups into a dictionary
    parsed_entry = {k: v.strip() for k, v in match.groupdict().items() if v}

    # Special handling for comments containing specific markers
    if "Comment" in parsed_entry and any(marker in parsed_entry['Comment'] for marker in ["(ס)", "(פ)", "(>)"]):
        parsed_entry['Reading'] = parsed_entry.pop('Comment')

    return parsed_entry

def process_lemma_with_range_and_diacritics(lemma):
    # Adjust regex to include diacritical marks and punctuation within Hebrew words
    
    
    # Check for range indicated by "–" and process accordingly
    if "–" in lemma:
        from_lemma, to_lemma = lemma.split("–")
        return {
            'from': process_individual_lemma(from_lemma.strip()),
            'to': process_individual_lemma(to_lemma.strip())
        }
    if len(lemma.strip().split(' '))>1:#add lemma for a two word lemma separated by a space
        return{'lemma1':process_individual_lemma(lemma.strip().split(' ')[0]),
              'lemma2':process_individual_lemma(lemma.strip().split(' ')[1])}
    else:
        return process_individual_lemma(lemma.strip())


# Function to process individual lemmas or ranges, after the split,
lemma_regex = r'(\(?[^\d\(\)]+\)?)([\(\d\)]*)?(.+?׀)?'#(\d+(?:–\d+)?)\s

def process_individual_lemma(individual_lemma):
    matches = re.findall(lemma_regex, individual_lemma)
    processed_lemmas = []
#     for match in matches[:1]:
#         word, number = match
#         lemma_dict = {'lemma': word}
#         if number: lemma_dict['number'] = (number)
#         processed_lemmas.append(lemma_dict)

    for match in matches[:1]:
        word, number, paseq = match
        lemma_dict = {'lemma': word}
        if number: lemma_dict['number'] = (number)
        if paseq: lemma_dict['paseq'] = (paseq)
        processed_lemmas.append(lemma_dict)
    return processed_lemmas

def extract_cross_references(text): #extract cross-references
    # Regex to match some Roman numerals: sequences of "I"s followed by an optional "V"
    pattern = r'([I]*[V]?)'
    # Find all occurrences of the pattern
    found_numerals = re.findall(pattern, text)
    # Remove empty matches from the list
    found_numerals = [numeral for numeral in found_numerals if numeral]
    # Replace found Roman numerals with an empty string
    result_text = re.sub(pattern, '', text)
    return result_text, found_numerals

#split the string of the witnesses into each ms and if there is a pm\sm\tm as if there is a מ"ק\מ"ג also
def split_into_witnesses(text):
    witnesses = text.split()
    result = []
    mgk = None
    for witness in witnesses:
        # Splitting the witness based on the special character ־
        parts = witness.split('־')
        base_witness = parts[0]
        if len(parts)>1:
            mgk = parts[1]
        # Handling the ' character
        if base_witness.endswith("'") or base_witness.endswith("''") or base_witness.endswith("'''") or base_witness.endswith("!"):
            base_witness = base_witness.rstrip("'").rstrip("!")
            annotations = witness[len(base_witness):]
            result.append([base_witness, annotations])
        else:
            result.append([base_witness])
    if mgk:
        for ms in result:
            ms.append(mgk)
    return result

def process_input(input_str):
    if "=" in input_str and '(' in input_str:
        # Remove the '=' and return as "affirming variants"
        input_str = input_str.replace('(','').replace(')','')
        key, value = input_str.strip().split("=")
        return {"affirming variants": {key.strip(): value.strip()}}
    else:
        if "=" in input_str:
            # Return as "affirming and unidentified"
            key, value = input_str.split("=")
            return {"affirming and unidentified": {key.strip(): value.strip()}}
        else:
            # Return as comment"
            return {"comment": input_str.strip()}

def process_entry_variants(entry):
    variants = entry.split(';')
    split_variants = []
    for variant in variants:
        if ',' in variant:
            var, related_var = variant.split(',')#
            split_variants.append({'Type':'Variant','Info':var})
            split_variants.append({'Type':'Related Variant','Info':related_var})
        else:
            split_variants.append({'Type':'Variant','Info':variant})          
            
    processed_variants = []
    
    for variant_dict in split_variants:
        variant = variant_dict['Info']
#         print(type(variant['Info']))
        variant_type = variant_dict['Type']
        
        # Extract cross-references from the variant
        clean_variant, cross_reference = extract_cross_references(variant)

        if ':' in clean_variant:
            witnesses, reading = clean_variant.split(':', 1)
            #print(witnesses)
            if '=' in witnesses:
                witnesses = witnesses.replace('=', ' ') #if = in structured witnesses separated by :
                #print(witnesses)
            processed_variants.append({
                'Type': variant_type,
                'Witnesses': split_into_witnesses(witnesses),
                'Reading Info': parse_reading_entry(reading.strip()), #need to handle + and < as well as 
                'CrossReference': cross_reference
            })
        else: # this means that either there is a comment here in parenthesis, or a "= א" format in parenthesis.
            decoded_input = process_input(clean_variant)
            #if 'comment' in 
            if 'comment' in list(decoded_input.keys()):
                processed_variants.append({
                    'Comment': decoded_input['comment'],
                    'CrossReference': cross_reference
                })

            else:
                if "affirming variants" in list(decoded_input.keys()):
                    processed_variants.append({
                        'Type': 'Affirming Variant',
                        'Witnesses': split_into_witnesses(list(decoded_input['affirming variants'].keys())[0]),
                        'CrossReference': cross_reference
                    })
                else: #its a "affirming and unidentified" type, which means there are two variants here
                    witnesses = split_into_witnesses(list(decoded_input['affirming and unidentified'].keys())[0])
                    processed_variants.append({
                        'Type': 'Affirming Variant',
                        'Witnesses': witnesses,
                        'CrossReference': cross_reference
                    })
                    processed_variants.append({
                        'Type': 'Unidentified Variant',
                        'Witnesses': (witnesses[0][0],"'"),
                        'CrossReference': cross_reference
                    })
                    

    return processed_variants


In [9]:
def process_apparatus_entries(entries, chapter):
    processed_entries = []
    current_verse = 1
    for entry in entries:
        if ']' in entry:
            verse_lemma, entry_content = entry.split(']', 1)
            #if digits, so there are verse numbers, if not, take the previous verse number
            match = re.match(r'^(\d+)(.*)', verse_lemma)
            if match:
                chapter, verses, lemmas = lemma_verse_processor(verse_lemma, chapter)
                processed_entries.append({'Verse': verses, 'Lemma Info': lemmas, 'Entry': process_entry_variants(entry_content.strip())})
                if type(verses)=='list':
                    current_verse = verses[-1]
                else:
                    current_verse = verses
            else: # no verse number, so take previous verse and process lemma separately
                processed_entries.append({'Verse': current_verse, 'Lemma Info': process_lemma_with_range_and_diacritics(verse_lemma), 'Entry': process_entry_variants(entry_content.strip())})
    
        else:# no lemma, but still maybe verse number
            match = re.match(r'^(\d+)(.*)', entry)
            if match:
                verse, clean_entry = match.group(1), match.group(2)
                processed_entries.append({'Verse': verse, 'Entry': process_entry_variants(clean_entry.strip())})
                current_verse = verse
            else: #no verse, use previous
                processed_entries.append({'Verse': current_verse, 'Entry': process_entry_variants(entry.strip())})       

    return processed_entries


processed_entries = process_apparatus_entries(df["Entries"][1], 'five')


In [10]:
df["Entries"][1][-6]#.strip()

'22 וארשתיך – יהוה] ר!: כל הפסוק אינו מנוקד'

In [11]:
processed_entries[12]#["Entry"]

{'Verse': 5,
 'Lemma Info': [{'lemma': 'ושתה֙'}],
 'Entry': [{'Type': 'Variant',
   'Witnesses': [['ל18'],
    ['20', "'"],
    ['29', "'"],
    ['נ', "'"],
    ['פ'],
    ['ר'],
    ['ק1', "'"]],
   'Reading Info': {'Reading': 'ושתיה'},
   'CrossReference': []},
  {'Type': 'Variant',
   'Witnesses': [['ל'],
    ['ל18'],
    ['20', "'"],
    ['29'],
    ['30'],
    ['37'],
    ['נ'],
    ['ק'],
    ['מ']],
   'Reading Info': {'Reading': 'ושת֙', 'Comment': '(י)'},
   'CrossReference': []},
  {'Type': 'Affirming Variant',
   'Witnesses': [['ל20', "''"], ['ק1', "''"], ['ש']],
   'CrossReference': []}]}

In [12]:
processed_entries#[0]#["Entry"]#[16]['Reading']

[{'Verse': 1,
  'Lemma Info': [{'lemma': '(פ)'}],
  'Entry': [{'Type': 'Variant',
    'Witnesses': [['ל'], ['ל18'], ['פ'], ['ר'], ['מ']],
    'Reading Info': {'Reading': '(ס)'},
    'CrossReference': []}]},
 {'Verse': 1,
  'Lemma Info': [{'lemma': 'וְהיה', 'number': '(1)'}],
  'Entry': [{'Type': 'Variant',
    'Witnesses': [['ר', "'"]],
    'Reading Info': {'Reading': 'וַ'},
    'CrossReference': []}]},
 {'Verse': 1,
  'Lemma Info': {'lemma1': [{'lemma': 'מספ֤ר'}],
   'lemma2': [{'lemma': 'בני־ישראל֙'}]},
  'Entry': [{'Type': 'Variant',
    'Witnesses': [['ל20', "'"]],
    'Reading Info': {'Reading': 'מספ֞ר בנ֤י ישראל֙'},
    'CrossReference': []}]},
 {'Verse': 2,
  'Lemma Info': [{'lemma': 'י֥ום'}],
  'Entry': [{'Type': 'Variant',
    'Witnesses': [['ל', "'"]],
    'Reading Info': {'Reading': 'י֖'},
    'CrossReference': []}]},
 {'Verse': 3,
  'Lemma Info': [{'lemma': '(>)'}],
  'Entry': [{'Type': 'Variant',
    'Witnesses': [['ל18']],
    'Reading Info': {'Reading': '(ס)'},
    'Cros

In [494]:
processed_entries[0]["Entry"][1]['Reading']

'(>)'

In [486]:
processed_entries[14]["Entry"][-6]['Reading']

'"נפשוֹ גׄ מטעיין וסימנהׄ..."'

In [485]:
processed_entries[14]["Entry"][1]['Reading']

'"<...>שו קׄ" + "נפשו קׄ <...>פשם כתׄ וחד מן <...>לין דכתבין <...>וׄ תיבוׄ ק וׄ"'

In [None]:
#now need to process the reading, including sigla [+< <...>] and comments
#also catch special marks in witnesses, like !

In [None]:
################################# old functions

In [121]:
def process_full_entry(text, previous_verse=None):
    lemma, part_entry = split_full_entry(text)
    lemma_dict = lemma_verse_processor(lemma)

    # Use the previous verse if the current verse list is empty
    if not lemma_dict['verses'] and previous_verse is not None:
        lemma_dict['verses'] = previous_verse

    # Split part_entry by '|'
    if '|' in part_entry:
        entry_parts = part_entry.split('|')
    else:
        entry_parts = [part_entry]

    # Initialize a list to hold all processed parts
    processed_parts = []

    # Process each part separately
    for part in entry_parts:
        # Split part by ',' not inside parentheses
        sub_parts = split_on_comma_not_in_parentheses(part)

        # Process each sub-part using process_comma_entry
        processed_sub_parts = [process_comma_entry(sub_part) for sub_part in sub_parts]

        # Concatenate processed sub-parts for each part
        processed_parts.append(processed_sub_parts)

    # Combine processed parts. Assuming you want them as a nested list
    decoded_entries = processed_parts

    # Return the lemma_dict and decoded_entries, along with the verses used for this entry
    return lemma_dict, decoded_entries, lemma_dict['verses']

def split_on_comma_not_in_parentheses(part):
    """
    Splits the string on ',' not inside parentheses.
    """
    sub_parts = []
    current_part = []
    paren_depth = 0  # Track depth of parentheses

    for char in part:
        if char == '(':
            paren_depth += 1
        elif char == ')':
            paren_depth -= 1
        elif char == ',' and paren_depth == 0:
            # At a top-level comma, split here
            sub_parts.append(''.join(current_part))
            current_part = []
            continue

        current_part.append(char)

    # Add the last part if there's any
    if current_part:
        sub_parts.append(''.join(current_part))

    return sub_parts

def split_full_entry(text):
    sliced_entry = text.split(sep=']')
    lemma, entry = sliced_entry
#         print(f"lemma: {lemma}")
#         print(f"entry: {entry}")
    return lemma, entry    

def lemma_verse_processor(text):
    # Simplified approach: first split into digits and lemmas
    # Regex to match the verse numbers at the beginning
    verse_regex = r'^(\d+(?:–\d+)?)\s'
    
    # Extract verses
    verses_match = re.match(verse_regex, text)
    verses = list(map(int, verses_match.group(1).split('–'))) if verses_match else []
    
    # Isolate lemmas part by removing the verses
    lemmas_part = text[len(verses_match.group(0)):].strip() if verses_match else text
    return {
        'verses': verses,
        'lemmas': process_lemma_with_range_and_diacritics(lemmas_part)
    }

# Function to process individual lemmas or ranges, after the split,
lemma_regex = r'(k|q)?\s*([^\d\s]+)(\d?\,?\d?)'#(\d+(?:–\d+)?)\s

def process_lemma_with_range_and_diacritics(lemma):
    # Adjust regex to include diacritical marks and punctuation within Hebrew words
    
    
    # Check for range indicated by "–" and process accordingly
    if "–" in lemma:
        from_lemma, to_lemma = lemma.split("–")
        return {
            'from': process_individual_lemma(from_lemma.strip()),
            'to': process_individual_lemma(to_lemma.strip())
        }

    # Split lemma if there are separate lemmas with "/"
    split_lemmas = re.split(r'\s*/\s*', lemma) if '/' in lemma else [lemma]
    
    processed_lemmas = []
    for split_lemma in split_lemmas:
        processed = process_individual_lemma(split_lemma)
        processed_lemmas.extend(processed)
    
    return processed_lemmas

def process_individual_lemma(individual_lemma):
    matches = re.findall(lemma_regex, individual_lemma)
    processed_lemmas = []
    for match in matches:
        prefix, word, number = match
        lemma_dict = {'lemma': word}
        if prefix: lemma_dict[prefix] = True
        if number: lemma_dict['number'] = (number)
        processed_lemmas.append(lemma_dict)
    return processed_lemmas

# processing functions for sub-units of app_entry, for which there is matching lemma and verse data processed above

def extract_cross_references(text): #extract cross-references
    # Regex to match some Roman numerals: sequences of "I"s followed by an optional "V"
    pattern = r'([I]*[V]?)'
    # Find all occurrences of the pattern
    found_numerals = re.findall(pattern, text)
    # Remove empty matches from the list
    found_numerals = [numeral for numeral in found_numerals if numeral]
    # Replace found Roman numerals with an empty string
    result_text = re.sub(pattern, '', text)
    return result_text, found_numerals

def parse_witnesses(text):
    pattern = re.compile(r'\s?([^\d]*?)?(\d+)\s?\(?([^\)\d.]+)?\)?', re.DOTALL | re.UNICODE)
    parts = re.findall(pattern, text)
    # Filter out empty tuples
    return [part for part in parts if any(part)]

def parse_comma_witnesses(text):
    pattern = re.compile(r'\s?([^\d]*?)?(\d*)?\s?\(?([^\)\d]+)?\)?', re.DOTALL | re.UNICODE)
    parts = re.findall(pattern, text)
    # Filter out empty tuples
    return [part for part in parts if any(part)]

def parse_reading_entry(entry):
    # Refined regex pattern
    pattern = r"""
        \s?(?P<Sigla>[+<>~\.]*)                         # Captures special sigla
        \s*
        (?P<Reading>(?:[kq]*\s?)[\u0590-\u05FF\uFB1D-\uFB4F\s.]*    # Hebrew reading, including 'k', 'q'
                   (?:/\s(?:[kq]?\s?)?[\u0590-\u05FF\uFB1D-\uFB4F\s.]*)?)  # Allows for 'k'/'q' followed by Hebrew, separated by '/'
        \s*
        \s*
        (?P<Comment>\(.*\))?                     # Captures comments
    """
    
    # Compiling regex with VERBOSE flag for better readability and explanation
    compiled_pattern = re.compile(pattern, re.VERBOSE)
    match = compiled_pattern.match(entry)

    if not match:
        return None  # Return None if no match is found

    # Extracting groups into a dictionary
    parsed_entry = {k: v for k, v in match.groupdict().items() if v}

    return parsed_entry

#splitting entry into witnesses and reading (if only one group assign to witnesses)
def witness_reading_splitter(text):
    pattern = re.compile(r'(.*?)?([\+<>~.]*\s?[kq\u0590-\u05FF]+)(.*)?', re.DOTALL)
    match = pattern.match(text)
    if match:
        return match.groups()  # Returns a tuple with the three parts
    else:
        pattern = re.compile(r'(.*?)([\+<>~])(.*)?', re.DOTALL)
        match = pattern.match(text)
        if match:
            return match.groups()  # Returns a tuple with the three parts
        else:
            return text
        return text  # No divider matching the pattern was found


def process_entry(entry):
    clean_entry, cross_references = extract_cross_references(entry)
    split_entry = witness_reading_splitter(clean_entry)
    if type(split_entry) is tuple:
        witnesses = {'Witnesses': parse_witnesses(split_entry[0])}
        if len(split_entry) == 2:
            reading = parse_reading_entry(split_entry[1])
        else:  # there are 3 groups:
            reading = parse_reading_entry(split_entry[1] + split_entry[2])
    else:
        witnesses = {'Witnesses': parse_witnesses(split_entry)}
        reading = ''
    # Include "Cross References" only if the list is not empty
    result = [witnesses, {"Reading": reading}]
    if cross_references:
        result.append({"Cross References": cross_references})
    return result

def process_comma_entry(entry):
    clean_entry, cross_references = extract_cross_references(entry)
    split_entry = witness_reading_splitter(clean_entry)
    if type(split_entry) is tuple:
        witnesses = {'Witnesses': parse_comma_witnesses(split_entry[0])}
        if len(split_entry) == 2:
            reading = parse_reading_entry(split_entry[1])
        else:  # there are 3 groups:
            reading = parse_reading_entry(split_entry[1] + split_entry[2])
    else:
        witnesses = {'Witnesses': parse_comma_witnesses(split_entry)}
        reading = ''
    # Include "Cross References" only if the list is not empty
    result = [witnesses, {"Reading": reading}]
    if cross_references:
        result.append({"Cross References": cross_references})
    return result


In [None]:
# iterate over full entries. get verse number from previous if needed. also get reading from lemma. and split on | ,

In [None]:
#process lemma:
#split into digits and lemmas. then process each separately

In [738]:
[process_full_entry(example) for example in full_entries][-1]#[0]['verses']


({'verses': [10], 'lemmas': [{'lemma': 'ואסרם'}]},
 [[({'Witnesses': [('', '96', 'pm')]},
    {'Reading': {'Reading': 'יאשרם '}},
    {'Cross References': []})],
  [({'Witnesses': [('', '150', '')]},
    {'Reading': {'Reading': 'על'}},
    {'Cross References': []})]])

In [731]:
# old function, didnt take the splitting into commas consideration
# def process_full_entry(text):
#     lemma, part_entry = split_full_entry(text)
#     lemma_dict = lemma_verse_processor(lemma)
# #     if len(lemma_dict['verses'])==0: #get verse from previous entry
# #         lemma_dict['verses'] = 
        
#     #entry_units = split_entry_units # splits on | and ,
#     #for entry in entry_units:
#     decoded_entry = process_entry(part_entry)
#     return lemma_dict, decoded_entry

In [639]:
# processing functions for sub-units of app_entry, for which there is matching lemma and verse data processed above

def remove_and_list_roman_numerals(text): #extract cross-references
    # Regex to match some Roman numerals: sequences of "I"s followed by an optional "V"
    pattern = r'([I]*[V]?)'
    # Find all occurrences of the pattern
    found_numerals = re.findall(pattern, text)
    # Remove empty matches from the list
    found_numerals = [numeral for numeral in found_numerals if numeral]
    # Replace found Roman numerals with an empty string
    result_text = re.sub(pattern, '', text)
    return result_text, found_numerals

def custom_split_string(text): #process witnesses
    pattern = re.compile(r'\s?([^\d]*?)?(\d+)\s?\(?([^\)\d]+)?\)?', re.DOTALL|re.UNICODE)    
    #pattern = r'([^,\d]*?)?(\d+)\s?(\(([^\)]+)?\)?)?([\skq]?)+'
    parts = re.findall(pattern, text)
    return parts

def parse_reading_entry(entry):
    # Refined regex pattern
    pattern = r"""
        \s?(?P<Sigla>[+<>~]?)                         # Captures special sigla
        \s*
        (?P<Reading>(?:[kq]?\s?)?[\u0590-\u05FF\uFB1D-\uFB4F\s.]*    # Hebrew reading, including 'k', 'q'
                   (?:/\s(?:[kq]?\s?)?[\u0590-\u05FF\uFB1D-\uFB4F\s.]*)?)  # Allows for 'k'/'q' followed by Hebrew, separated by '/'
        \s*
        (?P<Comment>\(.*\))?                     # Captures comments
    """
    
    # Compiling regex with VERBOSE flag for better readability and explanation
    compiled_pattern = re.compile(pattern, re.VERBOSE)
    match = compiled_pattern.match(entry)

    if not match:
        return None  # Return None if no match is found

    # Extracting groups into a dictionary
    parsed_entry = {k: v for k, v in match.groupdict().items() if v}

    return parsed_entry

#splitting entry into witnesses and reading (if only one group assign to witnesses)
def split_string(text):
    pattern = re.compile(r'(.*?)?([\+<>]?\s?[kq\u0590-\u05FF]+)(.*)?', re.DOTALL)
    match = pattern.match(text)
    if match:
        return match.groups()  # Returns a tuple with the three parts
    else:
        pattern = re.compile(r'(.*?)([\+<>])(.*)?', re.DOTALL)
        match = pattern.match(text)
        if match:
            return match.groups()  # Returns a tuple with the three parts
        else:
            return text
        return text  # No divider matching the pattern was found


def process_entry(entry):
    clean_entry, cross_references = remove_and_list_roman_numerals(entry)
    split_entry = split_string(clean_entry)
    if type(split_entry) is tuple:
        witnesses = {'Witnesses': custom_split_string(split_entry[0])}
        if len(split_entry)==2:
            reading = parse_reading_entry(split_entry[1])
        else: #there are 3 groups:
            reading = parse_reading_entry(split_entry[1]+split_entry[2])
    else:
        witnesses = {'Witnesses': custom_split_string(split_entry)}
        reading = ''
    return witnesses, {"Reading":reading}, {"Cross References":cross_references}


# for entry in sample_texts:
#     print(f"entry: {entry}")
#     clean_entry, cross_references = remove_and_list_roman_numerals(entry)
#     split_entry = split_string(clean_entry)
#     if type(split_entry) is tuple:
#         witnesses = {'witnesses': custom_split_string(split_entry[0])}
#         reading = parse_reading_entry(split_entry[1])
#         print(f"witnesses: {witnesses}")
#         print(f"reading: {reading}")
#     else:
#         witnesses = {'witnesses': custom_split_string(split_entry)}
#         print(f"witnesses: {witnesses}")
#     print(f"references: {cross_references}")


In [640]:
sample_texts = [
    "96 (non voc)",
    "30 (pm) 93 (pm) 150 (pm) + סךII IV (See b. R.HaŠanamss 23b, (LamR) Buber 1:16 (40b))",
    "93 (non voc) 96 150 (non voc) + את",
    "30 (pm) >",
    "30 + לי (non voc)I II",
    "30 (pm) >I II IV (similarly b. Pesaḥim 87bmss)",
    "93 (pm) ביהושעIV (similarly PesiqtaR 33 (153b))",
    "96 >I II IV",
    "130 k",
    "G-B Msr 34 k ממני / q ממנוIV",
    "93 כד..",
    "150 ..דברים",
    "G-B Eb 94 ותָעָד (understood as \עוד (rather than \עדי))",
    "30 89 (sm) 93 (pm) 150 (non voc) + כיI II IV"
]

In [643]:
process_entry(sample_texts[6])

({'Witnesses': [('', '93', 'pm')]},
 {'Reading': {'Reading': 'ביהושע ',
   'Comment': '(similarly PesiqtaR 33 (153b))'}},
 {'Cross References': ['IV']})

In [638]:
clean_entry, cross_references = remove_and_list_roman_numerals(sample_texts[6])
split_entry = split_string(clean_entry)
if type(split_entry) is tuple:
    witnesses = {'Witnesses': custom_split_string(split_entry[0])}
    if len(split_entry)==2:
        reading = parse_reading_entry(split_entry[1])
    else: #there are 3 groups:
        reading = parse_reading_entry(split_entry[1]+split_entry[2])

reading

{'Reading': 'ביהושע ', 'Comment': '(similarly PesiqtaR 33 (153b))'}

In [462]:
def custom_split_string(text): #process witnesses
    pattern = re.compile(r'\s?([^\d]*?)?(\d+)\s?\(?([^\)\d]+)?\)?', re.DOTALL|re.UNICODE)    
    #pattern = r'([^,\d]*?)?(\d+)\s?(\(([^\)]+)?\)?)?([\skq]?)+'
    parts = re.findall(pattern, text)
    return parts
# "96 (non voc)",
# "30 (pm) 93 (pm) 150 (pm)
test_witness = "30 (pm) 93 150 (pm)"
custom_split_string(test_witness)


[('', '30', 'pm'), ('', '93', ''), ('', '150', 'pm')]

In [623]:
#try parsing single entry app, splitting into witnesses and reading (if only one group assign to witnesses)
def split_string(text):
    pattern = re.compile(r'(.*?)?([\+<>]?\s?[kq\u0590-\u05FF]+)(.*)?', re.DOTALL)
    match = pattern.match(text)
    if match:
        return match.groups()  # Returns a tuple with the three parts
    else:
        pattern = re.compile(r'(.*?)([\+<>])(.*)?', re.DOTALL)
        match = pattern.match(text)
        if match:
            return match.groups()  # Returns a tuple with the three parts
        else:
            return text
        return text  # No divider matching the pattern was found

# Example usage
processed_sample = [split_string(text) for text in sample_texts]

print(processed_sample)


['96 (non voc)', ('30 (pm) 93 (pm) 150 (pm) ', '+ סך', 'II IV (See b. R.HaŠanamss 23b, (LamR) Buber 1:16 (40b))'), ('93 (non voc) 96 150 (non voc) ', '+ את', ''), ('30 (pm) ', '>', ''), ('30 ', '+ לי', ' (non voc)I II'), ('30 (pm) ', '>', 'I II IV (similarly b. Pesaḥim 87bmss)'), ('93 (pm)', ' ביהושע', 'IV (similarly PesiqtaR 33 (153b))'), ('96 ', '>', 'I II IV'), ('130', ' k', ''), ('G-B Msr 34', ' k', ' ממני / q ממנוIV'), ('93', ' כד', '..'), ('150 ..', 'דברים', ''), ('G-B Eb 94', ' ותָעָד', ' (understood as \\עוד (rather than \\עדי))'), ('30 89 (sm) 93 (pm) 150 (non voc) ', '+ כי', 'I II IV')]


In [619]:
#function for parsing the reading+comment (assumes witnesses and cross references have been removed)
def parse_reading_entry(entry):
    # Refined regex pattern
    pattern = r"""
        \s?(?P<Sigla>[+<>~]?)                         # Captures special sigla
        \s*
        (?P<Reading>(?:[kq]?\s?)?[\u0590-\u05FF\uFB1D-\uFB4F\s.]*    # Hebrew reading, including 'k', 'q'
                   (?:/\s(?:[kq]?\s?)?[\u0590-\u05FF\uFB1D-\uFB4F\s.]*)?)  # Allows for 'k'/'q' followed by Hebrew, separated by '/'
        \s*
        (?P<Comment>\(.*\))?                     # Captures comments
    """
    
    # Compiling regex with VERBOSE flag for better readability and explanation
    compiled_pattern = re.compile(pattern, re.VERBOSE)
    match = compiled_pattern.match(entry)

    if not match:
        return None  # Return None if no match is found

    # Extracting groups into a dictionary
    parsed_entry = {k: v for k, v in match.groupdict().items() if v}

    return parsed_entry

# Process the sample reading texts with the refined function
parse_reading_entry = [parse_reading_entry(text) for text in sample_reading_texts]

parse_reading_entry


[{'Reading': 'סך ',
  'Comment': '(See b. R.HaŠanamss 23b, (LamR) Buber 1:16 (40b))'},
 {'Sigla': '+', 'Reading': 'את'},
 {'Sigla': '>'},
 {'Sigla': '+', 'Reading': 'לי ', 'Comment': '(non voc)'},
 {'Sigla': '>', 'Comment': '(similarly b. Pesaḥim 87bmss)'},
 {'Reading': 'ביהושע ', 'Comment': '(similarly PesiqtaR 33 (153b))'},
 {},
 {'Reading': 'k'},
 {'Reading': 'k ממני / q ממנו'},
 {'Reading': 'כד..'},
 {'Reading': '..דברים'},
 {'Reading': 'נַחֵם ',
  'Comment': '(taken as infinitive, see Yeivin, Babylonian Vocalization, 1:542)'},
 {'Reading': 'חכֵם ', 'Comment': '(!)'}]

In [587]:
import re

def custom_string_processor(input_string, regex_pattern):
    # Helper function to apply regex and extract groups
    def apply_regex_and_extract(text):
        matches = re.finditer(regex_pattern, text)
        results = []
        for match in matches:
            results.append({
                'witnesses': match.group(1).strip(),
                'reading': match.group(2).strip(),
                'comments': match.group(3).strip() if match.group(3) else ''
            })
        return results

    # Process splits with "|", then ","
    def process_splits(text, delimiter):
        parts = text.split(delimiter)
        processed_parts = []
        for part in parts:
            # Apply regex to each part
            processed = apply_regex_and_extract(part)
            if processed:
                processed_parts.extend(processed)
        return processed_parts

    # Start processing
    processed_result = process_splits(input_string, '|')  # Start with the highest level of split

    return processed_result

# Custom regex pattern as provided
custom_regex = r'^(.*?)([\+<~>]?\s?[\u0590-\u05FF]+.*?)(.*)$'

# Test with the provided sample input
sample_input = "G-B msr. 30 (pm) G-A 89 (sm?) 150 (non voc) k, 30 (sm) 89 (sm) 93 (sm) 96 150 (pm) q, 93 (pm) + שערורהIV II (bla bla (f)) | 150 >"
processed_sample = [custom_string_processor(text, custom_regex) for text in sample_texts]


print(processed_sample)

def split_string(text):
    pattern = re.compile(r'^(.*?)([\+<>]?\s?[\u0590-\u05FF]+.*?)(.*)$', re.DOTALL)
    match = pattern.match(text)
    if match:
        return match.groups()  # Returns a tuple with the three parts
    else:
        return None  # No divider matching the pattern was found

# Example usage
# text = "G-B msr. 30 (pm) G-A 89 (pm?) 150 (sm) k, 30 (sm) 89 (sm) 93 (sm) 96 150 (pm) q, 93 (pm) > שערורהIV (bla bla (f))"#"30 89 (sm) 93 (pm) 150 (non voc) + כיI II IV"#"93 (pm) < ביהושעIV (similarly PesiqtaR 33 (153b))"
# split_parts = split_string(text)
# if split_parts:
#     print("witnesses:", split_parts[0])
#     print("reading:", split_parts[1])
#     print("After dividers:", split_parts[2])
# else:
#     print("No dividers found.")



[[], [{'witnesses': '30 (pm) 93 (pm) 150 (pm)', 'reading': '+ סך', 'comments': 'II IV (See b. R.HaŠanamss 23b, (LamR) Buber 1:16 (40b))'}], [{'witnesses': '93 (non voc) 96 150 (non voc)', 'reading': '+ את', 'comments': ''}], [], [{'witnesses': '30', 'reading': '+ לי', 'comments': '(non voc)I II'}], [], [{'witnesses': '93 (pm)', 'reading': 'ביהושע', 'comments': 'IV (similarly PesiqtaR 33 (153b))'}], [], [], [{'witnesses': 'G-B Msr 34 k', 'reading': 'ממני', 'comments': '/ q ממנוIV'}], [{'witnesses': '93', 'reading': 'כד', 'comments': '..'}], [{'witnesses': '150 ..', 'reading': 'דברים', 'comments': ''}], [{'witnesses': 'G-B Eb 94', 'reading': 'ותָעָד', 'comments': '(understood as \\עוד (rather than \\עדי))'}], [{'witnesses': '30 89 (sm) 93 (pm) 150 (non voc)', 'reading': '+ כי', 'comments': 'I II IV'}]]


In [37]:

def parse_apparatus_entry(entry):
    """Parse an apparatus entry into lemma(s) and content."""
    parts = entry.split(']')
    lemmas_contents = []
    for part in parts:
        if part.strip():
            lemma, content = part.split('[', 1) if '[' in part else (part, '')
            lemmas_contents.append((lemma.strip(), content.strip()))
    return lemmas_contents

def create_tei_document(apparatus_lines):
    """Create a TEI document from apparatus lines."""
    TEI_NAMESPACE = "http://www.tei-c.org/ns/1.0"
    TEI = "{%s}" % TEI_NAMESPACE
    NSMAP = {"tei": TEI_NAMESPACE}
    
    tei_root = ET.Element(TEI+"TEI", nsmap=NSMAP)
    tei_header = ET.SubElement(tei_root, TEI+"teiHeader")
    text = ET.SubElement(tei_root, TEI+"text")
    body = ET.SubElement(text, TEI+"body")
    current_chapter = None
    last_verse = None
    
    for line in apparatus_lines:
        line = line.strip()
        if not line:
            continue
        if line.startswith('Chapter'):
            chapter_number = line.split(' ')[1]
            current_chapter = ET.SubElement(body, TEI+"div", type="chapter", n=chapter_number)
            last_verse = None
            continue
        # Use regex to check if the line starts with a verse number and capture it
        match = re.match(r"^(\d+)\s*(.*)", line)
        if match:
            verse_number, entry = match.groups()
            last_verse = verse_number
        else:
            entry = line
            verse_number = last_verse
        
        if current_chapter is not None and verse_number:
            lemmas_contents = parse_apparatus_entry(entry)
            for lemma, content in lemmas_contents:
                app = ET.SubElement(current_chapter, TEI+"app")
                lem = ET.SubElement(app, TEI+"lem", n=verse_number)
                lem.text = lemma
                if content:
                    rdg = ET.SubElement(app, TEI+"rdg")
                    rdg.text = content

    return ET.ElementTree(tei_root)

def save_tei_file(tree, filename):
    """Save the TEI XML tree to a file."""
    tree.write(filename, encoding="UTF-8", xml_declaration=True, method="xml", short_empty_elements=True)



# Replace 'your_input_file.txt' with the path to your actual input file
input_file = '01 Hosea App III - מתוקן.txt'
output_file = 'apparatus_tei.xml'

with open(input_file, 'r', encoding='utf-8') as f:
    lines = f.readlines()

tei_tree = create_tei_document(lines)
save_tei_file(tei_tree, output_file)

print(f"TEI document has been saved to {output_file}.")


TEI document has been saved to apparatus_tei.xml.


In [47]:
import xml.etree.ElementTree as ET
import re

def create_tei_document(apparatus_lines):
    TEI_NAMESPACE = "http://www.tei-c.org/ns/1.0"
    TEI = "{%s}" % TEI_NAMESPACE
    
    tei_root = ET.Element(TEI + "TEI", xmlns=TEI_NAMESPACE)
    tei_header = ET.SubElement(tei_root, TEI + "teiHeader")
    text = ET.SubElement(tei_root, TEI + "text")
    body = ET.SubElement(text, TEI + "body")
    current_chapter = None
    last_verse_number = None
    
    for line in apparatus_lines:
        line = line.strip()
        if not line:
            continue
        if line.startswith('Chapter'):
            chapter_number = line.split(' ')[1].strip()
            current_chapter = ET.SubElement(body, TEI + "div", type="chapter", n=chapter_number)
        else:
            # Attempt to extract verse number and lemma content
            parts = re.match(r"^(\d+)\s*(.*)", line)
            if parts:
                verse_number, remainder = parts.groups()
                last_verse_number = verse_number  # Update last verse number with current
                
                # Further split to separate lemma from variants, if present
                lemma_section, variants_section = remainder.split(']', 1) if ']' in remainder else (remainder, "")
                lemma_section = lemma_section.strip()
                variants_section = variants_section.strip()

                if current_chapter is not None and verse_number:
                    # Create an apparatus entry for the lemma
                    app = ET.SubElement(current_chapter, TEI + "app")
                    lem = ET.SubElement(app, TEI + "lem", n=verse_number)
                    lem.text = lemma_section
                    
                    # Add variant readings if present
                    if variants_section:
                        rdg = ET.SubElement(app, TEI + "rdg")
                        rdg.text = variants_section
            else:
                print(f"Line does not conform to expected format: {line}")

    return ET.ElementTree(tei_root)

def save_tei_file(tree, filename):
    tree.write(filename, encoding="UTF-8", xml_declaration=True, method="xml", short_empty_elements=True)

# Replace 'your_input_file.txt' with the path to your actual input file
input_file = '01 Hosea App III - מתוקן.txt'
output_file = 'apparatus_tei.xml'

with open(input_file, 'r', encoding='utf-8') as f:
    lines = f.readlines()

tei_tree = create_tei_document(lines)
save_tei_file(tei_tree, output_file)

print(f"TEI document has been saved to {output_file}.")


Line does not conform to expected format: ﻿App III: Hosea
Line does not conform to expected format: יחזקיה] 30 93 (pm) 96 יחזקיהו
Line does not conform to expected format: ירבעם בן] 30 + נבט (non voc)
Line does not conform to expected format: לו] 96 >I  II IV
Line does not conform to expected format: ממלכוּת] 96 ממלכוֹת
Line does not conform to expected format: יזרעאל] 150 ישראל (parall; but 150-Tg: יזרעאל)
Line does not conform to expected format: כי] 93 (pm) + את
Line does not conform to expected format: אוסיף] 150 (pm) >
Line does not conform to expected format: את] 93 (pm) >
Line does not conform to expected format: בסוסים] 96 ובסוסיםI IV
Line does not conform to expected format: אשר2] 96 + לא
Line does not conform to expected format: אחד] 30 (pm) 150 (pm) >
Line does not conform to expected format: והצגתיה] 93 (pm) + כיום ערומה והצגתיה
Line does not conform to expected format: ושתִּה] 150 (pm) ושמתיה
Line does not conform to expected format: כי] 150 (non voc) + כי
Line does not co

In [10]:
import xml.etree.ElementTree as ET
import re


def create_apparatus_entry(verse_number, content, TEI):
    """Create TEI element for an apparatus entry."""
    app = ET.Element(TEI + "app")
    
    # Extract lemma text and the rest (witnesses, variant reading, and comments)
    lemma_text, _, rest = content.partition(']')
    lem = ET.SubElement(app, TEI + "lem")
    lem.text = lemma_text.strip()
    
    # Extract comments
    comments = re.findall(r'\((.*?)\)', rest)
    for comment in comments:
        note = ET.SubElement(app, TEI + "note")
        note.text = comment
    
    # Remove comments from rest for further processing
    rest = re.sub(r'\(.*?\)', '', rest).strip()
    
    # Extract and process witnesses and cross-references
    if rest:
        rdg = ET.SubElement(app, TEI + "rdg")
        witnesses, _, variant_reading = rest.partition(' ')
        if witnesses:
            rdg.set('wit', witnesses.strip())
        if variant_reading:
            rdg.text = variant_reading.strip()
        
        # Extract cross-references, assuming they are indicated by Roman numerals at the start
        cross_refs = re.findall(r'\bI{1,3}V?|\bIV', rest)
        for ref in cross_refs:
            ref_element = ET.SubElement(rdg, TEI + "ref")
            ref_element.set('target', '#' + ref)  # Assuming target IDs are prefixed with '#'
            ref_element.text = "See apparatus entry " + ref
    
    return app

def create_tei_document(apparatus_lines):
    TEI_NAMESPACE = "http://www.tei-c.org/ns/1.0"
    TEI = "{%s}" % TEI_NAMESPACE
    root = ET.Element(TEI + "TEI", xmlns=TEI_NAMESPACE)
    header = ET.SubElement(root, TEI + "teiHeader")
    text = ET.SubElement(root, TEI + "text")
    body = ET.SubElement(text, TEI + "body")
    div = ET.SubElement(body, TEI + "div")
    
    last_verse_number = None
    for line in apparatus_lines:
        line = line.strip()
        if not line:
            continue
        
        # Determine if the line starts with a verse number
        match = re.match(r'^(\d+)', line)
        if match:
            last_verse_number = match.group(1)
            content = line[len(last_verse_number):].strip()
        else:
            content = line
        
        if last_verse_number:
            entry = create_apparatus_entry(last_verse_number, content, TEI)
            div.append(entry)
    
    return ET.ElementTree(root)

def save_tei_file(tree, filename):
    tree.write(filename, encoding="UTF-8", xml_declaration=True, method="xml")


# Replace 'your_input_file.txt' with the path to your actual input file
input_file = '01 Hosea App III - מתוקן.txt'
output_file = 'apparatus_tei.xml'

with open(input_file, 'r', encoding='utf-8') as f:
    lines = f.readlines()

tei_tree = create_tei_document(lines)
save_tei_file(tei_tree, output_file)

print(f"TEI document has been saved to {output_file}.")


TEI document has been saved to apparatus_tei.xml.


In [232]:
#split entry into witnesses, reading, and comments 

def split_string(text):
    pattern = re.compile(r'^(.*?)([\+<>]?\s?[\u0590-\u05FF]+.*?)(.*)$', re.DOTALL)
    match = pattern.match(text)
    if match:
        return match.groups()  # Returns a tuple with the three parts
    else:
        return None  # No divider matching the pattern was found

# Example usage
text = "G-B msr. 30 (pm) G-A 89 (pm?) 150 (sm) k, 30 (sm) 89 (sm) 93 (sm) 96 150 (pm) q, 93 (pm) > שערורהIV (bla bla (f))"#"30 89 (sm) 93 (pm) 150 (non voc) + כיI II IV"#"93 (pm) < ביהושעIV (similarly PesiqtaR 33 (153b))"
split_parts = split_string(text)
if split_parts:
    print("witnesses:", split_parts[0])
    print("reading:", split_parts[1])
    print("After dividers:", split_parts[2])
else:
    print("No dividers found.")


witnesses: G-B msr. 30 (pm) G-A 89 (pm?) 150 (sm) k, 30 (sm) 89 (sm) 93 (sm) 96 150 (pm) q, 93 (pm) 
reading: > שערורה
After dividers: IV (bla bla (f))


[('G-B msr. ', '30', 'pm'), ('G-A ', '89', 'pm?'), ('', '150', 'non voc'), ('', '30', 'sm'), ('', '89', 'sm'), ('', '93', 'sm'), ('MS-G ', '150', 'pm')]


In [161]:
#parse comments
import re

def remove_and_list_roman_numerals(text):
    # Regex to match some Roman numerals: sequences of "I"s followed by an optional "V"
    pattern = r'([I]*[V]?)'
    # Find all occurrences of the pattern
    found_numerals = re.findall(pattern, text)
    # Remove empty matches from the list
    found_numerals = [numeral for numeral in found_numerals if numeral]
    # Replace found Roman numerals with an empty string
    result_text = re.sub(pattern, '', text)
    return result_text, found_numerals

# Example usage
text = "II IV (See b. R.HaŠanamss 23b, LamR Buber 1:16 (40b))"
result_text, numerals_found = remove_and_list_roman_numerals(text)
print("Modified text:", result_text.strip())
print("Numerals found:", numerals_found)



Modified text: (See b. R.HaŠanamss 23b, LamR Buber 1:16 (40b))
Numerals found: ['II', 'IV']


In [178]:
def process_entry(entry):
    split_parts = split_string(entry)
    if split_parts is None:
        return None
    
    witnesses, reading, comments = split_parts

    structured_entry = {
        'witnesses': [],
        'reading': reading,
        'comments': '',
        'cross_references': []
    }

    for part in custom_split_string(witnesses):
        # Assuming part[1] contains the witness number and part[0], part[2], part[3] contain additional info
        witness_info = {
            'n': part[1],
            'text': f"{part[0]}{part[1]} {part[2].strip()}{part[3]}"
        }
        structured_entry['witnesses'].append(witness_info)

    comments_text, numerals_found = remove_and_list_roman_numerals(comments)
    structured_entry['comments'] = comments_text
    structured_entry['cross_references'] = numerals_found

    return structured_entry

def create_apparatus_entry(verse_number, content, TEI):
    """Create TEI element for an apparatus entry."""
    TEI_ns = {'tei': TEI}  # Define the namespace dictionary if needed
    app = ET.Element(f"{{{TEI}}}app")  # Using namespace in the tag
    
    # Extract lemma text and the rest (witnesses, variant reading, and comments)
    lemma_text, _, rest = content.partition(']')
    lem = ET.SubElement(app, f"{{{TEI}}}lem")
    lem.text = lemma_text.strip('[] ')

    structured_entry = process_entry(rest)
    if not structured_entry:
        return None

    for witness in structured_entry['witnesses']:
        wit_element = ET.SubElement(app, f"{{{TEI}}}wit", {'n': witness['n']})
        wit_element.text = witness['text']
    
    rdg_element = ET.SubElement(app, f"{{{TEI}}}rdg")
    rdg_element.text = structured_entry['reading']

    if structured_entry['comments']:
        comment_element = ET.SubElement(app, f"{{{TEI}}}note")
        comment_element.text = structured_entry['comments']

    for ref in structured_entry['cross_references']:
        ref_element = ET.SubElement(app, f"{{{TEI}}}ref")
        ref_element.text = ref

    return app


In [181]:
import xml.etree.ElementTree as ET
import re


def create_tei_document(apparatus_lines):
    TEI_NAMESPACE = "http://www.tei-c.org/ns/1.0"
    ET.register_namespace('', TEI_NAMESPACE)  # Register the default namespace

    # Create the root element without redundantly specifying the xmlns attribute
    root = ET.Element("{%s}TEI" % TEI_NAMESPACE)
    header = ET.SubElement(root, "{%s}teiHeader" % TEI_NAMESPACE)
    text = ET.SubElement(root, "{%s}text" % TEI_NAMESPACE)
    body = ET.SubElement(text, "{%s}body" % TEI_NAMESPACE)
    div = ET.SubElement(body, "{%s}div" % TEI_NAMESPACE)
    
    last_verse_number = None
    for line in apparatus_lines:
        line = line.strip()
        if not line:
            continue
        
        # Determine if the line starts with a verse number
        match = re.match(r'^(\d+)', line)
        if match:
            last_verse_number = match.group(1)
            content = line[len(last_verse_number):].strip()
        else:
            content = line
        
        if last_verse_number:
            entry = create_apparatus_entry(last_verse_number, content, TEI_NAMESPACE)
            if entry is not None:  # Ensure entry creation was successful
                div.append(entry)
    
    return ET.ElementTree(root)

def save_tei_file(tree, filename):
    tree.write(filename, encoding="UTF-8", xml_declaration=True, method="xml")


# Replace 'your_input_file.txt' with the path to your actual input file
input_file = '01 Hosea App III - מתוקן.txt'
output_file = 'apparatus_tei.xml'

with open(input_file, 'r', encoding='utf-8') as f:
    lines = f.readlines()

tei_tree = create_tei_document(lines)
save_tei_file(tei_tree, output_file)

print(f"TEI document has been saved to {output_file}.")


TEI document has been saved to apparatus_tei.xml.


In [227]:
import re

def split_string(text):
    pattern = re.compile(r'^(.*?)([\+<~>]?\s?[\u0590-\u05FF]+.*?)(.*)$', re.DOTALL)
    match = pattern.match(text)
    if match:
        return match.groups()
    else:
        return None

def custom_split_string(text):
    pattern = re.compile(r'([^,\d]*?)?(\d+)\s?(\([^\)]+\)?)?([\skq]*)?', re.DOTALL|re.UNICODE)
    parts = re.findall(pattern, text)
    return parts

def remove_and_list_roman_numerals(text):
    pattern = r'([I]*[V]?)'
    found_numerals = re.findall(pattern, text)
    found_numerals = [numeral for numeral in found_numerals if numeral]
    result_text = re.sub(pattern, '', text)
    return result_text, found_numerals

# def process_entry(entry):
#     split_parts = split_string(entry)
#     if split_parts is None:
#         return "Unable to process entry: No valid dividers found."
    
#     witnesses, reading, comments = split_parts

#     witness_entries = []
#     for part in custom_split_string(witnesses):
#         witness_entry = f'<witness n="{part[1]}">{part[0]}{part[1]} {part[2].strip()}{part[3]}</witness>'
#         witness_entries.append(witness_entry)
#     witnesses_tagged = "\n".join(witness_entries)

#     reading_tagged = f'<reading>{reading}</reading>'

#     comments_text, numerals_found = remove_and_list_roman_numerals(comments)
#     comments_tagged = f'<comment>{comments_text}</comment>'
#     cross_references = "\n".join([f'<ref>{numeral}</ref>' for numeral in numerals_found])

#     # Combine all parts, placing cross_references outside the comment
#     tei_entry = f"{witnesses_tagged}\n{reading_tagged}\n{comments_tagged}\n{cross_references}"
#     return tei_entry

# Example usage
entry ="G-B msr. 30 (pm) G-A 89 (pm?) 150 (non voc) k, 30 (sm) 89 (sm) 93 (sm) 96 150 (pm) q, 93 (pm) > שערורהIV II (bla bla (f))"
processed_entry = process_entry(entry)
print(processed_entry)


{'witnesses': [{'n': '30', 'text': 'G-B msr. 30 (pm) '}, {'n': '89', 'text': 'G-A 89 (pm?) '}, {'n': '150', 'text': '150 (non voc) k'}, {'n': '30', 'text': ' 30 (sm) '}, {'n': '89', 'text': '89 (sm) '}, {'n': '93', 'text': '93 (sm) '}, {'n': '96', 'text': '96 '}, {'n': '150', 'text': '150 (pm) q'}, {'n': '93', 'text': ' 93 (pm) '}], 'reading': '> שערורה', 'comments': '  (bla bla (f))', 'cross_references': ['IV', 'II']}


In [60]:
rest = "93 (pm) + ביהושעIV (similarly PesiqtaR 33 (153b))"
rest

'93 (pm) + ביהושעIV (similarly PesiqtaR 33 (153b))'

In [None]:
#### 

In [None]:
####### old stuff

In [41]:

def read_text_from_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

def strip_non_hebrew(word):
    normalized_word = unicodedata.normalize('NFD', word)
    stripped_word = ''.join(re.findall(r'[\u05D0-\u05EA]', normalized_word))
    return unicodedata.normalize('NFC', stripped_word)

def process_word(token, verse_id, word_id, parent_element):
    parts = token.split('־')
    pe_count = 1  # Counter for 'פ' tags

    for part in parts:
        w = ET.SubElement(parent_element, 'w', id=f'verse{verse_id}_word{word_id}')

        alphabetic = strip_non_hebrew(part)
        non_alphabetic = ''.join(re.findall(r'[^\u05D0-\u05EA]', part))

        original = ET.SubElement(w, 'original')
        original.text = part
        stripped = ET.SubElement(w, 'stripped')
        stripped.text = alphabetic
        punctuation = ET.SubElement(w, 'punctuation')
        punctuation.text = non_alphabetic

        if "פ" in part:
            pe_tag = ET.SubElement(w, 'pe', id=f'verse{verse_id}_pe{pe_count}')
            pe_tag.text = "פ"
            pe_count += 1
        
        word_id += 1
    return word_id

def encode_tei_hebrew_word_details_enhanced(file_path, output_file):
    text = read_text_from_file(file_path)
    TEI = ET.Element('TEI', xmlns='http://www.tei-c.org/ns/1.0')
    text_element = ET.SubElement(TEI, 'text')
    body = ET.SubElement(text_element, 'body')

    chapter_id = 1
    verse_id = 1

    chapters = text.split('פרק')
    for chapter in chapters[1:]:
        div = ET.SubElement(body, 'div', type='chapter', id=f'chapter{chapter_id}')
        chapter_id += 1

        verses = re.split(r'(\[\פ\]|:)', chapter)
        for verse in verses:
            if verse.strip() and verse not in ['[פ]', ':']:
                p = ET.SubElement(div, 'p', type='verse', id=f'verse{verse_id}')
                word_id = 1

                tokens = verse.strip().split()
                for token in tokens:
                    word_id = process_word(token, verse_id, word_id, p)

                verse_id += 1

    tree = ET.ElementTree(TEI)
    with open(output_file, "w", encoding="utf-8") as f:
        tree.write(f, encoding="unicode")

# Specify the file paths
file_path = 'file.txt'  # Replace with your input file path
output_file = 'tei_hebrew_output_enhanced.xml'  # Replace with your output file path

# Run the function
encode_tei_hebrew_word_details_enhanced(file_path, output_file)
output_file

'tei_hebrew_output_enhanced.xml'

In [None]:
[' ', '"', '$', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', '<', '=', '>', 'E', 'I', 'T', '_', 'a', 'b', 'c', 'd', 'e', 'g', 'h', 'i', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', '֑', '֔', '֕', '֖', '֗', '֙', '֛', '֜', '֞', '֣', '֤', '֥', '֨', '֩', 'ְ', 'ֱ', 'ֲ', 'ִ', 'ֵ', 'ֶ', 'ַ', 'ָ', 'ֹ', 'ֻ', 'ּ', 'ֽ', '׀', 'ׁ', 'ׂ']

In [51]:
def extract_consecutive_non_hebrew_groups(file_path):
    text = read_text_from_file(file_path)
    non_hebrew_groups = set()

    # Using a regular expression to find sequences of non-Hebrew characters
    pattern = re.compile(r'([^\u05D0-\u05EA]{,2})')
    matches = pattern.findall(unicodedata.normalize('NFD', text))

    for match in matches:
        non_hebrew_groups.add(match.strip())

    return non_hebrew_groups

# Extract and print groups of consecutive non-Hebrew characters
file_path = 'file.txt'

consecutive_non_hebrew_groups = extract_consecutive_non_hebrew_groups(file_path)
print(sorted(consecutive_non_hebrew_groups))



['', '$', '$1', '$2', '$4', '2', ':', '[', ']', '֑', '֔', '֕', '֖', '֗', '֙', '֜', '֣', '֤', '֥', '֥$', '֨', '֩', 'ְ', 'ְ$', 'ְ֙', 'ְּ', 'ְׁ', 'ְׂ', 'ֱ', 'ֲ', 'ִ', 'ִ$', 'ִ֔', 'ִ֖', 'ִ֜', 'ִ֨', 'ִּ', 'ִֽ', 'ִׁ', 'ֵ', 'ֵ$', 'ֵ֔', 'ֵ֖', 'ֵ֗', 'ֵ֛', 'ֵ֣', 'ֵ֤', 'ֵ֨', 'ֵּ', 'ֵֽ', 'ֵׁ', 'ֶ', 'ֶ֑', 'ֶ֙', 'ֶ֣', 'ֶ֤', 'ֶ֥', 'ֶּ', 'ֶֽ', 'ֶׁ', 'ַ', 'ַ֗', 'ַ֙', 'ַּ', 'ַׁ', 'ָ', 'ָ֑', 'ָ֔', 'ָ֖', 'ָ֗', 'ָ֛', 'ָ֜', 'ָ֞', 'ָ֣', 'ָ֥', 'ָ֨', 'ָּ', 'ָֽ', 'ֹ', 'ֹ֖', 'ֹ֣', 'ֹ֤', 'ֹ֨', 'ֹּ', 'ֹׂ', 'ֻ', 'ּ', 'ּ֣', 'ֽ', '־', '־$', '׀', 'ׁ', 'ׂ֖', '\ufeff']
