In [1]:
import xml.etree.ElementTree as ET
import re
import unicodedata
from docx import Document
from lxml import etree
import pandas as pd
import os
from win32com import client


In [None]:
def convert_doc_to_docx(doc_path):
    word = client.Dispatch("Word.Application")
    absolute_doc_path = os.path.abspath(doc_path)
    doc = word.Documents.Open(absolute_doc_path)
    doc_path_new = absolute_doc_path.replace(".doc", ".docx")
    doc.SaveAs2(doc_path_new, FileFormat=16)  # FileFormat=16 is for docx
    doc.Close()
    word.Quit()
    return doc_path_new

# Example Usage
docx_path = convert_doc_to_docx('Hosea.2.doc')

def convert_docx_to_txt(docx_file_path, txt_file_path):
    # Load the .docx file
    doc = Document(docx_file_path)

    # Extract text from each paragraph in the document
    text_content = '\n'.join([paragraph.text for paragraph in doc.paragraphs])

    # Write the extracted text to a .txt file
    with open(txt_file_path, 'w', encoding='utf-8') as txt_file:
        txt_file.write(text_content)

    print(f"File converted successfully and saved as '{txt_file_path}'")

# Example usage

convert_docx_to_txt('Hosea.2.docx', 'output3.txt')


In [4]:
with open('01 Hosea App III - מתוקן.txt', 'r', encoding='utf-8') as file:
    lines = file.readlines()
    txt = file
lines

['\ufeffApp III: Hosea\n',
 '\n',
 'Chapter 1\n',
 '1 עזיה] 30 עזיהו\n',
 'יחזקיה] 30 93 (pm) 96 יחזקיהו\n',
 'ירבעם בן] 30 + נבט (non voc)\n',
 '2 בהושע] 93 (pm) ביהושעIV (similarly PesiqtaR 33 (153b))\n',
 '3 ותלד] 150 (pm) + בן\n',
 ' לו] 96 >I  II IV\n',
 '4 ופקדתי] 30 + על (non voc)\n',
 'ממלכוּת] 96 ממלכוֹת\n',
 '5 את] 30 >\n',
 'יזרעאל] 150 ישראל (parall; but 150-Tg: יזרעאל)\n',
 '6 עוד1] 30 (pm) >I II IV (similarly b. Pesaḥim 87bmss)\n',
 'כי] 93 (pm) + את\n',
 'אוסיף] 150 (pm) >\n',
 'את] 93 (pm) >\n',
 '7 והושעתים] 93 (pm) והושעתם\n',
 'בסוסים] 96 ובסוסיםI IV\n',
 '8 רֻחמה] 30 לח..\n',
 '9 ואנכי] 150 (pm) אנכי | 93 ואני\n',
 '\n',
 'Chapter 2\n',
 '1 אשר1] 150 + יאמר (non voc)\n',
 'אשר2] 96 + לא\n',
 '2 להם] 30 (pm) לכם\n',
 'אחד] 30 (pm) 150 (pm) >\n',
 '4 ונאפופיה] 150 (pm) ונאפו\n',
 '5 ערֻמה] 96 ערוֹמה \n',
 'והצגתיה] 93 (pm) + כיום ערומה והצגתיה \n',
 'ושתִּה] 150 (pm) ושמתיה\n',
 '7 אלכה] 30 + ואשיבה אל איש (non voc)\n',
 '8 שָׂך] 30 (pm) 93 (pm) 150 (pm) סךII IV (See 

In [None]:
########## render apparatus 3 ##############

In [None]:
#new logic
#1. split into lemma and app_entry. process each separately
# lemma: split into number range and lemma range. 
#numbers include \d+'-', lemmas need to include potential k\q attribute, and word number.
# app_entry: splits into mss., reading, comments, cross-reference.
# 


In [8]:
#processing functions, including chapter information
def process_full_entry(text, chapter, previous_verse=None):
    lemma, part_entry = split_full_entry(text)
    lemma_dict = lemma_verse_processor(lemma, chapter)
    
    # Use the previous verse if the current verse list is empty
    if not lemma_dict['verses'] and previous_verse is not None:
        lemma_dict['verses'] = previous_verse
        

    # Split part_entry by '|'
    if '|' in part_entry:
        entry_parts = part_entry.split('|')
    else:
        entry_parts = [part_entry]

    # Initialize a list to hold all processed parts
    processed_parts = []

    # Process each part separately
    for part in entry_parts:
        # Split part by ',' not inside parentheses
        sub_parts = split_on_comma_not_in_parentheses(part)

        # Process each sub-part using process_comma_entry
        processed_sub_parts = [process_comma_entry(sub_part) for sub_part in sub_parts]

        # Concatenate processed sub-parts for each part
        processed_parts.append(processed_sub_parts)

    # Combine processed parts. Assuming you want them as a nested list
    decoded_entries = processed_parts

    # Return the lemma_dict and decoded_entries, along with the verses used for this entry
    return lemma_dict, decoded_entries, lemma_dict['verses']

def split_on_comma_not_in_parentheses(part):
    """
    Splits the string on ',' not inside parentheses.
    """
    sub_parts = []
    current_part = []
    paren_depth = 0  # Track depth of parentheses

    for char in part:
        if char == '(':
            paren_depth += 1
        elif char == ')':
            paren_depth -= 1
        elif char == ',' and paren_depth == 0:
            # At a top-level comma, split here
            sub_parts.append(''.join(current_part))
            current_part = []
            continue

        current_part.append(char)

    # Add the last part if there's any
    if current_part:
        sub_parts.append(''.join(current_part))

    return sub_parts

def split_full_entry(text):
    sliced_entry = text.split(sep=']')
    lemma, entry = sliced_entry
#         print(f"lemma: {lemma}")
#         print(f"entry: {entry}")
    return lemma, entry    

# def lemma_verse_processor(text, chapter):
#     # Simplified approach: first split into digits and lemmas
#     # Regex to match the verse numbers at the beginning
#     verse_regex = r'^(\d+(?:–\d+)?)\s'
    
#     # Extract verses
#     verses_match = re.match(verse_regex, text)
#     verses = list(map(int, verses_match.group(1).split('–'))) if verses_match else []
    
#     # Isolate lemmas part by removing the verses
#     lemmas_part = text[len(verses_match.group(0)):].strip() if verses_match else text
#     return {
#         'chapter': chapter,
#         'verses': verses,
#         'lemmas': process_lemma_with_range_and_diacritics(lemmas_part)
#     }

def lemma_verse_processor(text, chapter):
    # Regex to match the verse numbers at the beginning
    verse_regex = r'^(\d+(?:–\d+)?)\s'
    
    # Extract verses
    verses_match = re.match(verse_regex, text)
    if verses_match:
        verse_range = verses_match.group(1).split('–')
        if len(verse_range) == 2 and verse_range[0] != verse_range[1]:
            verses = {'from': int(verse_range[0]), 'to': int(verse_range[1])}
        else:
            verses = int(verse_range[0])
    else:
        verses = None
    
    # Isolate lemmas part by removing the verses
    lemmas_part = text[len(verses_match.group(0)):].strip() if verses_match else text
    return {
        'chapter': chapter,
        'verses': verses,
        'lemmas': process_lemma_with_range_and_diacritics(lemmas_part)
    }


# Function to process individual lemmas or ranges, after the split,
lemma_regex = r'(k|q)?\s*([^\d\s]+)(\d?\,?\d?)'#(\d+(?:–\d+)?)\s

def process_lemma_with_range_and_diacritics(lemma):
    # Adjust regex to include diacritical marks and punctuation within Hebrew words
    
    
    # Check for range indicated by "–" and process accordingly
    if "–" in lemma:
        from_lemma, to_lemma = lemma.split("–")
        return {
            'from': process_individual_lemma(from_lemma.strip()),
            'to': process_individual_lemma(to_lemma.strip())
        }

    # Split lemma if there are separate lemmas with "/"
    split_lemmas = re.split(r'\s*/\s*', lemma) if '/' in lemma else [lemma]
    
    processed_lemmas = []
    for split_lemma in split_lemmas:
        processed = process_individual_lemma(split_lemma)
        processed_lemmas.extend(processed)
    
    return processed_lemmas

def process_individual_lemma(individual_lemma):
    matches = re.findall(lemma_regex, individual_lemma)
    processed_lemmas = []
    for match in matches:
        prefix, word, number = match
        lemma_dict = {'lemma': word}
        if prefix: lemma_dict[prefix] = True
        if number: lemma_dict['number'] = (number)
        processed_lemmas.append(lemma_dict)
    return processed_lemmas

# processing functions for sub-units of app_entry, for which there is matching lemma and verse data processed above

def extract_cross_references(text): #extract cross-references
    # Regex to match some Roman numerals: sequences of "I"s followed by an optional "V"
    pattern = r'([I]*[V]?)'
    # Find all occurrences of the pattern
    found_numerals = re.findall(pattern, text)
    # Remove empty matches from the list
    found_numerals = [numeral for numeral in found_numerals if numeral]
    # Replace found Roman numerals with an empty string
    result_text = re.sub(pattern, '', text)
    return result_text, found_numerals

def parse_witnesses(text):
    pattern = re.compile(r'\s?([^\d]*?)?(\d+)\s?\(?([^\)\d.]+)?\)?', re.DOTALL | re.UNICODE)
    parts = re.findall(pattern, text)
    # Filter out empty tuples
    return [part for part in parts if any(part)]

def post_process_witnesses(witnesses):
    # Define the set of specific values for "x"
    specific_values = {"G-B Ea ","G-B Eb ", "G-B Kb ", "G-B Msr "}  # Replace with the actual values

    # Iterate over the witnesses, except for the last one
    for i in range(len(witnesses) - 1):
        z, y, x = witnesses[i]

        # Check if "x" is one of the specific values
        if x in specific_values:
            # Remove "x" from the current tuple and prepend it to the "z" of the next witness
            witnesses[i] = (z, y, '')
            next_z, next_y, next_x = witnesses[i + 1]
            witnesses[i + 1] = (x + next_z, next_y, next_x)

    # Remove the first tuple if it becomes empty
    if witnesses and witnesses[0] == ('', '', ''):
        witnesses.pop(0)

    return witnesses

def parse_comma_witnesses(text):
    pattern = re.compile(r'\s?([^\d]*?)?(\d*)?\s?\(?([^\)\d]+)?\)?', re.DOTALL | re.UNICODE)
    parts = re.findall(pattern, text)
    # Filter out empty tuples
    return [part for part in parts if any(part)]

def parse_reading_entry(entry):
    # Refined regex pattern
    pattern = r"""
        \s?(?P<Sigla>[+<>~\.]*)                         # Captures special sigla
        \s*
        (?P<Reading>(?:[kq]*\s?)[\u0590-\u05FF\uFB1D-\uFB4F\s.]*    # Hebrew reading, including 'k', 'q'
                   (?:/\s(?:[kq]?\s?)?[\u0590-\u05FF\uFB1D-\uFB4F\s.]*)?)  # Allows for 'k'/'q' followed by Hebrew, separated by '/'
        \s*
        \s*
        (?P<GeneralComment>\(.*\))?                     # Captures comments
    """
    
    # Compiling regex with VERBOSE flag for better readability and explanation
    compiled_pattern = re.compile(pattern, re.VERBOSE)
    match = compiled_pattern.match(entry)

    if not match:
        return None  # Return None if no match is found

    # Extracting groups into a dictionary
    parsed_entry = {k: v for k, v in match.groupdict().items() if v}

    return parsed_entry

#splitting entry into witnesses and reading (if only one group assign to witnesses)
def witness_reading_splitter(text):
    pattern = re.compile(r'(.*?)?([\+<>~.]*\s?[kq\u0590-\u05FF]+)(.*)?', re.DOTALL)
    match = pattern.match(text)
    if match:
        return match.groups()  # Returns a tuple with the three parts
    else:
        pattern = re.compile(r'(.*?)([\+<>~])(.*)?', re.DOTALL)
        match = pattern.match(text)
        if match:
            return match.groups()  # Returns a tuple with the three parts
        else:
            return text
        return text  # No divider matching the pattern was found


def process_entry(entry):
    clean_entry, cross_references = extract_cross_references(entry)
    split_entry = witness_reading_splitter(clean_entry)
    if type(split_entry) is tuple:
        witnesses = {'Witnesses': parse_witnesses(split_entry[0])}
        if len(split_entry) == 2:
            reading = parse_reading_entry(split_entry[1])
        else:  # there are 3 groups:
            reading = parse_reading_entry(split_entry[1] + split_entry[2])
    else:
        witnesses = {'Witnesses': parse_witnesses(split_entry)}
        reading = ''
    # Include "Cross References" only if the list is not empty
    result = [witnesses, {"Rdg": reading}]
    if cross_references:
        result.append({"Cross References": cross_references})
    return result

def process_comma_entry(entry):
    clean_entry, cross_references = extract_cross_references(entry)
    split_entry = witness_reading_splitter(clean_entry)
    if type(split_entry) is tuple:
        witnesses = {'Witnesses': parse_comma_witnesses(split_entry[0])}
        if len(split_entry) == 2:
            reading = parse_reading_entry(split_entry[1])
        else:  # there are 3 groups:
            reading = parse_reading_entry(split_entry[1] + split_entry[2])
    else:
        witnesses = {'Witnesses': parse_comma_witnesses(split_entry)}
        reading = ''
    # Include "Cross References" only if the list is not empty
    result = [witnesses, {"Rdg": reading}]
    if cross_references:
        result.append({"Cross References": cross_references})
    return result


In [9]:
##### 
def compile_and_display_entries(entries):
    previous_verse = None
    current_chapter = None
    compiled_results = []
    
    for entry in entries:
        chapter_match = re.search(r'Chapter (\d+)', entry)
        if chapter_match:
            current_chapter = chapter_match.group(1)
            continue
        elif entry.strip() and not entry.strip().isdigit():
            clean_entry = entry.strip()
            lemma_dict, decoded_entries, used_verse = process_full_entry(clean_entry, current_chapter, previous_verse)
            previous_verse = used_verse

            # Combine the lemma_dict and decoded_entries for display
            entry_result = {
                #'Chapter': current_chapter,
                'Entry': clean_entry,
                'Lemma Info': lemma_dict,
                'Decoded Entries': []
            }
            last_witness_second_group = None
            for i, decoded_entries_list in enumerate(decoded_entries):
                for j, decoded_entry in enumerate(decoded_entries_list):
                    variant_type = "Variant" if j == 0 else "Related Variant"
                    variant_dict = {'Type': variant_type, 'Details': []}
                    for item in decoded_entry:
                        if isinstance(item, dict) and 'Witnesses' in item:
                            # Update witnesses with last non-empty second group if needed
                            updated_witnesses = []
                            for witness in item['Witnesses']:
                                if witness[1] == '' and last_witness_second_group:
                                    updated_witnesses.append((witness[0], last_witness_second_group, witness[2]))
                                else:
                                    updated_witnesses.append(witness)
                                    if witness[1]:
                                        last_witness_second_group = witness[1]
                            # Apply post-processing to witnesses
                            item['Witnesses'] = post_process_witnesses(updated_witnesses)
                        variant_dict['Details'].append(item)
                    entry_result['Decoded Entries'].append(variant_dict)
            compiled_results.append(entry_result)

    # Display the results in a structured format
    for result in compiled_results:
        print(f"Entry:")
        print(f"{result['Entry']}")
        print("Lemma Info:")
        for key, value in result['Lemma Info'].items():
            print(f"  {key}: {value}")
        print("Decoded Entries:")
        for variant in result['Decoded Entries']:
            print(f"  {variant['Type']}:")
            for detail in variant['Details']:
                if isinstance(detail, dict):
                    for key, value in detail.items():
                        print(f"    {key}: {value}")
                else:
                    print(f"    {detail}")
        print("-" * 50)  # Separator line
    
    return compiled_results

# Example usage
test = compile_and_display_entries(lines[2:-7]) # notice
test

Entry:
1 עזיה] 30 עזיהו
Lemma Info:
  chapter: 1
  verses: 1
  lemmas: [{'lemma': 'עזיה'}]
Decoded Entries:
  Variant:
    Witnesses: [('', '30', '')]
    Rdg: {'Reading': 'עזיהו'}
--------------------------------------------------
Entry:
יחזקיה] 30 93 (pm) 96 יחזקיהו
Lemma Info:
  chapter: 1
  verses: 1
  lemmas: [{'lemma': 'יחזקיה'}]
Decoded Entries:
  Variant:
    Witnesses: [('', '30', ''), ('', '93', 'pm'), ('', '96', '')]
    Rdg: {'Reading': 'יחזקיהו'}
--------------------------------------------------
Entry:
ירבעם בן] 30 + נבט (non voc)
Lemma Info:
  chapter: 1
  verses: 1
  lemmas: [{'lemma': 'ירבעם'}, {'lemma': 'בן'}]
Decoded Entries:
  Variant:
    Witnesses: [('', '30', '')]
    Rdg: {'Sigla': '+', 'Reading': 'נבט ', 'GeneralComment': '(non voc)'}
--------------------------------------------------
Entry:
2 בהושע] 93 (pm) ביהושעIV (similarly PesiqtaR 33 (153b))
Lemma Info:
  chapter: 1
  verses: 2
  lemmas: [{'lemma': 'בהושע'}]
Decoded Entries:
  Variant:
    Witnesses: [(''

Lemma Info:
  chapter: 8
  verses: 11
  lemmas: [{'lemma': 'כי'}]
Decoded Entries:
  Variant:
    Witnesses: [('', '89', 'pm')]
    Rdg: {'Sigla': '>'}
--------------------------------------------------
Entry:
אפרים] 30 (pm) ישראל
Lemma Info:
  chapter: 8
  verses: 11
  lemmas: [{'lemma': 'אפרים'}]
Decoded Entries:
  Variant:
    Witnesses: [('', '30', 'pm')]
    Rdg: {'Reading': 'ישראל'}
--------------------------------------------------
Entry:
לו] 93 (pm) לי
Lemma Info:
  chapter: 8
  verses: 11
  lemmas: [{'lemma': 'לו'}]
Decoded Entries:
  Variant:
    Witnesses: [('', '93', 'pm')]
    Rdg: {'Reading': 'לי'}
--------------------------------------------------
Entry:
12 k אכתוב / q אכתב] 93 96 150 (sm) k, 30 89 150 (pm) q
Lemma Info:
  chapter: 8
  verses: 12
  lemmas: [{'lemma': 'אכתוב', 'k': True}, {'lemma': 'אכתב', 'q': True}]
Decoded Entries:
  Variant:
    Witnesses: [('', '93', ''), ('', '96', ''), ('', '150', 'sm')]
    Rdg: {'Reading': 'k'}
  Related Variant:
    Witnesses: [

[{'Entry': '1 עזיה] 30 עזיהו',
  'Lemma Info': {'chapter': '1', 'verses': 1, 'lemmas': [{'lemma': 'עזיה'}]},
  'Decoded Entries': [{'Type': 'Variant',
    'Details': [{'Witnesses': [('', '30', '')]},
     {'Rdg': {'Reading': 'עזיהו'}}]}]},
 {'Entry': 'יחזקיה] 30 93 (pm) 96 יחזקיהו',
  'Lemma Info': {'chapter': '1', 'verses': 1, 'lemmas': [{'lemma': 'יחזקיה'}]},
  'Decoded Entries': [{'Type': 'Variant',
    'Details': [{'Witnesses': [('', '30', ''),
       ('', '93', 'pm'),
       ('', '96', '')]},
     {'Rdg': {'Reading': 'יחזקיהו'}}]}]},
 {'Entry': 'ירבעם בן] 30 + נבט (non voc)',
  'Lemma Info': {'chapter': '1',
   'verses': 1,
   'lemmas': [{'lemma': 'ירבעם'}, {'lemma': 'בן'}]},
  'Decoded Entries': [{'Type': 'Variant',
    'Details': [{'Witnesses': [('', '30', '')]},
     {'Rdg': {'Sigla': '+',
       'Reading': 'נבט ',
       'GeneralComment': '(non voc)'}}]}]},
 {'Entry': '2 בהושע] 93 (pm) ביהושעIV (similarly PesiqtaR 33 (153b))',
  'Lemma Info': {'chapter': '1', 'verses': 2, 'lem

In [10]:
def list_to_xml(data, output_file):
    root = ET.Element("root")

    for idx, entry in enumerate(data, start=1):
        entry_element = ET.SubElement(root, "entry", {"Number": str(idx)})
        ET.SubElement(entry_element, "Entry").text = entry.get("Entry", "")

        lemma_info = entry.get("Lemma Info", {})
        if isinstance(lemma_info, dict):
            lemma_info_element = ET.SubElement(entry_element, "LemmaInfo")
            chapter = lemma_info.get("chapter", "")
            if isinstance(chapter, str):
                ET.SubElement(lemma_info_element, "Chapter").text = chapter
            verses = lemma_info.get("verses", "")
            if isinstance(verses, (str, int)):
                ET.SubElement(lemma_info_element, "Verses").text = str(verses)
            elif isinstance(verses, dict):
                verses_element = ET.SubElement(lemma_info_element, "Verses")
                for key, value in verses.items():
                    ET.SubElement(verses_element, key.capitalize()).text = str(value)
            lemmas_element = ET.SubElement(lemma_info_element, "Lemmas")
            lemmas = lemma_info.get("lemmas", [])
            if isinstance(lemmas, list):
                for lemma in lemmas:
                    lemma_element = ET.SubElement(lemmas_element, "Lemma")
                    ET.SubElement(lemma_element, "Text").text = lemma.get("lemma", "")
                    if "number" in lemma:
                        ET.SubElement(lemma_element, "Number").text = str(lemma["number"])
                    if "k" in lemma:
                        ET.SubElement(lemma_element, "K").text = str(lemma["k"])
                    if "q" in lemma:
                        ET.SubElement(lemma_element, "Q").text = str(lemma["q"])
            elif isinstance(lemmas, dict):
                for key, lemma_list in lemmas.items():
                    lemma_range_element = ET.SubElement(lemmas_element, key.capitalize())
                    for lemma in lemma_list:
                        lemma_element = ET.SubElement(lemma_range_element, "Lemma")
                        ET.SubElement(lemma_element, "Text").text = lemma.get("lemma", "")
                        if "k" in lemma:
                            ET.SubElement(lemma_element, "K").text = str(lemma["k"])
                        if "q" in lemma:
                            ET.SubElement(lemma_element, "Q").text = str(lemma["q"])

        decoded_entries = entry.get("Decoded Entries", [])
        for decoded_entry in decoded_entries:
            if isinstance(decoded_entry, dict):
                decoded_entry_element = ET.SubElement(entry_element, "DecodedEntry")
                ET.SubElement(decoded_entry_element, "Type").text = decoded_entry.get("Type", "")
                details_element = ET.SubElement(decoded_entry_element, "Details")
                for detail in decoded_entry.get("Details", []):
                    if isinstance(detail, dict):
                        if "Witnesses" in detail:
                            witnesses_element = ET.SubElement(details_element, "Witnesses")
                            for witness in detail["Witnesses"]:
                                if isinstance(witness, tuple):
                                    witness_element = ET.SubElement(witnesses_element, "Witness")
                                    ET.SubElement(witness_element, "Collection").text = witness[0]
                                    ET.SubElement(witness_element, "Manuscript").text = witness[1]
                                    ET.SubElement(witness_element, "Comment").text = witness[2]
                        elif "Rdg" in detail:
                            rdg_element = ET.SubElement(details_element, "Rdg")
                            rdg = detail["Rdg"]
                            if isinstance(rdg, dict):
                                if "Sigla" in rdg:
                                    ET.SubElement(rdg_element, "Sigla").text = rdg["Sigla"]
                                ET.SubElement(rdg_element, "Reading").text = rdg.get("Reading", "")
                                if "GeneralComment" in rdg:
                                    ET.SubElement(rdg_element, "GeneralComment").text = rdg["GeneralComment"]
                        elif "Cross References" in detail:
                            cross_references_element = ET.SubElement(details_element, "CrossReferences")
                            for reference in detail["Cross References"]:
                                ET.SubElement(cross_references_element, "Reference").text = reference

    tree = ET.ElementTree(root)
    tree.write(output_file, encoding="utf-8", xml_declaration=True)

# Example usage
list_to_xml(test, "Apparatus III Encoding 2.xml")


In [None]:
def create_multiindex_df(compiled_results):
    # Prepare data for DataFrame
    data = []
    index = []
    for result in compiled_results:
        entry = result['Entry']
        lemma_info = result['Lemma Info']
        chapter = lemma_info['chapter']
        verses = lemma_info['verses']
        if isinstance(verses, dict):  # Check if verses have "from" and "to" keys
            verse_from = verses['from']
            verse_to = verses['to']
        else:
            verse_from = verse_to = verses

        lemmas_from = lemmas_to = []
        if isinstance(lemma_info['lemmas'], list):
            lemmas = [lemma['lemma'] for lemma in lemma_info['lemmas']]
        else:
            lemmas_from = [lemma['lemma'] for lemma in lemma_info['lemmas']['from']]
            lemmas_to = [lemma['lemma'] for lemma in lemma_info['lemmas']['to']]
            lemmas = lemmas_from + lemmas_to

        for variant in result['Decoded Entries']:
            variant_type = variant['Type']
            for lemma in lemmas:
                lemma_range = 'from' if lemma in lemmas_from else 'to' if lemma in lemmas_to else None
                index_key = (chapter, verse_from if lemma_range == 'from' else verse_to, lemma, variant_type)
                row = {'Entry': entry, 'Lemma Range': lemma_range}
                for detail in variant['Details']:
                    if isinstance(detail, dict):
                        if 'Rdg' in detail:
                            if isinstance(detail['Rdg'], dict):
                                rdg = detail['Rdg']
                                reading = rdg.get('Reading', None)
                                if reading == 'k':
                                    reading = f"k ({lemmas_from[0]})" if lemmas_from else 'k'
                                elif reading == 'q':
                                    reading = f"q ({lemma})" if lemma in lemmas_to else 'q'
                                row['Reading'] = reading
                                row['Sigla'] = rdg.get('Sigla', None)
                                row['Comment'] = rdg.get('Comment', None)
                            else:
                                row['Reading'] = detail['Rdg']
                        else:
                            row.update(detail)
                    else:
                        row['Detail'] = detail
                index.append(index_key)
                data.append(row)

    # Convert list of dictionaries to DataFrame
    df = pd.DataFrame(data)

    # Create MultiIndex for rows
    multi_index = pd.MultiIndex.from_tuples(index, names=['Chapter', 'Verse', 'Lemma', 'Variant Type'])

    # Assign MultiIndex to DataFrame
    df.index = multi_index

    return df

# Create MultiIndex DataFrame from compiled_results
compiled_results_df = create_multiindex_df(test)

# Display the DataFrame
compiled_results_df.head()


In [None]:
compiled_results_df.to_excel('Trial Master.xlsx')

In [None]:
test

In [None]:
# figure out how to present for human QA

In [None]:
#sample entries
full_entries = [
    "Chapter 1",
    "8 k ואמאסאך / q ואמאסך] 30 93 (sm) 96 150 (pm) q IV | 93 (pm) אמסך",
    "8 שָׂך] 30 (pm) 93 (pm) 150 (pm) סךII IV (See b. R.HaŠanamss 23b, (LamR) Buber 1:16 (40b))",
    "6 עוד1] 30 (pm) >I II IV (similarly b. Pesaḥim 87bmss)",
    "6 הדעת1] 93 (pm) 150 (pm) דעתII",
    "14 זעקו] 30 (pm) יזעקו | 150 (pm) >",
    "10 k שעריריה / q שערוריה] 30 (pm) 89 (pm) 150 (sm) k, 30 (sm) 89 (sm) 93 (sm) 96 150 (pm) q, 93 (pm) שערורהIV",
    "2 ללבבם] 96 (pm) ללבם | 30 93 150 (pm) בלבבםI",
    "12 עליהם] 96 (pm) להם",
    "איסירם] 30 89 איסרם | 96 (pm) אייסרם, (sm) אייסירים | 150 (pm) אסירים",
    "7 נדמֶה] 93 נדמָה | 30 (pm) + אפרים",
    "10 ואסרם] 96 (pm) יאשרם.. | 150  ..על",
    "8 רֻחמה] 30 לח..",
    "והצגתיה] 93 (pm) + כיום ערומה והצגתיה ",
    "4 דברו] 150 דברים",
    "7–8 ילכו ינקותיו – כיין לבנון] 30 > ",
    "3 כי1] 150 + לא (non voc)",
    "וכְמריו] 93 G-B Msr 34 כֹ",
    "6 בָשנה] G-B Msr 34 בָשְנָה ל ומדׄ מיש ביה בֹשְנָה",
    "k עינתם / q עונֹתם] 30 k, 89 G-B Eb 16 q, 93 96 150 (pm) עונותם",
    "10 לה] 93 (non voc) 96 150 (non voc) + את"
    
]

In [None]:
#process lemma:
#split into digits and lemmas. then process each separately

In [None]:
[process_full_entry(example) for example in full_entries][-1]#[0]['verses']


In [None]:
# old function, didnt take the splitting into commas consideration
# def process_full_entry(text):
#     lemma, part_entry = split_full_entry(text)
#     lemma_dict = lemma_verse_processor(lemma)
# #     if len(lemma_dict['verses'])==0: #get verse from previous entry
# #         lemma_dict['verses'] = 
        
#     #entry_units = split_entry_units # splits on | and ,
#     #for entry in entry_units:
#     decoded_entry = process_entry(part_entry)
#     return lemma_dict, decoded_entry

In [None]:
# processing functions for sub-units of app_entry, for which there is matching lemma and verse data processed above

def remove_and_list_roman_numerals(text): #extract cross-references
    # Regex to match some Roman numerals: sequences of "I"s followed by an optional "V"
    pattern = r'([I]*[V]?)'
    # Find all occurrences of the pattern
    found_numerals = re.findall(pattern, text)
    # Remove empty matches from the list
    found_numerals = [numeral for numeral in found_numerals if numeral]
    # Replace found Roman numerals with an empty string
    result_text = re.sub(pattern, '', text)
    return result_text, found_numerals

def custom_split_string(text): #process witnesses
    pattern = re.compile(r'\s?([^\d]*?)?(\d+)\s?\(?([^\)\d]+)?\)?', re.DOTALL|re.UNICODE)    
    #pattern = r'([^,\d]*?)?(\d+)\s?(\(([^\)]+)?\)?)?([\skq]?)+'
    parts = re.findall(pattern, text)
    return parts

def parse_reading_entry(entry):
    # Refined regex pattern
    pattern = r"""
        \s?(?P<Sigla>[+<>~]?)                         # Captures special sigla
        \s*
        (?P<Reading>(?:[kq]?\s?)?[\u0590-\u05FF\uFB1D-\uFB4F\s.]*    # Hebrew reading, including 'k', 'q'
                   (?:/\s(?:[kq]?\s?)?[\u0590-\u05FF\uFB1D-\uFB4F\s.]*)?)  # Allows for 'k'/'q' followed by Hebrew, separated by '/'
        \s*
        (?P<Comment>\(.*\))?                     # Captures comments
    """
    
    # Compiling regex with VERBOSE flag for better readability and explanation
    compiled_pattern = re.compile(pattern, re.VERBOSE)
    match = compiled_pattern.match(entry)

    if not match:
        return None  # Return None if no match is found

    # Extracting groups into a dictionary
    parsed_entry = {k: v for k, v in match.groupdict().items() if v}

    return parsed_entry

#splitting entry into witnesses and reading (if only one group assign to witnesses)
def split_string(text):
    pattern = re.compile(r'(.*?)?([\+<>]?\s?[kq\u0590-\u05FF]+)(.*)?', re.DOTALL)
    match = pattern.match(text)
    if match:
        return match.groups()  # Returns a tuple with the three parts
    else:
        pattern = re.compile(r'(.*?)([\+<>])(.*)?', re.DOTALL)
        match = pattern.match(text)
        if match:
            return match.groups()  # Returns a tuple with the three parts
        else:
            return text
        return text  # No divider matching the pattern was found


def process_entry(entry):
    clean_entry, cross_references = remove_and_list_roman_numerals(entry)
    split_entry = split_string(clean_entry)
    if type(split_entry) is tuple:
        witnesses = {'Witnesses': custom_split_string(split_entry[0])}
        if len(split_entry)==2:
            reading = parse_reading_entry(split_entry[1])
        else: #there are 3 groups:
            reading = parse_reading_entry(split_entry[1]+split_entry[2])
    else:
        witnesses = {'Witnesses': custom_split_string(split_entry)}
        reading = ''
    return witnesses, {"Reading":reading}, {"Cross References":cross_references}


# for entry in sample_texts:
#     print(f"entry: {entry}")
#     clean_entry, cross_references = remove_and_list_roman_numerals(entry)
#     split_entry = split_string(clean_entry)
#     if type(split_entry) is tuple:
#         witnesses = {'witnesses': custom_split_string(split_entry[0])}
#         reading = parse_reading_entry(split_entry[1])
#         print(f"witnesses: {witnesses}")
#         print(f"reading: {reading}")
#     else:
#         witnesses = {'witnesses': custom_split_string(split_entry)}
#         print(f"witnesses: {witnesses}")
#     print(f"references: {cross_references}")


In [None]:
sample_texts = [
    "96 (non voc)",
    "30 (pm) 93 (pm) 150 (pm) + סךII IV (See b. R.HaŠanamss 23b, (LamR) Buber 1:16 (40b))",
    "93 (non voc) 96 150 (non voc) + את",
    "30 (pm) >",
    "30 + לי (non voc)I II",
    "30 (pm) >I II IV (similarly b. Pesaḥim 87bmss)",
    "93 (pm) ביהושעIV (similarly PesiqtaR 33 (153b))",
    "96 >I II IV",
    "130 k",
    "G-B Msr 34 k ממני / q ממנוIV",
    "93 כד..",
    "150 ..דברים",
    "G-B Eb 94 ותָעָד (understood as \עוד (rather than \עדי))",
    "30 89 (sm) 93 (pm) 150 (non voc) + כיI II IV"
]

In [None]:
process_entry(sample_texts[6])

In [None]:
clean_entry, cross_references = remove_and_list_roman_numerals(sample_texts[6])
split_entry = split_string(clean_entry)
if type(split_entry) is tuple:
    witnesses = {'Witnesses': custom_split_string(split_entry[0])}
    if len(split_entry)==2:
        reading = parse_reading_entry(split_entry[1])
    else: #there are 3 groups:
        reading = parse_reading_entry(split_entry[1]+split_entry[2])

reading

In [None]:
def custom_split_string(text): #process witnesses
    pattern = re.compile(r'\s?([^\d]*?)?(\d+)\s?\(?([^\)\d]+)?\)?', re.DOTALL|re.UNICODE)    
    #pattern = r'([^,\d]*?)?(\d+)\s?(\(([^\)]+)?\)?)?([\skq]?)+'
    parts = re.findall(pattern, text)
    return parts
# "96 (non voc)",
# "30 (pm) 93 (pm) 150 (pm)
test_witness = "30 (pm) 93 150 (pm)"
custom_split_string(test_witness)


In [None]:
#try parsing single entry app, splitting into witnesses and reading (if only one group assign to witnesses)
def split_string(text):
    pattern = re.compile(r'(.*?)?([\+<>]?\s?[kq\u0590-\u05FF]+)(.*)?', re.DOTALL)
    match = pattern.match(text)
    if match:
        return match.groups()  # Returns a tuple with the three parts
    else:
        pattern = re.compile(r'(.*?)([\+<>])(.*)?', re.DOTALL)
        match = pattern.match(text)
        if match:
            return match.groups()  # Returns a tuple with the three parts
        else:
            return text
        return text  # No divider matching the pattern was found

# Example usage
processed_sample = [split_string(text) for text in sample_texts]

print(processed_sample)


In [None]:
#function for parsing the reading+comment (assumes witnesses and cross references have been removed)
def parse_reading_entry(entry):
    # Refined regex pattern
    pattern = r"""
        \s?(?P<Sigla>[+<>~]?)                         # Captures special sigla
        \s*
        (?P<Reading>(?:[kq]?\s?)?[\u0590-\u05FF\uFB1D-\uFB4F\s.]*    # Hebrew reading, including 'k', 'q'
                   (?:/\s(?:[kq]?\s?)?[\u0590-\u05FF\uFB1D-\uFB4F\s.]*)?)  # Allows for 'k'/'q' followed by Hebrew, separated by '/'
        \s*
        (?P<Comment>\(.*\))?                     # Captures comments
    """
    
    # Compiling regex with VERBOSE flag for better readability and explanation
    compiled_pattern = re.compile(pattern, re.VERBOSE)
    match = compiled_pattern.match(entry)

    if not match:
        return None  # Return None if no match is found

    # Extracting groups into a dictionary
    parsed_entry = {k: v for k, v in match.groupdict().items() if v}

    return parsed_entry

# Process the sample reading texts with the refined function
parse_reading_entry = [parse_reading_entry(text) for text in sample_reading_texts]

parse_reading_entry


In [None]:
import re

def custom_string_processor(input_string, regex_pattern):
    # Helper function to apply regex and extract groups
    def apply_regex_and_extract(text):
        matches = re.finditer(regex_pattern, text)
        results = []
        for match in matches:
            results.append({
                'witnesses': match.group(1).strip(),
                'reading': match.group(2).strip(),
                'comments': match.group(3).strip() if match.group(3) else ''
            })
        return results

    # Process splits with "|", then ","
    def process_splits(text, delimiter):
        parts = text.split(delimiter)
        processed_parts = []
        for part in parts:
            # Apply regex to each part
            processed = apply_regex_and_extract(part)
            if processed:
                processed_parts.extend(processed)
        return processed_parts

    # Start processing
    processed_result = process_splits(input_string, '|')  # Start with the highest level of split

    return processed_result

# Custom regex pattern as provided
custom_regex = r'^(.*?)([\+<~>]?\s?[\u0590-\u05FF]+.*?)(.*)$'

# Test with the provided sample input
sample_input = "G-B msr. 30 (pm) G-A 89 (sm?) 150 (non voc) k, 30 (sm) 89 (sm) 93 (sm) 96 150 (pm) q, 93 (pm) + שערורהIV II (bla bla (f)) | 150 >"
processed_sample = [custom_string_processor(text, custom_regex) for text in sample_texts]


print(processed_sample)

def split_string(text):
    pattern = re.compile(r'^(.*?)([\+<>]?\s?[\u0590-\u05FF]+.*?)(.*)$', re.DOTALL)
    match = pattern.match(text)
    if match:
        return match.groups()  # Returns a tuple with the three parts
    else:
        return None  # No divider matching the pattern was found

# Example usage
# text = "G-B msr. 30 (pm) G-A 89 (pm?) 150 (sm) k, 30 (sm) 89 (sm) 93 (sm) 96 150 (pm) q, 93 (pm) > שערורהIV (bla bla (f))"#"30 89 (sm) 93 (pm) 150 (non voc) + כיI II IV"#"93 (pm) < ביהושעIV (similarly PesiqtaR 33 (153b))"
# split_parts = split_string(text)
# if split_parts:
#     print("witnesses:", split_parts[0])
#     print("reading:", split_parts[1])
#     print("After dividers:", split_parts[2])
# else:
#     print("No dividers found.")



In [None]:

def parse_apparatus_entry(entry):
    """Parse an apparatus entry into lemma(s) and content."""
    parts = entry.split(']')
    lemmas_contents = []
    for part in parts:
        if part.strip():
            lemma, content = part.split('[', 1) if '[' in part else (part, '')
            lemmas_contents.append((lemma.strip(), content.strip()))
    return lemmas_contents

def create_tei_document(apparatus_lines):
    """Create a TEI document from apparatus lines."""
    TEI_NAMESPACE = "http://www.tei-c.org/ns/1.0"
    TEI = "{%s}" % TEI_NAMESPACE
    NSMAP = {"tei": TEI_NAMESPACE}
    
    tei_root = ET.Element(TEI+"TEI", nsmap=NSMAP)
    tei_header = ET.SubElement(tei_root, TEI+"teiHeader")
    text = ET.SubElement(tei_root, TEI+"text")
    body = ET.SubElement(text, TEI+"body")
    current_chapter = None
    last_verse = None
    
    for line in apparatus_lines:
        line = line.strip()
        if not line:
            continue
        if line.startswith('Chapter'):
            chapter_number = line.split(' ')[1]
            current_chapter = ET.SubElement(body, TEI+"div", type="chapter", n=chapter_number)
            last_verse = None
            continue
        # Use regex to check if the line starts with a verse number and capture it
        match = re.match(r"^(\d+)\s*(.*)", line)
        if match:
            verse_number, entry = match.groups()
            last_verse = verse_number
        else:
            entry = line
            verse_number = last_verse
        
        if current_chapter is not None and verse_number:
            lemmas_contents = parse_apparatus_entry(entry)
            for lemma, content in lemmas_contents:
                app = ET.SubElement(current_chapter, TEI+"app")
                lem = ET.SubElement(app, TEI+"lem", n=verse_number)
                lem.text = lemma
                if content:
                    rdg = ET.SubElement(app, TEI+"rdg")
                    rdg.text = content

    return ET.ElementTree(tei_root)

def save_tei_file(tree, filename):
    """Save the TEI XML tree to a file."""
    tree.write(filename, encoding="UTF-8", xml_declaration=True, method="xml", short_empty_elements=True)



# Replace 'your_input_file.txt' with the path to your actual input file
input_file = '01 Hosea App III - מתוקן.txt'
output_file = 'apparatus_tei.xml'

with open(input_file, 'r', encoding='utf-8') as f:
    lines = f.readlines()

tei_tree = create_tei_document(lines)
save_tei_file(tei_tree, output_file)

print(f"TEI document has been saved to {output_file}.")


In [None]:
####### old stuff

In [None]:
import xml.etree.ElementTree as ET
import re

def create_tei_document(apparatus_lines):
    TEI_NAMESPACE = "http://www.tei-c.org/ns/1.0"
    TEI = "{%s}" % TEI_NAMESPACE
    
    tei_root = ET.Element(TEI + "TEI", xmlns=TEI_NAMESPACE)
    tei_header = ET.SubElement(tei_root, TEI + "teiHeader")
    text = ET.SubElement(tei_root, TEI + "text")
    body = ET.SubElement(text, TEI + "body")
    current_chapter = None
    last_verse_number = None
    
    for line in apparatus_lines:
        line = line.strip()
        if not line:
            continue
        if line.startswith('Chapter'):
            chapter_number = line.split(' ')[1].strip()
            current_chapter = ET.SubElement(body, TEI + "div", type="chapter", n=chapter_number)
        else:
            # Attempt to extract verse number and lemma content
            parts = re.match(r"^(\d+)\s*(.*)", line)
            if parts:
                verse_number, remainder = parts.groups()
                last_verse_number = verse_number  # Update last verse number with current
                
                # Further split to separate lemma from variants, if present
                lemma_section, variants_section = remainder.split(']', 1) if ']' in remainder else (remainder, "")
                lemma_section = lemma_section.strip()
                variants_section = variants_section.strip()

                if current_chapter is not None and verse_number:
                    # Create an apparatus entry for the lemma
                    app = ET.SubElement(current_chapter, TEI + "app")
                    lem = ET.SubElement(app, TEI + "lem", n=verse_number)
                    lem.text = lemma_section
                    
                    # Add variant readings if present
                    if variants_section:
                        rdg = ET.SubElement(app, TEI + "rdg")
                        rdg.text = variants_section
            else:
                print(f"Line does not conform to expected format: {line}")

    return ET.ElementTree(tei_root)

def save_tei_file(tree, filename):
    tree.write(filename, encoding="UTF-8", xml_declaration=True, method="xml", short_empty_elements=True)

# Replace 'your_input_file.txt' with the path to your actual input file
input_file = '01 Hosea App III - מתוקן.txt'
output_file = 'apparatus_tei.xml'

with open(input_file, 'r', encoding='utf-8') as f:
    lines = f.readlines()

tei_tree = create_tei_document(lines)
save_tei_file(tei_tree, output_file)

print(f"TEI document has been saved to {output_file}.")


In [None]:
import xml.etree.ElementTree as ET
import re


def create_apparatus_entry(verse_number, content, TEI):
    """Create TEI element for an apparatus entry."""
    app = ET.Element(TEI + "app")
    
    # Extract lemma text and the rest (witnesses, variant reading, and comments)
    lemma_text, _, rest = content.partition(']')
    lem = ET.SubElement(app, TEI + "lem")
    lem.text = lemma_text.strip()
    
    # Extract comments
    comments = re.findall(r'\((.*?)\)', rest)
    for comment in comments:
        note = ET.SubElement(app, TEI + "note")
        note.text = comment
    
    # Remove comments from rest for further processing
    rest = re.sub(r'\(.*?\)', '', rest).strip()
    
    # Extract and process witnesses and cross-references
    if rest:
        rdg = ET.SubElement(app, TEI + "rdg")
        witnesses, _, variant_reading = rest.partition(' ')
        if witnesses:
            rdg.set('wit', witnesses.strip())
        if variant_reading:
            rdg.text = variant_reading.strip()
        
        # Extract cross-references, assuming they are indicated by Roman numerals at the start
        cross_refs = re.findall(r'\bI{1,3}V?|\bIV', rest)
        for ref in cross_refs:
            ref_element = ET.SubElement(rdg, TEI + "ref")
            ref_element.set('target', '#' + ref)  # Assuming target IDs are prefixed with '#'
            ref_element.text = "See apparatus entry " + ref
    
    return app

def create_tei_document(apparatus_lines):
    TEI_NAMESPACE = "http://www.tei-c.org/ns/1.0"
    TEI = "{%s}" % TEI_NAMESPACE
    root = ET.Element(TEI + "TEI", xmlns=TEI_NAMESPACE)
    header = ET.SubElement(root, TEI + "teiHeader")
    text = ET.SubElement(root, TEI + "text")
    body = ET.SubElement(text, TEI + "body")
    div = ET.SubElement(body, TEI + "div")
    
    last_verse_number = None
    for line in apparatus_lines:
        line = line.strip()
        if not line:
            continue
        
        # Determine if the line starts with a verse number
        match = re.match(r'^(\d+)', line)
        if match:
            last_verse_number = match.group(1)
            content = line[len(last_verse_number):].strip()
        else:
            content = line
        
        if last_verse_number:
            entry = create_apparatus_entry(last_verse_number, content, TEI)
            div.append(entry)
    
    return ET.ElementTree(root)

def save_tei_file(tree, filename):
    tree.write(filename, encoding="UTF-8", xml_declaration=True, method="xml")


# Replace 'your_input_file.txt' with the path to your actual input file
input_file = '01 Hosea App III - מתוקן.txt'
output_file = 'apparatus_tei.xml'

with open(input_file, 'r', encoding='utf-8') as f:
    lines = f.readlines()

tei_tree = create_tei_document(lines)
save_tei_file(tei_tree, output_file)

print(f"TEI document has been saved to {output_file}.")


In [None]:
#split entry into witnesses, reading, and comments 

def split_string(text):
    pattern = re.compile(r'^(.*?)([\+<>]?\s?[\u0590-\u05FF]+.*?)(.*)$', re.DOTALL)
    match = pattern.match(text)
    if match:
        return match.groups()  # Returns a tuple with the three parts
    else:
        return None  # No divider matching the pattern was found

# Example usage
text = "G-B msr. 30 (pm) G-A 89 (pm?) 150 (sm) k, 30 (sm) 89 (sm) 93 (sm) 96 150 (pm) q, 93 (pm) > שערורהIV (bla bla (f))"#"30 89 (sm) 93 (pm) 150 (non voc) + כיI II IV"#"93 (pm) < ביהושעIV (similarly PesiqtaR 33 (153b))"
split_parts = split_string(text)
if split_parts:
    print("witnesses:", split_parts[0])
    print("reading:", split_parts[1])
    print("After dividers:", split_parts[2])
else:
    print("No dividers found.")


In [None]:
#parse comments
import re

def remove_and_list_roman_numerals(text):
    # Regex to match some Roman numerals: sequences of "I"s followed by an optional "V"
    pattern = r'([I]*[V]?)'
    # Find all occurrences of the pattern
    found_numerals = re.findall(pattern, text)
    # Remove empty matches from the list
    found_numerals = [numeral for numeral in found_numerals if numeral]
    # Replace found Roman numerals with an empty string
    result_text = re.sub(pattern, '', text)
    return result_text, found_numerals

# Example usage
text = "II IV (See b. R.HaŠanamss 23b, LamR Buber 1:16 (40b))"
result_text, numerals_found = remove_and_list_roman_numerals(text)
print("Modified text:", result_text.strip())
print("Numerals found:", numerals_found)



In [None]:
def process_entry(entry):
    split_parts = split_string(entry)
    if split_parts is None:
        return None
    
    witnesses, reading, comments = split_parts

    structured_entry = {
        'witnesses': [],
        'reading': reading,
        'comments': '',
        'cross_references': []
    }

    for part in custom_split_string(witnesses):
        # Assuming part[1] contains the witness number and part[0], part[2], part[3] contain additional info
        witness_info = {
            'n': part[1],
            'text': f"{part[0]}{part[1]} {part[2].strip()}{part[3]}"
        }
        structured_entry['witnesses'].append(witness_info)

    comments_text, numerals_found = remove_and_list_roman_numerals(comments)
    structured_entry['comments'] = comments_text
    structured_entry['cross_references'] = numerals_found

    return structured_entry

def create_apparatus_entry(verse_number, content, TEI):
    """Create TEI element for an apparatus entry."""
    TEI_ns = {'tei': TEI}  # Define the namespace dictionary if needed
    app = ET.Element(f"{{{TEI}}}app")  # Using namespace in the tag
    
    # Extract lemma text and the rest (witnesses, variant reading, and comments)
    lemma_text, _, rest = content.partition(']')
    lem = ET.SubElement(app, f"{{{TEI}}}lem")
    lem.text = lemma_text.strip('[] ')

    structured_entry = process_entry(rest)
    if not structured_entry:
        return None

    for witness in structured_entry['witnesses']:
        wit_element = ET.SubElement(app, f"{{{TEI}}}wit", {'n': witness['n']})
        wit_element.text = witness['text']
    
    rdg_element = ET.SubElement(app, f"{{{TEI}}}rdg")
    rdg_element.text = structured_entry['reading']

    if structured_entry['comments']:
        comment_element = ET.SubElement(app, f"{{{TEI}}}note")
        comment_element.text = structured_entry['comments']

    for ref in structured_entry['cross_references']:
        ref_element = ET.SubElement(app, f"{{{TEI}}}ref")
        ref_element.text = ref

    return app


In [None]:
import xml.etree.ElementTree as ET
import re


def create_tei_document(apparatus_lines):
    TEI_NAMESPACE = "http://www.tei-c.org/ns/1.0"
    ET.register_namespace('', TEI_NAMESPACE)  # Register the default namespace

    # Create the root element without redundantly specifying the xmlns attribute
    root = ET.Element("{%s}TEI" % TEI_NAMESPACE)
    header = ET.SubElement(root, "{%s}teiHeader" % TEI_NAMESPACE)
    text = ET.SubElement(root, "{%s}text" % TEI_NAMESPACE)
    body = ET.SubElement(text, "{%s}body" % TEI_NAMESPACE)
    div = ET.SubElement(body, "{%s}div" % TEI_NAMESPACE)
    
    last_verse_number = None
    for line in apparatus_lines:
        line = line.strip()
        if not line:
            continue
        
        # Determine if the line starts with a verse number
        match = re.match(r'^(\d+)', line)
        if match:
            last_verse_number = match.group(1)
            content = line[len(last_verse_number):].strip()
        else:
            content = line
        
        if last_verse_number:
            entry = create_apparatus_entry(last_verse_number, content, TEI_NAMESPACE)
            if entry is not None:  # Ensure entry creation was successful
                div.append(entry)
    
    return ET.ElementTree(root)

def save_tei_file(tree, filename):
    tree.write(filename, encoding="UTF-8", xml_declaration=True, method="xml")


# Replace 'your_input_file.txt' with the path to your actual input file
input_file = '01 Hosea App III - מתוקן.txt'
output_file = 'apparatus_tei.xml'

with open(input_file, 'r', encoding='utf-8') as f:
    lines = f.readlines()

tei_tree = create_tei_document(lines)
save_tei_file(tei_tree, output_file)

print(f"TEI document has been saved to {output_file}.")


In [None]:
import re

def split_string(text):
    pattern = re.compile(r'^(.*?)([\+<~>]?\s?[\u0590-\u05FF]+.*?)(.*)$', re.DOTALL)
    match = pattern.match(text)
    if match:
        return match.groups()
    else:
        return None

def custom_split_string(text):
    pattern = re.compile(r'([^,\d]*?)?(\d+)\s?(\([^\)]+\)?)?([\skq]*)?', re.DOTALL|re.UNICODE)
    parts = re.findall(pattern, text)
    return parts

def remove_and_list_roman_numerals(text):
    pattern = r'([I]*[V]?)'
    found_numerals = re.findall(pattern, text)
    found_numerals = [numeral for numeral in found_numerals if numeral]
    result_text = re.sub(pattern, '', text)
    return result_text, found_numerals

# def process_entry(entry):
#     split_parts = split_string(entry)
#     if split_parts is None:
#         return "Unable to process entry: No valid dividers found."
    
#     witnesses, reading, comments = split_parts

#     witness_entries = []
#     for part in custom_split_string(witnesses):
#         witness_entry = f'<witness n="{part[1]}">{part[0]}{part[1]} {part[2].strip()}{part[3]}</witness>'
#         witness_entries.append(witness_entry)
#     witnesses_tagged = "\n".join(witness_entries)

#     reading_tagged = f'<reading>{reading}</reading>'

#     comments_text, numerals_found = remove_and_list_roman_numerals(comments)
#     comments_tagged = f'<comment>{comments_text}</comment>'
#     cross_references = "\n".join([f'<ref>{numeral}</ref>' for numeral in numerals_found])

#     # Combine all parts, placing cross_references outside the comment
#     tei_entry = f"{witnesses_tagged}\n{reading_tagged}\n{comments_tagged}\n{cross_references}"
#     return tei_entry

# Example usage
entry ="G-B msr. 30 (pm) G-A 89 (pm?) 150 (non voc) k, 30 (sm) 89 (sm) 93 (sm) 96 150 (pm) q, 93 (pm) > שערורהIV II (bla bla (f))"
processed_entry = process_entry(entry)
print(processed_entry)


In [None]:
#### 

In [None]:
[' ', '"', '$', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', '<', '=', '>', 'E', 'I', 'T', '_', 'a', 'b', 'c', 'd', 'e', 'g', 'h', 'i', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', '֑', '֔', '֕', '֖', '֗', '֙', '֛', '֜', '֞', '֣', '֤', '֥', '֨', '֩', 'ְ', 'ֱ', 'ֲ', 'ִ', 'ֵ', 'ֶ', 'ַ', 'ָ', 'ֹ', 'ֻ', 'ּ', 'ֽ', '׀', 'ׁ', 'ׂ']

In [None]:
#old function. didnt handle the "E-G misr" etc.
def compile_and_display_entries(entries):
    previous_verse = None
    compiled_results = []

    for entry in entries:
        lemma_dict, decoded_entries, used_verse = process_full_entry(entry, previous_verse)
        previous_verse = used_verse

        # Combine the lemma_dict and decoded_entries for display
        entry_result = {
            'Lemma': lemma_dict,
            'Decoded Entries': decoded_entries
        }
        compiled_results.append(entry_result)

    # Display the results in a structured format
    for result in compiled_results:
        print("Lemma:")
        for key, value in result['Lemma'].items():
            print(f"  {key}: {value}")
        print("Decoded Entries:")
        last_witness_second_group = None
        for i, decoded_entries_list in enumerate(result['Decoded Entries']):
            for j, decoded_entry in enumerate(decoded_entries_list):
                variant_type = "Variant" if j == 0 else "Related Variant"
                print(f"  {variant_type}:")
                for item in decoded_entry:
                    if isinstance(item, dict) and 'Witnesses' in item:
                        # Update witnesses with last non-empty second group if needed
                        updated_witnesses = []
                        for witness in item['Witnesses']:
                            if witness[1] == '' and last_witness_second_group:
                                updated_witnesses.append((witness[0], last_witness_second_group, witness[2]))
                            else:
                                updated_witnesses.append(witness)
                                if witness[1]:
                                    last_witness_second_group = witness[1]
                        item['Witnesses'] = updated_witnesses
                    if isinstance(item, dict):
                        for key, value in item.items():
                            print(f"    {key}: {value}")
                    else:
                        print(f"    {item}")
        print("-" * 50)  # Separator line

# Example usage
compile_and_display_entries(full_entries)


In [None]:
#old processing that doesnt include chapter extraction
def post_process_witnesses(witnesses):
    # Define the set of specific values for "x"
    specific_values = {"G-B Eb ", "G-B Kb ", "G-B Msr "}  # Replace with the actual values

    # Iterate over the witnesses, except for the last one
    for i in range(len(witnesses) - 1):
        z, y, x = witnesses[i]

        # Check if "x" is one of the specific values
        if x in specific_values:
            # Remove "x" from the current tuple and prepend it to the "z" of the next witness
            witnesses[i] = (z, y, None)
            next_z, next_y, next_x = witnesses[i + 1]
            witnesses[i + 1] = (x + next_z, next_y, next_x)

    # Remove the first tuple if it becomes empty
    if witnesses and witnesses[0] == (None, None, None):
        witnesses.pop(0)

    return witnesses

# Example usage
def compile_and_display_entries(entries):
    previous_verse = None
    compiled_results = []

    for entry in entries:
        lemma_dict, decoded_entries, used_verse = process_full_entry(entry, previous_verse)
        previous_verse = used_verse

        # Combine the lemma_dict and decoded_entries for display
        entry_result = {
            'Lemma': lemma_dict,
            'Decoded Entries': decoded_entries
        }
        compiled_results.append(entry_result)

    # Display the results in a structured format
    for result in compiled_results:
        print("Lemma:")
        for key, value in result['Lemma'].items():
            print(f"  {key}: {value}")
        print("Decoded Entries:")
        last_witness_second_group = None
        for i, decoded_entries_list in enumerate(result['Decoded Entries']):
            for j, decoded_entry in enumerate(decoded_entries_list):
                variant_type = "Variant" if j == 0 else "Related Variant"
                print(f"  {variant_type}:")
                for item in decoded_entry:
                    if isinstance(item, dict) and 'Witnesses' in item:
                        # Update witnesses with last non-empty second group if needed
                        updated_witnesses = []
                        for witness in item['Witnesses']:
                            if witness[1] == '' and last_witness_second_group:
                                updated_witnesses.append((witness[0], last_witness_second_group, witness[2]))
                            else:
                                updated_witnesses.append(witness)
                                if witness[1]:
                                    last_witness_second_group = witness[1]
                        # Apply post-processing to witnesses
                        item['Witnesses'] = post_process_witnesses(updated_witnesses)
                    if isinstance(item, dict):
                        for key, value in item.items():
                            print(f"    {key}: {value}")
                    else:
                        print(f"    {item}")
        print("-" * 50)  # Separator line
    return compiled_results
# Example usage
test = compile_and_display_entries(full_entries)
