In [1]:
import xml.etree.ElementTree as ET
import re
import unicodedata
from docx import Document
from lxml import etree
import os
from win32com import client


In [7]:
def convert_doc_to_docx(doc_path):
    word = client.Dispatch("Word.Application")
    absolute_doc_path = os.path.abspath(doc_path)
    doc = word.Documents.Open(absolute_doc_path)
    doc_path_new = absolute_doc_path.replace(".doc", ".docx")
    doc.SaveAs2(doc_path_new, FileFormat=16)  # FileFormat=16 is for docx
    doc.Close()
    word.Quit()
    return doc_path_new

# Example Usage
docx_path = convert_doc_to_docx('Hosea.2.doc')

def convert_docx_to_txt(docx_file_path, txt_file_path):
    # Load the .docx file
    doc = Document(docx_file_path)

    # Extract text from each paragraph in the document
    text_content = '\n'.join([paragraph.text for paragraph in doc.paragraphs])

    # Write the extracted text to a .txt file
    with open(txt_file_path, 'w', encoding='utf-8') as txt_file:
        txt_file.write(text_content)

    print(f"File converted successfully and saved as '{txt_file_path}'")

# Example usage

convert_docx_to_txt('Hosea.2.docx', 'output3.txt')


File converted successfully and saved as 'output3.txt'


In [9]:
with open('output3.txt', 'r', encoding='utf-8') as file:
    lines = file.readlines()
    txt = file
lines

['Hosea 2\n',
 '\n',
 '1–2] ]* 110–11\n',
 '1 ימד] * verb/nom\n',
 'ולא יספר] ]h >\n',
 'יֵאָמֵר להם2] ][ diath | ]h pr ἐκεῖ = [\n',
 '3 לַאֲחֵיכם] ] num\n',
 'ולאחותיכם] ]* num\n',
 '4 ו(תסר)] *[ >\n',
 '(ו)תסר ... מפניה] ] ἐξαρῶ … ἐκ προσώπου μου\n',
 '5 פן] ] ὅπως ἂν\n',
 'כ(יום)] ]h καθὼς ἐν\n',
 'ושמתיה ... ושתה] [ condens\n',
 'כ(מדבר) ... כ(ארץ)] ]h prep\n',
 '7 נתני] ] + μοι\n',
 'שמני] ]-[ & | ]h pr καὶ τὸν οἶνόν μου | T- > pron\n',
 '(ו)שקויָי] ] πάντα ὅσα μοι καθήκει = [, T 8\n',
 '8 דרכך] ][ pron | *h[T num\n',
 'גדרהּ] ] τὰς ὁδοὺς αὐτῆς = [ | ]hT > pron\n',
 'ונתיבותיה] ][ num | [Th > pron \n',
 '9 תמצא] ][ + pron\n',
 'אלכה ואשובה] [ ~\n',
 '10 וכסף] ]h + καὶ χρυσίον\n',
 'vוזהב,] ] vαὐτὴ δὲ,ἀργυρᾶ καὶ χρυσᾶ \n',
 'עשו] ] pers | [ pr ܘܡܢܗ + T\n',
 'ל(בעל)] ] τῇ\n',
 '11 ותירושי] ]h + καὶ τὸ ἔλαιόν μου\n',
 'לכסות] ] τοῦ μὴ καλύπτειν | * quae operiebant\n',
 '12 נבלתה] ]h τὴν ἀσχημοσύνην αὐτῆς = [T \n',
 'מידי] ~ pron\n',
 '13 (ו)הִשְׁבַּתִּי] ] ἀποστρέψω\n',
 'חגה – מועדה

In [13]:
def process_biblical_text(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    books = {}
    current_book = None
    current_chapter = None

    for line in lines:
        line = line.strip()
        if line.startswith("ספר"):
            current_book = line
            books[current_book] = {}
        elif line.startswith("פרק"):
            current_chapter = line
            books[current_book][current_chapter] = []
        else:
            if current_book is not None and current_chapter is not None:
                books[current_book][current_chapter].append(line)

    return books

# Example usage
file_path = 'output.txt'
books = process_biblical_text(file_path)

# # Example to print the structure
# for book, chapters in books.items():
#     for chapter, texts in chapters.items():
#         print(f'{book} {chapter}: {texts[:2]}...')  # print first two lines of each chapter


In [21]:
cantillation_marks = {
    '\u05BD': 'Meteg',
    '\u0594': 'Zaqef Qatan',
    '\u0597': 'Revia',
    '\u0598': 'Tsinnorit',
    '\u05A1': 'Pazer',
    '\u059F': 'Qarney Para',
    '\u0595': 'Zaqef Gadol',
    '\u059B': 'Tvir',
    '\u0591': 'Atnach',
    '\u05AA': 'Galgal',
    '\u05A7': 'Darga',
    '\u05BE': 'Maqqef',
    '\u05C3': 'Sof Pasuq',
    '\u0592': 'Segol',
    '\u05AE': 'Zarqa',
    '\u0599': 'Pashta',
    '\u05A9': 'Tlisha Qtana',
    '\u059A': 'Yetiv',
    '\u059D': 'Geresh Muqdam',
    '\u05AD': 'Dechi',
    '\u05A0': 'Tlisha Gdola',  
    '\u05AB': 'Ole',
    '\u059C': 'Geresh',
    '\u059E': 'Gershayim',
    '\u05A8': 'Qadma',  
    '\u05AC': 'Illuy',
    '\u0593': 'Shalshelet',
    '\u05A4': 'Mahpakh',
    '\u05A5': 'Merkha',
    '\u05A6': 'Merkha Khfula',
    '\u0596': 'Tipcha',
    '\u05A3': 'Munnach',
}

def tag_word_with_cantillation(word, cantillation_marks):
    for name, mark in cantillation_marks.items():
        if mark in word:
            return (mark, name)  # Returns the word and the cantillation mark's name
    return (None, None)  # Returns the word with None if no mark is found

In [32]:
#render main text into TEI
def create_tei_xml(books):
    root = etree.Element('TEI', xmlns="http://www.tei-c.org/ns/1.0")
    teiHeader = etree.SubElement(root, 'teiHeader')
    # Add header elements as needed here...

    text_element = etree.SubElement(root, 'text')
    body = etree.SubElement(text_element, 'body')

    for book_name, chapters in books.items():
        book_div = etree.SubElement(body, 'div', type='book', n=book_name)
        for chapter_name, verses in chapters.items():
            chapter_div = etree.SubElement(book_div, 'div', type='chapter', n=chapter_name)
            for verse in verses:
                lg = etree.SubElement(chapter_div, 'lg', type='verse')
                process_verse(verse, lg)

    return etree.tostring(root, pretty_print=True, encoding='UTF-8', xml_declaration=True)

def process_verse(verse_text, chapter_div):
    # Regular expression to capture verses and dividers
    pattern = r'(:|\[פ\]|\[ס\])'
    parts = re.split(pattern, verse_text)

    verse_counter = 1
    for part in parts:
        if part in [":", "[פ]", "[ס]"]:
            # If the part is a divider, add it as a separate element
            divider = etree.SubElement(chapter_div, 'divider')
            divider.text = part
        else:
            # Process normal verse text
            if part.strip():  # ignore empty strings resulting from split
                verse_number = f"פסוק {verse_counter}"
                lg = etree.SubElement(chapter_div, 'lg', type='verse', n=verse_number)
                add_tokens_to_verse(part, lg)
                verse_counter += 1

def add_tokens_to_verse(verse, lg_element):
    tokens = verse.split()
    for token in tokens:
        # Handle special signs within the token
        process_word(token, lg_element)

def process_word(word, lg_element):
    # Check for the "׀" sign and tag it as a "טעם" type "פסק"
    if "׀" in word:
        taam_element = etree.SubElement(lg_element, 'taam', type='פסק')
        taam_element.text = "׀"
    else:
        # Process word for hyphen-like signs
        process_hyphenated_word(word, lg_element)

def process_hyphenated_word(word, lg_element):
    # Check if the word contains "־" and process accordingly
    if "־" in word:
        # Create the full word element without special signs
        full_word_element = etree.SubElement(lg_element, 'w')
        full_word_element.text = re.sub(r'\$[1-4]', '', word)
        # Process each part of the hyphenated word
        subwords = word.split("־")
        for subword in subwords:
            nested_word_element = etree.SubElement(full_word_element, 'nestedWord')
            # Remove special signs from the nested word and add them as nested elements
            nested_special_signs(subword, nested_word_element)
    else:
        # If not a hyphenated word, process normally
        word_element = etree.SubElement(lg_element, 'w')
        nested_special_signs(word, word_element)

def nested_special_signs(word, word_element):
    # First, tag the word with its cantillation mark if present
    mark, cantillation_name = tag_word_with_cantillation(word, cantillation_marks)
    if cantillation_name:
        # Create a subelement for the cantillation mark
        cantillation_element = etree.SubElement(word_element, 'taam', type=cantillation_name)
        cantillation_element.text = mark
    
    # Then, process the special signs within the word
    special_signs = re.finditer(r'\$[1-4]', word)
    start = 0
    for match in special_signs:
        before_sign = word[start:match.start()]
        if before_sign:
            word_element.text = (word_element.text or '') + before_sign
        
        sign_element = etree.SubElement(word_element, 'specialSign')
        sign_element.text = match.group()
        start = match.end()
    
    remaining_text = word[start:]
    if remaining_text:
        word_element.text = (word_element.text or '') + remaining_text

def tag_word_with_cantillation(word, cantillation_marks):
    for mark, names in cantillation_marks.items():
        if mark in word:
            # Assuming multiple names might be associated with a single mark, join them with a comma
            name = ', '.join(names) if isinstance(names, list) else names
            return (mark, name)  # Remove the mark from the word
    return (word, None)  # No cantillation mark found, return the word as is
            
def is_special_token(token):
    # Define your logic to identify a special token here
    # For example, if special tokens are enclosed in brackets:
    return token.startswith('[') and token.endswith(']')

# Example usage
tei_xml = create_tei_xml(books)
with open('output.tei.xml', 'wb') as file:
    file.write(tei_xml)


In [None]:
def nested_special_signs(word, word_element, cantillation_marks):
    # First, tag the word with its cantillation mark if present
    processed_word, cantillation_name = tag_word_with_cantillation(word, cantillation_marks)
    if cantillation_name:
        # Create a subelement for the cantillation mark
        cantillation_element = etree.SubElement(word_element, 'taam', type=cantillation_name)
        cantillation_element.text = processed_word
    
    # Then, process the special signs within the word
    special_signs = re.finditer(r'\$[1-4]', processed_word)
    start = 0
    for match in special_signs:
        before_sign = processed_word[start:match.start()]
        if before_sign:
            word_element.text = (word_element.text or '') + before_sign
        
        sign_element = etree.SubElement(word_element, 'specialSign')
        sign_element.text = match.group()
        start = match.end()
    
    remaining_text = processed_word[start:]
    if remaining_text:
        word_element.text = (word_element.text or '') + remaining_text

def tag_word_with_cantillation(word, cantillation_marks):
    for mark, names in cantillation_marks.items():
        if mark in word:
            # Assuming multiple names might be associated with a single mark, join them with a comma
            name = ', '.join(names) if isinstance(names, list) else names
            return (word.replace(mark, ''), name)  # Remove the mark from the word
    return (word, None)  # No cantillation mark found, return the word as is


In [25]:
def identify_cantillation(word, cantillation_marks):
    for mark, name in cantillation_marks.items():
        if mark in word:
            return name+' '+mark  # Return the name of the first cantillation mark found
    return "No cantillation mark found"  # Return this if no cantillation mark is present in the word

# Example word
example_word = "הָיָ֗ה"

# Let's identify the cantillation mark in the example word
cantillation_name = identify_cantillation(example_word, cantillation_marks)
cantillation_name


'Revia ֗'

In [None]:
########## render apparatus 3 ##############

In [None]:
#new logic
#1. split into lemma and app_entry. process each separately
# lemma: split into number range and lemma range. 
#numbers include \d+'-', lemmas need to include potential k\q attribute, and word number.
# app_entry: splits into mss., reading, comments, cross-reference.
# 
#to do: 
#1. get_verse, if verse is empty. same for get_witness (if its inside a comma)
#2. QA: "איסירם] 30 89 איסרם | 96 (pm) אייסרם, (sm) אייסירים | 150 (pm) אסירים" (when split on comma allow for witness completion)
#3. include more special signs ~ and include .. in things that could appear at the end of the hebrew reading

In [7]:
full_entries = [
"8 k ואמאסאך / q ואמאסך] 30 93 (sm) 96 150 (pm) q IV | 93 (pm) אמסך",
"8 שָׂך] 30 (pm) 93 (pm) 150 (pm) סךII IV (See b. R.HaŠanamss 23b, (LamR) Buber 1:16 (40b))",
"6 עוד1] 30 (pm) >I II IV (similarly b. Pesaḥim 87bmss)",
"6 הדעת1] 93 (pm) 150 (pm) דעתII",
"14 זעקו] 30 (pm) יזעקו | 150 (pm) >",
"10 k שעריריה / q שערוריה] 30 (pm) 89 (pm) 150 (sm) k, 30 (sm) 89 (sm) 93 (sm) 96 150 (pm) q, 93 (pm) שערורהIV",
"2 ללבבם] 96 (pm) ללבם | 30 93 150 (pm) בלבבםI",
    "איסירם] 30 89 איסרם | 96 (pm) אייסרם, (sm) אייסירים | 150 (pm) אסירים",
    "7 נדמֶה] 93 נדמָה | 30 (pm) + אפרים",
    "10 ואסרם] 96 (pm) יאשרם | 150 ..על"
]

In [8]:
[process_full_entry(example) for example in full_entries]#[-1]#[0]['verses']


[({'verses': [8],
   'lemmas': [{'lemma': 'ואמאסאך', 'k': True},
    {'lemma': 'ואמאסך', 'q': True}]},
  [[({'Witnesses': [('', '30', ''),
       ('', '93', 'sm'),
       ('', '96', ''),
       ('', '150', 'pm')]},
     {'Reading': {'Reading': 'q  '}},
     {'Cross References': ['IV']})],
   [({'Witnesses': [('', '93', 'pm')]},
     {'Reading': {'Reading': 'אמסך'}},
     {'Cross References': []})]]),
 ({'verses': [8], 'lemmas': [{'lemma': 'שָׂך'}]},
  [[({'Witnesses': [('', '30', 'pm'), ('', '93', 'pm'), ('', '150', 'pm')]},
     {'Reading': {'Reading': 'סך  ',
       'Comment': '(See b. R.HaŠanamss 23b, (LamR) Buber 1:16 (40b))'}},
     {'Cross References': ['II', 'IV']})]]),
 ({'verses': [6], 'lemmas': [{'lemma': 'עוד', 'number': '1'}]},
  [[({'Witnesses': [('', '30', 'pm')]},
     {'Reading': {'Sigla': '>', 'Comment': '(similarly b. Pesaḥim 87bmss)'}},
     {'Cross References': ['I', 'II', 'IV']})]]),
 ({'verses': [6], 'lemmas': [{'lemma': 'הדעת', 'number': '1'}]},
  [[({'Witnesses'

In [3]:
def process_full_entry(text):
    lemma, part_entry = split_full_entry(text)
    lemma_dict = lemma_verse_processor(lemma)

    # Split part_entry by '|'
    if '|' in part_entry:
        entry_parts = part_entry.split('|')
    else:
        entry_parts = [part_entry]

    # Initialize a list to hold all processed parts
    processed_parts = []

    # Process each part separately
    for part in entry_parts:
        # Split part by ',' not inside parentheses
        sub_parts = split_on_comma_not_in_parentheses(part)

        # Process each sub-part and collect the results
        processed_sub_parts = [process_entry(sub_part) for sub_part in sub_parts]

        # Assuming you want to concatenate processed sub-parts for each part
        processed_parts.append(processed_sub_parts)

    # Flatten the list if necessary or keep it nested based on your requirement
    decoded_entries = processed_parts

    return lemma_dict, decoded_entries

def split_on_comma_not_in_parentheses(part):
    """
    Splits the string on ',' not inside parentheses.
    """
    sub_parts = []
    current_part = []
    paren_depth = 0  # Track depth of parentheses

    for char in part:
        if char == '(':
            paren_depth += 1
        elif char == ')':
            paren_depth -= 1
        elif char == ',' and paren_depth == 0:
            # At a top-level comma, split here
            sub_parts.append(''.join(current_part))
            current_part = []
            continue

        current_part.append(char)

    # Add the last part if there's any
    if current_part:
        sub_parts.append(''.join(current_part))

    return sub_parts


In [4]:
def split_full_entry(text):
    sliced_entry = text.split(sep=']')
    lemma, entry = sliced_entry
#         print(f"lemma: {lemma}")
#         print(f"entry: {entry}")
    return lemma, entry    

def lemma_verse_processor(text):
    # Simplified approach: first split into digits and lemmas
    # Regex to match the verse numbers at the beginning
    verse_regex = r'^(\d+(?:–\d+)?)\s'
    
    # Extract verses
    verses_match = re.match(verse_regex, text)
    verses = list(map(int, verses_match.group(1).split('–'))) if verses_match else []
    
    # Isolate lemmas part by removing the verses
    lemmas_part = text[len(verses_match.group(0)):].strip() if verses_match else text
    return {
        'verses': verses,
        'lemmas': process_lemma_with_range_and_diacritics(lemmas_part)
    }

# Function to process individual lemmas or ranges, after the split,
lemma_regex = r'(k|q)?\s*([^\d\s]+)(\d?\,?\d?)'#(\d+(?:–\d+)?)\s

def process_lemma_with_range_and_diacritics(lemma):
    # Adjust regex to include diacritical marks and punctuation within Hebrew words
    
    
    # Check for range indicated by "–" and process accordingly
    if "–" in lemma:
        from_lemma, to_lemma = lemma.split("–")
        return {
            'from': process_individual_lemma(from_lemma.strip()),
            'to': process_individual_lemma(to_lemma.strip())
        }

    # Split lemma if there are separate lemmas with "/"
    split_lemmas = re.split(r'\s*/\s*', lemma) if '/' in lemma else [lemma]
    
    processed_lemmas = []
    for split_lemma in split_lemmas:
        processed = process_individual_lemma(split_lemma)
        processed_lemmas.extend(processed)
    
    return processed_lemmas

def process_individual_lemma(individual_lemma):
    matches = re.findall(lemma_regex, individual_lemma)
    processed_lemmas = []
    for match in matches:
        prefix, word, number = match
        lemma_dict = {'lemma': word}
        if prefix: lemma_dict[prefix] = True
        if number: lemma_dict['number'] = (number)
        processed_lemmas.append(lemma_dict)
    return processed_lemmas

# processing functions for sub-units of app_entry, for which there is matching lemma and verse data processed above

def extract_cross_references(text): #extract cross-references
    # Regex to match some Roman numerals: sequences of "I"s followed by an optional "V"
    pattern = r'([I]*[V]?)'
    # Find all occurrences of the pattern
    found_numerals = re.findall(pattern, text)
    # Remove empty matches from the list
    found_numerals = [numeral for numeral in found_numerals if numeral]
    # Replace found Roman numerals with an empty string
    result_text = re.sub(pattern, '', text)
    return result_text, found_numerals

def parse_witnesses(text): #process witnesses
    pattern = re.compile(r'\s?([^\d]*?)?(\d+)\s?\(?([^\)\d\.]+)?\)?', re.DOTALL|re.UNICODE)    
    #pattern = r'([^,\d]*?)?(\d+)\s?(\(([^\)]+)?\)?)?([\skq]?)+'
    parts = re.findall(pattern, text)
    return parts

def parse_comma_witnesses(text): #process witnesses within comma, i.e., consider it is only mentioned once
    pattern = re.compile(r'\s?([^\d]*?)?(\d+)?\s?\(?([^\)\d]+)?\)?', re.DOTALL|re.UNICODE)    
    #pattern = r'([^,\d]*?)?(\d+)\s?(\(([^\)]+)?\)?)?([\skq]?)+'
    parts = re.findall(pattern, text)
    return parts


def parse_reading_entry(entry):
    # Refined regex pattern
    pattern = r"""
        \s?(?P<Sigla>[+<>~.]?)                         # Captures special sigla
        \s*
        (?P<Reading>(?:[kq]?\s?)?[\u0590-\u05FF\uFB1D-\uFB4F\s.]*    # Hebrew reading, including 'k', 'q'
                   (?:/\s(?:[kq]?\s?)?[\u0590-\u05FF\uFB1D-\uFB4F\s.]*)?)  # Allows for 'k'/'q' followed by Hebrew, separated by '/'
        \s*
        (?P<Comment>\(.*\))?                     # Captures comments
    """
    
    # Compiling regex with VERBOSE flag for better readability and explanation
    compiled_pattern = re.compile(pattern, re.VERBOSE)
    match = compiled_pattern.match(entry)

    if not match:
        return None  # Return None if no match is found

    # Extracting groups into a dictionary
    parsed_entry = {k: v for k, v in match.groupdict().items() if v}

    return parsed_entry

#splitting entry into witnesses and reading (if only one group assign to witnesses)
def witness_reading_splitter(text):
    pattern = re.compile(r'(.*?)?([\+<>~]?\s?[kq\u0590-\u05FF]+)(.*)?', re.DOTALL)
    match = pattern.match(text)
    if match:
        return match.groups()  # Returns a tuple with the three parts
    else:
        pattern = re.compile(r'(.*?)([\+<>~])(.*)?', re.DOTALL)
        match = pattern.match(text)
        if match:
            return match.groups()  # Returns a tuple with the three parts
        else:
            return text
        return text  # No divider matching the pattern was found


def process_entry(entry):
    clean_entry, cross_references = extract_cross_references(entry)
    split_entry = witness_reading_splitter(clean_entry)
    if type(split_entry) is tuple:
        witnesses = {'Witnesses': parse_witnesses(split_entry[0])}
        if len(split_entry)==2:
            reading = parse_reading_entry(split_entry[1])
        else: #there are 3 groups:
            reading = parse_reading_entry(split_entry[1]+split_entry[2])
    else:
        witnesses = {'Witnesses': parse_witnesses(split_entry)}
        reading = ''
    return witnesses, {"Reading":reading}, {"Cross References":cross_references}

In [None]:
# iterate over full entries. get verse number from previous if needed. also get reading from lemma. and split on | ,

In [None]:
#process lemma:
#split into digits and lemmas. then process each separately

In [738]:
[process_full_entry(example) for example in full_entries][-1]#[0]['verses']


({'verses': [10], 'lemmas': [{'lemma': 'ואסרם'}]},
 [[({'Witnesses': [('', '96', 'pm')]},
    {'Reading': {'Reading': 'יאשרם '}},
    {'Cross References': []})],
  [({'Witnesses': [('', '150', '')]},
    {'Reading': {'Reading': 'על'}},
    {'Cross References': []})]])

In [731]:
# old function, didnt take the splitting into commas consideration
# def process_full_entry(text):
#     lemma, part_entry = split_full_entry(text)
#     lemma_dict = lemma_verse_processor(lemma)
# #     if len(lemma_dict['verses'])==0: #get verse from previous entry
# #         lemma_dict['verses'] = 
        
#     #entry_units = split_entry_units # splits on | and ,
#     #for entry in entry_units:
#     decoded_entry = process_entry(part_entry)
#     return lemma_dict, decoded_entry

In [639]:
# processing functions for sub-units of app_entry, for which there is matching lemma and verse data processed above

def remove_and_list_roman_numerals(text): #extract cross-references
    # Regex to match some Roman numerals: sequences of "I"s followed by an optional "V"
    pattern = r'([I]*[V]?)'
    # Find all occurrences of the pattern
    found_numerals = re.findall(pattern, text)
    # Remove empty matches from the list
    found_numerals = [numeral for numeral in found_numerals if numeral]
    # Replace found Roman numerals with an empty string
    result_text = re.sub(pattern, '', text)
    return result_text, found_numerals

def custom_split_string(text): #process witnesses
    pattern = re.compile(r'\s?([^\d]*?)?(\d+)\s?\(?([^\)\d]+)?\)?', re.DOTALL|re.UNICODE)    
    #pattern = r'([^,\d]*?)?(\d+)\s?(\(([^\)]+)?\)?)?([\skq]?)+'
    parts = re.findall(pattern, text)
    return parts

def parse_reading_entry(entry):
    # Refined regex pattern
    pattern = r"""
        \s?(?P<Sigla>[+<>~]?)                         # Captures special sigla
        \s*
        (?P<Reading>(?:[kq]?\s?)?[\u0590-\u05FF\uFB1D-\uFB4F\s.]*    # Hebrew reading, including 'k', 'q'
                   (?:/\s(?:[kq]?\s?)?[\u0590-\u05FF\uFB1D-\uFB4F\s.]*)?)  # Allows for 'k'/'q' followed by Hebrew, separated by '/'
        \s*
        (?P<Comment>\(.*\))?                     # Captures comments
    """
    
    # Compiling regex with VERBOSE flag for better readability and explanation
    compiled_pattern = re.compile(pattern, re.VERBOSE)
    match = compiled_pattern.match(entry)

    if not match:
        return None  # Return None if no match is found

    # Extracting groups into a dictionary
    parsed_entry = {k: v for k, v in match.groupdict().items() if v}

    return parsed_entry

#splitting entry into witnesses and reading (if only one group assign to witnesses)
def split_string(text):
    pattern = re.compile(r'(.*?)?([\+<>]?\s?[kq\u0590-\u05FF]+)(.*)?', re.DOTALL)
    match = pattern.match(text)
    if match:
        return match.groups()  # Returns a tuple with the three parts
    else:
        pattern = re.compile(r'(.*?)([\+<>])(.*)?', re.DOTALL)
        match = pattern.match(text)
        if match:
            return match.groups()  # Returns a tuple with the three parts
        else:
            return text
        return text  # No divider matching the pattern was found


def process_entry(entry):
    clean_entry, cross_references = remove_and_list_roman_numerals(entry)
    split_entry = split_string(clean_entry)
    if type(split_entry) is tuple:
        witnesses = {'Witnesses': custom_split_string(split_entry[0])}
        if len(split_entry)==2:
            reading = parse_reading_entry(split_entry[1])
        else: #there are 3 groups:
            reading = parse_reading_entry(split_entry[1]+split_entry[2])
    else:
        witnesses = {'Witnesses': custom_split_string(split_entry)}
        reading = ''
    return witnesses, {"Reading":reading}, {"Cross References":cross_references}


# for entry in sample_texts:
#     print(f"entry: {entry}")
#     clean_entry, cross_references = remove_and_list_roman_numerals(entry)
#     split_entry = split_string(clean_entry)
#     if type(split_entry) is tuple:
#         witnesses = {'witnesses': custom_split_string(split_entry[0])}
#         reading = parse_reading_entry(split_entry[1])
#         print(f"witnesses: {witnesses}")
#         print(f"reading: {reading}")
#     else:
#         witnesses = {'witnesses': custom_split_string(split_entry)}
#         print(f"witnesses: {witnesses}")
#     print(f"references: {cross_references}")


In [640]:
sample_texts = [
    "96 (non voc)",
    "30 (pm) 93 (pm) 150 (pm) + סךII IV (See b. R.HaŠanamss 23b, (LamR) Buber 1:16 (40b))",
    "93 (non voc) 96 150 (non voc) + את",
    "30 (pm) >",
    "30 + לי (non voc)I II",
    "30 (pm) >I II IV (similarly b. Pesaḥim 87bmss)",
    "93 (pm) ביהושעIV (similarly PesiqtaR 33 (153b))",
    "96 >I II IV",
    "130 k",
    "G-B Msr 34 k ממני / q ממנוIV",
    "93 כד..",
    "150 ..דברים",
    "G-B Eb 94 ותָעָד (understood as \עוד (rather than \עדי))",
    "30 89 (sm) 93 (pm) 150 (non voc) + כיI II IV"
]

In [643]:
process_entry(sample_texts[6])

({'Witnesses': [('', '93', 'pm')]},
 {'Reading': {'Reading': 'ביהושע ',
   'Comment': '(similarly PesiqtaR 33 (153b))'}},
 {'Cross References': ['IV']})

In [638]:
clean_entry, cross_references = remove_and_list_roman_numerals(sample_texts[6])
split_entry = split_string(clean_entry)
if type(split_entry) is tuple:
    witnesses = {'Witnesses': custom_split_string(split_entry[0])}
    if len(split_entry)==2:
        reading = parse_reading_entry(split_entry[1])
    else: #there are 3 groups:
        reading = parse_reading_entry(split_entry[1]+split_entry[2])

reading

{'Reading': 'ביהושע ', 'Comment': '(similarly PesiqtaR 33 (153b))'}

In [462]:
def custom_split_string(text): #process witnesses
    pattern = re.compile(r'\s?([^\d]*?)?(\d+)\s?\(?([^\)\d]+)?\)?', re.DOTALL|re.UNICODE)    
    #pattern = r'([^,\d]*?)?(\d+)\s?(\(([^\)]+)?\)?)?([\skq]?)+'
    parts = re.findall(pattern, text)
    return parts
# "96 (non voc)",
# "30 (pm) 93 (pm) 150 (pm)
test_witness = "30 (pm) 93 150 (pm)"
custom_split_string(test_witness)


[('', '30', 'pm'), ('', '93', ''), ('', '150', 'pm')]

In [623]:
#try parsing single entry app, splitting into witnesses and reading (if only one group assign to witnesses)
def split_string(text):
    pattern = re.compile(r'(.*?)?([\+<>]?\s?[kq\u0590-\u05FF]+)(.*)?', re.DOTALL)
    match = pattern.match(text)
    if match:
        return match.groups()  # Returns a tuple with the three parts
    else:
        pattern = re.compile(r'(.*?)([\+<>])(.*)?', re.DOTALL)
        match = pattern.match(text)
        if match:
            return match.groups()  # Returns a tuple with the three parts
        else:
            return text
        return text  # No divider matching the pattern was found

# Example usage
processed_sample = [split_string(text) for text in sample_texts]

print(processed_sample)


['96 (non voc)', ('30 (pm) 93 (pm) 150 (pm) ', '+ סך', 'II IV (See b. R.HaŠanamss 23b, (LamR) Buber 1:16 (40b))'), ('93 (non voc) 96 150 (non voc) ', '+ את', ''), ('30 (pm) ', '>', ''), ('30 ', '+ לי', ' (non voc)I II'), ('30 (pm) ', '>', 'I II IV (similarly b. Pesaḥim 87bmss)'), ('93 (pm)', ' ביהושע', 'IV (similarly PesiqtaR 33 (153b))'), ('96 ', '>', 'I II IV'), ('130', ' k', ''), ('G-B Msr 34', ' k', ' ממני / q ממנוIV'), ('93', ' כד', '..'), ('150 ..', 'דברים', ''), ('G-B Eb 94', ' ותָעָד', ' (understood as \\עוד (rather than \\עדי))'), ('30 89 (sm) 93 (pm) 150 (non voc) ', '+ כי', 'I II IV')]


In [619]:
#function for parsing the reading+comment (assumes witnesses and cross references have been removed)
def parse_reading_entry(entry):
    # Refined regex pattern
    pattern = r"""
        \s?(?P<Sigla>[+<>~]?)                         # Captures special sigla
        \s*
        (?P<Reading>(?:[kq]?\s?)?[\u0590-\u05FF\uFB1D-\uFB4F\s.]*    # Hebrew reading, including 'k', 'q'
                   (?:/\s(?:[kq]?\s?)?[\u0590-\u05FF\uFB1D-\uFB4F\s.]*)?)  # Allows for 'k'/'q' followed by Hebrew, separated by '/'
        \s*
        (?P<Comment>\(.*\))?                     # Captures comments
    """
    
    # Compiling regex with VERBOSE flag for better readability and explanation
    compiled_pattern = re.compile(pattern, re.VERBOSE)
    match = compiled_pattern.match(entry)

    if not match:
        return None  # Return None if no match is found

    # Extracting groups into a dictionary
    parsed_entry = {k: v for k, v in match.groupdict().items() if v}

    return parsed_entry

# Process the sample reading texts with the refined function
parse_reading_entry = [parse_reading_entry(text) for text in sample_reading_texts]

parse_reading_entry


[{'Reading': 'סך ',
  'Comment': '(See b. R.HaŠanamss 23b, (LamR) Buber 1:16 (40b))'},
 {'Sigla': '+', 'Reading': 'את'},
 {'Sigla': '>'},
 {'Sigla': '+', 'Reading': 'לי ', 'Comment': '(non voc)'},
 {'Sigla': '>', 'Comment': '(similarly b. Pesaḥim 87bmss)'},
 {'Reading': 'ביהושע ', 'Comment': '(similarly PesiqtaR 33 (153b))'},
 {},
 {'Reading': 'k'},
 {'Reading': 'k ממני / q ממנו'},
 {'Reading': 'כד..'},
 {'Reading': '..דברים'},
 {'Reading': 'נַחֵם ',
  'Comment': '(taken as infinitive, see Yeivin, Babylonian Vocalization, 1:542)'},
 {'Reading': 'חכֵם ', 'Comment': '(!)'}]

In [587]:
import re

def custom_string_processor(input_string, regex_pattern):
    # Helper function to apply regex and extract groups
    def apply_regex_and_extract(text):
        matches = re.finditer(regex_pattern, text)
        results = []
        for match in matches:
            results.append({
                'witnesses': match.group(1).strip(),
                'reading': match.group(2).strip(),
                'comments': match.group(3).strip() if match.group(3) else ''
            })
        return results

    # Process splits with "|", then ","
    def process_splits(text, delimiter):
        parts = text.split(delimiter)
        processed_parts = []
        for part in parts:
            # Apply regex to each part
            processed = apply_regex_and_extract(part)
            if processed:
                processed_parts.extend(processed)
        return processed_parts

    # Start processing
    processed_result = process_splits(input_string, '|')  # Start with the highest level of split

    return processed_result

# Custom regex pattern as provided
custom_regex = r'^(.*?)([\+<~>]?\s?[\u0590-\u05FF]+.*?)(.*)$'

# Test with the provided sample input
sample_input = "G-B msr. 30 (pm) G-A 89 (sm?) 150 (non voc) k, 30 (sm) 89 (sm) 93 (sm) 96 150 (pm) q, 93 (pm) + שערורהIV II (bla bla (f)) | 150 >"
processed_sample = [custom_string_processor(text, custom_regex) for text in sample_texts]


print(processed_sample)

def split_string(text):
    pattern = re.compile(r'^(.*?)([\+<>]?\s?[\u0590-\u05FF]+.*?)(.*)$', re.DOTALL)
    match = pattern.match(text)
    if match:
        return match.groups()  # Returns a tuple with the three parts
    else:
        return None  # No divider matching the pattern was found

# Example usage
# text = "G-B msr. 30 (pm) G-A 89 (pm?) 150 (sm) k, 30 (sm) 89 (sm) 93 (sm) 96 150 (pm) q, 93 (pm) > שערורהIV (bla bla (f))"#"30 89 (sm) 93 (pm) 150 (non voc) + כיI II IV"#"93 (pm) < ביהושעIV (similarly PesiqtaR 33 (153b))"
# split_parts = split_string(text)
# if split_parts:
#     print("witnesses:", split_parts[0])
#     print("reading:", split_parts[1])
#     print("After dividers:", split_parts[2])
# else:
#     print("No dividers found.")



[[], [{'witnesses': '30 (pm) 93 (pm) 150 (pm)', 'reading': '+ סך', 'comments': 'II IV (See b. R.HaŠanamss 23b, (LamR) Buber 1:16 (40b))'}], [{'witnesses': '93 (non voc) 96 150 (non voc)', 'reading': '+ את', 'comments': ''}], [], [{'witnesses': '30', 'reading': '+ לי', 'comments': '(non voc)I II'}], [], [{'witnesses': '93 (pm)', 'reading': 'ביהושע', 'comments': 'IV (similarly PesiqtaR 33 (153b))'}], [], [], [{'witnesses': 'G-B Msr 34 k', 'reading': 'ממני', 'comments': '/ q ממנוIV'}], [{'witnesses': '93', 'reading': 'כד', 'comments': '..'}], [{'witnesses': '150 ..', 'reading': 'דברים', 'comments': ''}], [{'witnesses': 'G-B Eb 94', 'reading': 'ותָעָד', 'comments': '(understood as \\עוד (rather than \\עדי))'}], [{'witnesses': '30 89 (sm) 93 (pm) 150 (non voc)', 'reading': '+ כי', 'comments': 'I II IV'}]]


In [37]:

def parse_apparatus_entry(entry):
    """Parse an apparatus entry into lemma(s) and content."""
    parts = entry.split(']')
    lemmas_contents = []
    for part in parts:
        if part.strip():
            lemma, content = part.split('[', 1) if '[' in part else (part, '')
            lemmas_contents.append((lemma.strip(), content.strip()))
    return lemmas_contents

def create_tei_document(apparatus_lines):
    """Create a TEI document from apparatus lines."""
    TEI_NAMESPACE = "http://www.tei-c.org/ns/1.0"
    TEI = "{%s}" % TEI_NAMESPACE
    NSMAP = {"tei": TEI_NAMESPACE}
    
    tei_root = ET.Element(TEI+"TEI", nsmap=NSMAP)
    tei_header = ET.SubElement(tei_root, TEI+"teiHeader")
    text = ET.SubElement(tei_root, TEI+"text")
    body = ET.SubElement(text, TEI+"body")
    current_chapter = None
    last_verse = None
    
    for line in apparatus_lines:
        line = line.strip()
        if not line:
            continue
        if line.startswith('Chapter'):
            chapter_number = line.split(' ')[1]
            current_chapter = ET.SubElement(body, TEI+"div", type="chapter", n=chapter_number)
            last_verse = None
            continue
        # Use regex to check if the line starts with a verse number and capture it
        match = re.match(r"^(\d+)\s*(.*)", line)
        if match:
            verse_number, entry = match.groups()
            last_verse = verse_number
        else:
            entry = line
            verse_number = last_verse
        
        if current_chapter is not None and verse_number:
            lemmas_contents = parse_apparatus_entry(entry)
            for lemma, content in lemmas_contents:
                app = ET.SubElement(current_chapter, TEI+"app")
                lem = ET.SubElement(app, TEI+"lem", n=verse_number)
                lem.text = lemma
                if content:
                    rdg = ET.SubElement(app, TEI+"rdg")
                    rdg.text = content

    return ET.ElementTree(tei_root)

def save_tei_file(tree, filename):
    """Save the TEI XML tree to a file."""
    tree.write(filename, encoding="UTF-8", xml_declaration=True, method="xml", short_empty_elements=True)



# Replace 'your_input_file.txt' with the path to your actual input file
input_file = '01 Hosea App III - מתוקן.txt'
output_file = 'apparatus_tei.xml'

with open(input_file, 'r', encoding='utf-8') as f:
    lines = f.readlines()

tei_tree = create_tei_document(lines)
save_tei_file(tei_tree, output_file)

print(f"TEI document has been saved to {output_file}.")


TEI document has been saved to apparatus_tei.xml.


In [47]:
import xml.etree.ElementTree as ET
import re

def create_tei_document(apparatus_lines):
    TEI_NAMESPACE = "http://www.tei-c.org/ns/1.0"
    TEI = "{%s}" % TEI_NAMESPACE
    
    tei_root = ET.Element(TEI + "TEI", xmlns=TEI_NAMESPACE)
    tei_header = ET.SubElement(tei_root, TEI + "teiHeader")
    text = ET.SubElement(tei_root, TEI + "text")
    body = ET.SubElement(text, TEI + "body")
    current_chapter = None
    last_verse_number = None
    
    for line in apparatus_lines:
        line = line.strip()
        if not line:
            continue
        if line.startswith('Chapter'):
            chapter_number = line.split(' ')[1].strip()
            current_chapter = ET.SubElement(body, TEI + "div", type="chapter", n=chapter_number)
        else:
            # Attempt to extract verse number and lemma content
            parts = re.match(r"^(\d+)\s*(.*)", line)
            if parts:
                verse_number, remainder = parts.groups()
                last_verse_number = verse_number  # Update last verse number with current
                
                # Further split to separate lemma from variants, if present
                lemma_section, variants_section = remainder.split(']', 1) if ']' in remainder else (remainder, "")
                lemma_section = lemma_section.strip()
                variants_section = variants_section.strip()

                if current_chapter is not None and verse_number:
                    # Create an apparatus entry for the lemma
                    app = ET.SubElement(current_chapter, TEI + "app")
                    lem = ET.SubElement(app, TEI + "lem", n=verse_number)
                    lem.text = lemma_section
                    
                    # Add variant readings if present
                    if variants_section:
                        rdg = ET.SubElement(app, TEI + "rdg")
                        rdg.text = variants_section
            else:
                print(f"Line does not conform to expected format: {line}")

    return ET.ElementTree(tei_root)

def save_tei_file(tree, filename):
    tree.write(filename, encoding="UTF-8", xml_declaration=True, method="xml", short_empty_elements=True)

# Replace 'your_input_file.txt' with the path to your actual input file
input_file = '01 Hosea App III - מתוקן.txt'
output_file = 'apparatus_tei.xml'

with open(input_file, 'r', encoding='utf-8') as f:
    lines = f.readlines()

tei_tree = create_tei_document(lines)
save_tei_file(tei_tree, output_file)

print(f"TEI document has been saved to {output_file}.")


Line does not conform to expected format: ﻿App III: Hosea
Line does not conform to expected format: יחזקיה] 30 93 (pm) 96 יחזקיהו
Line does not conform to expected format: ירבעם בן] 30 + נבט (non voc)
Line does not conform to expected format: לו] 96 >I  II IV
Line does not conform to expected format: ממלכוּת] 96 ממלכוֹת
Line does not conform to expected format: יזרעאל] 150 ישראל (parall; but 150-Tg: יזרעאל)
Line does not conform to expected format: כי] 93 (pm) + את
Line does not conform to expected format: אוסיף] 150 (pm) >
Line does not conform to expected format: את] 93 (pm) >
Line does not conform to expected format: בסוסים] 96 ובסוסיםI IV
Line does not conform to expected format: אשר2] 96 + לא
Line does not conform to expected format: אחד] 30 (pm) 150 (pm) >
Line does not conform to expected format: והצגתיה] 93 (pm) + כיום ערומה והצגתיה
Line does not conform to expected format: ושתִּה] 150 (pm) ושמתיה
Line does not conform to expected format: כי] 150 (non voc) + כי
Line does not co

In [10]:
import xml.etree.ElementTree as ET
import re


def create_apparatus_entry(verse_number, content, TEI):
    """Create TEI element for an apparatus entry."""
    app = ET.Element(TEI + "app")
    
    # Extract lemma text and the rest (witnesses, variant reading, and comments)
    lemma_text, _, rest = content.partition(']')
    lem = ET.SubElement(app, TEI + "lem")
    lem.text = lemma_text.strip()
    
    # Extract comments
    comments = re.findall(r'\((.*?)\)', rest)
    for comment in comments:
        note = ET.SubElement(app, TEI + "note")
        note.text = comment
    
    # Remove comments from rest for further processing
    rest = re.sub(r'\(.*?\)', '', rest).strip()
    
    # Extract and process witnesses and cross-references
    if rest:
        rdg = ET.SubElement(app, TEI + "rdg")
        witnesses, _, variant_reading = rest.partition(' ')
        if witnesses:
            rdg.set('wit', witnesses.strip())
        if variant_reading:
            rdg.text = variant_reading.strip()
        
        # Extract cross-references, assuming they are indicated by Roman numerals at the start
        cross_refs = re.findall(r'\bI{1,3}V?|\bIV', rest)
        for ref in cross_refs:
            ref_element = ET.SubElement(rdg, TEI + "ref")
            ref_element.set('target', '#' + ref)  # Assuming target IDs are prefixed with '#'
            ref_element.text = "See apparatus entry " + ref
    
    return app

def create_tei_document(apparatus_lines):
    TEI_NAMESPACE = "http://www.tei-c.org/ns/1.0"
    TEI = "{%s}" % TEI_NAMESPACE
    root = ET.Element(TEI + "TEI", xmlns=TEI_NAMESPACE)
    header = ET.SubElement(root, TEI + "teiHeader")
    text = ET.SubElement(root, TEI + "text")
    body = ET.SubElement(text, TEI + "body")
    div = ET.SubElement(body, TEI + "div")
    
    last_verse_number = None
    for line in apparatus_lines:
        line = line.strip()
        if not line:
            continue
        
        # Determine if the line starts with a verse number
        match = re.match(r'^(\d+)', line)
        if match:
            last_verse_number = match.group(1)
            content = line[len(last_verse_number):].strip()
        else:
            content = line
        
        if last_verse_number:
            entry = create_apparatus_entry(last_verse_number, content, TEI)
            div.append(entry)
    
    return ET.ElementTree(root)

def save_tei_file(tree, filename):
    tree.write(filename, encoding="UTF-8", xml_declaration=True, method="xml")


# Replace 'your_input_file.txt' with the path to your actual input file
input_file = '01 Hosea App III - מתוקן.txt'
output_file = 'apparatus_tei.xml'

with open(input_file, 'r', encoding='utf-8') as f:
    lines = f.readlines()

tei_tree = create_tei_document(lines)
save_tei_file(tei_tree, output_file)

print(f"TEI document has been saved to {output_file}.")


TEI document has been saved to apparatus_tei.xml.


In [232]:
#split entry into witnesses, reading, and comments 

def split_string(text):
    pattern = re.compile(r'^(.*?)([\+<>]?\s?[\u0590-\u05FF]+.*?)(.*)$', re.DOTALL)
    match = pattern.match(text)
    if match:
        return match.groups()  # Returns a tuple with the three parts
    else:
        return None  # No divider matching the pattern was found

# Example usage
text = "G-B msr. 30 (pm) G-A 89 (pm?) 150 (sm) k, 30 (sm) 89 (sm) 93 (sm) 96 150 (pm) q, 93 (pm) > שערורהIV (bla bla (f))"#"30 89 (sm) 93 (pm) 150 (non voc) + כיI II IV"#"93 (pm) < ביהושעIV (similarly PesiqtaR 33 (153b))"
split_parts = split_string(text)
if split_parts:
    print("witnesses:", split_parts[0])
    print("reading:", split_parts[1])
    print("After dividers:", split_parts[2])
else:
    print("No dividers found.")


witnesses: G-B msr. 30 (pm) G-A 89 (pm?) 150 (sm) k, 30 (sm) 89 (sm) 93 (sm) 96 150 (pm) q, 93 (pm) 
reading: > שערורה
After dividers: IV (bla bla (f))


[('G-B msr. ', '30', 'pm'), ('G-A ', '89', 'pm?'), ('', '150', 'non voc'), ('', '30', 'sm'), ('', '89', 'sm'), ('', '93', 'sm'), ('MS-G ', '150', 'pm')]


In [161]:
#parse comments
import re

def remove_and_list_roman_numerals(text):
    # Regex to match some Roman numerals: sequences of "I"s followed by an optional "V"
    pattern = r'([I]*[V]?)'
    # Find all occurrences of the pattern
    found_numerals = re.findall(pattern, text)
    # Remove empty matches from the list
    found_numerals = [numeral for numeral in found_numerals if numeral]
    # Replace found Roman numerals with an empty string
    result_text = re.sub(pattern, '', text)
    return result_text, found_numerals

# Example usage
text = "II IV (See b. R.HaŠanamss 23b, LamR Buber 1:16 (40b))"
result_text, numerals_found = remove_and_list_roman_numerals(text)
print("Modified text:", result_text.strip())
print("Numerals found:", numerals_found)



Modified text: (See b. R.HaŠanamss 23b, LamR Buber 1:16 (40b))
Numerals found: ['II', 'IV']


In [178]:
def process_entry(entry):
    split_parts = split_string(entry)
    if split_parts is None:
        return None
    
    witnesses, reading, comments = split_parts

    structured_entry = {
        'witnesses': [],
        'reading': reading,
        'comments': '',
        'cross_references': []
    }

    for part in custom_split_string(witnesses):
        # Assuming part[1] contains the witness number and part[0], part[2], part[3] contain additional info
        witness_info = {
            'n': part[1],
            'text': f"{part[0]}{part[1]} {part[2].strip()}{part[3]}"
        }
        structured_entry['witnesses'].append(witness_info)

    comments_text, numerals_found = remove_and_list_roman_numerals(comments)
    structured_entry['comments'] = comments_text
    structured_entry['cross_references'] = numerals_found

    return structured_entry

def create_apparatus_entry(verse_number, content, TEI):
    """Create TEI element for an apparatus entry."""
    TEI_ns = {'tei': TEI}  # Define the namespace dictionary if needed
    app = ET.Element(f"{{{TEI}}}app")  # Using namespace in the tag
    
    # Extract lemma text and the rest (witnesses, variant reading, and comments)
    lemma_text, _, rest = content.partition(']')
    lem = ET.SubElement(app, f"{{{TEI}}}lem")
    lem.text = lemma_text.strip('[] ')

    structured_entry = process_entry(rest)
    if not structured_entry:
        return None

    for witness in structured_entry['witnesses']:
        wit_element = ET.SubElement(app, f"{{{TEI}}}wit", {'n': witness['n']})
        wit_element.text = witness['text']
    
    rdg_element = ET.SubElement(app, f"{{{TEI}}}rdg")
    rdg_element.text = structured_entry['reading']

    if structured_entry['comments']:
        comment_element = ET.SubElement(app, f"{{{TEI}}}note")
        comment_element.text = structured_entry['comments']

    for ref in structured_entry['cross_references']:
        ref_element = ET.SubElement(app, f"{{{TEI}}}ref")
        ref_element.text = ref

    return app


In [181]:
import xml.etree.ElementTree as ET
import re


def create_tei_document(apparatus_lines):
    TEI_NAMESPACE = "http://www.tei-c.org/ns/1.0"
    ET.register_namespace('', TEI_NAMESPACE)  # Register the default namespace

    # Create the root element without redundantly specifying the xmlns attribute
    root = ET.Element("{%s}TEI" % TEI_NAMESPACE)
    header = ET.SubElement(root, "{%s}teiHeader" % TEI_NAMESPACE)
    text = ET.SubElement(root, "{%s}text" % TEI_NAMESPACE)
    body = ET.SubElement(text, "{%s}body" % TEI_NAMESPACE)
    div = ET.SubElement(body, "{%s}div" % TEI_NAMESPACE)
    
    last_verse_number = None
    for line in apparatus_lines:
        line = line.strip()
        if not line:
            continue
        
        # Determine if the line starts with a verse number
        match = re.match(r'^(\d+)', line)
        if match:
            last_verse_number = match.group(1)
            content = line[len(last_verse_number):].strip()
        else:
            content = line
        
        if last_verse_number:
            entry = create_apparatus_entry(last_verse_number, content, TEI_NAMESPACE)
            if entry is not None:  # Ensure entry creation was successful
                div.append(entry)
    
    return ET.ElementTree(root)

def save_tei_file(tree, filename):
    tree.write(filename, encoding="UTF-8", xml_declaration=True, method="xml")


# Replace 'your_input_file.txt' with the path to your actual input file
input_file = '01 Hosea App III - מתוקן.txt'
output_file = 'apparatus_tei.xml'

with open(input_file, 'r', encoding='utf-8') as f:
    lines = f.readlines()

tei_tree = create_tei_document(lines)
save_tei_file(tei_tree, output_file)

print(f"TEI document has been saved to {output_file}.")


TEI document has been saved to apparatus_tei.xml.


In [227]:
import re

def split_string(text):
    pattern = re.compile(r'^(.*?)([\+<~>]?\s?[\u0590-\u05FF]+.*?)(.*)$', re.DOTALL)
    match = pattern.match(text)
    if match:
        return match.groups()
    else:
        return None

def custom_split_string(text):
    pattern = re.compile(r'([^,\d]*?)?(\d+)\s?(\([^\)]+\)?)?([\skq]*)?', re.DOTALL|re.UNICODE)
    parts = re.findall(pattern, text)
    return parts

def remove_and_list_roman_numerals(text):
    pattern = r'([I]*[V]?)'
    found_numerals = re.findall(pattern, text)
    found_numerals = [numeral for numeral in found_numerals if numeral]
    result_text = re.sub(pattern, '', text)
    return result_text, found_numerals

# def process_entry(entry):
#     split_parts = split_string(entry)
#     if split_parts is None:
#         return "Unable to process entry: No valid dividers found."
    
#     witnesses, reading, comments = split_parts

#     witness_entries = []
#     for part in custom_split_string(witnesses):
#         witness_entry = f'<witness n="{part[1]}">{part[0]}{part[1]} {part[2].strip()}{part[3]}</witness>'
#         witness_entries.append(witness_entry)
#     witnesses_tagged = "\n".join(witness_entries)

#     reading_tagged = f'<reading>{reading}</reading>'

#     comments_text, numerals_found = remove_and_list_roman_numerals(comments)
#     comments_tagged = f'<comment>{comments_text}</comment>'
#     cross_references = "\n".join([f'<ref>{numeral}</ref>' for numeral in numerals_found])

#     # Combine all parts, placing cross_references outside the comment
#     tei_entry = f"{witnesses_tagged}\n{reading_tagged}\n{comments_tagged}\n{cross_references}"
#     return tei_entry

# Example usage
entry ="G-B msr. 30 (pm) G-A 89 (pm?) 150 (non voc) k, 30 (sm) 89 (sm) 93 (sm) 96 150 (pm) q, 93 (pm) > שערורהIV II (bla bla (f))"
processed_entry = process_entry(entry)
print(processed_entry)


{'witnesses': [{'n': '30', 'text': 'G-B msr. 30 (pm) '}, {'n': '89', 'text': 'G-A 89 (pm?) '}, {'n': '150', 'text': '150 (non voc) k'}, {'n': '30', 'text': ' 30 (sm) '}, {'n': '89', 'text': '89 (sm) '}, {'n': '93', 'text': '93 (sm) '}, {'n': '96', 'text': '96 '}, {'n': '150', 'text': '150 (pm) q'}, {'n': '93', 'text': ' 93 (pm) '}], 'reading': '> שערורה', 'comments': '  (bla bla (f))', 'cross_references': ['IV', 'II']}


In [60]:
rest = "93 (pm) + ביהושעIV (similarly PesiqtaR 33 (153b))"
rest

'93 (pm) + ביהושעIV (similarly PesiqtaR 33 (153b))'

In [None]:
#### 

In [None]:
####### old stuff

In [41]:

def read_text_from_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

def strip_non_hebrew(word):
    normalized_word = unicodedata.normalize('NFD', word)
    stripped_word = ''.join(re.findall(r'[\u05D0-\u05EA]', normalized_word))
    return unicodedata.normalize('NFC', stripped_word)

def process_word(token, verse_id, word_id, parent_element):
    parts = token.split('־')
    pe_count = 1  # Counter for 'פ' tags

    for part in parts:
        w = ET.SubElement(parent_element, 'w', id=f'verse{verse_id}_word{word_id}')

        alphabetic = strip_non_hebrew(part)
        non_alphabetic = ''.join(re.findall(r'[^\u05D0-\u05EA]', part))

        original = ET.SubElement(w, 'original')
        original.text = part
        stripped = ET.SubElement(w, 'stripped')
        stripped.text = alphabetic
        punctuation = ET.SubElement(w, 'punctuation')
        punctuation.text = non_alphabetic

        if "פ" in part:
            pe_tag = ET.SubElement(w, 'pe', id=f'verse{verse_id}_pe{pe_count}')
            pe_tag.text = "פ"
            pe_count += 1
        
        word_id += 1
    return word_id

def encode_tei_hebrew_word_details_enhanced(file_path, output_file):
    text = read_text_from_file(file_path)
    TEI = ET.Element('TEI', xmlns='http://www.tei-c.org/ns/1.0')
    text_element = ET.SubElement(TEI, 'text')
    body = ET.SubElement(text_element, 'body')

    chapter_id = 1
    verse_id = 1

    chapters = text.split('פרק')
    for chapter in chapters[1:]:
        div = ET.SubElement(body, 'div', type='chapter', id=f'chapter{chapter_id}')
        chapter_id += 1

        verses = re.split(r'(\[\פ\]|:)', chapter)
        for verse in verses:
            if verse.strip() and verse not in ['[פ]', ':']:
                p = ET.SubElement(div, 'p', type='verse', id=f'verse{verse_id}')
                word_id = 1

                tokens = verse.strip().split()
                for token in tokens:
                    word_id = process_word(token, verse_id, word_id, p)

                verse_id += 1

    tree = ET.ElementTree(TEI)
    with open(output_file, "w", encoding="utf-8") as f:
        tree.write(f, encoding="unicode")

# Specify the file paths
file_path = 'file.txt'  # Replace with your input file path
output_file = 'tei_hebrew_output_enhanced.xml'  # Replace with your output file path

# Run the function
encode_tei_hebrew_word_details_enhanced(file_path, output_file)
output_file

'tei_hebrew_output_enhanced.xml'

In [None]:
[' ', '"', '$', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', '<', '=', '>', 'E', 'I', 'T', '_', 'a', 'b', 'c', 'd', 'e', 'g', 'h', 'i', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', '֑', '֔', '֕', '֖', '֗', '֙', '֛', '֜', '֞', '֣', '֤', '֥', '֨', '֩', 'ְ', 'ֱ', 'ֲ', 'ִ', 'ֵ', 'ֶ', 'ַ', 'ָ', 'ֹ', 'ֻ', 'ּ', 'ֽ', '׀', 'ׁ', 'ׂ']

In [51]:
def extract_consecutive_non_hebrew_groups(file_path):
    text = read_text_from_file(file_path)
    non_hebrew_groups = set()

    # Using a regular expression to find sequences of non-Hebrew characters
    pattern = re.compile(r'([^\u05D0-\u05EA]{,2})')
    matches = pattern.findall(unicodedata.normalize('NFD', text))

    for match in matches:
        non_hebrew_groups.add(match.strip())

    return non_hebrew_groups

# Extract and print groups of consecutive non-Hebrew characters
file_path = 'file.txt'

consecutive_non_hebrew_groups = extract_consecutive_non_hebrew_groups(file_path)
print(sorted(consecutive_non_hebrew_groups))



['', '$', '$1', '$2', '$4', '2', ':', '[', ']', '֑', '֔', '֕', '֖', '֗', '֙', '֜', '֣', '֤', '֥', '֥$', '֨', '֩', 'ְ', 'ְ$', 'ְ֙', 'ְּ', 'ְׁ', 'ְׂ', 'ֱ', 'ֲ', 'ִ', 'ִ$', 'ִ֔', 'ִ֖', 'ִ֜', 'ִ֨', 'ִּ', 'ִֽ', 'ִׁ', 'ֵ', 'ֵ$', 'ֵ֔', 'ֵ֖', 'ֵ֗', 'ֵ֛', 'ֵ֣', 'ֵ֤', 'ֵ֨', 'ֵּ', 'ֵֽ', 'ֵׁ', 'ֶ', 'ֶ֑', 'ֶ֙', 'ֶ֣', 'ֶ֤', 'ֶ֥', 'ֶּ', 'ֶֽ', 'ֶׁ', 'ַ', 'ַ֗', 'ַ֙', 'ַּ', 'ַׁ', 'ָ', 'ָ֑', 'ָ֔', 'ָ֖', 'ָ֗', 'ָ֛', 'ָ֜', 'ָ֞', 'ָ֣', 'ָ֥', 'ָ֨', 'ָּ', 'ָֽ', 'ֹ', 'ֹ֖', 'ֹ֣', 'ֹ֤', 'ֹ֨', 'ֹּ', 'ֹׂ', 'ֻ', 'ּ', 'ּ֣', 'ֽ', '־', '־$', '׀', 'ׁ', 'ׂ֖', '\ufeff']
