In [None]:
import re
from lxml import etree

In [4]:
# Hebrew letter to numerical value (based on gematria)
hebrew_to_num = {
    'א': '1', 'ב': '2', 'ג': '3', 'ד': '4', 'ה': '5', 'ו': '6', 'ז': '7', 'ח': '8', 'ט': '9', 'י': '10',
    'כ': '20', 'ך': '20', 'ל': '30', 'מ': '40', 'ם': '40', 'נ': '50', 'ן': '50', 'ס': '60', 'ע': '70',
    'פ': '80', 'ף': '80', 'צ': '90', 'ץ': '90', 'ק': '100', 'ר': '200', 'ש': '300', 'ת': '400'
}

def convert_hebrew_to_num(hebrew_str):
    # Convert each Hebrew character to its corresponding number
    num_str = ''.join(hebrew_to_num.get(char, char) for char in hebrew_str)
    return num_str


In [8]:


def create_tei_xml_from_text(text_lines):
    root = etree.Element('TEI', xmlns="http://www.tei-c.org/ns/1.0")
    teiHeader = create_tei_header(root)
    text_element = etree.SubElement(root, 'text')
    body = etree.SubElement(text_element, 'body')

    current_chapter = None
    current_verse = None
    verse_counter = 1

    for line in text_lines:
        line = line.strip()
        print(f"Processing line: {line}")

        if line.startswith('פרק'):
            if current_chapter:
                close_chapter(current_chapter)
            chapter_letter = line.split()[1]
            chapter_num = convert_hebrew_to_num(chapter_letter)
            current_chapter = etree.SubElement(body, 'div', type='chapter', n=chapter_num)
            verse_counter = 1
            print(f"Started chapter {chapter_num}")
        elif line:
            # Ensure we have a valid chapter before processing verses
            if current_chapter is None:
                print("Error: No chapter started before encountering text.")
                continue
            # Process each line for verses and markers
            process_text(line, current_chapter, chapter_num, verse_counter)
            verse_counter += 1

    if current_chapter:
        close_chapter(current_chapter)

    return etree.tostring(root, pretty_print=True, encoding='UTF-8', xml_declaration=True)

def create_tei_header(root):
    teiHeader = etree.SubElement(root, 'teiHeader')
    fileDesc = etree.SubElement(teiHeader, 'fileDesc')
    titleStmt = etree.SubElement(fileDesc, 'titleStmt')
    title = etree.SubElement(titleStmt, 'title')
    title.text = "Book of Hosea"
    publicationStmt = etree.SubElement(titleStmt, 'publicationStmt')
    publisher = etree.SubElement(publicationStmt, 'publisher')
    publisher.text = "Digital Edition"
    date = etree.SubElement(publicationStmt, 'date')
    date.text = "2024"
    sourceDesc = etree.SubElement(fileDesc, 'sourceDesc')
    source = etree.SubElement(sourceDesc, 'source')
    source.text = "Manuscript"
    return teiHeader

def close_chapter(chapter_element):
    print(f"Closed chapter {chapter_element.get('n')}")

def process_text(text, current_chapter, chapter_num, verse_counter):
    global word_counter  # This keeps track of the word within the verse

    # Reset the word counter for each new verse
    word_counter = 0  # Restart the word counter for each verse
    
    # Split the text into words and special markers
    parts = re.split(r'(\s+|:|\[פ\]|\[ס\]|\$[1]|־|׀)', text)

    current_verse = etree.SubElement(current_chapter, 'lg', type='verse', n=f'{chapter_num}:{verse_counter}')
    print(f"Started verse {verse_counter} in chapter {chapter_num}")

    for part in parts:
        part = part.strip()  # Remove unnecessary whitespace
        if not part:
            continue  # Skip empty parts

        if part in [':', '[פ]', '[ס]', '$1', '׀', '־']:
            add_special_marker(part, current_verse)
            if part in [':', '$1']:  # If a verse-ending marker is found
                verse_counter += 1
                current_verse = etree.SubElement(current_chapter, 'lg', type='verse', n=f'{chapter_num}:{verse_counter}')
                print(f"Started new verse {verse_counter} in chapter {chapter_num}")
                word_counter = 0  # Reset the word counter when a new verse starts

        else:
            # Treat overdots as part of the word, so they get processed here
            process_word(part, current_verse, chapter_num, verse_counter)

    return verse_counter  # Return the updated verse counter



def process_word(word, lg_element, chapter_num, verse_counter):
    global word_counter
    word_counter += 1  # Increment the counter for each word

    # Split the word based on the special overdots ($2, $3, $4, $5)
    parts = re.split(r'(\$[2-5])', word)  # Keep the overdot marker as part of the split

    # Create the <w> element for the word, including a unique ID with chapter and verse information
    w_element = etree.SubElement(lg_element, 'w', id=f'word_{chapter_num}_{verse_counter}_{word_counter}')
    print(f"Created <w> element with ID: word_{chapter_num}_{verse_counter}_{word_counter}")

    w_text = ""  # Buffer for word text

    for part in parts:
        if re.match(r'\$[2-5]', part):
            # Append the buffer to the <w> element before the overdot
            if w_text:
                w_element.text = (w_element.text or '') + w_text  # Set or append to the <w>'s text
                print(f"Appending text before overdot: '{w_text}' to <w> element")
                w_text = ""  # Reset buffer

            # Insert the overdot as a sub-element inside the same <w> element
            overdot_type = part[-1]  # Get the overdot type (2, 3, 4, or 5)
            overdot_element = etree.SubElement(w_element, 'overdot', type=overdot_type)
            print(f"Overdot encountered: {part} - Added overdot type {overdot_type} inside <w> element")
        else:
            # Accumulate text for the word
            w_text += part
            print(f"Accumulating text: '{part}' - Current buffer: '{w_text}'")

    # Append the remaining text after the overdot using the .tail property
    if w_text:
        if w_element.text:  # If text already exists, add the remaining text after the overdot
            overdot_element.tail = w_text  # Place text after the overdot
            print(f"Appending remaining text after overdot: '{w_text}' to <w> element")
        else:
            w_element.text = w_text  # If no text exists, just set the main text

    print(f"--- Final word text: '{w_element.text}', with ID: word_{chapter_num}_{verse_counter}_{word_counter} ---")

   
def add_special_marker(marker, current_verse):
    if marker == ':':
        verse_end = etree.SubElement(current_verse, 'verseEnd', type='standard')
        verse_end.text = marker
        print("Added standard verse end.")
    elif marker == '$1':
        verse_end = etree.SubElement(current_verse, 'verseEnd', type='special')
        verse_end.text = marker
        print("Added special verse end.")
    elif marker in ['[פ]', '[ס]', '׀', '־']:
        punctuation = etree.SubElement(current_verse, 'punctuation', type=marker.strip('[]'))
        punctuation.text = marker
        print(f"Added special marker: {marker}")

# Example usage
with open('hosea_full.txt', 'r', encoding='utf-8') as file:
    text_lines = file.readlines()

tei_xml = create_tei_xml_from_text(text_lines)

with open('output.tei.xml', 'wb') as output_file:
    output_file.write(tei_xml)


Processing line: ספר הושע
Error: No chapter started before encountering text.
Processing line: פרק א
Started chapter 1
Processing line: דְּבַר־יְהוָ֣ה ׀ אֲשֶׁ֣ר הָיָ֗ה אֶל־הוֹשֵׁעַ֙ בֶּן־בְּאֵ$2רִ֔י בִּימֵ֨י עֻזִּיָּ֥ה יוֹתָ֛ם אָחָ֥ז יְחִזְקִ$2יָּ֖ה מַלְכֵ֣י יְהוּדָ֑ה וּבִימֵ֛י יָרָבְעָ֥ם בֶּן־יוֹאָ֖שׁ מֶ֥לֶךְ יִשְׂרָאֵֽל$1 תְּחִלַּ֥$2ת דִּבֶּר־יְהוָ֖ה בְּ$2הוֹשֵׁ֑עַ     [פ]     וַיֹּ֨אמֶר יְהוָ֜ה אֶל־הוֹשֵׁ֗עַ לֵ֣ךְ קַח־לְךָ֞ אֵ֤שֶׁת זְנוּנִים֙ וְיַלְ$2דֵ֣י זְנוּנִ֔ים כִּֽי־זָנֹ֤ה$2 תִזְנֶה֙ הָאָ֔רֶץ מֵאַחֲרֵ֖י יְהוָֽה: וַיֵּ֙לֶךְ֙ וַיִּקַּ֔ח אֶת־גֹּ֖מֶר בַּת־דִּבְ$2לָ֑יִם וַתַּ֥הַר וַתֵּלֶד־$2ל֖וֹ $4בֵּֽן: וַיֹּ֤אמֶר $2יְהוָה֙ $4אֵלָ֔יו קְרָ֥א שְׁמ֖וֹ יִזְרְעֶ֑אל כִּי־ע֣וֹד מְעַ֗ט וּפָ֨קַדְתִּ֜י אֶת־דְּמֵ֤י יִזְרְעֶאל֙ עַל־בֵּ֣ית יֵה֔וּא וְהִ֨שְׁבַּתִּ֔י מַמְ$2לְכ֖וּת בֵּ֥ית יִשְׂרָאֵֽל: וְהָיָ֖ה בַּיּ֣וֹם הַה֑וּא וְשָֽׁבַרְתִּי֙ אֶת־קֶ֣שֶׁת יִשְׂרָאֵ֔ל בְּעֵ֖מֶק יִזְרְעֶֽאל$1 וַתַּ֤הַר $2עוֹד֙ $4וַתֵּ֣לֶד בַּ֔ת וַיֹּ֣אמֶר ל֔וֹ קְרָ֥א שְׁמָ֖הּ לֹ֣א רֻחָ֑מָה כִּי֩ לֹ֨א אוֹסִ֜יף ע֗וֹד אֲרַחֵם֙ אֶת־בֵּ֣ית יִשְׂרָאֵ֔ל כִּֽי־נָשֹׂ֥א אֶשָּׂ֖א לָהֶֽם: וְאֶת־בֵּ֤ית יְהוּדָה֙

  if current_chapter:


Accumulating text: 'בָּאָ֔רֶץ' - Current buffer: 'בָּאָ֔רֶץ'
--- Final word text: 'בָּאָ֔רֶץ', with ID: word_2_25_3 ---
Created <w> element with ID: word_2_25_4
Accumulating text: 'וְרִחַמְתִּ֖י' - Current buffer: 'וְרִחַמְתִּ֖י'
--- Final word text: 'וְרִחַמְתִּ֖י', with ID: word_2_25_4 ---
Created <w> element with ID: word_2_25_5
Accumulating text: 'אֶת' - Current buffer: 'אֶת'
--- Final word text: 'אֶת', with ID: word_2_25_5 ---
Added special marker: ־
Created <w> element with ID: word_2_25_6
Accumulating text: 'לֹ֣א' - Current buffer: 'לֹ֣א'
--- Final word text: 'לֹ֣א', with ID: word_2_25_6 ---
Created <w> element with ID: word_2_25_7
Accumulating text: 'רֻחָ֑מָה' - Current buffer: 'רֻחָ֑מָה'
--- Final word text: 'רֻחָ֑מָה', with ID: word_2_25_7 ---
Created <w> element with ID: word_2_25_8
Accumulating text: 'וְאָמַרְתִּ֤י' - Current buffer: 'וְאָמַרְתִּ֤י'
--- Final word text: 'וְאָמַרְתִּ֤י', with ID: word_2_25_8 ---
Created <w> element with ID: word_2_25_9
Accumulating text: 'לְלֹֽ' - Cu

Accumulating text: 'כַשַּׁ֙חַל֙' - Current buffer: 'כַשַּׁ֙חַל֙'
--- Final word text: 'כַשַּׁ֙חַל֙', with ID: word_5_14_3 ---
Created <w> element with ID: word_5_14_4
Accumulating text: 'לְאֶפְרַ֔יִם' - Current buffer: 'לְאֶפְרַ֔יִם'
--- Final word text: 'לְאֶפְרַ֔יִם', with ID: word_5_14_4 ---
Created <w> element with ID: word_5_14_5
Accumulating text: 'וְכַכְּפִ֖' - Current buffer: 'וְכַכְּפִ֖'
Appending text before overdot: 'וְכַכְּפִ֖' to <w> element
Overdot encountered: $2 - Added overdot type 2 inside <w> element
Accumulating text: 'יר' - Current buffer: 'יר'
Appending remaining text after overdot: 'יר' to <w> element
--- Final word text: 'וְכַכְּפִ֖', with ID: word_5_14_5 ---
Created <w> element with ID: word_5_14_6
Accumulating text: 'לְבֵ֣ית' - Current buffer: 'לְבֵ֣ית'
--- Final word text: 'לְבֵ֣ית', with ID: word_5_14_6 ---
Created <w> element with ID: word_5_14_7
Accumulating text: 'יְהוּדָ֑ה' - Current buffer: 'יְהוּדָ֑ה'
--- Final word text: 'יְהוּדָ֑ה', with ID: word_5_14_7 ---
Creat

Created <w> element with ID: word_8_11_4
Accumulating text: 'מִזְבְּח֖וֹת' - Current buffer: 'מִזְבְּח֖וֹת'
--- Final word text: 'מִזְבְּח֖וֹת', with ID: word_8_11_4 ---
Created <w> element with ID: word_8_11_5
Accumulating text: 'לַחֲטֹ֑א' - Current buffer: 'לַחֲטֹ֑א'
--- Final word text: 'לַחֲטֹ֑א', with ID: word_8_11_5 ---
Created <w> element with ID: word_8_11_6
Accumulating text: 'הָיוּ' - Current buffer: 'הָיוּ'
--- Final word text: 'הָיוּ', with ID: word_8_11_6 ---
Added special marker: ־
Created <w> element with ID: word_8_11_7
Accumulating text: 'ל֥וֹ' - Current buffer: 'ל֥וֹ'
--- Final word text: 'ל֥וֹ', with ID: word_8_11_7 ---
Created <w> element with ID: word_8_11_8
Accumulating text: 'מִזְבְּח֖וֹת' - Current buffer: 'מִזְבְּח֖וֹת'
--- Final word text: 'מִזְבְּח֖וֹת', with ID: word_8_11_8 ---
Created <w> element with ID: word_8_11_9
Accumulating text: 'לַחֲטֹֽא' - Current buffer: 'לַחֲטֹֽא'
--- Final word text: 'לַחֲטֹֽא', with ID: word_8_11_9 ---
Added standard verse end.
Started new verse

Created <w> element with ID: word_10_11_17
Accumulating text: 'יַעֲקֹֽב' - Current buffer: 'יַעֲקֹֽב'
--- Final word text: 'יַעֲקֹֽב', with ID: word_10_11_17 ---
Added standard verse end.
Started new verse 12 in chapter 10
Created <w> element with ID: word_10_12_1
Accumulating text: 'זִרְע֨וּ' - Current buffer: 'זִרְע֨וּ'
--- Final word text: 'זִרְע֨וּ', with ID: word_10_12_1 ---
Created <w> element with ID: word_10_12_2
Accumulating text: 'לָכֶ֤ם' - Current buffer: 'לָכֶ֤ם'
--- Final word text: 'לָכֶ֤ם', with ID: word_10_12_2 ---
Created <w> element with ID: word_10_12_3
Accumulating text: 'לִצְדָקָה֙' - Current buffer: 'לִצְדָקָה֙'
--- Final word text: 'לִצְדָקָה֙', with ID: word_10_12_3 ---
Created <w> element with ID: word_10_12_4
Accumulating text: 'קִצְר֣וּ' - Current buffer: 'קִצְר֣וּ'
--- Final word text: 'קִצְר֣וּ', with ID: word_10_12_4 ---
Created <w> element with ID: word_10_12_5
Accumulating text: 'לְפִי' - Current buffer: 'לְפִי'
--- Final word text: 'לְפִי', with ID: word_10_1

Accumulating text: 'בְעֶזְ' - Current buffer: 'בְעֶזְ'
Appending text before overdot: 'בְעֶזְ' to <w> element
Overdot encountered: $2 - Added overdot type 2 inside <w> element
Accumulating text: 'רֶֽךָ' - Current buffer: 'רֶֽךָ'
Appending remaining text after overdot: 'רֶֽךָ' to <w> element
--- Final word text: 'בְעֶזְ', with ID: word_103_9_5 ---
Added standard verse end.
Started new verse 10 in chapter 103
Created <w> element with ID: word_103_10_1
Accumulating text: 'אֱהִ֤י' - Current buffer: 'אֱהִ֤י'
--- Final word text: 'אֱהִ֤י', with ID: word_103_10_1 ---
Created <w> element with ID: word_103_10_2
Accumulating text: 'מַלְכְּךָ֙' - Current buffer: 'מַלְכְּךָ֙'
--- Final word text: 'מַלְכְּךָ֙', with ID: word_103_10_2 ---
Created <w> element with ID: word_103_10_3
Accumulating text: 'אֵפ֔וֹא' - Current buffer: 'אֵפ֔וֹא'
--- Final word text: 'אֵפ֔וֹא', with ID: word_103_10_3 ---
Created <w> element with ID: word_103_10_4
Accumulating text: 'וְיוֹשִׁי' - Current buffer: 'וְיוֹשִׁי'
Appending te

  if current_chapter:
