In [41]:
import xml.etree.ElementTree as ET
import re
import unicodedata

def read_text_from_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

def strip_non_hebrew(word):
    normalized_word = unicodedata.normalize('NFD', word)
    stripped_word = ''.join(re.findall(r'[\u05D0-\u05EA]', normalized_word))
    return unicodedata.normalize('NFC', stripped_word)

def process_word(token, verse_id, word_id, parent_element):
    parts = token.split('־')
    pe_count = 1  # Counter for 'פ' tags

    for part in parts:
        w = ET.SubElement(parent_element, 'w', id=f'verse{verse_id}_word{word_id}')

        alphabetic = strip_non_hebrew(part)
        non_alphabetic = ''.join(re.findall(r'[^\u05D0-\u05EA]', part))

        original = ET.SubElement(w, 'original')
        original.text = part
        stripped = ET.SubElement(w, 'stripped')
        stripped.text = alphabetic
        punctuation = ET.SubElement(w, 'punctuation')
        punctuation.text = non_alphabetic

        if "פ" in part:
            pe_tag = ET.SubElement(w, 'pe', id=f'verse{verse_id}_pe{pe_count}')
            pe_tag.text = "פ"
            pe_count += 1
        
        word_id += 1
    return word_id

def encode_tei_hebrew_word_details_enhanced(file_path, output_file):
    text = read_text_from_file(file_path)
    TEI = ET.Element('TEI', xmlns='http://www.tei-c.org/ns/1.0')
    text_element = ET.SubElement(TEI, 'text')
    body = ET.SubElement(text_element, 'body')

    chapter_id = 1
    verse_id = 1

    chapters = text.split('פרק')
    for chapter in chapters[1:]:
        div = ET.SubElement(body, 'div', type='chapter', id=f'chapter{chapter_id}')
        chapter_id += 1

        verses = re.split(r'(\[\פ\]|:)', chapter)
        for verse in verses:
            if verse.strip() and verse not in ['[פ]', ':']:
                p = ET.SubElement(div, 'p', type='verse', id=f'verse{verse_id}')
                word_id = 1

                tokens = verse.strip().split()
                for token in tokens:
                    word_id = process_word(token, verse_id, word_id, p)

                verse_id += 1

    tree = ET.ElementTree(TEI)
    with open(output_file, "w", encoding="utf-8") as f:
        tree.write(f, encoding="unicode")

# Specify the file paths
file_path = 'file.txt'  # Replace with your input file path
output_file = 'tei_hebrew_output_enhanced.xml'  # Replace with your output file path

# Run the function
encode_tei_hebrew_word_details_enhanced(file_path, output_file)
output_file

'tei_hebrew_output_enhanced.xml'

In [None]:
[' ', '"', '$', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', '<', '=', '>', 'E', 'I', 'T', '_', 'a', 'b', 'c', 'd', 'e', 'g', 'h', 'i', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', '֑', '֔', '֕', '֖', '֗', '֙', '֛', '֜', '֞', '֣', '֤', '֥', '֨', '֩', 'ְ', 'ֱ', 'ֲ', 'ִ', 'ֵ', 'ֶ', 'ַ', 'ָ', 'ֹ', 'ֻ', 'ּ', 'ֽ', '׀', 'ׁ', 'ׂ']

In [51]:
def extract_consecutive_non_hebrew_groups(file_path):
    text = read_text_from_file(file_path)
    non_hebrew_groups = set()

    # Using a regular expression to find sequences of non-Hebrew characters
    pattern = re.compile(r'([^\u05D0-\u05EA]{,2})')
    matches = pattern.findall(unicodedata.normalize('NFD', text))

    for match in matches:
        non_hebrew_groups.add(match.strip())

    return non_hebrew_groups

# Extract and print groups of consecutive non-Hebrew characters
file_path = 'file.txt'

consecutive_non_hebrew_groups = extract_consecutive_non_hebrew_groups(file_path)
print(sorted(consecutive_non_hebrew_groups))



['', '$', '$1', '$2', '$4', '2', ':', '[', ']', '֑', '֔', '֕', '֖', '֗', '֙', '֜', '֣', '֤', '֥', '֥$', '֨', '֩', 'ְ', 'ְ$', 'ְ֙', 'ְּ', 'ְׁ', 'ְׂ', 'ֱ', 'ֲ', 'ִ', 'ִ$', 'ִ֔', 'ִ֖', 'ִ֜', 'ִ֨', 'ִּ', 'ִֽ', 'ִׁ', 'ֵ', 'ֵ$', 'ֵ֔', 'ֵ֖', 'ֵ֗', 'ֵ֛', 'ֵ֣', 'ֵ֤', 'ֵ֨', 'ֵּ', 'ֵֽ', 'ֵׁ', 'ֶ', 'ֶ֑', 'ֶ֙', 'ֶ֣', 'ֶ֤', 'ֶ֥', 'ֶּ', 'ֶֽ', 'ֶׁ', 'ַ', 'ַ֗', 'ַ֙', 'ַּ', 'ַׁ', 'ָ', 'ָ֑', 'ָ֔', 'ָ֖', 'ָ֗', 'ָ֛', 'ָ֜', 'ָ֞', 'ָ֣', 'ָ֥', 'ָ֨', 'ָּ', 'ָֽ', 'ֹ', 'ֹ֖', 'ֹ֣', 'ֹ֤', 'ֹ֨', 'ֹּ', 'ֹׂ', 'ֻ', 'ּ', 'ּ֣', 'ֽ', '־', '־$', '׀', 'ׁ', 'ׂ֖', '\ufeff']
