# Script to automatically make diacritical corrections in Hebrew text

## Defines letters and words

In [147]:
aleph_bet = ['א','ב','ג','ד','ה','ו','ז','ח','ט','י','כ','ך','ל','מ','ם',
             'נ','ן','ס','ע','פ','ף','צ','ק','ר','ש','ת','װ','ױ','ײ','יִ',
             'ﬡ','ﬢ','ﬣ','ﬤ','ﬥ','ﬦ','ﬧ','ﬨ','שׁ','שׂ','שּׁ','שּׂ','אַ','אָ',
             'גּ','דּ','הּ','וּ','זּ','טּ','יּ','ךּ','כּ','לּ','מּ','נּ','סּ','ףּ',
             'פּ','צּ','שּ','תּ','וֹ','בֿ','כֿ','פֿ','ﭏ','בּ', 'קּ']

cant = ['֑','֒','֓','֔','֕','֖','֗','֘','֙','֚','֛','֜',
             '֝','֞','֠','֡','֢','֣','֤','֥','֦','֧','֨','֩','֪','֫','֬','֭','֯','׃']

vowel = ['ְ','ֱ','ֲ','ֳ','ִ','ֵ','ֶ','ַ','ָ','ֹ','ֺ','ֻ','ּ','ֽ','־','ֿ','ׁ','ׂ','ׄ','ׅ','ׇ']

letter_with_dagesh = ['שּׁ','שּׂ','גּ','דּ','זּ','טּ','יּ','כּ','לּ','מּ','נּ','סּ','פּ','צּ','שּ','תּ','בּ','קּ']
rafe = 'ֿ'
shva = vowel[0]
chataf_vowels = vowel[1:4]
short_vowels = [vowel[4],vowel[6],vowel[7],vowel[11],vowel[20]]
long_vowels = ['וֹ','וּ', vowel[5], vowel[8], aleph_bet[9], 'ֹ']
vowels_limited = vowel[0:4]+short_vowels+long_vowels
dagesh = 'ּ'
maqqaf = '־'
meteg = 'ֽ'
dot = '֯'

gutterals = ['א','ה','ח','ע','ר' ,'ﬡ','ﬣ','ﬧ']             

In [148]:
shem = 'יהוה'
shem_vowels = 'יְהֹוָה'
yy_vowels = aleph_bet[9]+vowel[0]+aleph_bet[9]+vowel[8]

kal_backwards_dagesh = 'כָּל'
kal_dagesh = 'כָּל'
kal = aleph_bet[52] + vowel[8] + aleph_bet[12]
khal = aleph_bet[10] + vowel[8] + aleph_bet[12]
kol = aleph_bet[52] + vowel[20] + aleph_bet[12]
kol_maqqaf = aleph_bet[52] + vowel[20] + aleph_bet[12] + '־'
khol_maqqaf = aleph_bet[10] + vowel[20] + aleph_bet[12] + '־'
kol_space = aleph_bet[52] + vowel[20] + aleph_bet[12] + ' '
khol_space = aleph_bet[10] + vowel[20] + aleph_bet[12] + ' '

et = 'אֶת'
ve_et = 'וְאֶת'
space_et_space = ' ' + 'אֶת' + ' '
space_ve_et_space = ' ' + 'וְאֶת' + ' '
et_maqqaf = ' ' + 'אֶת' + '־'
ve_et_maqqaf = ' ' + 'וְאֶת' + '־'

In [149]:
shva_exceptions = ['שְׁתֵּי','שְׁתָּיִם','שְׁתַּיִם','שְׁנִַים','שְׁנֵי','שְׁתֵּים','שְׁנֵים','שְׁנֵי','שְׁנֵים']
battim = 'בָּתִּים'
vattim = 'בָתִּים'
ana = 'אָנָּא'
anah = 'אָנָּה'
kamatz_katan_exceptions = [battim, vattim, ana, anah]

In [150]:
#the shem converter "eats" some special characters next to sheimot, this stops it from doing that
#it doesn't work all that well, so often manual intervention is necessary.
special_characters = ['{', '}','\\']

## Strips vowels and cantelation marks from words

In [151]:
#removes trop and vowels
def nonalpha_remover(word):
    no_cant_word = ''
    for letter in word:
        if letter.isalpha() == True:
            no_cant_word = no_cant_word + letter
    return no_cant_word

In [152]:
#removes trop but not vowels
def trop_remover(word):
    no_cant_word = ''
    for letter in word:
        if letter not in cant:
            no_cant_word = no_cant_word + letter
    return no_cant_word

## Converts Shem-Havaya to double-yud while preserving cantelation marks

In [153]:
def shem_converter(word):
    prefix = ''
    if word[0] != aleph_bet[9]:
        prefix = prefix + word[0]
        i = 1
        if word[1].isalpha() == False:
            prefix = prefix + word[1]
            if word[2].isalpha() == False:
                prefix = prefix + word[2]
    no_prefix_word = word[len(prefix):]

    if len(prefix) > 0:
        yud1 = aleph_bet[9]
    else:
        yud1 = aleph_bet[9]+vowel[0]

    yud2 = aleph_bet[9]+vowel[8]

    cant_marks = []

    #finds suffix
    suffix = ''
    i = -1
    while word[i] != aleph_bet[4]:
        if word[i] in special_characters:
            suffix = word[i] + suffix
        i = i-1
    
    for character in no_prefix_word:
        if character in cant:
            cant_marks.append(character)
    if len(cant_marks) == 0:
        new_shem = prefix + yud1 + yud2 + suffix
    elif len(cant_marks) == 1:
        new_shem = prefix + yud1 + yud2 + cant_marks[0] + suffix
    elif len(cant_marks) == 2:
        new_shem = prefix + yud1 + cant_marks[0] + yud2 + cant_marks[1] + suffix

    return new_shem

In [154]:
## Creates new paragraph with double-yud in place of Shem Havaya

In [155]:
#this needs to be fixed to stop eating special characters (i.e. brackets, \)

In [156]:
def convert_shem(paragraph):
    paragraph = str.replace(paragraph, '־', ' ־ ')
    par_list = paragraph.split()
    for word in par_list:
        index = par_list.index(word)
        if shem in nonalpha_remover(word):
            double_yud = shem_converter(word)
            par_list[index] = double_yud

    new_par = ' '.join(par_list)
    new_par = str.replace(new_par, ' ־ ','־')
    new_par = str.replace(new_par, '׃', '׃')
    return new_par

In [157]:
## use https://he.wikisource.org/wiki/%D7%9E%D7%A7%D7%A8%D7%90 for testing texts

## Puts a kamatz katan and maqqaf in "kol"

In [158]:
#this script does not work on words with trope
#Since pesukim from wikisource have maqqafs and kamatz katans, it shouldn't make it less useful

In [159]:
def kamatz_exception(word):
    if word[-1] != aleph_bet[12]:
        return True
    elif aleph_bet[9] in word:
        return True
    elif aleph_bet[0] in word:
        return True
    else:
        return False
    #returns "true" if the word is a likely false positive for the word "kol"

In [160]:
def kol_kamatz_katan(paragraph):
    paragraph = str.replace(paragraph, '־', ' ־ ')
    par_list = paragraph.split()
    #Paragraph is now split into words
    for word in par_list:
        index = par_list.index(word)
        if (kal in word or khal in word or kal_backwards_dagesh in word or kal_dagesh in word) and kamatz_exception(word) == False:
            if kal_backwards_dagesh in word:
                word = word.replace(kal_backwards_dagesh,kol)
            if kal_dagesh in word:
                word = word.replace(kal_dagesh,kol)
            split_word = word.split(vowel[8])
            word = vowel[20].join(split_word)
            par_list[index] = word
            #If "k(h)al" appears in a word, change it to a kamatz katan
    new_par = ' '.join(par_list)
    new_par = str.replace(new_par, ' ־ ','־')
    #Put paragraph back together
    if kol_space in new_par: 
        new_par_split = new_par.split(kol_space)
        new_par = kol_maqqaf.join(new_par_split)
    if khol_space in new_par:
        new_par_split = new_par.split(khol_space)
        new_par = khol_maqqaf.join(new_par_split)
    #Previous if-statements swap a space following "kol" to a maqqaf
    new_par = str.replace(new_par, '׃', '׃')
    return new_par

## Puts a Maqqaf after "et" (when it has a segol)

In [161]:
#this script does not work on words with trope
#Since pesukim from wikisource have maqqafs, this shouldn't be a problem.

In [162]:
def et_fixer(paragraph):
    par_list = paragraph.split()
    if et in par_list:
        new_par_split = paragraph.split(space_et_space)
        new_paragraph = et_maqqaf.join(new_par_split)
        new_par_split = new_paragraph.split(space_ve_et_space)
        new_paragraph = ve_et_maqqaf.join(new_par_split)
    else:
        new_paragraph = paragraph
    return new_paragraph

## Puts a kamatz katan in common kamatz-katan words and situations

In [163]:
#this script does not work on words with trope
#Since pesukim from wikisource have kamatz katans, this shouldn't be too much of a problem.

In [164]:
#this imports a list of kamatz-katan words to fix and their replacements
#import csv
import csv
words_to_fix = []
corrected_words = []
with open('kamatz_correction_list.csv', 'r', encoding='utf-8') as csvDataFile:
    csvReader = csv.reader(csvDataFile)
    for row in csvReader:
        words_to_fix.append(row[0])
        corrected_words.append(row[1])
words_to_fix=words_to_fix[1:]
corrected_words=corrected_words[1:]

In [165]:
#removes shva marks, to check for kamatz katans
#Since some may preceed erroneously marked shvas
#This will need to be adjusted if the shva na marker is not a rafe
def shva_mark_remover(word):
    if rafe in word:
        word = word.replace(rafe,'')
    return word

In [178]:
def kamatz_list_tester(word):
    i=0
    while i<len(words_to_fix):
        if shva_mark_remover(word)==words_to_fix[i]:
            word = corrected_words[i]
        i=i+1
    return word

In [180]:
def kamatz_katan_adder(word):    
    #prevents script from erroneously fixing false positives
    nt_word = trop_remover(word)
    for index in range(0,len(kamatz_katan_exceptions)):
        if kamatz_katan_exceptions[index] in nt_word:
            return word
    
    #runs word through list of predefined kamatz words to check
    if shva_mark_remover(word) in words_to_fix:
        return kamatz_list_tester(word)

    for index in range(1,len(word)-2):
        if word[index] == vowel[8]:
            i = index
            while word[i] not in aleph_bet:
                i = i+1
                if i>len(word)-1:
                    break
            if i<len(word)-1:
                next_consonant = word[i]
                if next_consonant in letter_with_dagesh:
                    word = word[:index]+vowel[20]+word[index+1:]
            
            i = index+1
            while word[i] not in vowels_limited:
                i = i+1
                if i>len(word)-1:
                    return word
            next_vowel = word[i]
            if next_vowel == chataf_vowels[2] and next_consonant in gutterals:
                word = word[:index]+vowel[20]+word[index+1:]
                return word
            #finds the vowel after the next one.  If both are shvas, the kamatz is katan.
            if next_vowel == shva and shva in word[index+1:]:
                if i<len(word)-1:
                    i = i+1
                    while i<len(word)-2 and word[i] not in vowels_limited:
                        i = i+1
                    sec_vowel = word[i]
                    if sec_vowel == shva:
                        word = word[:index]+vowel[20]+word[index:]

            #if there's a kamatz under the second-to-last consonant, and a meteg earlier in the word, the kamatz is katan
            if meteg in word and index == len(word)-2:
                #finds index of previous consonant.  Adds kamatz katan if meteg is before that.
                while word[i] not in aleph_bet:
                    i = i-1
                while i>0 and word[i] != meteg:
                    i = i-1
                if word[i] == meteg:
                    word = word[:index]+vowel[20]+word[index:]
    return word

In [186]:
def kamatz_katan(paragraph):
    #This goes through the list of common words with kamatz katan (besides "kol")
    #and corrects them if they are present.
    #This list can be added to as needed.
    par_list = paragraph.split()
    
    #Calls the kamatz_katan_adder for each word in paragraph, if that word has a kamatz at all
    for index in range(len(par_list)):
        if vowel[8] in par_list[index]:
            par_list[index] = kamatz_katan_adder(par_list[index])
    paragraph = ' '.join(par_list)
    double_kamatz1 = vowel[20] + vowel[8]
    double_kamatz2 = vowel[8] + vowel[20]
    if double_kamatz1 in paragraph or double_kamatz2 in paragraph:
        paragraph = paragraph.replace(double_kamatz1,vowel[20])
        paragraph = paragraph.replace(double_kamatz2,vowel[20])
    return paragraph  

## Marks shva na'

In [21]:
#defines how shva is marked
mark = rafe
#mark = dot

In [22]:
#Inserts the marker to mark a shva na'
def na_marker(paragraph,index):
    if paragraph[index-1] == mark or paragraph[index+1] == mark or paragraph[index+2] == mark or paragraph[index-2]==mark:
        return paragraph
        #does nothing, if the shva is already marked with a rafe
    else:
        par_start = paragraph[:index]
        par_end = paragraph[index:]
        new_par = par_start+mark+par_end
        return new_par
        #adds a rafe over the input letter

In [23]:
#Determines what shvas are na' in a word, and calls the program to mark them
def shva_na_function(word):
    nt_word = trop_remover(word)
    #this skips instances of shem hashem
    if shem_vowels in nt_word or yy_vowels in nt_word:
        return word
    if nt_word in shva_exceptions:
        return word
    for index in range(1,len(word)-2):
        i = index
        if word[index] != shva:
            continue
            #this doesn't bother with the loop if the letter isn't a shva
            
        while word[i] not in aleph_bet:
            i = i-1
        previous_consonant = word[i]
        #if the previous consonant is the beginning of the word, the shva is na'
        if i == 0:
            word = na_marker(word,index)
            continue
        if word[i-1] == maqqaf:
            word = na_marker(word,index)
        #if the previous consonant has a dagesh, the shva must be na'
        if previous_consonant in letter_with_dagesh:
            word = na_marker(word,index)
            continue
        if word[i+1] == dagesh or word[index+1] == dagesh:
            word = na_marker(word,index)
            continue
            
        #determines the next consonant
        i = index + 1
        while word[i] not in aleph_bet and i<len(word)-1:
            i = i+1
        next_consonant = word[i]
        if next_consonant == previous_consonant:
            word = na_marker(word,index)
            continue

        #determines the previous vowel
        i = index-2
        while word[i] not in vowels_limited and i>0:
            i = i-1
        if i == 0:
            return word
        else:
            previous_vowel = word[i]
            #if the previous vowel is a shva, the current shva is na'
            if previous_vowel == shva:
                word = na_marker(word,index)
            if previous_vowel in long_vowels:
                word = na_marker(word,index)    
        
        #a shva following a long vowel is na', unless the long vowel is word-initial shuruk
   

    #determine how to mark סקינמלוי letters
    return word

In [24]:
#Calls the shva_na_function for each word in the input paragraph
def shva_na_converter(paragraph):
    par_list = paragraph.split()
    for word in par_list:
        index = par_list.index(word)
        par_list[index] = shva_na_function(word)
    new_par = ' '.join(par_list)
    return new_par

## Changes shva marking character

In [25]:
#this program allows the changing of all rafes used to mark shva into another character
#in this case a circle above the letter
#this could be adapted to change a text that uses the dot to using something else
def shva_na_mark_changer(paragraph):
    new_paragraph = paragraph.replace(rafe,dot)
    return new_paragraph
    #return new_paragraph

## Fixes mistakes

In [22]:
def mistake_fixer(paragraph):
    badkamatz1 = vowel[8] + vowel[20]
    badkamatz2 = vowel[20] + vowel[8]
    badkamatz3 =vowel[20]+vowel[20]
    
    baddot = dot+dot
    baddot2 = dot+shva+dagesh+dot
    baddot3 = 'ְּׁ֯֯'
    fixdot3 = baddot3[1:6]
    baddot4 = 'ְׁ֯֯'
    fixdot4 = 'ְׁ֯'
    
    badrafe = rafe+rafe
    badrafe2 = rafe+shva+dagesh+rafe
    badrafe3 = baddot3.replace(rafe,dot)
    fixrafe3 = fixdot3.replace(rafe,dot)
    badrafe4 = baddot4.replace(rafe,dot)
    fixrafe4 = fixdot4.replace(rafe,dot)
    
    if vowel[5] in paragraph or vowel[2] in paragraph:
        paragraph = paragraph.replace(badkamatz1,vowel[20])
        paragraph = paragraph.replace(badkamatz2,vowel[20])
        paragraph = paragraph.replace(badkamatz3,vowel[20])
    
    if dot in paragraph:
        paragraph = paragraph.replace(baddot,dot)
        paragraph = paragraph.replace(baddot2,dagesh+shva+dot)
        paragraph = paragraph.replace(baddot3,fixdot3)
        paragraph = paragraph.replace(baddot4,fixdot4)
    
    if rafe in paragraph:
        paragraph = paragraph.replace(badrafe,rafe)   
        paragraph = paragraph.replace(badrafe2,dagesh+shva+dot)
        paragraph = paragraph.replace(badrafe4,fixrafe4)
    return paragraph

## Runs Paragraph through all converters

In [34]:
def text_converter(paragraph):
    #Comment out components of the script you don't want to run
    paragraph = convert_shem(paragraph)
    #paragraph = kol_kamatz_katan(paragraph)
    #paragraph = et_fixer(paragraph)
    #paragraph = kamatz_katan(paragraph)
    #paragraph = shva_na_converter(paragraph)
    paragraph = shva_na_mark_changer(paragraph)
    paragraph = mistake_fixer(paragraph)
    return paragraph

## Kamatz-check list

In [59]:
#This outputs a list of lines and words that have ambiguous kamatz cases
#These are words with a kamatz that isn't marked katan followed by a shva
#Once the list is output the user can manually go through the list, find words that need a kamatz katan, and make the change
def kamatz_inspector(word):
    i=1 #since a word can't start with a kamatz
    length = len(word)
    while i<len(word)-6:
        if word[i]==vowel[8]:
            index=i+2
            while word[index] not in vowels_limited and index<len(word)-2:
                index=index+1
            next_vowel=word[index]
            if next_vowel==shva:
                return True
        i=i+1
    return False
            

In [60]:
def kamatz_finder(line):
    par_list = line.split()
    output = []
    for word in par_list:
        if kamatz_inspector(word)==True:
            output.append(word)
    return output

## Reading file from disk

In [56]:
input_file = 'siddur.tex'

In [57]:
with open(input_file, 'r', encoding='utf-8') as infile:
    lines = list(infile.readlines())

In [61]:
new_lines = []
line_number = 1
for line in lines:
    new_lines.append(text_converter(line))
    #for troubleshooting--identifies the last line where the script ran
    #print('line number', line_number, 'outputted successfully')
    
    if kamatz_finder(line) != []:
        print(line_number,kamatz_finder(line))
    
    #print('code got here')
    line_number = line_number+1
    

141 ['\\firstword{בָּרוּךְ}']
237 ['אָ֥זְֿרוּ', 'יָלְֿדָ֣ה']
241 ['יָֽדְֿךָ֙', 'וְֿהִתְבָּרְֿכ֣וּ']
287 ['וְֿזָרְֿק֡וּ']
352 ['\\firstword{בָּרוּךְ}']
357 ['וְ֝צִדְקָֽתְךָ֗']
362 ['\\firstword{בָּרוּךְ}']
366 ['\\firstword{בָּרוּךְ}']
377 ['עַל־יָדְֿךָ֗']
379 ['כִּֽי־יִשְׁאָלְֿךָ֥', 'עַל־יָ֣דְֿכָ֔ה']
490 ['בָטָֽחְֿנוּ׃']
509 ['בָּרְֿכ֥וּ']
523 ['צָֽדְק֥וּ']
534 ['וַ֝יְגָרְשֵׁ֗הוּ']
535 ['אֲבָרְכָ֣ה']
567 ['נִבְהָֽלְנוּ׃']
592 ['יִשָּׂא֑וּנְךָ']
619 ['בָּרְכ֣וּ', 'בָּרְכ֥וּ']
620 ['בָּרְכ֣וּ', 'בָּרְכ֥וּ']
659 ['מָלְאָ֥ה']
675 ['בָטָֽחְנוּ׃']
684 ['מַה־גָּדְל֣וּ', 'עָמְק֥וּ']
686 ['לְהִשָּׁמְדָ֥ם']
688 ['יִ֝תְפָּרְד֗וּ']
701 ['נָשְׂא֤וּ', 'נָשְׂא֣וּ']
720 ['אָבְֿד֥וּ']
737 ['וַאֲבָרְֿכָ֥ה']
738 ['אֲבָֽרְֿכֶ֑ךָּ']
742 ['וּגְדֻלָּתְךָ֥']
743 ['וְֿצִדְקָתְֿךָ֥']
746 ['יְֿבָרְֿכֽוּכָה׃\\hfill']
747 ['וּגְבוּרָתְֿךָ֥']
756 ['כׇּל־הָרְֿשָׁעִ֣ים']
769 ['אָבְֿד֥וּ']
772 ['לָרְֿעֵבִ֑ים']
793 ['הַשָּׂם־גְּֿֿבוּלֵ֥ךְ']
841 ['\\firstword{בָּר֖וּךְ}\\source{תהלים']
854 ['וּבְיָ֣דְךָ֔']
884 ['יָרְֿד֥

In [38]:
output_file = 'siddur_converted.tex'

In [39]:
with open(output_file, 'w', encoding='utf-8') as outfile:
    for line in new_lines:
        if line != '':
            if line[-1] != '\n':
                outfile.write(line + '\n')
            else:
                outfile.write(line)
        else:
            outfile.write('\n')