# Script to automatically make diacritical corrections in Hebrew text

## Defines letters and words

In [1]:
aleph_bet = ['א','ב','ג','ד','ה','ו','ז','ח','ט','י','כ','ך','ל','מ','ם',
             'נ','ן','ס','ע','פ','ף','צ','ק','ר','ש','ת','װ','ױ','ײ','יִ',
             'ﬡ','ﬢ','ﬣ','ﬤ','ﬥ','ﬦ','ﬧ','ﬨ','שׁ','שׂ','שּׁ','שּׂ','אַ','אָ',
             'גּ','דּ','הּ','וּ','זּ','טּ','יּ','ךּ','כּ','לּ','מּ','נּ','סּ','ףּ',
             'פּ','צּ','שּ','תּ','וֹ','בֿ','כֿ','פֿ','ﭏ','בּ', 'קּ']

cant = ['֑','֒','֓','֔','֕','֖','֗','֘','֙','֚','֛','֜',
             '֝','֞','֠','֡','֢','֣','֤','֥','֦','֧','֨','֩','֪','֫','֬','֭','֯','׃']

vowel = ['ְ','ֱ','ֲ','ֳ','ִ','ֵ','ֶ','ַ','ָ','ֹ','ֺ','ֻ','ּ','ֽ','־','ֿ','ׁ','ׂ','ׄ','ׅ','ׇ']

letter_with_dagesh = ['שּׁ','שּׂ','גּ','דּ','זּ','טּ','יּ','כּ','לּ','מּ','נּ','סּ','פּ','צּ','שּ','תּ','בּ','קּ']
rafe = 'ֿ'
shva = vowel[0]
chataf_vowels = vowel[1:4]
short_vowels = [vowel[4],vowel[6],vowel[7],vowel[11],vowel[20]]
long_vowels = ['וֹ','וּ', vowel[5], vowel[8], aleph_bet[9], 'ֹ']
vowels_limited = vowel[0:4]+short_vowels+long_vowels
dagesh = 'ּ'
maqqaf = '־'
meteg = 'ֽ'
dot = '֯'

gutterals = ['א','ה','ח','ע','ר' ,'ﬡ','ﬣ','ﬧ']             

In [2]:
shem = 'יהוה'
shem_vowels = 'יְהֹוָה'
yy_vowels = aleph_bet[9]+vowel[0]+aleph_bet[9]+vowel[8]

kal_backwards_dagesh = 'כָּל'
kal_dagesh = 'כָּל'
kal = aleph_bet[52] + vowel[8] + aleph_bet[12]
khal = aleph_bet[10] + vowel[8] + aleph_bet[12]
kol = aleph_bet[52] + vowel[20] + aleph_bet[12]
kol_maqqaf = aleph_bet[52] + vowel[20] + aleph_bet[12] + '־'
khol_maqqaf = aleph_bet[10] + vowel[20] + aleph_bet[12] + '־'
kol_space = aleph_bet[52] + vowel[20] + aleph_bet[12] + ' '
khol_space = aleph_bet[10] + vowel[20] + aleph_bet[12] + ' '

et = 'אֶת'
ve_et = 'וְאֶת'
space_et_space = ' ' + 'אֶת' + ' '
space_ve_et_space = ' ' + 'וְאֶת' + ' '
et_maqqaf = ' ' + 'אֶת' + '־'
ve_et_maqqaf = ' ' + 'וְאֶת' + '־'

In [3]:
shva_exceptions = ['שְׁתֵּי','שְׁתָּיִם','שְׁתַּיִם','שְׁנִַים','שְׁנֵי','שְׁתֵּים','שְׁנֵים','שְׁנֵי','שְׁנֵים']
battim = 'בָּתִּים'
vattim = 'בָתִּים'
ana = 'אָנָּא'
anah = 'אָנָּה'
kamatz_katan_exceptions = [battim, vattim, ana, anah]

In [4]:
#the shem converter "eats" some special characters next to sheimot, this stops it from doing that
#it doesn't work all that well, so often manual intervention is necessary.
special_characters = ['{', '}','\\']

In [14]:
##Strips trop from words

In [15]:
#removes trop but not vowels
def trop_remover(word):
    no_cant_word = ''
    for letter in word:
        if letter not in cant:
            no_cant_word = no_cant_word + letter
    return no_cant_word

## Marks shva na

In [5]:
#defines how shva is marked
mark = rafe
#mark = dot

In [6]:
#Inserts the marker to mark a shva na'
def na_marker(paragraph,index):
    if paragraph[index-1] == mark or paragraph[index+1] == mark or paragraph[index+2] == mark or paragraph[index-2]==mark:
        return paragraph
        #does nothing, if the shva is already marked with a rafe
    else:
        par_start = paragraph[:index]
        par_end = paragraph[index:]
        new_par = par_start+mark+par_end
        return new_par
        #adds a rafe over the input letter

In [7]:
#Determines what shvas are na' in a word, and calls the program to mark them
def shva_na_function(word):
    nt_word = trop_remover(word)
    #this skips instances of shem hashem
    if shem_vowels in nt_word or yy_vowels in nt_word:
        return word
    if nt_word in shva_exceptions:
        return word
    for index in range(1,len(word)-2):
        i = index
        if word[index] != shva:
            continue
            #this doesn't bother with the loop if the letter isn't a shva
            
        while word[i] not in aleph_bet:
            i = i-1
        previous_consonant = word[i]
        #if the previous consonant is the beginning of the word, the shva is na'
        if i == 0:
            word = na_marker(word,index)
            continue
        if word[i-1] == maqqaf:
            word = na_marker(word,index)
        #if the previous consonant has a dagesh, the shva must be na'
        if previous_consonant in letter_with_dagesh:
            word = na_marker(word,index)
            continue
        if word[i+1] == dagesh or word[index+1] == dagesh:
            word = na_marker(word,index)
            continue
            
        #determines the next consonant
        i = index + 1
        while word[i] not in aleph_bet and i<len(word)-1:
            i = i+1
        next_consonant = word[i]
        if next_consonant == previous_consonant:
            word = na_marker(word,index)
            continue

        #determines the previous vowel
        i = index-2
        while word[i] not in vowels_limited and i>0:
            i = i-1
        if i == 0:
            return word
        else:
            previous_vowel = word[i]
            #if the previous vowel is a shva, the current shva is na'
            if previous_vowel == shva:
                word = na_marker(word,index)
            if previous_vowel in long_vowels:
                word = na_marker(word,index)    
        
        #a shva following a long vowel is na', unless the long vowel is word-initial shuruk
   

    #determine how to mark סקינמלוי letters
    return word

In [8]:
#Calls the shva_na_function for each word in the input paragraph
def shva_na_converter(paragraph):
    par_list = paragraph.split()
    for word in par_list:
        index = par_list.index(word)
        par_list[index] = shva_na_function(word)
    new_par = ' '.join(par_list)
    return new_par

## Changes shva marking character

In [9]:
#this program allows the changing of all rafes used to mark shva into another character
#in this case a circle above the letter
#this could be adapted to change a text that uses the dot to using something else
def shva_na_mark_changer(paragraph):
    new_paragraph = paragraph.replace(rafe,dot)
    return new_paragraph
    #return new_paragraph

In [10]:
def text_converter(paragraph):
    paragraph = shva_na_converter(paragraph)
    paragraph = shva_na_mark_changer(paragraph)
    return paragraph

## Reading file from disk

In [16]:
input_file = 'chumash.tex'

In [17]:
with open(input_file, 'r', encoding='utf-8') as infile:
    lines = list(infile.readlines())

In [18]:
new_lines = []
line_number = 1
for line in lines:
    new_lines.append(text_converter(line))
    #for troubleshooting--identifies the last line where the script ran
    print('line number', line_number, 'outputted successfully')
    line_number=line_number+1
    
    #if kamatz_finder(line) != []:
        #print(line_number,kamatz_finder(line))
        
    

    

line number 1 outputted successfully
line number 2 outputted successfully
line number 3 outputted successfully
line number 4 outputted successfully
line number 5 outputted successfully
line number 6 outputted successfully
line number 7 outputted successfully
line number 8 outputted successfully
line number 9 outputted successfully
line number 10 outputted successfully
line number 11 outputted successfully
line number 12 outputted successfully
line number 13 outputted successfully
line number 14 outputted successfully
line number 15 outputted successfully
line number 16 outputted successfully
line number 17 outputted successfully
line number 18 outputted successfully
line number 19 outputted successfully
line number 20 outputted successfully
line number 21 outputted successfully
line number 22 outputted successfully
line number 23 outputted successfully
line number 24 outputted successfully
line number 25 outputted successfully
line number 26 outputted successfully
line number 27 output

line number 216 outputted successfully
line number 217 outputted successfully
line number 218 outputted successfully
line number 219 outputted successfully
line number 220 outputted successfully
line number 221 outputted successfully
line number 222 outputted successfully
line number 223 outputted successfully
line number 224 outputted successfully
line number 225 outputted successfully
line number 226 outputted successfully
line number 227 outputted successfully
line number 228 outputted successfully
line number 229 outputted successfully
line number 230 outputted successfully
line number 231 outputted successfully
line number 232 outputted successfully
line number 233 outputted successfully
line number 234 outputted successfully
line number 235 outputted successfully
line number 236 outputted successfully
line number 237 outputted successfully
line number 238 outputted successfully
line number 239 outputted successfully
line number 240 outputted successfully
line number 241 outputted

line number 464 outputted successfully
line number 465 outputted successfully
line number 466 outputted successfully
line number 467 outputted successfully
line number 468 outputted successfully
line number 469 outputted successfully
line number 470 outputted successfully
line number 471 outputted successfully
line number 472 outputted successfully
line number 473 outputted successfully
line number 474 outputted successfully
line number 475 outputted successfully
line number 476 outputted successfully
line number 477 outputted successfully
line number 478 outputted successfully
line number 479 outputted successfully
line number 480 outputted successfully
line number 481 outputted successfully
line number 482 outputted successfully
line number 483 outputted successfully
line number 484 outputted successfully
line number 485 outputted successfully
line number 486 outputted successfully
line number 487 outputted successfully
line number 488 outputted successfully
line number 489 outputted

line number 736 outputted successfully
line number 737 outputted successfully
line number 738 outputted successfully
line number 739 outputted successfully
line number 740 outputted successfully
line number 741 outputted successfully
line number 742 outputted successfully
line number 743 outputted successfully
line number 744 outputted successfully
line number 745 outputted successfully
line number 746 outputted successfully
line number 747 outputted successfully
line number 748 outputted successfully
line number 749 outputted successfully
line number 750 outputted successfully
line number 751 outputted successfully
line number 752 outputted successfully
line number 753 outputted successfully
line number 754 outputted successfully
line number 755 outputted successfully
line number 756 outputted successfully
line number 757 outputted successfully
line number 758 outputted successfully
line number 759 outputted successfully
line number 760 outputted successfully
line number 761 outputted

line number 1016 outputted successfully
line number 1017 outputted successfully
line number 1018 outputted successfully
line number 1019 outputted successfully
line number 1020 outputted successfully
line number 1021 outputted successfully
line number 1022 outputted successfully
line number 1023 outputted successfully
line number 1024 outputted successfully
line number 1025 outputted successfully
line number 1026 outputted successfully
line number 1027 outputted successfully
line number 1028 outputted successfully
line number 1029 outputted successfully
line number 1030 outputted successfully
line number 1031 outputted successfully
line number 1032 outputted successfully
line number 1033 outputted successfully
line number 1034 outputted successfully
line number 1035 outputted successfully
line number 1036 outputted successfully
line number 1037 outputted successfully
line number 1038 outputted successfully
line number 1039 outputted successfully
line number 1040 outputted successfully


In [19]:
output_file = 'chumash_converted.tex'

In [20]:
with open(output_file, 'w', encoding='utf-8') as outfile:
    for line in new_lines:
        if line != '':
            if line[-1] != '\n':
                outfile.write(line + '\n')
            else:
                outfile.write(line)
        else:
            outfile.write('\n')