# Script to automatically make diacritical corrections in Hebrew text

## Defines letters and words

In [71]:
aleph_bet = ['א','ב','ג','ד','ה','ו','ז','ח','ט','י','כ','ך','ל','מ','ם',
             'נ','ן','ס','ע','פ','ף','צ','ק','ר','ש','ת','װ','ױ','ײ','יִ',
             'ﬡ','ﬢ','ﬣ','ﬤ','ﬥ','ﬦ','ﬧ','ﬨ','שׁ','שׂ','שּׁ','שּׂ','אַ','אָ',
             'גּ','דּ','הּ','וּ','זּ','טּ','יּ','ךּ','כּ','לּ','מּ','נּ','סּ','ףּ',
             'פּ','צּ','שּ','תּ','וֹ','בֿ','כֿ','פֿ','ﭏ','בּ', 'קּ']

cant = ['֑','֒','֓','֔','֕','֖','֗','֘','֙','֚','֛','֜',
             '֝','֞','֠','֡','֢','֣','֤','֥','֦','֧','֨','֩','֪','֫','֬','֭','֯','׃']

vowel = ['ְ','ֱ','ֲ','ֳ','ִ','ֵ','ֶ','ַ','ָ','ֹ','ֺ','ֻ','ּ','ֽ','־','ֿ','ׁ','ׂ','ׄ','ׅ','ׇ']

letter_with_dagesh = ['שּׁ','שּׂ','גּ','דּ','זּ','טּ','יּ','כּ','לּ','מּ','נּ','סּ','פּ','צּ','שּ','תּ','בּ','קּ']
rafe = 'ֿ'
shva = vowel[0]
chataf_vowels = vowel[1:4]
short_vowels = [vowel[4],vowel[6],vowel[7],vowel[11],vowel[20]]
long_vowels = ['וֹ','וּ', vowel[5], vowel[8], aleph_bet[9], 'ֹ']
vowels_limited = chataf_vowels+short_vowels+long_vowels
dagesh = 'ּ'

gutterals = ['א','ה','ח','ע','ר' ,'ﬡ','ﬣ','ﬧ']             

In [70]:
chataf_vowels[2]

'ֳ'

In [35]:
shem = 'יהוה'
shem_vowels = 'יְהֹוָה'
yy_vowels = aleph_bet[9]+vowel[0]+aleph_bet[9]+vowel[8]

kal = aleph_bet[52] + vowel[8] + aleph_bet[12]
khal = aleph_bet[10] + vowel[8] + aleph_bet[12]
kol_maqqaf = aleph_bet[52] + vowel[20] + aleph_bet[12] + '־'
khol_maqqaf = aleph_bet[10] + vowel[20] + aleph_bet[12] + '־'
kol_space = aleph_bet[52] + vowel[20] + aleph_bet[12] + ' '
khol_space = aleph_bet[10] + vowel[20] + aleph_bet[12] + ' '

et = 'אֶת'
ve_et = 'וְאֶת'
space_et_space = ' ' + 'אֶת' + ' '
space_ve_et_space = ' ' + 'וְאֶת' + ' '
et_maqqaf = ' ' + 'אֶת' + '־'
ve_et_maqqaf = ' ' + 'וְאֶת' + '־'

In [60]:
shva_exceptions = ['שְׁתֵּי','שְׁתָּיִם','שְׁתַּיִם','שְׁנִַים','שְׁנֵי','שְׁתֵּים','שְׁנֵים']
battim = 'בָּתִּים'
vattim = 'בָתִּים'
kamatz_katan_exceptions = [battim, vattim]

## Strips vowels and cantelation marks from words

In [37]:
#removes trop and vowels
def nonalpha_remover(word):
    no_cant_word = ''
    for letter in word:
        if letter.isalpha() == True:
            no_cant_word = no_cant_word + letter
    return no_cant_word

In [38]:
#removes trop but not vowels
def trop_remover(word):
    no_cant_word = ''
    for letter in word:
        if letter not in cant:
            no_cant_word = no_cant_word + letter
    return no_cant_word

## Converts Shem-Havaya to double-yud while preserving cantelation marks

In [39]:
def shem_converter(word):
    prefix = ''
    if word[0] != aleph_bet[9]:
        prefix = prefix + word[0]
        if word[1].isalpha() == False:
            prefix = prefix + word[1]
            if word[2].isalpha() == False:
                prefix = prefix + word[2]
    no_prefix_word = word[len(prefix):]

    if len(prefix) > 0:
        yud1 = aleph_bet[9]
    else:
        yud1 = aleph_bet[9]+vowel[0]

    yud2 = aleph_bet[9]+vowel[8]

    cant_marks = []

    for character in no_prefix_word:
        if character in cant:
            cant_marks.append(character)
    if len(cant_marks) == 0:
        new_shem = prefix + yud1 + yud2
    elif len(cant_marks) == 1:
        new_shem = prefix + yud1 + yud2 + cant_marks[0]
    elif len(cant_marks) == 2:
        new_shem = prefix + yud1 + cant_marks[0] + yud2 + cant_marks[1]

    return new_shem

In [40]:
## Creates new paragraph with double-yud in place of Shem Havaya

In [41]:
def convert_shem(paragraph):
    paragraph = str.replace(paragraph, '־', ' ־ ')
    par_list = paragraph.split()
    for word in par_list:
        index = par_list.index(word)
        if shem in nonalpha_remover(word):
            double_yud = shem_converter(word)
            par_list[index] = double_yud

    new_par = ' '.join(par_list)
    new_par = str.replace(new_par, ' ־ ','־')
    new_par = str.replace(new_par, '׃', '׃')
    return new_par

In [42]:
## use https://he.wikisource.org/wiki/%D7%9E%D7%A7%D7%A8%D7%90 for testing texts

## Puts a kamatz katan and maqqaf in "kol"

In [43]:
#this script does not work on words with trope
#Since pesukim from wikisource have maqqafs and kamatz katans, it shouldn't make it less useful

In [44]:
def kamatz_exception(word):
    if word[-1] != aleph_bet[12]:
        return True
    elif aleph_bet[9] in word:
        return True
    else:
        return False
    #returns "true" if the word is a likely false positive for the word "kol"

In [45]:
def kol_kamatz_katan(paragraph):
    paragraph = str.replace(paragraph, '־', ' ־ ')
    par_list = paragraph.split()
    #Paragraph is now split into words
    for word in par_list:
        index = par_list.index(word)
        if (kal in word or khal in word) and kamatz_exception(word) == False:
            split_word = word.split(vowel[8])
            word = vowel[20].join(split_word)
            par_list[index] = word
            #If "k(h)al" appears in a word, change it to a kamatz katan
    new_par = ' '.join(par_list)
    new_par = str.replace(new_par, ' ־ ','־')
    #Put paragraph back together
    if kol_space in new_par: 
        new_par_split = new_par.split(kol_space)
        new_par = kol_maqqaf.join(new_par_split)
    if khol_space in new_par:
        new_par_split = new_par.split(khol_space)
        new_par = khol_maqqaf.join(new_par_split)
    #Previous if-statements swap a space following "kol" to a maqqaf
    new_par = str.replace(new_par, '׃', '׃')
    return new_par

## Puts a Maqqaf after "et" (when it has a segol)

In [46]:
#this script does not work on words with trope
#Since pesukim from wikisource have maqqafs, this shouldn't be a problem.

In [47]:
def et_fixer(paragraph):
    par_list = paragraph.split()
    if et in par_list:
        new_par_split = paragraph.split(space_et_space)
        new_paragraph = et_maqqaf.join(new_par_split)
        new_par_split = new_paragraph.split(space_ve_et_space)
        new_paragraph = ve_et_maqqaf.join(new_par_split)
    else:
        new_paragraph = paragraph
    return new_paragraph

## Puts a kamatz katan in common kamatz-katan words and situations

In [48]:
#this script does not work on words with trope
#Since pesukim from wikisource have kamatz katans, this shouldn't be too much of a problem.

In [65]:
chakhma = 'חָכְמָה'
chokhma = 'חׇכְמָה'
karban = 'קָרְבַּן'
korban = 'קׇרְבַּן'
kareinu = 'קָרְאֵֽנוּ'
koreinu = 'קׇרְאֵֽנוּ'
karbe_ = 'קָרְבְּ'
korbe_ = 'קׇרְבְּ'
kadshe_ = 'קָדְשְׁ'
kodshe_ = 'קׇדְשְׁ'
kadashim = 'קׇדָשִׁים'
kodashim = 'קׇדָשִׁים'

words_to_fix = [chakhma, karban, kareinu, karbe_, kadshe_, kadashim]
corrected_words = [chokhma, korban, koreinu, korbe_, kodshe_, kodashim]

In [73]:
def kamatz_katan_adder(word):    
    #prevents script from erroneously fixing false positives
    nt_word = trop_remover(word)
    for index in range(0,index(kamatz_katan_exceptions)):
        if kamatz_katan_exceptions[index] in nt_word:
            return word
    
    for index in range(1,len(word)-2):
        if word[index] == vowel[8]:
            i = index
            while word[i] not in aleph_bet:
                i = i+1
            next_consonant = word[i]
            if next_consonant in letter_with_dagesh:
                word[index] = vowel[20]
                
            while word[i] not in vowels_limited:
                i = i+1
            next_vowel = word[i]
            if next_vowel == chataf_vowels[2] and next_consonant in gutterals:
                word[index] = vowel[20]
    
    #add kamatz katan if next vowel is chataf-kamatz and the next consonant is a gutteral
    return word

In [67]:
def kamatz_katan(paragraph):
    #This goes through the list of common words with kamatz katan (besides "kol")
    #and corrects them if they are present.
    #This list can be added to as needed.
    for index in range(len(words_to_fix)):
        paragraph = paragraph.replace(words_to_fix[index],corrected_words[index])
    par_list = paragraph.split()
    
    #Calls the kamatz_katan_adder for each word in paragraph, if that word has a kamatz at all
    for index in range(len(par_list)):
        if vowel[8] in par_list[index]:
            par_list[index] = kamatz_katan_adder(par_list[index])

                #paragraph = [paragraph[0:index]+vowel[20]+paragraph[index+1:len(paragraph)]]
    paragraph = ' '.join(par_list)
    return paragraph  

In [None]:
#kamatz katan adder needs to be tested

## Marks shva na'

In [51]:
#Inserts a rafe to mark a shva na'
def na_marker(paragraph,index):
    if paragraph[index-1] == rafe or paragraph[index+1] == rafe:
        return paragraph
        #does nothing, if the shva is already marked with a rafe
    else:
        par_start = paragraph[:index]
        par_end = paragraph[index:]
        new_par = par_start+rafe+par_end
        return new_par
        #adds a rafe over the input letter

In [52]:
#Determines what shvas are na' in a word, and calls the program to mark them
def shva_na_function(word):
    nt_word = trop_remover(word)
    #this skips instances of shem hashem
    if nt_word == shem_vowels or nt_word == yy_vowels:
        return word
    if nt_word in shva_exceptions:
        return word
    for index in range(1,len(word)-2):
        i = index
        if word[index] != shva:
            continue
            #this doesn't bother with the loop if the letter isn't a shva
            
        while word[i] not in aleph_bet:
            i = i-1
        previous_consonant = word[i]
        #if the previous consonant is the beginning of the word, the shva is na'
        if i == 0:
            word = na_marker(word,index)
            continue
        #if the previous consonant has a dagesh, the shva must be na'
        if previous_consonant in letter_with_dagesh:
            word = na_marker(word,index)
            continue
        
        #determines the previous vowel
        while word[i] not in vowels_limited:
            i = i-1
        previous_vowel = word[i]
        #if the previous vowel is a shva, the current shva is na'
        if previous_vowel == shva:
            word = na_marker(word,index)
        if previous_vowel in long_vowels and i != 0:
            word = na_marker(word,index)    
        
        #a shva following a long vowel is na', unless the long vowel is word-initial shuruk
    
    return word

In [53]:
#Calls the shva_na_function for each word in the input paragraph
def shva_na_converter(paragraph):
    par_list = paragraph.split()
    for word in par_list:
        index = par_list.index(word)
        par_list[index] = shva_na_function(word)
    new_par = ' '.join(par_list)
    return new_par

In [54]:
paragraph = 'בַּחֹ֤דֶשׁ הָרִאשׁוֹן֙ הוּא־חֹ֣דֶשׁ נִיסָ֔ן בִּשְׁנַת֙ שְׁתֵּ֣ים עֶשְׂרֵ֔ה לַמֶּ֖לֶךְ אֲחַשְׁוֵר֑וֹשׁ הִפִּ֣יל פּוּר֩ ה֨וּא הַגּוֹרָ֜ל לִפְנֵ֣י הָמָ֗ן מִיּ֧וֹם ׀ לְי֛וֹם וּמֵחֹ֛דֶשׁ לְחֹ֥דֶשׁ שְׁנֵים־עָשָׂ֖ר הוּא־חֹ֥דֶשׁ אֲדָֽר׃'

In [55]:
shva_na_converter(paragraph)

'בַּחֹ֤דֶשׁ הָרִאשׁוֹן֙ הוּא־חֹ֣דֶשׁ נִיסָ֔ן בִּשְׁנַת֙ שְׁתֵּ֣ים עֶשְׂרֵ֔ה לַמֶּ֖לֶךְ אֲחַשְׁוֵר֑וֹשׁ הִפִּ֣יל פּוּר֩ ה֨וּא הַגּוֹרָ֜ל לִפְנֵ֣י הָמָ֗ן מִיּ֧וֹם ׀ לְֿי֛וֹם וּמֵחֹ֛דֶשׁ לְֿחֹ֥דֶשׁ שְֿׁנֵים־עָשָׂ֖ר הוּא־חֹ֥דֶשׁ אֲדָֽר׃'

## Runs Paragraph through all converters

In [56]:
def text_converter(paragraph):
    #Comment out components of the script you don't want to run
    paragraph = convert_shem(paragraph)
    paragraph = kol_kamatz_katan(paragraph)
    paragraph = et_fixer(paragraph)
    paragraph = kamatz_katan(paragraph)
    paragraph = shva_na_converter(paragraph)
    return paragraph

## Reading file from disk

In [57]:
input_file = 'olat_tamid_2.1.tex'

In [58]:
with open(input_file, 'r', encoding='utf-8') as infile:
    lines = list(infile.readlines())

In [59]:
new_lines = []
for line in lines:
    new_lines.append(text_converter(line))

UnboundLocalError: local variable 'new_paragraph' referenced before assignment

In [30]:
output_file = 'olat_tamid_converted.tex'

In [31]:
with open(output_file, 'w', encoding='utf-8') as outfile:
    for line in new_lines:
        if line != '':
            if line[-1] != '\n':
                outfile.write(line + '\n')
            else:
                outfile.write(line)
        else:
            outfile.write('\n')