In [2]:
import csv, json

In [3]:
torah = './tanakh_texts/torah.csv'
megillot = './tanakh_texts/chamesh_megillot.csv'
navi1 = './tanakh_texts/neviim_rishonim.csv'
navi2 = './tanakh_texts/neviim_achronim.csv'
emet = './tanakh_texts/sifrei_emet.csv'
ketuvim = './tanakh_texts/ketuvim.csv'

all_tanakh = [torah,megillot,navi1,navi2,emet,ketuvim]

letters_to_numbers = {
    "א":1,
    "ב":2,
    "ג":3,
    "ד":4,
    "ה":5,
    "ו":6,
    "ז":7,
    "ח":8,
    "ט":9,
    "י":10,
    "כ":20,
    "ל":30,
    "מ":40,
    "נ":50,
    "ס":60,
    "ע":70,
    "פ":80,
    "צ":90,
    "ק":100,
    "ר":200,
    "ש":300,
    "ת":400
}

sefer_names = {
    "בראשית":"Genesis",
    "שמות":"Exodus",
    "ויקרא":"Leviticus",
    "במדבר":"Numbers",
    "דברים":"Deuteronomy",
    "יהושע":"Joshua",
    "שופטים":"Judges",
    "שמואל א":"1Samuel",
    "שמואל ב":"2Samuel",
    "מלכים א":"1Kings",
    "מלכים ב":"2Kings",
    "ישעיהו":"Isaiah",
    "ירמיהו":"Jeremiah",
    "יחזקאל":"Ezekiel",
    "הושע":"Hosea",
    "יואל":"Joel",
    "עמוס":"Amos",
    "עובדיה":"Obadiah",
    "יונה":"Jonah",
    "מיכה":"Micah",
    "נחום":"Nahum",
    "חבקוק":"Habbakuk",
    "צפניה":"Zephaniah",
    "חגי":"Haggai",
    "זכריה":"Zek",
    "מלאכי":"Malachi",
    "שיר השירים":"Song of Songs",
    "רות":"Ruth",
    "אסתר":"Esther",
    "קהלת":"Ecclesiastes",
    "איכה":"Lamentations",
    "תהלים":"Psalms",
    "איוב":"Job",
    "משלי":"Proverbs",
    "דניאל":"Daniel",
    "עזרא":"Ezra",
    "נחמיה":"Nehemiah",
    "דברי הימים א":"1Chronicles",
    "דברי הימים ב":"2Chronicles"
}

def gematria_parser(string):
    result = 0
    for char in string:
        if char in letters_to_numbers.keys():
            result += letters_to_numbers[char]
    return result

In [4]:
def reference_parser(refstring):
    #{{מ:פסוק|בראשית|כ|יב}}
    book_start = refstring.index("פסוק|")+5
    book_end = refstring.index("|",book_start+1)-1
    book = refstring[book_start:book_end+1]
    chap_start = book_end+2
    chap_end = refstring.index("|",chap_start)
    chapter = refstring[chap_start:chap_end]
    verse_start = chap_end+1
    if "|" in refstring[verse_start:-1]:
        verse_end = refstring.index("|",verse_start+1)
    else:
        verse_end = refstring.index("}")
    verse = refstring[verse_start:verse_end]
    book_eng = sefer_names[book]
    chapter_num = gematria_parser(chapter)
    verse_num = gematria_parser(verse)
    reference = {}
    #reference["book_he"] = book
    reference["book_en"] = book_eng
    reference["chap_en"] = chapter_num
    #reference["chap_he"] = chapter
    #reference["verse_he"] = verse
    reference["verse_en"] = verse_num
    return reference

In [5]:
def get_sedras(bookset):
    sedras = []
    with open(bookset, 'r', encoding='utf-8') as csvDataFile:
        csvReader = csv.reader(csvDataFile)
        for row in csvReader:
            if "סדר=" in row[3]:
                start_seder_i = row[3].index("סדר=")+4
                if "|" in row[3][start_seder_i:]:
                    end_seder_i = row[3].index("|",start_seder_i)
                    if row[3].index("}",start_seder_i) < end_seder_i:
                        end_seder_i =  row[3].index("}",start_seder_i)
                else:
                    end_seder_i = row[3].index("}")
                seder_num_heb = row[3][start_seder_i:end_seder_i]
                seder_num = gematria_parser(seder_num_heb)
                if "*" in seder_num_heb:
                    seder_num = str(seder_num)+"*"
                start_dict = reference_parser(row[3])
                sedra_dict = {"eng_num":seder_num,"heb_num":seder_num_heb,"start":start_dict}
                sedras.append(sedra_dict)
    return sedras

In [6]:
sedarim_torah = get_sedras(torah)

In [17]:
def write_csv(sedra_list,output_name):
    with open(output_name, 'w', encoding='utf-8-sig', newline='') as csvDataFile:
        
        csvwriter = csv.writer(csvDataFile,delimiter=',')
        csvwriter.writerow(['sedra_num','heb_num','book','chap','verse'])
        for sedra in sedra_list:
            csvwriter.writerow([sedra['eng_num'],sedra['heb_num'],sedra['start']['book_en'],sedra['start']['chap_en'],sedra['start']['verse_en']])

In [18]:
for bookset in all_tanakh:
    csv_name = bookset.replace("./tanakh_texts/","sedarim_list_")
    sedarim = get_sedras(bookset)
    write_csv(sedarim,csv_name)