In [1]:
import json
from collections import defaultdict

In [2]:
def get_verse_info(src_text):
    # Get Book with number of chapters and number of verses
    # dict of book name to array of numbers for # of verses
    with open(src_text) as f:
        text = json.load(f)
        info = defaultdict(list)
        for verse_info in text:
            book_name = verse_info['book_name']
            chap_num = int(verse_info['chapter_id']) - 1
            if chap_num >= len(info[book_name]):
                info[book_name].append(0)
            info[book_name][chap_num] += 1
    
    return info
ot = get_verse_info('esv_ot.txt')
# for book_name, verses in ot.items():
#     print(book_name, len(verses))
nt = get_verse_info('esv_nt.txt')
# for book_name, verses in nt.items():
#     print(book_name, len(verses))
bible = {**ot, **nt}
for book_name, verses in bible.items():
    print(book_name, len(verses))

Genesis 50
Exodus 40
Leviticus 27
Numbers 36
Deuteronomy 34
Joshua 24
Judges 21
Ruth 4
1 Samuel 31
2 Samuel 24
1 Kings 22
2 Kings 25
1 Chronicles 29
2 Chronicles 36
Ezra 10
Nehemiah 13
Esther 10
Job 42
Psalm 150
Proverbs 31
Ecclesiastes 12
Song of Solomon 8
Isaiah 66
Jeremiah 52
Lamentations 5
Ezekiel 48
Daniel 12
Hosea 14
Joel 3
Amos 9
Obadiah 1
Jonah 4
Micah 7
Nahum 3
Habakkuk 3
Zephaniah 3
Haggai 2
Zechariah 14
Malachi 4
Matthew 28
Mark 16
Luke 24
John 21
Acts 28
Romans 16
1 Corinthians 16
2 Corinthians 13
Galatians 6
Ephesians 6
Philippians 4
Colossians 4
1 Thessalonians 5
2 Thessalonians 3
1 Timothy 6
2 Timothy 4
Titus 3
Philemon 1
Hebrews 13
James 5
1 Peter 5
2 Peter 3
1 John 5
2 John 1
3 John 1
Jude 1
Revelation 22


The rule is: if the line only consists of numbers, then the next line consists of the verse
If it starts with a number and then text, it will be that verse's text.
Maybe get the # of verse per chapter from ESV, and apply it here. Hard to distinguish between chapters in this text

In [3]:
def start_new_verse_or_chap(line, expected_num):
    prefix_len = get_int_prefix(line)
    if prefix_len == -1:
        return False, -1, -1
    else:
        if int(line[:prefix_len]) == expected_num:
            return True, prefix_len, 0
        elif int(line[:prefix_len]) == expected_num + 1:
            return True, prefix_len, 1
        else:
            return False, prefix_len, -1

def get_int_prefix(line):
    try: 
        int_prefix = int(line)
        return len(line)
    except:
        for length in range(3, 0, -1):
            try:
                int_prefix = int(line[:length])
                return length
            except:
                pass
        return -1

def clean_nlt(bible_info, nlt_src, nlt_dest):
    with open(nlt_src, 'r') as src:
        lines = src.readlines()
    
    nlt = defaultdict(lambda: defaultdict(list))
    # dict of book name to chapter to list of verses
    cur_book_name = ''
    book_names = []
    expected_book_idx = 0
    is_read_book_names = False
    current_verse = ""
    current_chapter = 1
    chapter_verse_offset = 0
    for line in lines:
        line = line.strip()
        if line == 'Book Names':
            is_read_book_names = True
        elif is_read_book_names:
            if line:
                book_names.append(line)
            else:
                is_read_book_names = False
        else: # Reading verses
            if expected_book_idx < len(book_names) and line == book_names[expected_book_idx]:
                if current_verse.strip():
                    nlt[cur_book_name][current_chapter].append(current_verse.strip())
                cur_book_name = line
                current_chapter = 1
                expected_book_idx += 1
                current_verse = ''
                chapter_verse_offset = 0
            else:
                target_verse = len(nlt[cur_book_name][current_chapter]) + 2
                new_verse, length, offset = start_new_verse_or_chap(line, target_verse)
                if new_verse: # starting a new verse
                    current_verse = current_verse.strip()
                    if current_verse:
                        nlt[cur_book_name][current_chapter].append(current_verse)
                    chapter_verse_offset += offset
                    
                    # Update chapter after because line will refer to next chapter
                    if current_chapter - 1 < len(bible[cur_book_name]):
                        if len(nlt[cur_book_name][current_chapter]) == bible[cur_book_name][current_chapter - 1]:
                            current_chapter += 1
                            chapter_verse_offset = 0
                    current_verse = line[length:]
                else:
                    # If it's a parseable string but unexpected (new chapter!!)
                    if length > 0:
                        if current_chapter - 1 < len(bible[cur_book_name]) and len(nlt[cur_book_name][current_chapter]) + 1 == bible[cur_book_name][current_chapter - 1]:
                            current_verse = current_verse.strip()
                            if current_verse:
                                nlt[cur_book_name][current_chapter].append(current_verse)  
                            current_chapter += 1
                            chapter_verse_offset = 0
                            

                            current_verse = line[length:]
                    # If we continue the current verse
                    else:
                        current_verse += ' {}'.format(line)
    return nlt

def is_valid(gt, nlt):
    if len(gt) != len(nlt): # Number of books
        print(f'Exptected {len(gt)} # of books, but got {len(nlt)} # of books')
        return False
    
    for book_name in gt:
        if book_name not in nlt:
            print(f'{book_name} not found in NLT')
            return False
        if len(gt[book_name]) != len(nlt[book_name]):
            print(f'Expected {len(gt[book_name])} chapters for {book_name} but got {len(nlt[book_name])}')
            return False
        for chap_idx, verse_counts in enumerate(gt[book_name], 1):
            if len(nlt[book_name][chap_idx]) != verse_counts:
                print(f'Expected {verse_counts} verses for {book_name} {chap_idx} but got {len(nlt[book_name][chap_idx])} verses')
                return False
    
    return True

nlt = clean_nlt(bible, 'nlt_new.txt', 'nlt_clean.txt')
# for name, data in nlt.items():
#     print(name, len(data))
print(is_valid(bible, nlt))
print(bible['Genesis'][0], len(nlt['Genesis'][1]))
for verse_idx, verse in enumerate(nlt['Genesis'][1], 1):
    print(verse_idx, verse)
    print()
# for chapter, verses in nlt['Zechariah'].items():
#     print(chapter, verses)
# for name, data in nlt.items():
#     print(name, len(data))

Expected 16 chapters for Mark but got 12
False
31 31
1 heavens and the earth.

2 The earth was empty, a formless mass cloaked in darkness. And the Spirit of God was hovering over its surface.

3 Then God said, "Let there be light," and there was light.

4 And God saw that it was good. Then he separated the light from the darkness.

5 God called the light "day" and the darkness "night." Together these made up one day.

6 And God said, "Let there be space between the waters, to separate water from water."

7 And so it was. God made this space to separate the waters above from the waters below.

8 And God called the space "sky." This happened on the second day.

9 And God said, "Let the waters beneath the sky be gathered into one place so dry ground may appear." And so it was.

10 God named the dry ground "land" and the water "seas." And God saw that it was good.

11 Then God said, "Let the land burst forth with every sort of grass and seedbearing plant. And let there be trees that grow s

In [4]:
def convert_nlt(bible_info, nlt_src, nlt_dest):
    with open(nlt_src, 'r') as src:
        lines = src.readlines()
        
    nlt = []
    # dict of book name to chapter to list of verses
    cur_book_name = ''
    book_names = []
    expected_book_idx = 0
    is_read_book_names = False
    current_verse = ""
    current_chapter = 1
    for line in lines:
        line = line.strip()
        if line == 'Book Names':
            is_read_book_names = True
        elif is_read_book_names:
            if line:
                book_names.append(line)
            else:
                is_read_book_names = False
        else: # Reading verses
            if expected_book_idx < len(book_names) and line == book_names[expected_book_idx]:
                if current_verse.strip():
                    nlt.append(current_verse.strip())
                cur_book_name = line
                current_chapter = 1
                expected_book_idx += 1
                current_verse = ''
            else:
                prefix_len = get_int_prefix(line)
                if prefix_len >= 0: # new verse
                    nlt.append(current_verse)
                    current_verse = line[prefix_len:]
                else:
                    current_verse += ' {}'.format(line)
                    
    with open(nlt_dest, 'w') as dest:
        for verse in nlt:
            dest.write(verse)
            dest.write('\n')

convert_nlt(bible, 'nlt_new.txt', 'nlt_something.txt')
            

In [122]:
def num_chapters(nlt, book_name):
    return len(nlt[book_name])

def num_verses(nlt, book_name, chapter):
    return len(nlt[book_name][chapter])

def read_nlt_something(bible_info, nlt_src):
    nlt = defaultdict(lambda : defaultdict(list))
    book_names = list(bible_info.keys())
    name_idx = 0
    chapter_idx = 0
    with open(nlt_src, 'r') as f:
        lines = f.readlines()
    
    line_idx = 0
    for book_name in book_names:
        for chapter_idx in range(num_chapters(bible_info, book_name)):
            for verse_idx in range(bible_info[book_name][chapter_idx]):
#                 if line_idx < len(lines):
                nlt[book_name][chapter_idx].append(lines[line_idx])
                line_idx += 1

    return book_names, nlt

book_names, nlt = read_nlt_something(bible, 'nlt_something.txt')

In [127]:
with open('nlt_final.txt', 'w') as f:
    json.dump(nlt, f)

In [129]:
with open('nlt_final.txt') as f:
    data = json.load(f)
    for name in data.keys():
        print(name)

Genesis
Exodus
Leviticus
Numbers
Deuteronomy
Joshua
Judges
Ruth
1 Samuel
2 Samuel
1 Kings
2 Kings
1 Chronicles
2 Chronicles
Ezra
Nehemiah
Esther
Job
Psalm
Proverbs
Ecclesiastes
Song of Solomon
Isaiah
Jeremiah
Lamentations
Ezekiel
Daniel
Hosea
Joel
Amos
Obadiah
Jonah
Micah
Nahum
Habakkuk
Zephaniah
Haggai
Zechariah
Malachi
Matthew
Mark
Luke
John
Acts
Romans
1 Corinthians
2 Corinthians
Galatians
Ephesians
Philippians
Colossians
1 Thessalonians
2 Thessalonians
1 Timothy
2 Timothy
Titus
Philemon
Hebrews
James
1 Peter
2 Peter
1 John
2 John
3 John
Jude
Revelation
