In [1]:
import requests
import json
from bs4 import BeautifulSoup

In [2]:
url = 'https://www.biblestudytools.com/bible-versions/'

In [3]:
def fetch_html_from_url(url):
    try:
        # URL of the website you want to scrape
        
        # Send an HTTP GET request to the URL
        response = requests.get(url)

        # Check if the request was successful
        response.raise_for_status()

        # Parse the HTML content of the page
        soup = BeautifulSoup(response.content, 'html.parser')

        # Now, you can use BeautifulSoup to extract data
        # For example, let's extract and print all the links on the page:
    #     for link in soup.find_all('a'):
    #         print(link.get('href'))

    except requests.exceptions.RequestException as e:
        print(f"Failed to make the request: {e}")
    except Exception as e:
        print(f"An error occurred: {e}")
        
    return soup 

soup_obj = fetch_html_from_url(url)

In [4]:
version_body = soup_obj.find_all("div", class_="p-2 w-full")

In [5]:
len(version_body)

9

In [6]:
version_dict = {}
for version in version_body: 
    # Retrieving ceby
    version_info = version.find("a")
    version_name_list = version_info.text.strip().split()
    version_name = ' '.join(version_name_list[:-1])
    version_link = version_info.get('href')
    version_alias = version_name_list[-1]
    version_dict[version_alias] = [version_name, version_link]
    print(version_name)
    print(version_link)
    print(version_alias)
    print()

Holman Christian Standard Bible
https://www.biblestudytools.com/csb/
CSB

English Standard Version
https://www.biblestudytools.com/esv/
ESV

King James Version
https://www.biblestudytools.com/kjv/
KJV

The Message Bible
https://www.biblestudytools.com/msg/
MSG

New American Standard Bible
https://www.biblestudytools.com/nas/
NAS

New International Version
https://www.biblestudytools.com/niv/
NIV

New King James Version
https://www.biblestudytools.com/nkjv/
NKJV

New Living Translation
https://www.biblestudytools.com/nlt/
NLT

New Revised Standard
https://www.biblestudytools.com/nrs/
NRS



In [7]:
version_dict['ESV'][1]

'https://www.biblestudytools.com/esv/'

In [8]:
url_new = version_dict['ESV'][1]

In [9]:
esv_soup_obj = fetch_html_from_url(url_new)

In [10]:
books_in_esv = esv_soup_obj.find_all("div", class_="text-center")

In [11]:
books_dict={}

for alias in version_dict.keys():
    books_list = []
    # get URL for particular book
    alias_link =  version_dict[alias][1]
    
    # Load books page
    book_data = fetch_html_from_url(alias_link)
    
    # Grab book classes
    books_in_vers = book_data.find_all("div", class_="text-center")
    
    # Get book names and their links
    for book in books_in_vers:
        book_info = book.find('a')
        if book_info:
            book_link = book_info.get('href')
            book_name = book_info.text.strip()
            books_list.append((book_name,book_link))

    books_dict[alias] = books_list[:-1]

In [12]:
len(books_in_esv)

71

In [13]:
books_dict.keys()

dict_keys(['CSB', 'ESV', 'KJV', 'MSG', 'NAS', 'NIV', 'NKJV', 'NLT', 'NRS'])

In [14]:
books_dict['CSB'][:3]

[('Operation Christmas Child – Shoebox Collection Week is Here!',
  'https://samaritanspurse.org/what-we-do/operation-christmas-child/?utm_source=OCC23-Salem-Web-Alert-Bar&utm_medium=referral&utm_campaign=m_YOCC-SALW&utm_content=salem-alert-bar'),
 ('Genesis', 'https://www.biblestudytools.com/csb/genesis/'),
 ('Exodus', 'https://www.biblestudytools.com/csb/exodus/')]

In [15]:
books_dict['CSB'][0][0]

'Operation Christmas Child – Shoebox Collection Week is Here!'

In [16]:
import re

In [17]:
def get_chapters(book, book_link):
    chapter_store = []
    
    # Fetch book page and retrieve information on chapters
    book_info = fetch_html_from_url(book_link)
    
    # get number of chapters in book
    num_chaps = len(book_info.find('div', class_ = 'grid').find_all('a', class_='text-center'))
    
    for chapter in range(1,num_chaps + 1):
        
        # chapter link 
        temp_chap_link = f'{book_link}{chapter}.html'
        
        # fetch chapter data
        chap_data = fetch_html_from_url(temp_chap_link)
        
        # fetch chapter content
        main_chap_data = chap_data.find_all('div', class_='leading-8')
        chap_headings = main_chap_data[0].find_all('h3')
        chap_headings_text = [heading.text for heading in chap_headings]
        
        
        # fetch verses
        verses = [
            re.sub(' +', ' ', verse.text.replace('\n', ' ').strip())
            for verse in main_chap_data
        ]
        
        # Store chapter data
        chapter_store.append(verses)
    
    return chapter_store

In [18]:
import concurrent.futures
import requests
from bs4 import BeautifulSoup
import re

def fetch_html_from_url(url):
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }

        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')

    except requests.exceptions.RequestException as e:
        print(f"Failed to make the request: {e}")
        soup = None
    except Exception as e:
        print(f"An error occurred: {e}")
        soup = None

    return soup

def fetch_chapter_data(chapter, book_link):
    temp_chap_link = f'{book_link}{chapter}.html'
    chap_data = fetch_html_from_url(temp_chap_link)

    if chap_data:
        main_chap_data = chap_data.find_all('div', class_='leading-8')
        chap_headings = main_chap_data[0].find_all('h3')
        chap_headings_text = [heading.text for heading in chap_headings]

        verses = [
            re.sub(' +', ' ', verse.text.replace('\n', ' ').strip())
            for verse in main_chap_data
        ]

        return verses

    return None

def get_chapters(book, book_link):
    chapter_store = []

    book_info = fetch_html_from_url(book_link)

    if book_info:
        num_chaps = len(book_info.find('div', class_='grid').find_all('a', class_='text-center'))

        with concurrent.futures.ThreadPoolExecutor() as executor:
            chapter_store = list(executor.map(lambda chap: fetch_chapter_data(chap, book_link), range(1, num_chaps + 1)))

    return chapter_store

# # Example usage:
# book_link = "https://example.com/book/"
# chapters = get_chapters("Example Book", book_link)
# print(chapters)

In [35]:
book_name, book_link = books_dict['NKJV'][-2]

In [36]:
chaps_gotten = get_chapters(book_name, book_link)

In [37]:
chaps_gotten[0]

['1 Jude, a bondservant of Jesus Christ, and brother of James, To those who are called, sanctified [a] by God the Father, and preserved in Jesus Christ:',
 '2 Mercy, peace, and love be multiplied to you.',
 '3 Beloved, while I was very diligent to write to you concerning our common salvation, I found it necessary to write to you exhorting you to contend earnestly for the faith which was once for all delivered to the saints.',
 '4 For certain men have crept in unnoticed, who long ago were marked out for this condemnation, ungodly men, who turn the grace of our God into lewdness and deny the only Lord God [b] and our Lord Jesus Christ.',
 '5 But I want to remind you, though you once knew this, that the Lord, having saved the people out of the land of Egypt, afterward destroyed those who did not believe.',
 '6 And the angels who did not keep their proper domain, but left their own abode, He has reserved in everlasting chains under darkness for the judgment of the great day;',
 '7 as Sodom

In [None]:
books_dict.keys()

In [None]:
bible_books = books_dict['CSB']

In [None]:
# Get all chapter for all books
import time 

bible_versions_books = {}
for version in list(books_dict.keys())[1:2]:
    saved_book = {}
    
    # Retrieve books in version 
    bible_books = books_dict[version]
    
    for book, book_link in bible_books:
        # Retrieve bible book chapters
        book_chaps_gotten = get_chapters(book, book_link)

        # Store book chapter in dictionary
        saved_book[book] = book_chaps_gotten

        # Introduce a time delay to avoid burdening the server
        time.sleep(1)  # You can adjust the sleep duration as needed
        
    bible_versions_books[version] = saved_book

with open('data/bible_versions_book_new.json', 'w+') as f:
    json.dump(saved_book, f)
    

In [None]:
bible_versions_books.keys()

In [None]:
list(books_dict.keys())[:1]

In [None]:
list(books_dict.keys())

In [None]:
chaps_gotten[1].find_all('div', class_='leading-8')[0].text.strip()

In [None]:
chaps_gotten[4]