In [22]:
import requests
import time 
import concurrent.futures
import re
import os
import json
import shutil
from bs4 import BeautifulSoup

In [2]:
url = 'https://www.biblestudytools.com/bible-versions/'

In [3]:
def fetch_html_from_url(url):
    try:
        # URL of the website you want to scrape
        
        # Send an HTTP GET request to the URL
        response = requests.get(url)

        # Check if the request was successful
        response.raise_for_status()

        # Parse the HTML content of the page
        soup = BeautifulSoup(response.content, 'html.parser')

        # Now, you can use BeautifulSoup to extract data
        # For example, let's extract and print all the links on the page:
    #     for link in soup.find_all('a'):
    #         print(link.get('href'))

    except requests.exceptions.RequestException as e:
        print(f"Failed to make the request: {e}")
    except Exception as e:
        print(f"An error occurred: {e}")
        
    return soup 

soup_obj = fetch_html_from_url(url)

In [4]:
version_body = soup_obj.find_all("div", class_="p-2 w-full")

In [5]:
len(version_body)

9

In [6]:
version_dict = {}
for version in version_body: 
    # Retrieving ceby
    version_info = version.find("a")
    version_name_list = version_info.text.strip().split()
    version_name = ' '.join(version_name_list[:-1])
    version_link = version_info.get('href')
    version_alias = version_name_list[-1]
    version_dict[version_alias] = [version_name, version_link]
    print(version_name)
    print(version_link)
    print(version_alias)
    print()

Holman Christian Standard Bible
https://www.biblestudytools.com/csb/
CSB

English Standard Version
https://www.biblestudytools.com/esv/
ESV

King James Version
https://www.biblestudytools.com/kjv/
KJV

The Message Bible
https://www.biblestudytools.com/msg/
MSG

New American Standard Bible
https://www.biblestudytools.com/nas/
NAS

New International Version
https://www.biblestudytools.com/niv/
NIV

New King James Version
https://www.biblestudytools.com/nkjv/
NKJV

New Living Translation
https://www.biblestudytools.com/nlt/
NLT

New Revised Standard
https://www.biblestudytools.com/nrs/
NRS



In [7]:
version_dict['ESV'][1]

'https://www.biblestudytools.com/esv/'

In [10]:
url_new = version_dict['ESV'][1]

In [11]:
esv_soup_obj = fetch_html_from_url(url_new)

In [12]:
books_in_esv = esv_soup_obj.find_all("div", class_="text-center")

In [13]:
books_dict={}

for alias in version_dict.keys():
    books_list = []
    # get URL for particular book
    alias_link =  version_dict[alias][1]
    
    # Load books page
    book_data = fetch_html_from_url(alias_link)
    
   # Grab book classes
    book_grid = book_data.find_all("div", class_="mt-3")
    book_data = min(book_grid, key=len)
    books_in_vers = book_data.find_all("div", class_="text-center")

    # Get book names and their links
    for book in books_in_vers:
        book_info = book.find('a')
        if book_info:
            book_link = book_info.get('href')
            book_name = book_info.text.strip()
            books_list.append((book_name,book_link))

    books_dict[alias] = books_list

In [14]:
books_dict.keys()

dict_keys(['CSB', 'ESV', 'KJV', 'MSG', 'NAS', 'NIV', 'NKJV', 'NLT', 'NRS'])

In [15]:
books_dict['CSB'][-3:]

[('3 John', 'https://www.biblestudytools.com/csb/3-john/'),
 ('Jude', 'https://www.biblestudytools.com/csb/jude/'),
 ('Revelation', 'https://www.biblestudytools.com/csb/revelation/')]

In [16]:
books_dict['CSB'][0][0]

'Genesis'

In [17]:
def fetch_html_from_url(url):
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }

        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        
    except requests.exceptions.RequestException as e:
        print(f"Failed to make the request: {e}")
        soup = None
    except Exception as e:
        print(f"An error occurred: {e}")
        soup = None

    return soup 

def fetch_chapter_data(chapter, book_link):
    temp_chap_link = f'{book_link}{chapter}.html'
    chap_data = fetch_html_from_url(temp_chap_link)

    if chap_data:
        main_chap_data = chap_data.find_all('div', class_='leading-8')
        chap_headings = main_chap_data[0].find_all('h3')
        chap_headings_text = [heading.text for heading in chap_headings]
        
        # Remove reference text interfering with main text
        verses = []
        for verse in main_chap_data:
            # Exclude text within <sup> tags and their associated references
            for sup_tag in verse.find_all('sup'):
                sup_tag.decompose()

            verse_text = re.sub(' +', ' ', verse.text.replace('\n', ' ').strip())
            verses.append(verse_text)


        return verses

    return None

def get_chapters(book, book_link):
    chapter_store = []

    book_info = fetch_html_from_url(book_link)

    if book_info:
        num_chaps = len(book_info.find('div', class_='grid').find_all('a', class_='text-center'))

        with concurrent.futures.ThreadPoolExecutor() as executor:
            chapter_store = list(executor.map(lambda chap: fetch_chapter_data(chap, book_link), range(1, num_chaps + 1)))

    return chapter_store


def make_dir(path):
    if os.path.exists(path):
        shutil.rmtree(path)
    os.makedirs(path)
    
    

def make_path(root_dir, version, book=None, chapter=None, file_extension=".txt"):
    components = [version]

    if book:
        components.append(book)
    
    if chapter is not None:
        components.append(f"chap_{chapter}{file_extension}")

    path = os.path.join(root_dir, *components)
    return path


# # Example usage:
# book_link = "https://example.com/book/"
# chapters = get_chapters("Example Book", book_link)
# print(chapters)

In [18]:
book_name, book_link = books_dict['ESV'][-1]

In [19]:
chaps_gotten = get_chapters(book_name, book_link)

In [20]:
chaps_gotten[-1][:2]

['The River of Life 1 Then the angel showed me the river of the water of life, bright as crystal, flowing from the throne of God and of the Lamb',
 '2 through the middle of the street of the city; also, on either side of the river, the tree of life with its twelve kinds of fruit, yielding its fruit each month. The leaves of the tree were for the healing of the nations.']

In [68]:
books_dict.keys()

dict_keys(['CSB', 'ESV', 'KJV', 'MSG', 'NAS', 'NIV', 'NKJV', 'NLT', 'NRS'])

In [None]:
bible_books = books_dict['CSB']

In [129]:
# Get all chapter for all books
cwd = os.getcwd()
root_data_dir = os.path.join(cwd, 'data')

# 
bible_versions_books = {}
for version in list(books_dict.keys()):
    saved_book = {}
    
    # Retrieve books in version 
    bible_books = books_dict[version]
    
    for book, book_link in bible_books:
        # Retrieve bible book chapters
        book_chaps_gotten = get_chapters(book, book_link)

        # Store book chapter in dictionary
        saved_book[book] = book_chaps_gotten

        # Introduce a time delay to avoid burdening the server
        time.sleep(1)  # You can adjust the sleep duration as needed
        
    bible_versions_books[version] = saved_book
    time.sleep(5)

with open('data/bible_vers_all_2.json', 'w+') as f:
    json.dump(bible_versions_books, f)
    

Failed to make the request: HTTPSConnectionPool(host='www.biblestudytools.com', port=443): Read timed out. (read timeout=10)
Failed to make the request: ('Connection aborted.', ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None))Failed to make the request: ('Connection aborted.', ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None))
Failed to make the request: ('Connection aborted.', ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None))

Failed to make the request: ('Connection aborted.', ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None))
Failed to make the request: ('Connection aborted.', ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None))
Failed to make the request: ('Connection aborted.'

KeyboardInterrupt: 

In [127]:
len(bible_versions_books['NRS'].keys())


66

In [70]:
bible_versions_books.keys()

dict_keys([])

In [74]:
version

'ESV'

In [None]:
list(books_dict.keys())[:1]

In [None]:
list(books_dict.keys())

In [None]:
chaps_gotten[1].find_all('div', class_='leading-8')[0].text.strip()

In [None]:
chaps_gotten[4]