# Web Scraping the Bible (Book of Deuteronomy) 

In [10]:
from bs4 import BeautifulSoup
import requests
# Here, we're just importing both Beautiful Soup and the Requests library

In [11]:
page_link = 'http://www.gutenberg.org/files/10/10-h/10-h.htm#The_Fifth_Book_of_Moses_Called_Deuteronomy'
# this is the url that we've already determined is safe and legal to scrape from.
page_response = requests.get(page_link, timeout=5)
# here, we fetch the content from the url, using the requests library
page_content = BeautifulSoup(page_response.content, "html.parser")
#we use the html parser to parse the url content and store it in a variable.
Deuteronomy = []
for i in range(3775,4438):  # subset to only book of Deuteronomy - there's probably a better way to do this
    paragraphs = page_content.find_all("p")[i].text
    textContent.append(paragraphs)

In [12]:
paragraphs = page_content.find_all("p")[i].text

In [13]:
paragraphs

'34:9 And Joshua the son of Nun was full of the spirit of wisdom; for\r\nMoses had laid his hands upon him: and the children of Israel\r\nhearkened unto him, and did as the LORD commanded Moses.'

In [14]:
Deuteronomy[0]

'1:1 These be the words which Moses spake unto all Israel on this side\r\nJordan in the wilderness, in the plain over against the Red sea,\r\nbetween Paran, and Tophel, and Laban, and Hazeroth, and Dizahab.'

## We now need to clean up the textContent a bit by doing the following:
### 1. Delete the verse numbers
### 2. replace all \r\n appearences with a space
### 3. Delete verses where the words "LORD" or "God" appear (to reduce blaspheme in our final result)

In [15]:
import re # import regular expressions

In [16]:
def clean(string):
    #removing verse numbers 
    string = re.sub("\d+:\d+", "", string)
    string = string.replace("\r\n", " ") # normal python function
    # empty verses get removed later
    if "lord" in string.lower() or "god" in string.lower():
        string = ''
    string = string.strip()
    return string

Deuteronomy = [clean(verse) for verse in Deuteronomy]

# remove empty verses (blasphemic verses)
Deuteronomy_clean = [verse for verse in Deuteronomy if len(verse) > 0]

In [17]:
Deuteronomy[0]

'These be the words which Moses spake unto all Israel on this side Jordan in the wilderness, in the plain over against the Red sea, between Paran, and Tophel, and Laban, and Hazeroth, and Dizahab.'

In [20]:
Deuteronomy = "".join(Deuteronomy)

In [23]:
Deuteronomy = Deuteronomy.replace ('.', '.\n')

In [24]:
with open("Deuteronomy.txt", "w") as output:
    output.write(str(Deuteronomy))
    