In [286]:
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd
from pathlib import Path
import re
from unidecode import unidecode
import string
import random
import csv

In [260]:
def find_text(doc):
    #find all the "content" tags
    contents = doc.find_all('span', class_ ="content")
    texts = []
    for idx, line in enumerate(contents):
        line_text = line.get_text()
        #some of them are empty, remove
        if len(line_text) >1:
            texts.append(line_text)
    return texts

In [265]:
def crawl(starting_link, lines_dict, chapters_dict, skip = False):
    
    #checking if we're in a book we're supposed to read
    #stopping otherwise
    tag = starting_link.split("/")[-1]
    print(tag)
    chapter_abrv = tag.split(".")[0]
    if chapter_abrv in ("PSA", "WIS"):
        print("past revelations")
        return lines_dict, chapters_dict

    #reading the document
    result = requests.get(starting_link)
    doc = BeautifulSoup(result.text, "html.parser")

    #checking if we are in a chapter we're supposed to read
    #skipping text, just grabbing link, otherwise
    chapter_num = tag.split(".")[1]
    if "INTRO" not in chapter_num:
        #reading the text of the document
        lines = find_text(doc)
        print(len(lines))
        #book,chapter,version are all at the end of the web address

        lines_dict[tag] = lines
        #combining all the lines of a chapter into a single bit of text
        chapter = " ".join(lines)
        chapters_dict[tag] = chapter
    else:
        print("introduction, skipping text")

    #finding the link to the subsequent chapter
    #contained in the next arrow
    arrow = doc.find("div", class_ = "next-arrow")
    #checks to see if arrow is there, if it has link, if not, stop
    if not arrow:
        print("no arrow")
        return lines_dict, chapters_dict
    elif not arrow.find("a"):
        print("empty arrow")
        return lines_dict, chapters_dict
    elif not arrow.find("a").has_attr("href"):
        print("no link")
        return lines_dict, chapters_dict
    #found the next arrow, has a link to subsequent chapter
    else:
        #finding the web address in the arrow, only has the end of the web address
        link_end = arrow.find("a").get("href")
        #creating the link
        new_link = "https://my.bible.com" + link_end
        print(new_link)
        #pass all checks, recurse!!!
        lines_dict, chapters_dict = crawl(new_link, lines_dict, chapters_dict)

    return lines_dict, chapters_dict

In [266]:
starting_link = "https://my.bible.com/bible/2516/MAT.INTRO1.pcm"
lines_dict = {}
chapters_dict = {}
lines_dict, chapters_dict = crawl(starting_link, lines_dict, chapters_dict)

MAT.INTRO1.pcm
introductionm, skipping text
https://my.bible.com/bible/2516/MAT.1.PCM
MAT.1.PCM
47
https://my.bible.com/bible/2516/MAT.2.PCM
MAT.2.PCM
30
https://my.bible.com/bible/2516/MAT.3.PCM
MAT.3.PCM
19
https://my.bible.com/bible/2516/MAT.4.PCM
MAT.4.PCM
34
https://my.bible.com/bible/2516/MAT.5.PCM
MAT.5.PCM
56
https://my.bible.com/bible/2516/MAT.6.PCM
MAT.6.PCM
40
https://my.bible.com/bible/2516/MAT.7.PCM
MAT.7.PCM
29
https://my.bible.com/bible/2516/MAT.8.PCM
MAT.8.PCM
36
https://my.bible.com/bible/2516/MAT.9.PCM
MAT.9.PCM
39
https://my.bible.com/bible/2516/MAT.10.PCM
MAT.10.PCM
42
https://my.bible.com/bible/2516/MAT.11.PCM
MAT.11.PCM
35
https://my.bible.com/bible/2516/MAT.12.PCM
MAT.12.PCM
58
https://my.bible.com/bible/2516/MAT.13.PCM
MAT.13.PCM
72
https://my.bible.com/bible/2516/MAT.14.PCM
MAT.14.PCM
37
https://my.bible.com/bible/2516/MAT.15.PCM
MAT.15.PCM
42
https://my.bible.com/bible/2516/MAT.16.PCM
MAT.16.PCM
28
https://my.bible.com/bible/2516/MAT.17.PCM
MAT.17.PCM
29
https

In [275]:
print(len(chapters_dict))
print(len(lines_dict))

260
260


Saving lines as a json, whole text as a txt

In [267]:
version = starting_link.split(".")[-1]
json_lines_title = "scraping_outputs/" + version + "_lines_raw.json"
single_text_title = "scraping_outputs/" + version + "_whole_raw.txt"
print(version)

pcm


In [268]:
with open(json_lines_title, 'w') as fp:
    json.dump(lines_dict, fp)

merging the chapters into a single text

In [270]:
single_text = ""
for title, chapter in chapters_dict.items():
    #print(title)
    single_text += " " + chapter
single_text = single_text.strip()

MAT.1.PCM Jesus Christ wey dem born for David and Abraham family, naim story bi dis. Abraham born Isaak, Isaak kon born Jakob. Na Jakob born Judah and en brodas, Judah born Perez and Zera (wey en mama bi Tamar), Perez born Hezron and Hezron born Ram, Ram na Amminadab papa, Amminadab born Nashon, Nashon kon born Salmon, Salmon born Boaz (wey Rahab bi en mama), na Boaz bi Obed papa (wey Rut bi en mama), Obed kon born Jesse, Jesse born David wey bi Israel pipol king. David born Solomon (wey Uraya wife bi en mama), Solomon born Rehoboam, Rehoboam naim bi Abijah papa and Abijah kon born Asa, Asa born Jehoshafat, Jehoshafat kon born Joram, Joram born Uzzaya, Uzzaya born Jotam, Jotam kon born Ahaz, wey born Hezekaya, Hezekaya born Manasseh, Manasseh naim bi Amnon papa and na Amon born Josaya. Josaya born Jekonaya and en brodas for di time wey dem karry Israel pipol go bi slave for Babilon. Afta dem karry Israel pipol go Babilon, Jekonaya kon born Shealtiel, Shealtiel kon bi Zerubabel papa, Ze

In [272]:
with open(single_text_title, "w") as text_file:
    text_file.write(single_text)

Creating and saving cleaned versions

In [386]:
csv_lines_clean_title = "clean_outputs/" + version + "/" + version + "_lines_clean.csv"
print(csv_lines_clean_title)
csv_chapters_clean_title = "clean_outputs/" + version + "/" + version + "_chapters_clean.csv"
print(csv_chapters_clean_title)
csv_sentences_clean_title = "clean_outputs/" + version + "/" + version + "_sentences_clean.csv"
print(csv_sentences_clean_title)
single_text_clean_title = "clean_outputs/" + version + "/" + version + "_whole_clean.txt"
print(single_text_clean_title)

clean_outputs/PCM/PCM_lines_clean.csv
clean_outputs/PCM_chapters_clean.csv
clean_outputs/PCM_sentences_clean.csv
clean_outputs/PCM_whole_clean.txt


In [370]:
def clean_text(text):
    #cleans a text whole, doesn't split

    #removes non unicode charecters
    text = unidecode(text)
    #removing punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    #removing double spaces, ect.
    text = re.sub(' +', ' ', text)
    #lower case
    text = text.lower().strip()

    return text

In [375]:
clean_chapters_dict = {}
for title, chapter in chapters_dict.items():
    clean_chapter = clean_text(chapter)
    clean_chapters_dict[title] = clean_chapter

Creating cleaned chapters csv

In [376]:
chapter_clean_df = pd.DataFrame.from_dict(clean_chapters_dict, orient = "index", columns=["text"])
chapter_clean_df = chapter_clean_df.rename_axis('chapt_code').reset_index()
new = chapter_clean_df['chapt_code'].str.split(".", expand = True)
chapter_clean_df['book'] = new[0]
chapter_clean_df['chapt'] = new[1]
chapter_clean_df['version'] = new[2]
chapter_clean_df.to_csv(csv_chapters_clean_title)

Whole cleaned text csv

In [377]:
cleaned_single_text = clean_text(single_text)
with open(single_text_clean_title, "w") as text_file:
    text_file.write(cleaned_single_text)

In [378]:
cleaned_lines = []
for chapter, lines in lines_dict.items():
    #print(chapter)
    book, chapt, version = chapter.split(".")
    for line in lines:
        cleaned_line = clean_text(line)
        cleaned_lines.append((book, chapt, version, cleaned_line))

In [379]:
cleaned_lines_df = pd.DataFrame(cleaned_lines, columns=['book', 'chapt', 'version', 'text'])
cleaned_lines_df.to_csv(csv_lines_clean_title)

Individual cleaned sentences

In [380]:
def to_clean_sentences(text):
    #splits block text into sentences

    #removes non unicode charecters, quotation marks
    text = text.replace("’", "").replace("‘", "").replace('”', "").replace('“', "").replace("'", "").replace('"', "")
    text = unidecode(text)

    #splitting based on period, ! or ?
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
    #r'(?<=[.!?])\s+'
    #from https://stackoverflow.com/questions/25735644/python-regex-for-splitting-text-into-sentences-sentence-tokenizing

    for idx, s in enumerate(sentences):
        #removing punctuation
        s = s.strip()
        s = s.translate(str.maketrans('', '', string.punctuation))
        #removing double spaces, ect.
        s = re.sub(' +', ' ', s)
        sentences[idx] = s.lower()

    return sentences

In [381]:
cleaned_sentences = []
for chapter, text in chapters_dict.items():
    #print(chapter)
    book, chapt, version = chapter.split(".")
    sentences = to_clean_sentences(text)
    for s in sentences:
        cleaned_sentences.append((book, chapt, version, s))

In [382]:
cleaned_sentences_df = pd.DataFrame(cleaned_sentences, columns=['book', 'chapt', 'version', 'text'])
cleaned_sentences_df.to_csv(csv_sentences_clean_title)

Creating NSP data based on sentences and lines