In [803]:
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd
from pathlib import Path
import re
from unidecode import unidecode
import string
import random
import csv
import pickle

In [388]:
def find_text(doc):
    #find all the "content" tags
    contents = doc.find_all('span', class_ ="content")
    texts = []
    for idx, line in enumerate(contents):
        line_text = line.get_text()
        #some of them are empty, remove
        if len(line_text) >1:
            texts.append(line_text)
    return texts

In [389]:
def crawl(starting_link, lines_dict, chapters_dict, skip = False):
    
    #checking if we're in a book we're supposed to read
    #stopping otherwise
    tag = starting_link.split("/")[-1]
    print(tag)
    chapter_abrv = tag.split(".")[0]
    if chapter_abrv in ("PSA", "WIS"):
        print("past revelations")
        return lines_dict, chapters_dict

    #reading the document
    result = requests.get(starting_link)
    doc = BeautifulSoup(result.text, "html.parser")

    #checking if we are in a chapter we're supposed to read
    #skipping text, just grabbing link, otherwise
    chapter_num = tag.split(".")[1]
    if "INTRO" not in chapter_num:
        #reading the text of the document
        lines = find_text(doc)
        print(len(lines))
        #book,chapter,version are all at the end of the web address

        lines_dict[tag] = lines
        #combining all the lines of a chapter into a single bit of text
        chapter = " ".join(lines)
        chapters_dict[tag] = chapter
    else:
        print("introduction, skipping text")

    #finding the link to the subsequent chapter
    #contained in the next arrow
    arrow = doc.find("div", class_ = "next-arrow")
    #checks to see if arrow is there, if it has link, if not, stop
    if not arrow:
        print("no arrow")
        return lines_dict, chapters_dict
    elif not arrow.find("a"):
        print("empty arrow")
        return lines_dict, chapters_dict
    elif not arrow.find("a").has_attr("href"):
        print("no link")
        return lines_dict, chapters_dict
    #found the next arrow, has a link to subsequent chapter
    else:
        #finding the web address in the arrow, only has the end of the web address
        link_end = arrow.find("a").get("href")
        #creating the link
        new_link = "https://my.bible.com" + link_end
        print(new_link)
        #pass all checks, recurse!!!
        lines_dict, chapters_dict = crawl(new_link, lines_dict, chapters_dict)

    return lines_dict, chapters_dict

In [709]:
starting_link = "https://my.bible.com/bible/1/MAT.1.KJV"
lines_dict = {}
chapters_dict = {}
lines_dict, chapters_dict = crawl(starting_link, lines_dict, chapters_dict)

MAT.1.KJV
37
https://my.bible.com/bible/1/MAT.2.KJV
MAT.2.KJV
37
https://my.bible.com/bible/1/MAT.3.KJV
MAT.3.KJV
30
https://my.bible.com/bible/1/MAT.4.KJV
MAT.4.KJV
53
https://my.bible.com/bible/1/MAT.5.KJV
MAT.5.KJV
86
https://my.bible.com/bible/1/MAT.6.KJV
MAT.6.KJV
50
https://my.bible.com/bible/1/MAT.7.KJV
MAT.7.KJV
43
https://my.bible.com/bible/1/MAT.8.KJV
MAT.8.KJV
69
https://my.bible.com/bible/1/MAT.9.KJV
MAT.9.KJV
81
https://my.bible.com/bible/1/MAT.10.KJV
MAT.10.KJV
71
https://my.bible.com/bible/1/MAT.11.KJV
MAT.11.KJV
50
https://my.bible.com/bible/1/MAT.12.KJV
MAT.12.KJV
102
https://my.bible.com/bible/1/MAT.13.KJV
MAT.13.KJV
118
https://my.bible.com/bible/1/MAT.14.KJV
MAT.14.KJV
58
https://my.bible.com/bible/1/MAT.15.KJV
MAT.15.KJV
81
https://my.bible.com/bible/1/MAT.16.KJV
MAT.16.KJV
58
https://my.bible.com/bible/1/MAT.17.KJV
MAT.17.KJV
42
https://my.bible.com/bible/1/MAT.18.KJV
MAT.18.KJV
51
https://my.bible.com/bible/1/MAT.19.KJV
MAT.19.KJV
79
https://my.bible.com/bible/1/

In [710]:
print(len(chapters_dict))
print(len(lines_dict))

260
260


Saving lines as a json, whole text as a txt

In [711]:
version = starting_link.split(".")[-1]
#version = "PCM"
json_lines_title = "scraping_outputs/" + version + "_lines_raw.json"
single_text_title = "scraping_outputs/" + version + "_whole_raw.txt"
print(version)

KJV


In [546]:
with open(json_lines_title, 'w') as fp:
    json.dump(lines_dict, fp)

merging the chapters into a single text

In [587]:
single_text = ""
for title, chapter in chapters_dict.items():
    #print(title)
    single_text += " " + chapter
single_text = single_text.strip()

In [548]:
with open(single_text_title, "w") as text_file:
    text_file.write(single_text)

Creating and saving cleaned versions

In [712]:
csv_lines_clean_title = "clean_outputs/" + version + "/" + version + "_lines_clean.csv"
print(csv_lines_clean_title)
csv_chapters_clean_title = "clean_outputs/" + version + "/" + version + "_chapters_clean.csv"
print(csv_chapters_clean_title)
csv_sentences_clean_title = "clean_outputs/" + version + "/" + version + "_sentences_clean.csv"
print(csv_sentences_clean_title)
single_text_clean_title = "clean_outputs/" + version + "/" + version + "_whole_clean.txt"
print(single_text_clean_title)

clean_outputs/KJV/KJV_lines_clean.csv
clean_outputs/KJV/KJV_chapters_clean.csv
clean_outputs/KJV/KJV_sentences_clean.csv
clean_outputs/KJV/KJV_whole_clean.txt


In [589]:
def clean_text(text):
    #cleans a text whole, doesn't split

    #removes non unicode charecters
    text = unidecode(text)
    #removing punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    #removing double spaces, ect.
    text = re.sub(' +', ' ', text)
    #lower case
    text = text.lower().strip()

    return text

Creating cleaned chapters csv

In [590]:
clean_chapters_dict = {}
for title, chapter in chapters_dict.items():
    clean_chapter = clean_text(chapter)
    clean_chapters_dict[title] = clean_chapter

In [551]:
chapter_clean_df = pd.DataFrame.from_dict(clean_chapters_dict, orient = "index", columns=["text"])
#changing index
chapter_clean_df = chapter_clean_df.rename_axis('chapt_code').reset_index()
#creating new labeling columns
new = chapter_clean_df['chapt_code'].str.split(".", expand = True)
chapter_clean_df['book'] = new[0]
chapter_clean_df['chapt'] = new[1]
chapter_clean_df['version'] = new[2]
chapter_clean_df.to_csv(csv_chapters_clean_title)

Whole cleaned text csv

In [552]:
cleaned_single_text = clean_text(single_text)
with open(single_text_clean_title, "w") as text_file:
    text_file.write(cleaned_single_text)

Cleaned lines csv

In [553]:
cleaned_lines = []
for chapter, lines in lines_dict.items():
    #print(chapter)
    book, chapt, version = chapter.split(".")
    for line in lines:
        cleaned_line = clean_text(line)
        cleaned_lines.append((book, chapt, version, cleaned_line))

In [554]:
cleaned_lines_df = pd.DataFrame(cleaned_lines, columns=['book', 'chapt', 'version', 'text'])
cleaned_lines_df.to_csv(csv_lines_clean_title)

Individual cleaned sentences

In [717]:
def to_clean_sentences(text):
    #splits block text into sentences

    #removes non unicode charecters, quotation marks
    text = text.replace("’", "").replace("‘", "").replace('”', "").replace('“', "").replace("'", "").replace('"', "")
    text = unidecode(text)

    #splitting based on period, ! or ?
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
    #r'(?<=[.!?])\s+'
    #from https://stackoverflow.com/questions/25735644/python-regex-for-splitting-text-into-sentences-sentence-tokenizing

    for idx, s in enumerate(sentences):
        #removing punctuation
        s = s.strip()
        s = s.translate(str.maketrans('', '', string.punctuation))
        #removing double spaces, ect.
        s = re.sub(' +', ' ', s)
        sentences[idx] = s.lower()

    #removing sentences that are three charecters or fewer
    sentences=[s for s in sentences if len(s)>3]
    #removing sentences that have fewer than 5 words
    sentences=[s for s in sentences if s.count(' ') >4]

    return sentences

In [718]:
cleaned_sentences = []
only_sentences = []
for chapter, text in chapters_dict.items():
    #print(chapter)
    book, chapt, version = chapter.split(".")
    sentences = to_clean_sentences(text)
    only_sentences += sentences
    for s in sentences:
        cleaned_sentences.append((book, chapt, version, s))

In [719]:
[s + s for s in only_sentences]

['the book of the generation of jesus christ the son of david the son of abrahamthe book of the generation of jesus christ the son of david the son of abraham',
 'abraham begat isaac and isaac begat jacob and jacob begat judas and his brethren and judas begat phares and zara of thamar and phares begat esrom and esrom begat aram and aram begat aminadab and aminadab begat naasson and naasson begat salmon and salmon begat booz of rachab and booz begat obed of ruth and obed begat jesse and jesse begat david the king and david the king begat solomon of her that had been the wife of urias and solomon begat roboam and roboam begat abia and abia begat asa and asa begat josaphat and josaphat begat joram and joram begat ozias and ozias begat joatham and joatham begat achaz and achaz begat ezekias and ezekias begat manasses and manasses begat amon and amon begat josias and josias begat jechonias and his brethren about the time they were carried away to babylon and after they were brought to babyl

In [715]:
cleaned_sentences_df = pd.DataFrame(cleaned_sentences, columns=['book', 'chapt', 'version', 'text'])
cleaned_sentences_df.to_csv(csv_sentences_clean_title)

Creating NSP data based on sentences and lines

In [778]:
file = "clean_outputs/USNT/USNT_sentences_clean.csv"
cleaned_sentences_df = pd.read_csv(file, index_col=0)

In [846]:
bits = file.split("/")
version = bits[1]
correct_file = "nsp_data/" + version + "/" + version + "_sentences_correct.csv"
print(correct_file)
incorrect_file = "nsp_data/" + version + "/" + version + "_sentences_incorr.csv"
print(incorrect_file)

nsp_data/USNT/USNT_sentences_correct.csv
nsp_data/USNT/USNT_sentences_incorr.csv
nsp_data/USNT/USNT_sentences_mixed.csv
nsp_data/USNT/USNT_sentences_mixed_train.csv
nsp_data/USNT/USNT_sentences_mixed_test.csv


In [809]:
def make_correct(cleaned_sentences_df):
    correct_sentences_df = cleaned_sentences_df.copy()
    correct_sentences_df['second'] = correct_sentences_df['text'].shift(-1)
    correct_sentences_df['label'] = 0
    correct_sentences_df = correct_sentences_df[:][:-1]
    return correct_sentences_df


In [810]:
correct_sentences_df = make_correct(cleaned_sentences_df)

In [781]:
correct_sentences_df.to_csv(correct_file)

In [815]:
def make_incorrect(cleaned_sentences_df):
    text_col = cleaned_sentences_df['text']
    incorrect = []
    txt_len = len(text_col)
    for idx in range(len(text_col)):
        second_idx = random.randint(0, txt_len - 1)
        if second_idx == idx: 
        #avoid number that is equal to the first sentence
            second_idx += 1
            print("same as first")
        if second_idx == idx + 1:
        #or the one after it
            second_idx += 1
            print("same as correct")
        incorrect.append(text_col[second_idx])

    incorrect_sentences_df = cleaned_sentences_df.copy()
    incorrect_sentences_df['second'] = incorrect
    incorrect_sentences_df['label'] = 1
    return incorrect_sentences_df

In [816]:
incorrect_sentences_df = make_incorrect(cleaned_sentences_df)

same as correct


In [784]:
incorrect_sentences_df.to_csv(incorrect_file)

Mixed files

In [804]:
all_chapters = []
test_chapters = []
train_chapters = []
for tag in chapters_dict.keys():
    bits = tag.split(".")
    chapter = bits[0] + "." + bits[1]
    all_chapters.append(chapter)
    if random.randint(0,4) == 0:
        test_chapters.append(chapter)
    else:
        train_chapters.append(chapter)


In [806]:
with open("train_test_chapters/train_chapters.json", 'w') as fp:
    json.dump(train_chapters, fp)
with open("train_test_chapters/test_chapters.json", 'w') as fp:
    json.dump(test_chapters, fp)
with open("train_test_chapters/all_chapters.json", 'w') as fp:
    json.dump(all_chapters, fp)

In [929]:
file = "clean_outputs/USNT/USNT_sentences_clean.csv"
cleaned_sentences_df = pd.read_csv(file, index_col=0)
bits = file.split("/")
version = bits[1]
mixed_file = "nsp_data/" + version + "/" + version + "_sentences_mixed.csv"
print(mixed_file)
mixed_train_file = "nsp_data/" + version + "/" + version + "_sentences_mixed_train.csv"
print(mixed_train_file)
mixed_test_file = "nsp_data/" + version + "/" + version + "_sentences_mixed_test.csv"
print(mixed_test_file)

nsp_data/USNT/USNT_sentences_mixed.csv
nsp_data/USNT/USNT_sentences_mixed_train.csv
nsp_data/USNT/USNT_sentences_mixed_test.csv


In [916]:
def make_mixed(cleaned_sentences_df):
    text_col = cleaned_sentences_df['text']
    mixed = []
    labels = []
    txt_len = len(text_col)
    for idx in range(len(text_col) - 1):
        if random.randint(0,1) == 0:
            mixed.append(text_col[idx + 1])
            labels.append(0)
        else:
            second_idx = random.randint(0, txt_len - 1)
            if second_idx == idx: 
            #avoid number that is equal to the first sentence
                second_idx += 1
                print("same as first")
            if second_idx == idx + 1:
            #or the one after it
                second_idx += 1
                print("same as correct")
            mixed.append(text_col[second_idx])
            labels.append(1)

    mixed_sentences_df = cleaned_sentences_df.copy()
    mixed_sentences_df = mixed_sentences_df[:][:-1]
    mixed_sentences_df['second'] = mixed
    mixed_sentences_df['label'] = labels
    mixed_sentences_df['identifier'] = mixed_sentences_df.book + "." + mixed_sentences_df.chapt.apply(str)
    return mixed_sentences_df

In [930]:
mixed_sentences_df = make_mixed(cleaned_sentences_df)

In [931]:
mixed_sentences_df.to_csv(mixed_file)

In [932]:
mixed_sentences_df_train = mixed_sentences_df[mixed_sentences_df.identifier.isin(train_chapters)]
mixed_sentences_df_test = mixed_sentences_df[mixed_sentences_df.identifier.isin(test_chapters)]

In [933]:
print(len(mixed_sentences_df_train), len(mixed_sentences_df_test))

3252 1192


In [934]:
mixed_sentences_df_train.to_csv(mixed_train_file)
mixed_sentences_df_test.to_csv(mixed_test_file)

Checking lengths

In [955]:
file = "clean_outputs/KJV/KJV_sentences_clean.csv"
cleaned_sentences_df = pd.read_csv(file, index_col=0)
print(len(cleaned_sentences_df))
sum(cleaned_sentences_df.text.str.count(" ") + 1)/len(cleaned_sentences_df)

7102


25.308223035764573