In [1]:
OT = [
    "Genesis",
    "Exodus",
    "Leviticus",
    "Numbers",
    "Deuteronomy",
    "Joshua",
    "Judges",
    "Ruth",
    "1 Samuel",
    "2 Samuel",
    "1 Kings",
    "2 Kings",
    "1 Chronicles",
    "2 Chronicles",
    "Ezra",
    "Nehemiah",
    "Esther",
    "Job",
    "Psalms",
    "Proverbs",
    "Ecclesiastes",
    "Song of Solomon",
    "Isaiah",
    "Jeremiah",
    "Lamentations",
    "Ezekiel",
    "Daniel",
    "Hosea",
    "Joel",
    "Amos",
    "Obadiah",
    "Jonah",
    "Micah",
    "Nahum",
    "Habakkuk",
    "Zephaniah",
    "Haggai",
    "Zechariah",
    "Malachi",
]

NT = [
    "Matthew",
    "Mark",
    "Luke",
    "John",
    "Acts",
    "Romans",
    "1 Corinthians",
    "2 Corinthians",
    "Galatians",
    "Ephesians",
    "Philippians",
    "Colossians",
    "1 Thessalonians",
    "2 Thessalonians",
    "1 Timothy",
    "2 Timothy",
    "Titus",
    "Philemon",
    "Hebrews",
    "James",
    "1 Peter",
    "2 Peter",
    "1 John",
    "2 John",
    "3 John",
    "Jude",
    "Revelation",
]

bible_books_in_order = OT + NT


In [2]:
from collections import defaultdict
import json
from collections import defaultdict, Counter

with open("corpus.json", "r") as f:
    corpus = json.load(f)

# Remove junk
junk = ["''","``","'s"]
for item in junk:
    corpus.pop(item)

In [3]:
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.io as pio
from plotly.offline import init_notebook_mode,iplot
pio.renderers.default = "notebook"
init_notebook_mode(connected=True) 

In [4]:
def get_common_words(books):
    smol_stats = defaultdict(int)
    for word, refs in corpus.items():
        new_refs = []
        for ref in refs:
            if ref[0] in books:
                new_refs.append(ref)
        if new_refs:
            smol_stats[word]+=len(new_refs)

    return Counter(smol_stats)

In [5]:
book_word_variance = defaultdict(lambda:defaultdict(int)) # {"Genesis":{"beginning":30,..},"Exodus":{...}...}
for word, refs in corpus.items():
    for ref in refs:
        book_word_variance[ref[0]][word]+=1

unique_word_count = {i:sum(c for c in j.values()) for i,j in book_word_variance.items()} # {"Genesis": total_word_count ,"Exodus": 3000}

print("Number of unique words per book:")
unique_words_per_book = Counter(unique_word_count).most_common()
print(unique_words_per_book)

Number of unique words per book:
[('Psalms', 17900), ('Jeremiah', 16633), ('Genesis', 15744), ('Ezekiel', 15383), ('Isaiah', 15067), ('Numbers', 13152), ('Exodus', 12861), ('2 Chronicles', 10981), ('Deuteronomy', 10594), ('Luke', 10333), ('Acts', 10293), ('1 Kings', 10099), ('1 Samuel', 9954), ('2 Kings', 9770), ('Matthew', 9728), ('Leviticus', 9654), ('1 Chronicles', 9217), ('2 Samuel', 8681), ('John', 7610), ('Job', 7420), ('Joshua', 7391), ('Judges', 7376), ('Proverbs', 6751), ('Mark', 6176), ('Revelation', 5024), ('Daniel', 4688), ('Nehemiah', 4423), ('Romans', 4076), ('1 Corinthians', 3858), ('Hebrews', 3074), ('Ezra', 2996), ('2 Corinthians', 2463), ('Zechariah', 2401), ('Esther', 2382), ('Ecclesiastes', 2226), ('Hosea', 2115), ('Amos', 1741), ('Lamentations', 1431), ('Galatians', 1346), ('Ephesians', 1336), ('Micah', 1249), ('Song of Solomon', 1178), ('1 Timothy', 1107), ('1 Peter', 1064), ('James', 1031), ('Ruth', 1021), ('1 John', 1004), ('Philippians', 920), ('Colossians', 87

In [6]:
print("Most Common words in the bible:") 
print(get_common_words(bible_books_in_order).most_common(50))

gospels = ["Matthew","Mark","Luke","John"]
print("\nMost Common words in the gospels:") 
print(get_common_words(gospels).most_common(50))

print("\nMost Common words in the OT:") 
print(get_common_words(OT).most_common(50))

print("\nMost Common words in the NT:") 
print(get_common_words(NT).most_common(50))

Most Common words in the bible:
[('lord', 7788), ('god', 4160), ('said', 3184), ('king', 2503), ('one', 2478), ('son', 2338), ('people', 2214), ('man', 2209), ('israel', 1858), ('men', 1805), ('like', 1528), ('land', 1451), ('come', 1435), ('us', 1422), ('day', 1419), ('go', 1392), ('jesus', 1274), ('went', 1231), ('father', 1220), ('may', 1219), ('came', 1191), ('made', 1109), ('let', 1071), ('put', 1018), ('david', 1007), ('house', 965), ('say', 916), ('give', 896), ('sons', 865), ('make', 852), ('know', 850), ('take', 847), ('moses', 843), ('hand', 840), ('judah', 834), ('among', 811), ('also', 809), ('see', 804), ('jerusalem', 799), ('away', 797), ('place', 792), ('must', 785), ('time', 768), ('brought', 763), ('name', 762), ('earth', 734), ('says', 727), ('bring', 723), ('city', 710), ('great', 698)]

Most Common words in the gospels:
[('jesus', 912), ('said', 687), ('one', 465), ('man', 433), ('son', 318), ('god', 310), ('father', 293), ('came', 269), ('went', 258), ('come', 246)

In [15]:
# Get common words of any books lul
get_common_words(["Mark"]).most_common(10)

[('jesus', 202),
 ('said', 125),
 ('man', 89),
 ('one', 73),
 ('disciples', 59),
 ('came', 58),
 ('went', 58),
 ('god', 52),
 ('asked', 50),
 ('people', 45)]

In [7]:
HOVER_TEMPLATE = "Book: %{customdata[0]}<br>Word Count: %{customdata[1]}<br>Percentage: %{y}%"

def plot_word(main_word, filter_books=[], show_percent=False):
    stats = defaultdict(int)
    for ref in corpus[main_word]:
        if filter_books:
            if ref[0] in filter_books:
                stats[ref[0]] += 1
        else:
            stats[ref[0]] += 1

    # expanded = [(book, count), (book, count), ...]
    expanded = list(stats.items())
    custom_data = expanded.copy()
    if show_percent:
        expanded = [(i, j / unique_word_count[i] * 100) for i, j in stats.items()]
        
    x, y = zip(*expanded)
    bar = go.Bar(x=x, y=y,customdata=custom_data,hovertemplate=HOVER_TEMPLATE)
    return bar


def plot_words(words, filter_books=[], show_percent=False):
    fig = make_subplots(rows=len(words), cols=1, shared_xaxes=True, subplot_titles=tuple(map(lambda x:"Word: "+x,words)))
    for row, main_word in enumerate(words):
        fig.add_trace(plot_word(main_word, filter_books, show_percent), row=row + 1, col=1)

    fig.update_layout(
        height=len(words) * 250,
        title_text="Word Frequency",
        showlegend=False,
        yaxis_title="Percentage" if show_percent else "Word Count",
        xaxis_title="Books",
    )
    fig.update_xaxes(categoryorder="array", categoryarray=bible_books_in_order)
    return fig


In [30]:
books = bible_books_in_order


#words = [w[0] for w in get_common_words(["Genesis"]).most_common(10)]
words = ["father"]
words_fig = plot_words(words,filter_books=books,show_percent=True)
#words_fig.show()

# Re-extract data from graph
big_data = words_fig.to_dict()

all_percents = defaultdict(float)
all_counts = defaultdict(int)

# for each word graph add percentages together
for word_graph in big_data["data"]:
    count = list(map(lambda x:x[1],word_graph["customdata"]))
    percents = word_graph["y"]
    books = word_graph["x"]
    for i in range(len(books)):
        all_percents[books[i]] += percents[i]
        all_counts[books[i]] += count[i]

word_books,all_percentages_from_y = zip(*all_percents.items())

print(len(word_books),len(all_percentages_from_y))
bar = go.Bar(x=word_books, y=all_percentages_from_y, hovertemplate=HOVER_TEMPLATE,customdata=list(all_counts.items()))
fig = go.Figure(data=bar)
fig.update_layout(title="Combined frequency for words - "+ (", ".join(words)), showlegend=False,
        yaxis_title="Percentage",
        xaxis_title="Books",)
fig.update_xaxes(categoryorder="array", categoryarray=bible_books_in_order)
fig.show()

#words_fig.show()

53 53


In [35]:

# Chapter-wise word count
def chapter_wise(words,books):
    new_refs = defaultdict(lambda:defaultdict(int))
    for word in words:
        refs = corpus.get(word,[])
        for ref in refs:
            if ref[0] in books:
                new_refs[ref[0]][ref[1]]+=1
    return new_refs

# Picking top 5 books
books_with_words = [thing[0] for thing in sorted(all_counts.items(),key=lambda x:x[1],reverse=True)][:5]
# books_with_words = ["Matthew","2 John","Acts","Luke"]
chap_refs = chapter_wise(words,books_with_words) # {"Genesis":{"1":30,"2":20...},"Exodus":{...}...}

fig = make_subplots(rows=len(chap_refs), cols=1, shared_xaxes=True, subplot_titles=[f"{book} - {sum(chap_refs[book].values())} times" for book in books_with_words])
for row,book in enumerate(books_with_words):
    x,y = zip(*chap_refs[book].items())
    fig.add_trace(go.Bar(x=x,y=y,customdata=list(zip(x,y)),hovertemplate="Chapter: %{customdata[0]}<br>Word Count: %{customdata[1]}"),row=row + 1, col=1)

fig.update_layout(
    height=len(chap_refs) * 250,
    title_text="Chapterwise Combined Counts for words - " + (", ".join(words)),
    yaxis_title="Word Count",
    xaxis_title="Chapters",
    showlegend=False
)

fig.update_xaxes(categoryorder="array", categoryarray=tuple(range(200))) # Hack for now

fig.show()


In [None]:
from tabulate import tabulate

one_words = []
for w,refs in corpus.items():
    if len(refs)==1:
        one_words.append((w,refs[0]))
print(tabulate(sorted(one_words)))

freq = defaultdict(int)
for word,ref in one_words:
    freq[ref[0]]+=1

#print(sorted(list(freq.items()),key=lambda x:x[1],reverse=True))

x,y = zip(*freq.items())
fig = px.bar(x=x,y=y,title="Books with unique words",labels={'y': 'Number of unique words', 'x':'books'})
fig.show()