In [1]:
# TFIDF CALCULATION AND SUMMARIES OF THE 66 BOOKS OF THE KING JAMES BIBLE

NOTE: The txt-files which will be generated and filled in this notebook are already stored on github, s.t. they are accessable even if the cells in this notebook are for some reason not executable or take too long to run<br>
The file are: 
- words from summaries generated by us using the sentences with the highest overall tf-idf score: <b>words_top_tfidf_score_summaries.txt</b>
- words from the summaries generated by us using the sentences with the most words of high tf-idf-score:
<b>topten_words_summaries.txt</b>
- words from the gold summaries found online 
<b>gold_summaries_words.txt</b>

In [2]:
# READ IN AND CLEAN DATA
#the txt-file was created and filled in "King James Bible (66 books) to correct format.ipynb"
with open('66_bible_books.txt', 'r') as f:
    chapters=f.readlines()
    
# pick first sentence of each book as gold standard summary

summaries_gold = [[] for _ in range(len(chapters))]

for i in range(len(chapters)):
    summaries_gold[i] = chapters[i].split('.')[0]
    

    
# divide into strings by space punctuation
# 'chapters' is list with 'raw' vocabulary per book, includes still stopwords and double words
import re
for i in range(len(chapters)):
    chapters[i] = re.findall(r"[\w']+", chapters[i])


    
# CLEAN VOCABULARY    

# get vocabulary per chapter without stopwords and lowercase
# without_stopwords is a list with 66 sublists, each of which includes the vocabulary of this book 
from nltk.corpus import stopwords
without_stopwords = [[] for _ in range(len(chapters))]
import string

# manually add new stopwords: old fashioned language
stopwords = list(stopwords.words('english'))
newstopwords = ['shalt', 'saith', 'thy','thee', 'thou', 'shall', 'thine', 'ye', 'you', 'whereof', 'didst', 'thereof', 'shouldest', 'unto', 'said', 'upon', 'also', 'abound', 'book64', 'book57', 'us', 'ai', 'book63', 'let', 'hereby', 'hath', 'saying', 'book65', 'doeth', 'say']
stopwords.extend(newstopwords)

for i in range(len(chapters)):
    for j in range(len(chapters[i])):
        chapters[i][j]=chapters[i][j].translate(chapters[i][j].maketrans('', '', string.punctuation)).lower()
        if chapters[i][j] not in stopwords:
            without_stopwords[i].append(chapters[i][j])

            
# look at output
# print first 22 words of clean vocabulary of first book
print(without_stopwords[0][:22])             
         

['book1', 'beginning', 'god', 'created', 'heaven', 'earth', 'earth', 'without', 'form', 'void', 'darkness', 'face', 'deep', 'spirit', 'god', 'moved', 'face', 'waters', 'god', 'light', 'light', 'god']


In [3]:
# TFIDF part 1
# TF: Term Frequency


# raw count occurence of each word per chapter (TF)
# 'counts' is a list with each chapter as sublist (66 elements)
# those sublists include tuples of the vocabulary and its counts inside that document/chapter/book

counts = [[] for _ in range(len(without_stopwords))] # makes list with empty sublists
for i in range(len(without_stopwords)):
        counts[i] = [(without_stopwords[i].count(word), word) for word in without_stopwords[i]] 

# look at output
# print first 22 words of counts of first book
print(counts[0][:22])

[(1, 'book1'), (5, 'beginning'), (230, 'god'), (11, 'created'), (30, 'heaven'), (121, 'earth'), (121, 'earth'), (9, 'without'), (1, 'form'), (1, 'void'), (5, 'darkness'), (49, 'face'), (6, 'deep'), (5, 'spirit'), (230, 'god'), (2, 'moved'), (49, 'face'), (32, 'waters'), (230, 'god'), (11, 'light'), (11, 'light'), (230, 'god')]


In [4]:
# TFIDF part 2
# IDF: Inverse Document Frequency
# measure of how much information the word provides, i.e., if it's common or rare across all documents
# obtained by dividing the total number of documents by the number of documents containing the term
# and then taking the logarithm of that quotient


# first, create vocabulary
# 'vocabulary' is list of the whole vocabulary, doubles still inside
vocabulary = [without_stopwords[i][j] for i in range(len(without_stopwords)) for j in range(len(without_stopwords[i]))]
# for later: take doubles out, so code works faster
voc_total_clean = []
for i in range(len(vocabulary)):
    if vocabulary[i] not in voc_total_clean:
        voc_total_clean.append(vocabulary[i])

        
        
        
        
# ratio of all existing texts and documents of an entire dataset
# and the number of texts that contain the defined keyword
# for each word in vocabulary
# check if it is in document, count those documents where it exists 

# 'word appearings' is long list
# with every word in vocabulary and the number of books it appears in
word_appearings = [0] * len(voc_total_clean)
for j in range(len(voc_total_clean)):
    appear=0
    for i in range(len(without_stopwords)):
        if voc_total_clean[j] in without_stopwords[i]:
            appear += 1
            word_appearings[j] = (voc_total_clean[j], appear)
    
N = len(chapters) # all existing texts and documents          

idf = [0] * len(voc_total_clean)
import math
for i in range(len(voc_total_clean)):
    idf[i]=(math.log10(N/word_appearings[i][1]+1), voc_total_clean[i])

    
# make idf dictionary to make the idf scores easier to access
idf_dict = {}
for (idfscore, word) in idf:
    idf_dict[word] = idfscore 
    
    
# look at output, print idf scores for lord', 'jesus', 'hell', 'heaven', 'sin'
print(idf_dict['lord'])
print(idf_dict['jesus'])
print(idf_dict['hell'])
print(idf_dict['heaven'])
print(idf_dict['sin'])

0.3184738859451898
0.5488144793747373
0.6886291709978
0.36061568564822527
0.42324587393680785


In [5]:
# TF IDF
# part 3: bring together the TF and IDF scores

# multiply tf and idf scores
# output is list with number of chapters (66) sublists
# in every sublist there is a tuple with the vocabulary and tfidf score

# clean counts, remove doubles
counts_clean = [[] for _ in range(len(counts))]
for i in range(len(counts)):
    for j in range(len(counts[i])):
        if counts[i][j] not in counts_clean[i]:
            counts_clean[i].append(counts[i][j])

            
            
            
tfidf = [[] for _ in range(len(counts_clean))]
for i in range(len(counts_clean)):
    for (count, word) in counts_clean[i]:
        tfidf[i] = [(count * idf_dict[word], word) for (count, word) in counts_clean[i]]


# make tfidf dictionary 
# need this for function later
tfidf_dicts=[{} for _ in chapters]
for i in range(len(tfidf)):
    for j in range(len(tfidf[i])):
        tfidf_dicts[i][tfidf[i][j][1]] = tfidf[i][j][0]
        
        
# the tfidf part is written by nelli

In [6]:
# first, we are looking at the words with the highest tf-idf scores for every chapter/book

# take the top top words of every book (words with highest tfidf scores)
# 'top ten words' is list with 66 sublists, each of which including 10 tuples with the word and tfidf score

top_ten_words = [sorted(tfidf[i],key=lambda x: x[0], reverse = True)[0:10] for i in range(len(tfidf))]

# make new list where tfidf scores are removed, just the words with the highest tfidf scores stay
# 'topten_justwords' is a list with 66 sublists each of which including the 10 'most important words'
topten_justwords = [[] for _ in range(len(top_ten_words))]
for i in range(len(top_ten_words)):
    topten_justwords[i] = [top_ten_words[i][j][1] for j in range(len(top_ten_words[i]))]
    
# read in the data again to get full sentences for every chapter
# bring it to right format for following function
# 's_per_chapter' is list with 66 sublists
# each sublist holds the sentences of that book as strings
with open('66_bible_books.txt', 'r') as f:
    chapters_whole=f.readlines()
    
for i in range(len(chapters_whole)):
    chapters_whole[i] = chapters_whole[i].strip('\n')
        
s_per_chapter = [[] for _ in range(len(chapters_whole))]
for i in range(len(chapters_whole)):
    s_per_chapter[i] = chapters_whole[i].split('.')[1:]
    
    
# function that takes sublists of 's_per_chapter' and 'topten_justwords' as inputs
# and returns the 3 sentences with the highest occurrence of 'highly important' words
# we count how many times each of the topten words of that chapter is in every sentence
def get_3_sent(chapter, topten):
    score = [[] for i in range(len(chapter))]
    for i in range(len(chapter)):
        occ = []
        for word in topten:
            occ.append(chapter[i].count(word))
            add = sum(occ)
            score[i] = add
    import numpy as np    
    index = np.array(score).argsort()[-3:][::-1]

    chosen_sentences = []
    for ind in index: 
        chosen_sentences.append(chapter[ind]+'.')
        
    return chosen_sentences

# test the function
# remember that the list 's_per_chapter' holds 66 elements that include the single sentences as sublists
# and topten_justwords are the 10 words with the highest tfidf scores per chapter
# for the third book of bible 
get_3_sent(s_per_chapter[2], topten_justwords[2])

# see summaries of all chapters according to this method below in this notebook 

["And the priest that maketh him clean shall present the man that is to be made clean, and those things, before the LORD, at the door of the tabernacle of the congregation:And the priest shall take one he lamb, and offer him for a trespass offering, and the log of oil, and wave them for a wave offering before the LORD:And he shall slay the lamb in the place where he shall kill the sin offering and the burnt offering, in the holy place: for as the sin offering is the priest's, so is the trespass offering: it is most holy:And the priest shall take some of the blood of the trespass offering, and the priest shall put it upon the tip of the right ear of him that is to be cleansed, and upon the thumb of his right hand, and upon the great toe of his right foot:And the priest shall take some of the log of oil, and pour it into the palm of his own left hand:And the priest shall dip his right finger in the oil that is in his left hand, and shall sprinkle of the oil with his finger seven times be

In [7]:
# second, we are looking which sentences in each book have the highest tfidf average



# new approach
# take average tfidf score of each sentence as criterioin to pick sentence


        
# clean up sentences, make voc for every sentence
voc_s_per_chapter = [[] for i in s_per_chapter]
for i in range(len(s_per_chapter)):
    for j in range(len(s_per_chapter[i])):
        voc_s_per_chapter[i].append(s_per_chapter[i][j].split())
for i in range(len(voc_s_per_chapter)):
    for j in range(len(voc_s_per_chapter[i])):
        for k in range(len(voc_s_per_chapter[i][j])):
            voc_s_per_chapter[i][j][k] = voc_s_per_chapter[i][j][k].lower()
            
            
# still cleaning, take only the voc for each sentence that we have in "voc_total_clean"
# otherwise we will try to look up words in the tfidf dict that are not there
# which will result in an error          
# also take stopwords out take
# this part takes pretty long because it's going through whole voc again
# 'voc_s_per_chapter_clean' is list with 66 sublists
# each sublist has a sublist for each sentence of that chapter
# with the splitted and cleaned voc of that sentence
voc_s_per_chapter_clean = [[] for i in range(len(voc_s_per_chapter))]
for i in range(len(voc_s_per_chapter_clean)):
    for j in range(len(voc_s_per_chapter[i])):
        list=[]
        for k in range(len(voc_s_per_chapter[i][j])):
            if voc_s_per_chapter[i][j][k] not in stopwords:
                if voc_s_per_chapter[i][j][k] in voc_total_clean:
                    list.append(voc_s_per_chapter[i][j][k])
        voc_s_per_chapter_clean[i].append(list)         
    
# make new (matching) list so we can save the tf-idf score of each word
# and calculate the average tfidf score of each sentence below
import copy
scores = copy.deepcopy(voc_s_per_chapter_clean)
for i in range(len(voc_s_per_chapter_clean)):
    for j in range(len(voc_s_per_chapter_clean[i])):
        for k in range(len(voc_s_per_chapter_clean[i][j])):
            #if voc_s_per_chapter_clean[i][]
            scores[i][j][k] = tfidf_dicts[i][voc_s_per_chapter_clean[i][j][k]]

# get the average tfidf score of each sentence in each chapter
averages = [[] for i in range(len(scores))]
for i in range(len(scores)):
    averages[i] = [[] for n in range(len(scores[i]))]
    for j in range(len(scores[i])):
        
        try:
            averages[i][j] = sum(scores[i][j])/len(scores[i][j])
        except ZeroDivisionError:
            pass
        
# clean averages, replace empty ones with 0
for i in range(len(averages)):
    for j in range(len(averages[i])):
        if averages[i][j] == []:
            averages[i][j] = 0       
        

# save for each of the 66 books the indexes of the 3 sentences with the highest average tfidf score
indexes = [[] for _ in range(len(averages))]
import numpy as np 
for i in range(len(averages)):
    indexes[i].append(np.array(averages[i]).argsort()[-3:][::-1])
# make np arrays to lists to make them accessible
indexes_list=[i.tolist() for n in range(len(indexes)) for i in indexes[n]]

    
    

# make new list to append the top 3 sentence per chapter
summaries_highest_tfidf_avg = [[] for k in range(len(indexes))]
# loop through each book and indexes
# and append to the list
for i in range(len(indexes)):
    for j in indexes_list[i]:
        summaries_highest_tfidf_avg[i].append(s_per_chapter[i][j])
        
#for i in range(len(summaries_highest_tfidf_avg)):
#    for j in range(len(summaries_highest_tfidf_avg[i])):
        
# look at output
# print the summaries acoording to this method
print(summaries_highest_tfidf_avg)        


[['And he said unto him, What is thy name? And he said, Jacob', ' And Joseph went out over all the land of Egypt', 'So Jacob came to Luz, which is in the land of Canaan, that is, Bethel, he and all the people that were with him'], ['And Moses and Aaron did as the LORD commanded them, so did they', 'And Moses went out from Pharaoh, and entreated the LORD', 'The LORD is a man of war: the LORD is his name'], ['And if his offering be a goat, then he shall offer it before the LORD', 'Blind, or broken, or maimed, or having a wen, or scurvy, or scabbed, ye shall not offer these unto the LORD, nor make an offering by fire of them upon the altar unto the LORD', 'And if the burnt sacrifice for his offering to the LORD be of fowls, then he shall bring his offering of turtledoves, or of young pigeons'], ['And Moses did so: as the LORD commanded him, so did he', 'And Moses told the children of Israel according to all that the LORD commanded Moses', 'And Moses and Eleazar the priest did as the LORD 

In [8]:
# print summaries according to first method
# get_3_sent function run through all chapters

summaries = []
for i in range(len(s_per_chapter)):
    summaries.append(get_3_sent(s_per_chapter[i], topten_justwords[i]))
    
n =1
for s in summaries:
    print('SUMMARY OF CHAPTER '+str(n))
    #print('\n')
    print(s)
    print('\n')
    
    n+=1

SUMMARY OF CHAPTER 1
['When that year was ended, they came unto him the second year, and said unto him, We will not hide it from my lord, how that our money is spent; my lord also hath our herds of cattle; there is not ought left in the sight of my lord, but our bodies, and our lands:Wherefore shall we die before thine eyes, both we and our land? buy us and our land for bread, and we and our land will be servants unto Pharaoh: and give us seed, that we may live, and not die, that the land be not desolate.', " Now the sons of Jacob were twelve:The sons of Leah; Reuben, Jacob's firstborn, and Simeon, and Levi, and Judah, and Issachar, and Zebulun:The sons of Rachel; Joseph, and Benjamin:And the sons of Bilhah, Rachel's handmaid; Dan, and Naphtali:And the sons of Zilpah, Leah's handmaid: Gad, and Asher: these are the sons of Jacob, which were born to him in Padanaram.", "And they took their cattle, and their goods, which they had gotten in the land of Canaan, and came into Egypt, Jacob, a

In [9]:
# even more interesting and describing ('ausschlaggebend') are the top ten words for each chapter
# they give a pretty good insight in the chapter
topten_justwords

[['jacob',
  'joseph',
  'laban',
  'land',
  'abram',
  'god',
  'lord',
  'abraham',
  'sons',
  'came'],
 ['moses',
  'lord',
  'aaron',
  'pharaoh',
  'sockets',
  'israel',
  'people',
  'egypt',
  'tabernacle',
  'land'],
 ['offering',
  'lord',
  'priest',
  'unclean',
  'atonement',
  'burnt',
  'plague',
  'aaron',
  'altar',
  'skin'],
 ['lord',
  'offering',
  'moses',
  'children',
  'israel',
  'congregation',
  'aaron',
  'tabernacle',
  'one',
  'families'],
 ['lord',
  'god',
  'land',
  'go',
  'people',
  'day',
  'possess',
  'israel',
  'man',
  'egypt'],
 ['joshua',
  'lord',
  'children',
  'israel',
  'suburbs',
  'tribe',
  'jordan',
  'cities',
  'land',
  'king'],
 ['gideon',
  'israel',
  'lord',
  'samson',
  'abimelech',
  'children',
  'jephthah',
  'went',
  'men',
  'came'],
 ['naomi',
  'boaz',
  'ruth',
  'kinsman',
  'glean',
  'reapers',
  'moabitess',
  'gleaned',
  'law',
  'elimelech'],
 ['saul',
  'david',
  'samuel',
  'lord',
  'philistines',
 

In [10]:
# THE END

<div class="alert alert-block alert-info">now we bring the list we obtained above into txt files, so that we can use them in different notebooks<div>

In [61]:
#split the summaries obtained by taking the sentences with the best overall tfidf score 
top_tfidf_score_summaries = []    
summaries_highest_tfidf_avg[0][0]
for i in range(len(summaries_highest_tfidf_avg)):
    concat_s = ''
    for summary in summaries_highest_tfidf_avg[i]:
        #clean each sentence in each summary from unwanted chars
        s = re.sub('!','',re.sub('\?','',re.sub(':','',re.sub(',','',summary))))
        concat_s = concat_s+s
        
    #make sure there will be no double elemets by creating a set
    concat_list = set(concat_s.split(' '))  
    split_concat = []
    for elem in concat_list:
        if elem not in stopwords:
            split_concat.append(elem)
            
    top_tfidf_score_summaries.append(split_concat)

#make sure the list has the correct length    
len(top_tfidf_score_summaries)


66

In [62]:
#and write into a new txt file:
with open('words_top_tfidf_score_summaries.txt', 'w') as f:
    for item in top_tfidf_score_summaries:
        f.write("%s\n" % item)

In [72]:
#REMOVE UNWANTED CAHRS FROM GOLD STANDARD SENTENCES

In [73]:

cleaned_gold_summaries = []
import re
for summary in summaries_gold:
        remove_bracket = re.sub('\[\\\'','',summary)
        remove_backslash = re.sub('\\\'','',remove_bracket)
        new_entry = remove_backslash.split('\"')
        cleaned_gold_summaries.append([new_entry[1]])
cleaned_gold_summaries[0]

['In the beginning God created the heaven and the earth']

In [74]:
#remove the stopwords from the gold standard summaries obtained by copying the first sentence
gold_summaries_from_first_sentence = []
for summary in cleaned_gold_summaries:
    clean_summary_per_book = []
    single_words  = summary[0].split(' ')
    for word in single_words: 
        if word not in stopwords:
            clean_summary_per_book.append(word)

    gold_summaries_from_first_sentence.append(clean_summary_per_book)
len(gold_summaries_from_first_sentence)
print(gold_summaries_from_first_sentence[0])


['In', 'beginning', 'God', 'created', 'heaven', 'earth']


In [75]:
#write the words from the gold summaries found online into a txt
#the gold standard summaries stored in this txt file are obtained by taking the first sentence of each book.
with open('gold_summaries_words.txt', 'w') as f:
    for item in gold_summaries_from_first_sentence:
        f.write("%s\n" % item)
#write the words from the summaries generated by us using the sentences with the most words of high tf-idf-score.        
with open('topten_words_summaries.txt', 'w') as f:
    for item in topten_justwords:
        f.write("%s\n" % item)