<div class="alert alert-block alert-info">In this notebook create summaries for the 66 bible books with german text and prepare goldstandard summaries in german. 
<br>
The german bible text is available at <a href="https://www.sermon-online.de/search.pl?lang=de&id=6068">https://www.sermon-online.de/search.pl?lang=de&id=6068</a>.
The plain text can be downloaded or copied from <a href="https://info2.sermon-online.com/german/MartinLuther-1912/Martin_Luther_Uebersetzung_1912.txt">https://info2.sermon-online.com/german/MartinLuther-1912/Martin_Luther_Uebersetzung_1912.txt</a> 

NOTE: The txt-files which will be generated and filled in this notebook are allready stored on github, s.t. they are accessable even if the cells in this notebook are for some reason not executable or take too long to run<br>
The file names are: 
- Topten tf-idf score words in summaries generated by us: <b>words_topten_german_summaries.txt</b>
- words in gold standard summaries found online: <b>words_german_goldstandard_summaries.txt</b>


In [129]:
with open('bible_luther_1912.txt', 'rb') as f:
    chapters=f.read().decode(errors='replace')
    
lines = chapters.split('\r\n')

lines = lines[:31173] # delete the unnecessary information 

In [2]:
# bring into right form, separate by book, so we obtain list with 66 books of bible 


# make list with booknames: 'Gen', 'Exo', etc
booknames=[[] for _ in range(len(lines))]
for i in range(len(lines)):
    booknames[i] = str(lines[i][:3])
books=[]
for i in range(len(booknames)):
    if booknames[i] not in books:
        books.append(booknames[i])
        
# make chapters. chapters is a list with 66 elements, each of which has sublists with single sentences
chapters = [[] for _ in range(len(books))]
for i in range(len(lines)):
    for j in range(len(books)):
        if lines[i][:3] == books[j]:
            chapters[j].append(lines[i])

In [3]:
# get vocabulary for each chapter without stopwords, punctuation etc

voc_unclean = [[] for _ in range(len(chapters))]
for i in range(len(chapters)):
    for j in range(len(chapters[i])):
        voc_unclean[i].append(chapters[i][j].split())
        
        
        
from nltk.corpus import stopwords
voc_clean = [[] for _ in range(len(chapters))]
import string

stopwords = list(stopwords.words('german'))
newstopwords = [i.lower() for i in books]
stopwords.extend(newstopwords)

for i in range(len(voc_unclean)):
    for j in range(len(voc_unclean[i])):
        for k in range(len(voc_unclean[i][j])):
            variable = voc_unclean[i][j][k].translate(voc_unclean[i][j][k].maketrans('', '', string.punctuation)).lower()
            if variable not in stopwords:
                voc_clean[i].append(variable)

In [5]:
# raw count occurence of each word per chapter (TF)
# 'counts' is a list with each chapter as sublist
# those sublists include tuples of the vocabulary and its counts inside that document/chapter

counts = [[] for _ in range(len(voc_clean))] # makes list with empty sublists
for i in range(len(voc_clean)):
        counts[i] = [(voc_clean[i].count(word), word) for word in voc_clean[i]] 


# 'counts' is now a list with 66 elements

# look at output
print(counts[0][:22])

[(1, '11'), (2, 'anfang'), (8, 'schuf'), (196, 'gott'), (30, 'himmel'), (56, 'erde'), (1, '12'), (56, 'erde'), (2, 'w�st'), (3, 'leer'), (2, 'finster'), (4, 'tiefe'), (6, 'geist'), (21, 'gottes'), (1, 'schwebte'), (38, 'wasser'), (1, '13'), (196, 'gott'), (418, 'sprach'), (9, 'licht'), (113, 'ward'), (9, 'licht')]


In [6]:
vocabulary = [voc_clean[i][j] for i in range(len(voc_clean)) for j in range(len(voc_clean[i]))]

<div class="alert alert-block alert-info">Now that we have a clean vocabulary, we an apply TF-IDF calculation:

In [7]:
## IDF

word_appearings = [0] * len(vocabulary)

for j in range(len(vocabulary)):
    appear=0
    for i in range(len(voc_clean)):
        if vocabulary[j] in voc_clean[i]:
            appear += 1
            word_appearings[j] = (vocabulary[j], appear)


N = len(chapters) # all existing texts and documents
# should be 66 in final version            

idf = [0] * len(vocabulary)
import math

for i in range(len(vocabulary)):
    idf[i]=(math.log10(N/word_appearings[i][1]+1), vocabulary[i])

    
# make idf dictionary to make the idf scores easier to access
idf_dict = {}
for (idfscore, word) in idf:
    idf_dict[word] = idfscore    

In [8]:
# TF IDF
# multiply tf and idf scores
# output ist liste mit der anzahl von chaptern als sublists
# und in jeder sublist ist ein tuple mit dem vocabulary und dem tf idf score


# clean counts, remove doubles
counts_clean = [[] for _ in range(len(counts))]
for i in range(len(counts)):
    for j in range(len(counts[i])):
        if counts[i][j] not in counts_clean[i]:
            counts_clean[i].append(counts[i][j])

            
            
            
tfidf = [[] for _ in range(len(counts_clean))]
for i in range(len(counts_clean)):
    for (count, word) in counts_clean[i]:
        tfidf[i] = [(count * idf_dict[word], word) for (count, word) in counts_clean[i]]


In [9]:
top_ten_words = [sorted(tfidf[i],key=lambda x: x[0], reverse = True)[0:10] for i in range(len(tfidf))]
# also make new list where tfidf scores are removed, just the words with the highest tfidf scores stay
topten_justwords = [[] for _ in range(len(top_ten_words))]
for i in range(len(top_ten_words)):
    topten_justwords[i] = [top_ten_words[i][j][1] for j in range(len(top_ten_words[i]))]

In [131]:
import numpy as np

#just like for english language, the get_3_sent creates a summary by taking the three sentences with the most topten tf-idf-score-word-occurences
def get_3_sent(chapter, topten):
    score = [[] for i in range(len(chapter))]
    for i in range(len(chapter)):
        occ = []
        for word in topten:
            occ.append(chapter[i].count(word))
            add = sum(occ)
            score[i] = add
        
    index = np.array(score).argsort()[-3:][::-1]

    chosen_sentences = []
    for ind in index: 
        chosen_sentences.append(chapter[ind])
        
    return chosen_sentences

In [11]:
#test function
get_3_sent(chapters[2], topten_justwords[2])

['Lev 20:25 dass ihr auch absondern sollt das reine Vieh vom unreinen und unreine V�gel von den reinen, und eure Seelen nicht verunreinigt am Vieh, an V�geln und an allem, was auf Erden kriecht, das ich euch abgesondert habe, dass es unrein sei.',
 'Lev 23:36 Sieben Tage sollt ihr dem HErrn opfern. Der achte Tag soll auch heilig hei�en, dass ihr zusammenkommt, und sollt eure Opfer dem HErrn tun; denn es ist der Tag der Versammlung; keine Dienstarbeit sollt ihr tun.',
 'Lev 18:24 Ihr sollt euch in dieser keinem verunreinigen; denn in diesem allem haben sich verunreinigt die Heiden, die ich vor euch her will aussto�en,']

<div class="alert alert-block alert-info">As we can see, there are some unreadable chars in the text. In the following cells, these words containing such cahrs will be removed and then the data will be stored in a new txt-file. </div>

<div class="alert alert-block alert-danger">CAUTION! Execute the following cell not ore than once! otherwise the unwanted char might be shiftet and signs that are actually wanted will be removed</div>

In [132]:
topten_justwords
#print(topten_justwords[2])
#storing the unwanted char in a new variable to compare with the characters later. 
unwanted_char = topten_justwords[2][4][1]
unwanted_char


'e'

In [51]:
#remove unreadable char: 
for words in topten_justwords:
    for char in words: 
        if unwanted_char in char:
            words.remove(char)
#check if it worked:
topten_justwords[2]            

['unrein', 'herrn', 'sollt', 'mose', 'herr', 'fett']

In [128]:
# write the topten_jsutwords from the summaries generated by us into file for later usage in a different notebook
#and write into a new txt file:
with open('words_topten_german_summaries.txt', 'w') as f:
    for item in topten_justwords:
        f.write("%s\n" % item)

<div class="alert alert-block alert-info">Now, we only need to clean the goldstandard summaries and store them in a txt-file</div>

In [13]:
just_sent=[[] for i in range(len(chapters))]
for i in range(len(chapters)):
    for j in range(len(chapters[i])):
        variable=chapters[i][j].split()
        just_sent[i].append(" ".join(variable[2:]))



import numpy as np
summaries = []
for i in range(len(just_sent)):
    summaries.append(get_3_sent(just_sent[i], topten_justwords[i]))
    
n =1
for s in summaries:
    print('SUMMARY OF CHAPTER '+str(n))
    print(s)
    print('\n')
    
    n+=1

SUMMARY OF CHAPTER 1
['Da sprach der HErr zu Kain: Wo ist dein Bruder Abel? Er sprach: Ich wei� nicht; soll ich meines Bruders H�ter sein?', 'Und Gott segnete sie und sprach zu ihnen: Seid fruchtbar und mehret euch und f�llet die Erde und machet sie euch untertan und herrschet �ber die Fische im Meer und �ber die V�gel unter dem Himmel und �ber alles Getier, das auf Erden kriecht.', 'und sprach zu dem Knecht: Wer ist der Mann, der uns entgegenkommt auf dem Felde? Der Knecht sprach: Das ist mein Herr. Da nahm sie den Mantel und verh�llte sich.']


SUMMARY OF CHAPTER 2
['Gott sprach zu Mose: ICH WERDE SEIN, DER ICH SEIN WERDE. Und sprach: Also sollst du den Kindern Israel sagen: ICH WERDE SEIN hat mich zu euch gesandt.', 'Und Mose stieg hinauf zu Gott. Und der HErr rief ihm vom Berge und sprach: So sollst du sagen zu dem Hause Jakob und verk�ndigen den Kindern Israel:', "Der HErr sprach zu ihm: Was ist's, was du in deiner Hand hast? Er sprach: Ein Stab."]


SUMMARY OF CHAPTER 3
['dass ih

In [22]:
#check the length of our summary list:
len(summaries)

66

<div class="alert alert-block alert-info">In the following cells, we prepare goldstandard summaries in german language. <br> They can be found at <a href="https://www.die-bibel.de/bibeln/bibelkenntnis/inhalt-und-aufbau-der-bibel/altes-testament/gesetzesbuecher/">https://www.die-bibel.de/bibeln/bibelkenntnis/inhalt-und-aufbau-der-bibel/altes-testament/gesetzesbuecher/</a></div>

In [14]:
with open('bibel_zusammenfassungen.txt', 'rb') as f:
    zus=f.read().decode(errors='replace')
    
buecher = zus.split('\n\n')


# clean up 

worte=[[] for _ in buecher]
from nltk.corpus import stopwords
stopwords = list(stopwords.words('german'))

for i in range(len(buecher)):
    worte[i] = buecher[i].split()
    
    
# remove stopwords and make lowercase
worte_wo=[[] for _ in worte]
for i in range(len(worte)):
    for j in range(len(worte[i])):
        if worte[i][j].lower() not in stopwords:
            worte_wo[i].append(worte[i][j].lower())

# remove punctuation from string
import re
for i in range(len(worte_wo)):
    for j in range(len(worte_wo[i])):
        worte_wo[i][j] = re.sub(r'[^\w\s]','',worte_wo[i][j])

In [15]:
worte_final = [[] for _ in worte]
for i in range(len(worte_wo)):
    for j in range(len(worte_wo[i])):
        if worte_wo[i][j] != '':
            worte_final[i].append(worte_wo[i][j])
            

In [63]:
#worte_final[0]
print(len(set(worte_final[0])))
(len(worte_final[0]))


#now we now, that there still double entries in our worte_final

55


63

In [109]:
#remove words that occure more than once by transorming the "words" list into a set and into a list again
german_summary_words = []
for words in worte_final:
    german_summary_words.append(list(set(words)))


In [137]:
#now the length of the first entry should be 55
len(german_summary_words[0])


55

In [123]:
#remove unwanted chars from the words in the gold standard summaries      
#!!! above for the topten words in our summaries that would not work, 
#because there all german Umlaute (and presumably also 'ß') are encoded to the same unreadable char
cleaned_gold_summary_german = []
for summary in german_summary_words:
    #print(summary)
    cleaned_words = []
    for words in summary:
        cleaned_word = re.sub('ß','ss',re.sub('ä','ae',re.sub('ü','ue',re.sub('ö','oe',words))))
        cleaned_words.append(cleaned_word)
    cleaned_gold_summary_german.append(cleaned_words)

print(len(cleaned_gold_summary_german[0]))
print(len(set(cleaned_gold_summary_german[0])))

55
55


In [125]:
# write the from the german gold standard summaries into a txt file for later usage in a different notebook
with open('words_german_goldstandard_summaries.txt', 'w', encoding = 'utf-8') as f:
    for item in cleaned_gold_summary_german:
        f.write("%s\n" % item)