In [26]:
import spacy
from string import punctuation
from heapq import nlargest
from spacy.lang.en.stop_words import STOP_WORDS
import math
from transformers import pipeline

In [27]:
nlp = spacy.load('en_core_web_md')
summurizer = pipeline('summarization', framework='pt')
stop_words = list(STOP_WORDS)

No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [28]:
from docx import Document
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.shared import Inches

In [29]:
from spacytextblob.spacytextblob import SpacyTextBlob
# The polarity score is a float within the range [-1.0, 1.0]. The subjectivity is a float within the range [0.0, 1.0] where 0.0 is very objective and 1.0 is very subjective. 
nlp.add_pipe('spacytextblob')


<spacytextblob.spacytextblob.SpacyTextBlob at 0x2df020c70>

In [30]:

class Paragraph:

    def __init__(self, content, heading, heading_style, corpus):
        self.content = content
        self.heading = heading
        self.heading_style = heading_style
        self.corpus = corpus
        self.summary = self.summarize(ratio=0.5)
        self.polarity = self.get_polarity()
        self.subjectivity = self.get_subjectivity()

    def get_keywords(self, **kwargs):
        doc = nlp(self.content)
        tags = ['PROPN', 'NOUN'] # alternative ['PROPN', 'ADJ', 'NOUN']
        keywords = {}

        for token in doc:
             if token.pos_ in tags and token.lemma_ not in punctuation:
                tf_idf = self.get_tf_idf(token.lemma_)
                if token.lemma_ not in keywords.keys():
                    keywords[token.lemma_] = tf_idf
        
        return sorted(keywords, key=keywords.get)[:kwargs.get('n')]
    
    def summarize(self, **kwargs):

        if kwargs.get('strategy') == 'abstract':
            try:
                return summurizer(self.content, min_length=100, max_length= 200)[0]['summary_text']
            except:
                return self.summary

        doc = nlp(self.content)

        # Create dictionary with tokens as keys and their frequency as the value
        token_frequencies = {}
        for token in doc:
            if token.text not in stop_words:
                if token.text not in token_frequencies.keys():
                    token_frequencies[token.text] = 1
                else:
                    token_frequencies[token.text] += 1
        
        # Divide each token by the maximum frequency to get the waited frequency of each token
        for token in token_frequencies:
            token_frequencies[token] = token_frequencies[token] / max(token_frequencies.values())

        # Create dictionary with sentence as key and scentence score as value
        scentence_list = [s for s in doc.sents]
        scentence_score = {}

        # sentence score is equal to sum of token scores for each token in the scentence
        for sentence in scentence_list:
            for token in sentence:
                if token.text.lower() in token_frequencies.keys():
                    if len(sentence.text.split(' ')) < 30:
                        if sentence not in scentence_score.keys():
                            scentence_score[sentence] = token_frequencies[token.text.lower()]
                        else:
                            scentence_score[sentence] += token_frequencies[token.text.lower()]

        # Summurize documents to n-sentences
        n = math.floor(kwargs.get('ratio') * len(scentence_list))

        summurized_sentences = nlargest(n, scentence_score, key=scentence_score.get)
        summary = [s for s in scentence_score.keys() if s in summurized_sentences]
        new_text = ' '.join([s.text for s in summary])
        
        #print(len(self.content), '-', len(new_text),'=',f'{len(self.content) - len(new_text)} words saved')

        return new_text
    
    # a float within the range [-1.0, 1.0].
    def get_polarity(self):
        polarity_num = nlp(self.content)._.polarity
        polarity_txt = ''
        
        if polarity_num >= 0.6:
            polarity_txt = 'Very Positive'
        elif polarity_num >= 0.2:
            polarity_txt = 'Positive'
        elif polarity_num >= -0.2:
            polarity_txt = 'Neutral'
        elif polarity_num >= -0.6:
            polarity_txt = 'Negative'
        elif polarity_num < -0.6:
            polarity_txt = 'Very Negative'

        return (polarity_txt, polarity_num)
    # a float within the range [0.0, 1.0] where 0.0 is very objective and 1.0 is very subjective. 
    def get_subjectivity(self):
        return nlp(self.content)._.subjectivity

    def get_tf_idf(self, token):
        count_tf = 0
        ### TF
        for word in nlp(self.content):
            if word.text == token:
                count_tf += 1

        tf = count_tf/len(self.content.split())

        ### IDF
        n_documents = len(self.corpus)
        count_idf = 0
        for paragraph in self.corpus:
            if token in paragraph.text:
                count_idf += 1
        
        idf = math.log(n_documents / count_idf, 2) if count_idf != 0 else 0

        return tf * idf   


In [35]:
bachelor_thesis = Document('./../Lukas-Linss_ba.docx')
paragraphs = bachelor_thesis.paragraphs

In [36]:
paragraph_objs = list()
for i, p in enumerate(paragraphs):
    if len(p.text) > 100:
        paragraph_objs.append(Paragraph(p.text, paragraphs[i - 1].text, int(paragraphs[i - 1].style.name[-1]), paragraphs))

In [37]:
document = Document()
title = document.add_heading(bachelor_thesis.core_properties.title, 0)
title.alignment = WD_ALIGN_PARAGRAPH.CENTER
document.add_page_break()

for p in paragraph_objs:
    document.add_heading(p.heading, p.heading_style)
    paragraph = document.add_paragraph(p.summarize(ratio=0.5)) # p.summarize(strategy='abstract') fuer die abstrakte Methode
    paragraph.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY
    document.add_paragraph(f'Keywords: { p.get_keywords(n=4) }')
    document.add_paragraph(f'Polarity: {p.polarity[0]} ({round(p.polarity[1], 2)})')
    document.add_paragraph(f'Subjectivity: {round(p.subjectivity, 2)} / 1.0')

    document.add_page_break()

document.save(f'ba summary {bachelor_thesis.core_properties.author}.docx')