In [208]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
from heapq import nlargest
from spacy.lang.en.stop_words import STOP_WORDS
import math

In [212]:
nlp = spacy.load('en_core_web_md')
stop_words = list(STOP_WORDS)

In [84]:
from docx import Document
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.shared import Inches

In [277]:
from spacytextblob.spacytextblob import SpacyTextBlob

# The polarity score is a float within the range [-1.0, 1.0]. The subjectivity is a float within the range [0.0, 1.0] where 0.0 is very objective and 1.0 is very subjective.
nlp.add_pipe('spacytextblob')

<spacytextblob.spacytextblob.SpacyTextBlob at 0x176fddcd0>

In [316]:

class Paragraph:

    def __init__(self, content, heading, heading_style) -> None:
        self.content = content
        self.heading = heading
        self.heading_style = heading_style
        self.keywords = self.get_keywords(n=4)
        self.summary = self.summurize(ratio=0.5)
        self.polarity = self.get_polarity()
        self.subjectivity = self.get_subjectivity()
    
    def get_keywords(self, **kwargs):
        doc = nlp(self.content)
        tags = ['PROPN', 'ADJ', 'NOUN']
        keywords = {}

        for token in doc:
            if token.text not in stop_words and token.text not in punctuation:
                if token.pos_ in tags:
                    if token.text in keywords.keys():
                        keywords[token.text] += 1
                    else:
                        keywords[token.text] = 1
            
        return nlargest(kwargs.get('n'), keywords, key=keywords.get)
    
    def summurize(self, **kwargs):
        doc = nlp(self.content)

        # Create dictionary with tokens as keys and their frequency as the value
        token_frequencies = {}
        for token in doc:
            if token.text not in stop_words:
                if token.text not in token_frequencies.keys():
                    token_frequencies[token.text] = 1
                else:
                    token_frequencies[token.text] += 1
        
        # Divide each token by the maximum frequency to get the waited frequency of each token
        for token in token_frequencies:
            token_frequencies[token] = token_frequencies[token] / max(token_frequencies.values())

        # Create dictionary with sentence as key and scentence score as value
        scentence_list = [s for s in doc.sents]
        scentence_score = {}

        # sentence score is equal to sum of token scores for each token in the scentence
        for sentence in scentence_list:
            for token in sentence:
                if token.text.lower() in token_frequencies.keys():
                    if len(sentence.text.split(' ')) < 30:
                        if sentence not in scentence_score.keys():
                            scentence_score[sentence] = token_frequencies[token.text.lower()]
                        else:
                            scentence_score[sentence] += token_frequencies[token.text.lower()]

        # Summurize documents to n-sentences
        n = math.floor(kwargs.get('ratio') * len(scentence_list))

        summurized_sentences = nlargest(n, scentence_score, key=scentence_score.get)
        summary = [s for s in scentence_score.keys() if s in summurized_sentences]
        new_text = ' '.join([s.text for s in summary])
        
        print(len(self.content), '-', len(new_text),'=',f'{len(self.content) - len(new_text)} words saved')

        return new_text
    
    # a float within the range [-1.0, 1.0].
    def get_polarity(self):
        polarity_num = nlp(self.content)._.polarity
        polarity_txt = ''
        
        if polarity_num >= 0.6:
            polarity_txt = 'Very Positive'
        elif polarity_num >= 0.2:
            polarity_txt = 'Positive'
        elif polarity_num >= -0.2:
            polarity_txt = 'Neutral'
        elif polarity_num >= -0.6:
            polarity_txt = 'Negative'
        elif polarity_num < -0.6:
            polarity_txt = 'Very Negative'

        return (polarity_txt, polarity_num)
    # a float within the range [0.0, 1.0] where 0.0 is very objective and 1.0 is very subjective. 
    def get_subjectivity(self):
        return nlp(self.content)._.subjectivity        


In [238]:
bachelor_thesis = Document('./../Linss-Lukas-ba.docx')
paragraphs = bachelor_thesis.paragraphs
paragraph_dict = {}
for i, p in enumerate(paragraphs):
    if len(p.text) > 100:
        paragraph_dict[(paragraphs[i - 1].text, int(paragraphs[i - 1].style.name[-1]))] = p.text

In [317]:
paragraph_objs = list()
for i, p in enumerate(paragraphs):
    if len(p.text) > 100:
        paragraph_objs.append(Paragraph(p.text, paragraphs[i - 1].text, int(paragraphs[i - 1].style.name[-1])))

2012 - 793 = 1219 words saved
2709 - 917 = 1792 words saved
1719 - 789 = 930 words saved
767 - 303 = 464 words saved
3801 - 786 = 3015 words saved
4893 - 1592 = 3301 words saved
829 - 315 = 514 words saved
2930 - 1177 = 1753 words saved
7969 - 3855 = 4114 words saved
3340 - 1565 = 1775 words saved
994 - 327 = 667 words saved
3778 - 1723 = 2055 words saved
5181 - 2568 = 2613 words saved
731 - 227 = 504 words saved
1744 - 781 = 963 words saved
8290 - 3787 = 4503 words saved
4007 - 1754 = 2253 words saved
484 - 168 = 316 words saved
5009 - 2265 = 2744 words saved
3466 - 1403 = 2063 words saved
853 - 311 = 542 words saved
2491 - 1136 = 1355 words saved
4466 - 2186 = 2280 words saved
4256 - 1820 = 2436 words saved
534 - 260 = 274 words saved
2534 - 1011 = 1523 words saved
5460 - 2556 = 2904 words saved
506 - 254 = 252 words saved
523 - 274 = 249 words saved
624 - 223 = 401 words saved
967 - 388 = 579 words saved
1261 - 483 = 778 words saved
987 - 453 = 534 words saved
714 - 286 = 428 words 

In [319]:
#paragraph_dict.keys()
paragraph_objs[1].polarity
#paragraph_objs[0].summurize(ratio=0.5)
#print(paragraph_dict)

('Neutral', 0.057314213564213574)

In [128]:
def summurizer(**kwargs):
    document = kwargs.get('doc')
    doc = nlp(document)

    # Create dictionary with tokens as keys and their frequency as the value
    token_frequencies = {}
    for token in doc:
        if token.text not in stop_words:
            if token.text not in token_frequencies.keys():
                token_frequencies[token.text] = 1
            else:
                token_frequencies[token.text] += 1
    
    # Divide each token by the maximum frequency to get the waited frequency of each token
    for token in token_frequencies:
        token_frequencies[token] = token_frequencies[token] / max(token_frequencies.values())

    # Create dictionary with sentence as key and scentence score as value
    scentence_list = [s for s in doc.sents]
    scentence_score = {}

    # sentence score is equal to sum of token scores for each token in the scentence
    for sentence in scentence_list:
        for token in sentence:
            if token.text.lower() in token_frequencies.keys():
                if len(sentence.text.split(' ')) < 30:
                    if sentence not in scentence_score.keys():
                        scentence_score[sentence] = token_frequencies[token.text.lower()]
                    else:
                        scentence_score[sentence] += token_frequencies[token.text.lower()]

    # Summurize documents to n-sentences
    n = math.floor(kwargs.get('ratio') * len(scentence_list))

    summurized_sentences = nlargest(n, scentence_score, key=scentence_score.get)
    summary = [s for s in scentence_score.keys() if s in summurized_sentences]
    new_text = ' '.join([s.text for s in summary])
    
    print(len(document), '-', len(new_text),'=',f'{len(document) - len(new_text)} words saved')

    return new_text

In [57]:
def extractKeywords(**kwargs):
    doc = nlp(kwargs.get('doc'))
    tags = ['PROPN', 'ADJ', 'NOUN']
    keywords = {}

    for token in doc:
        if token.text not in stop_words and token.text not in punctuation:
            if token.pos_ in tags:
                if token.text in keywords.keys():
                    keywords[token.text] += 1
                else:
                    keywords[token.text] = 1
        
    return nlargest(kwargs.get('n'), keywords, key=keywords.get)

In [278]:
def sentimentAnalysis(**kwargs):
    doc = nlp(kwargs.get('doc'))
    return doc._.assessments

In [120]:
extractKeywords(doc=list(paragraph_dict.values())[0], n=5)

['virtual', 'metaverse', 'store', 'channel', 'shopping']

In [None]:
summurizer(doc=list(paragraph_dict.values())[0], ratio=0.5)

In [323]:
document = Document()
wc_picture = document.add_picture('./../WordCloud/word_cloud.png', width=Inches(4), height=Inches(4))
pic_paragraph = document.paragraphs[-1] 
pic_paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER

title = document.add_heading('Shopping Experiences in the Metaverse', 0)
sub_title = document.add_heading('Implementation of a white lable Metaverse Solution', 1)
title.alignment = WD_ALIGN_PARAGRAPH.CENTER
sub_title.alignment = WD_ALIGN_PARAGRAPH.CENTER
document.add_page_break()


for p in paragraph_objs:
    document.add_heading(p.heading, p.heading_style)
    paragraph = document.add_paragraph(p.summary)
    paragraph.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY
    document.add_paragraph(f'Keywords: { p.keywords }')
    document.add_paragraph(f'Polarity: {p.polarity[0]} ({round(p.polarity[1], 2)})')
    document.add_paragraph(f'Subjectivity: {round(p.subjectivity, 2)} / 1.0')

    document.add_page_break()

document.save('ba_summary.docx')

