In [4]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
from heapq import nlargest
from spacy.lang.en.stop_words import STOP_WORDS
import math
from transformers import pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
nlp = spacy.load('en_core_web_md')
summurizer = pipeline('summarization')
stop_words = list(STOP_WORDS)

No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [6]:
from docx import Document
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.shared import Inches

In [7]:
from spacytextblob.spacytextblob import SpacyTextBlob

# The polarity score is a float within the range [-1.0, 1.0]. The subjectivity is a float within the range [0.0, 1.0] where 0.0 is very objective and 1.0 is very subjective.
nlp.add_pipe('spacytextblob')

<spacytextblob.spacytextblob.SpacyTextBlob at 0x157612640>

In [16]:

class Paragraph:

    def __init__(self, content, heading, heading_style) -> None:
        self.content = content
        self.heading = heading
        self.heading_style = heading_style
        self.keywords = self.get_keywords(n=4)
        self.summary = self.summurize(ratio=0.5)
        self.summary_abs = self.summurize(strategy='abstract')
        self.polarity = self.get_polarity()
        self.subjectivity = self.get_subjectivity()
    
    def get_keywords(self, **kwargs):
        doc = nlp(self.content)
        tags = ['PROPN', 'ADJ', 'NOUN']
        keywords = {}

        for token in doc:
            if token.text not in stop_words and token.text not in punctuation:
                if token.pos_ in tags:
                    if token.text in keywords.keys():
                        keywords[token.text] += 1
                    else:
                        keywords[token.text] = 1
            
        return nlargest(kwargs.get('n'), keywords, key=keywords.get)
    
    def summurize(self, **kwargs):

        if kwargs.get('strategy') == 'abstract':
            try:
                return [v for k, v  in summurizer('summarize:' + self.content)[0].items()][0]
            except:
                return self.summary

        doc = nlp(self.content)

        # Create dictionary with tokens as keys and their frequency as the value
        token_frequencies = {}
        for token in doc:
            if token.text not in stop_words:
                if token.text not in token_frequencies.keys():
                    token_frequencies[token.text] = 1
                else:
                    token_frequencies[token.text] += 1
        
        # Divide each token by the maximum frequency to get the waited frequency of each token
        for token in token_frequencies:
            token_frequencies[token] = token_frequencies[token] / max(token_frequencies.values())

        # Create dictionary with sentence as key and scentence score as value
        scentence_list = [s for s in doc.sents]
        scentence_score = {}

        # sentence score is equal to sum of token scores for each token in the scentence
        for sentence in scentence_list:
            for token in sentence:
                if token.text.lower() in token_frequencies.keys():
                    if len(sentence.text.split(' ')) < 30:
                        if sentence not in scentence_score.keys():
                            scentence_score[sentence] = token_frequencies[token.text.lower()]
                        else:
                            scentence_score[sentence] += token_frequencies[token.text.lower()]

        # Summurize documents to n-sentences
        n = math.floor(kwargs.get('ratio') * len(scentence_list))

        summurized_sentences = nlargest(n, scentence_score, key=scentence_score.get)
        summary = [s for s in scentence_score.keys() if s in summurized_sentences]
        new_text = ' '.join([s.text for s in summary])
        
        print(len(self.content), '-', len(new_text),'=',f'{len(self.content) - len(new_text)} words saved')

        return new_text
    
    # a float within the range [-1.0, 1.0].
    def get_polarity(self):
        polarity_num = nlp(self.content)._.polarity
        polarity_txt = ''
        
        if polarity_num >= 0.6:
            polarity_txt = 'Very Positive'
        elif polarity_num >= 0.2:
            polarity_txt = 'Positive'
        elif polarity_num >= -0.2:
            polarity_txt = 'Neutral'
        elif polarity_num >= -0.6:
            polarity_txt = 'Negative'
        elif polarity_num < -0.6:
            polarity_txt = 'Very Negative'

        return (polarity_txt, polarity_num)
    # a float within the range [0.0, 1.0] where 0.0 is very objective and 1.0 is very subjective. 
    def get_subjectivity(self):
        return nlp(self.content)._.subjectivity        


In [9]:
bachelor_thesis = Document('./../Linss-Lukas-ba.docx')
paragraphs = bachelor_thesis.paragraphs
paragraph_dict = {}
for i, p in enumerate(paragraphs):
    if len(p.text) > 100:
        paragraph_dict[(paragraphs[i - 1].text, int(paragraphs[i - 1].style.name[-1]))] = p.text

In [17]:
paragraph_objs = list()
for i, p in enumerate(paragraphs):
    if len(p.text) > 100:
        paragraph_objs.append(Paragraph(p.text, paragraphs[i - 1].text, int(paragraphs[i - 1].style.name[-1])))

2012 - 793 = 1219 words saved
2709 - 917 = 1792 words saved
1719 - 789 = 930 words saved
767 - 303 = 464 words saved
3801 - 786 = 3015 words saved
4893 - 1592 = 3301 words saved
829 - 315 = 514 words saved
2930 - 1177 = 1753 words saved
7969 - 3855 = 4114 words saved
3340 - 1565 = 1775 words saved
994 - 327 = 667 words saved
3778 - 1723 = 2055 words saved
5181 - 2568 = 2613 words saved


Your max_length is set to 142, but you input_length is only 127. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=63)


731 - 227 = 504 words saved
1744 - 781 = 963 words saved
8290 - 3787 = 4503 words saved
4007 - 1754 = 2253 words saved


Your max_length is set to 142, but you input_length is only 87. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=43)


484 - 168 = 316 words saved
5009 - 2265 = 2744 words saved
3466 - 1403 = 2063 words saved
853 - 311 = 542 words saved
2491 - 1136 = 1355 words saved
4466 - 2186 = 2280 words saved
4256 - 1820 = 2436 words saved


Your max_length is set to 142, but you input_length is only 97. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=48)


534 - 260 = 274 words saved
2534 - 1011 = 1523 words saved


Your max_length is set to 142, but you input_length is only 97. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=48)


5460 - 2556 = 2904 words saved
506 - 254 = 252 words saved


Your max_length is set to 142, but you input_length is only 101. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)


523 - 274 = 249 words saved


Your max_length is set to 142, but you input_length is only 127. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=63)


624 - 223 = 401 words saved
967 - 388 = 579 words saved
1261 - 483 = 778 words saved
987 - 453 = 534 words saved


Your max_length is set to 142, but you input_length is only 141. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=70)


714 - 286 = 428 words saved


Your max_length is set to 142, but you input_length is only 121. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=60)


675 - 304 = 371 words saved


Your max_length is set to 142, but you input_length is only 125. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=62)


655 - 308 = 347 words saved


Your max_length is set to 142, but you input_length is only 111. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=55)


603 - 295 = 308 words saved
1901 - 1078 = 823 words saved


Your max_length is set to 142, but you input_length is only 70. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=35)


386 - 194 = 192 words saved
1569 - 627 = 942 words saved
1076 - 510 = 566 words saved
1321 - 634 = 687 words saved
1882 - 757 = 1125 words saved
5261 - 2666 = 2595 words saved
2449 - 1098 = 1351 words saved
1026 - 431 = 595 words saved
1324 - 625 = 699 words saved
1743 - 1009 = 734 words saved


Your max_length is set to 142, but you input_length is only 113. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=56)


560 - 224 = 336 words saved
934 - 504 = 430 words saved
2082 - 948 = 1134 words saved
1193 - 547 = 646 words saved
4253 - 1921 = 2332 words saved
3053 - 1250 = 1803 words saved
999 - 385 = 614 words saved


In [365]:
#paragraph_dict.keys()
paragraph_objs[1].summury_abs
#paragraph_objs[0].summurize(ratio=0.5)
#print(paragraph_dict)

" The term “metaverse” has reached mainstream audiences and has had a polarising effect on consumers and businesses alike . While consumers seem to be confused by the concept, others interpret current metaverse developments as a mere fad at best and see it as the newest marketing buzzword . While looking at various use cases, this thesis will mainly explore the metaverses' potential as a platform for creating immersive shopping and advertising experiences . This thesis will analyse the current state of this field of study and show how metaverse shopping experiences can be implemented ."

In [329]:
summurizer(paragraph_objs[0].content, min_length=100, max_length=300)

[{'summary_text': ' With the use of virtual worlds becoming a growing trend, the metaverse is becoming a promising channel for Brands to offer their customers immersive shopping experiences . While many brands are interested in utilising this new channel, creating them comes with several challenges and uncertainties as many companies lack experience within this field . This thesis tries to shed light on the process associated with building a virtual shopping experience by following the process of developing a white-label shopping mall solution . The resulting solution was successfully launched and was able to attract eight customers who leased a store .'}]

In [128]:
def summurizer(**kwargs):
    document = kwargs.get('doc')
    doc = nlp(document)

    # Create dictionary with tokens as keys and their frequency as the value
    token_frequencies = {}
    for token in doc:
        if token.text not in stop_words:
            if token.text not in token_frequencies.keys():
                token_frequencies[token.text] = 1
            else:
                token_frequencies[token.text] += 1
    
    # Divide each token by the maximum frequency to get the waited frequency of each token
    for token in token_frequencies:
        token_frequencies[token] = token_frequencies[token] / max(token_frequencies.values())

    # Create dictionary with sentence as key and scentence score as value
    scentence_list = [s for s in doc.sents]
    scentence_score = {}

    # sentence score is equal to sum of token scores for each token in the scentence
    for sentence in scentence_list:
        for token in sentence:
            if token.text.lower() in token_frequencies.keys():
                if len(sentence.text.split(' ')) < 30:
                    if sentence not in scentence_score.keys():
                        scentence_score[sentence] = token_frequencies[token.text.lower()]
                    else:
                        scentence_score[sentence] += token_frequencies[token.text.lower()]

    # Summurize documents to n-sentences
    n = math.floor(kwargs.get('ratio') * len(scentence_list))

    summurized_sentences = nlargest(n, scentence_score, key=scentence_score.get)
    summary = [s for s in scentence_score.keys() if s in summurized_sentences]
    new_text = ' '.join([s.text for s in summary])
    
    print(len(document), '-', len(new_text),'=',f'{len(document) - len(new_text)} words saved')

    return new_text

In [57]:
def extractKeywords(**kwargs):
    doc = nlp(kwargs.get('doc'))
    tags = ['PROPN', 'ADJ', 'NOUN']
    keywords = {}

    for token in doc:
        if token.text not in stop_words and token.text not in punctuation:
            if token.pos_ in tags:
                if token.text in keywords.keys():
                    keywords[token.text] += 1
                else:
                    keywords[token.text] = 1
        
    return nlargest(kwargs.get('n'), keywords, key=keywords.get)

In [278]:
def sentimentAnalysis(**kwargs):
    doc = nlp(kwargs.get('doc'))
    return doc._.assessments

In [120]:
extractKeywords(doc=list(paragraph_dict.values())[0], n=5)

['virtual', 'metaverse', 'store', 'channel', 'shopping']

In [None]:
summurizer(doc=list(paragraph_dict.values())[0], ratio=0.5)

In [15]:
document = Document()
wc_picture = document.add_picture('./../WordCloud/word_cloud.png', width=Inches(4), height=Inches(4))
pic_paragraph = document.paragraphs[-1] 
pic_paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER

title = document.add_heading('Shopping Experiences in the Metaverse', 0)
sub_title = document.add_heading('Implementation of a white lable Metaverse Solution', 1)
title.alignment = WD_ALIGN_PARAGRAPH.CENTER
sub_title.alignment = WD_ALIGN_PARAGRAPH.CENTER
document.add_page_break()


for p in paragraph_objs:
    document.add_heading(p.heading, p.heading_style)
    paragraph = document.add_paragraph(p.summary_abs)
    paragraph.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY
    document.add_paragraph(f'Keywords: { p.keywords }')
    document.add_paragraph(f'Polarity: {p.polarity[0]} ({round(p.polarity[1], 2)})')
    document.add_paragraph(f'Subjectivity: {round(p.subjectivity, 2)} / 1.0')

    document.add_page_break()

document.save('ba_summary.docx')

