In [1]:
import spacy
import pandas
import statistics
from spacy.language import Language
from spacy.symbols import nsubj, acomp
from collections import Counter
from heapq import nlargest
nlp = spacy.load('en_core_web_sm')

In [2]:
rd = pandas.read_csv('preprossecedData.csv')
products = rd['name'].unique()

In [3]:
@Language.component("set_custom_boundaries")
def set_custom_boundaries(doc):
    for token in doc[:-1]:
        if token.text in ["," ,".", "!", "?", ":", "-"]:
            doc[token.i + 1].is_sent_start = True
    return doc
nlp.add_pipe("set_custom_boundaries", before="parser")

<function __main__.set_custom_boundaries(doc)>

The aim of this notebook is to improve the summarization functions from summarization.ipynb by changing the sentence weighting to favor descriptive sentences consisting of noun-adjective pairs. The following steps will be made based on the explanations at https://achyutjoshi.github.io/aspect_extraction/aspectextraction

In [4]:
def hasPair(sent):
    pairExists = False
    for possible_subject in sent:
        if possible_subject.dep == nsubj and acomp in [child.dep for child in possible_subject.head.children]:
            pairExists = True
            break
    return pairExists

In [5]:
nlp.vocab["it"].is_stop = False
nlp.vocab["this"].is_stop = False
nlp.vocab["they"].is_stop = False
nlp.vocab["these"].is_stop = False

In [6]:
#Positive summary
def summarizeP(product):
    df = rd[(rd['name'] == product) & (rd['doRecommend'] == True)]
    reviewText = df['text'].str.cat(sep='. ')
    reviewText = reviewText.replace(".. ", ". ")
    reviewText = reviewText.replace(".", ". ")
    reviewText = reviewText.replace(",", ", ")
    reviewText = reviewText.replace("  ", " ")
    reviewDoc = nlp(reviewText)
    pos_tag = ['NOUN', 'ADJ']
    words = [ token.lemma_ for token in reviewDoc if token.is_stop != True and token.is_punct != True and token.pos_ in pos_tag]
    freq_word = Counter(words)
    max_freq = freq_word.most_common(1)[0][1]
    for word in freq_word.keys():
        freq_word[word] = (freq_word[word]/max_freq)
    sent_strength = {}
    for sent in reviewDoc.sents:
        sent_strength[sent] = 0
        for word in sent:
            if word.lemma_ in freq_word.keys():
                sent_strength[sent] += freq_word[word.lemma_]
        if hasPair(sent):
            sent_strength[sent] += len(sent.text)
        sent_strength[sent] /= len(sent.text)
    important_sents = nlargest(3, sent_strength, key=sent_strength.get)
    final_sentences = [ w.text for w in important_sents ]
    summary = ' '.join(final_sentences)
    return summary

In [7]:
#Negative summary
def summarizeN(product):
    df = rd[(rd['name'] == product) & (rd['doRecommend'] == False)]
    reviewText = df['text'].str.cat(sep='. ')
    reviewText = reviewText.replace(".. ", ". ")
    reviewText = reviewText.replace(".", ". ")
    reviewText = reviewText.replace(",", ", ")
    reviewText = reviewText.replace("  ", " ")
    reviewDoc = nlp(reviewText)
    pos_tag = ['NOUN', 'ADJ']
    words = [ token.lemma_ for token in reviewDoc if token.is_stop != True and token.is_punct != True and token.pos_ in pos_tag]
    freq_word = Counter(words)
    max_freq = freq_word.most_common(1)[0][1]
    for word in freq_word.keys():
        freq_word[word] = (freq_word[word]/max_freq)
    sent_strength = {}
    for sent in reviewDoc.sents:
        sent_strength[sent] = 0
        for word in sent:
            if word.lemma_ in freq_word.keys():
                sent_strength[sent] += freq_word[word.lemma_]
        if hasPair(sent):
            sent_strength[sent] += len(sent.text)
        sent_strength[sent] /= len(sent.text)
    important_sents = nlargest(3, sent_strength, key=sent_strength.get)
    final_sentences = [ w.text for w in important_sents ]
    summary = ' '.join(final_sentences)
    return summary

In [8]:
print("PSummarizing", products[0])
print()
print(summarizeP(products[0]))

PSummarizing AmazonBasics AAA Performance Alkaline Batteries (36 Count)

Batteries are great. Batteries are good, The battery is great,


In [9]:
print("NSummarizing", products[0])
print()
print(summarizeN(products[0]))

NSummarizing AmazonBasics AAA Performance Alkaline Batteries (36 Count)

Batteries are great, Batteries are fine. batteries aren't great,


In [10]:
print("PSummarizing", products[1])
print()
print(summarizeP(products[1]))

PSummarizing AmazonBasics AA Performance Alkaline Batteries (48 Count) - Packaging May Vary

batteries are great. Batteries are good, The batteries are great!


In [11]:
print("NSummarizing", products[1])
print()
print(summarizeN(products[1]))

NSummarizing AmazonBasics AA Performance Alkaline Batteries (48 Count) - Packaging May Vary

Batteries are fine. batteries aren't great, Price per battery is great.


In [12]:
print("PSummarizing", products[2])
print()
print(summarizeP(products[2]))

PSummarizing AmazonBasics Backpack for Laptops up to 17-inches

ouch for water container is small. The equipment was heavy and prone to mildew if you weren't careful, Lots of pockets,


In [13]:
print("NSummarizing", products[2])
print()
print(summarizeN(products[2]))

NSummarizing AmazonBasics Backpack for Laptops up to 17-inches

The padding for my back and the laptop pocket is nice and thick. The backpack is a bit much for everyday use (just in bulk and number of pockets, but it's great.


In [14]:
print("PSummarizing", products[3])
print()
print(summarizeP(products[3]))

PSummarizing AmazonBasics 15.6-Inch Laptop and Tablet Bag

he price was so low that I expected Everything was great type of bag but this is perfect.


In [15]:
print("NSummarizing", products[3])
print()
print(summarizeN(products[3]))

NSummarizing AmazonBasics 15.6-Inch Laptop and Tablet Bag

I am a Verizon Technician and this laptop case is perfect for what I do! Does it's job fine, Inside the laptop slips in the middle then your are able to put a ipad and maybe a diary either side.
