In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF,  LatentDirichletAllocation
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS

from nltk import sent_tokenize
from nltk.corpus import stopwords

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

import pandas as pd
import numpy as np

import string
import spacy
import gzip
import simplejson as json
import nltk
import en_core_web_sm
nlp = en_core_web_sm.load()

from nltk.tokenize import TweetTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk import pos_tag
import csv

In [2]:
def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield json.loads(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')


class CleanTextTransformer(TransformerMixin):
   
    def transform(self, X, **transform_params):
        #return [cleanText(text) for text in X]
        return [text for text in X]

    def fit(self, X, y=None, **fit_params):
        return self

    def get_params(self, deep=True):
        return {}
    

def tokenizeText(sample):
    "This function tokenizes text and does other preprocessing steps like Lemmatization and Stemming."

    tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True)
    stop_words = set(stopwords.words('english')) 
    lemmatizer = WordNetLemmatizer()
    
    #tokenize
    tokens = tokenizer.tokenize(sample)
    # lemmatize
    lemmas = []
    for word in tokens:
        if word.isalnum() and not word in stop_words:
            word = word.lower()
            word = lemmatizer.lemmatize(word, pos = 'v')
            lemmas.append(word)
    tokens = lemmas
    while "" in tokens:
        tokens.remove("")
    while " " in tokens:
        tokens.remove(" ")
    while "\n" in tokens:
        tokens.remove("\n")
    while "\n\n" in tokens:
        tokens.remove("\n\n")

    return tokens


def return_topics(vectorizer, clf, W, df, n_top_words, n_top_documents):
    topics, reviews = [], []
    features = vectorizer.get_feature_names()
    sentiment_analyser = SentimentIntensityAnalyzer()

    for topic_id, topic in enumerate(clf.components_):

        # grab the list of words describing the topic
        topic_word_list = []
        for i in topic.argsort()[:-n_top_words - 1:-1]:
            topic_word_list.append(features[i])

        # split words in case there are some bigrams
        split_topic_word_list = []
        for word in topic_word_list:
            for splitted in word.split():
                split_topic_word_list.append(splitted)
        topic_words = list(set(split_topic_word_list))

        # append topic words as a single string
        topics.append(' '.join([word for word in topic_words]))

        # iterate for reviews for each topic
        topic_doc_indices = np.argsort(W[:, topic_id])[::-1][0:n_top_documents]

        for doc_ind in topic_doc_indices:
            review = df['reviewText'].iloc[doc_ind]

            # check if the review contains any of the topic words
            if any(word in review.lower() for word in topic_words):
                # analyse sentiment
                vader = sentiment_analyser.polarity_scores(review)
                # form the review - topic_id and sentiment data structure
                reviews.append(df.iloc[doc_ind].to_dict())
                reviews[-1]['topic'] = topic_id
                reviews[-1]['sentiment'] = vader['compound']

    return topics, reviews


def summarize_reviews(topics, reviews):
    # returns reviews with the following new fields
    #  'summary': sentences from review w/ topic words

    analyser = SentimentIntensityAnalyzer()
    summary_all_review = []
    for ii, review in enumerate(reviews):
        summary = []
        sentences = sent_tokenize(review['reviewText'])
        topic_words = topics[review['topic']].split()

        for sentence in sentences:
            for word in topic_words:
                if word in sentence.lower():
                    summary.append(sentence)
                    break

        reviews[ii]['summary'] = ' '.join([sentence for sentence in summary])
        vader = analyser.polarity_scores(reviews[ii]['summary'])
        reviews[ii]['summary_sentiment'] = vader['compound']
        
        summary_all_review.append(reviews[ii]['summary'])

    return reviews, summary_all_review

def print_topics(test_asin):

    test_df = reviews_df[reviews_df['asin'] == test_asin].dropna()
    n_features, n_top_words, n_topics, n_top_documents = 1000, 3, 13, 3

    vectorizer = TfidfVectorizer(max_features=n_features,
                                 tokenizer=tokenizeText,
                                 stop_words='english',
                                 ngram_range=(1,2))

    clf = NMF(n_components=n_topics, random_state=1, solver='mu', beta_loss='frobenius')
   
    #clf = LatentDirichletAllocation(n_components = 5, max_iter = 5, learning_method ='online',learning_offset = 50.,random_state = 0)

    pipe = Pipeline([('cleanText', CleanTextTransformer()),('vectorizer', vectorizer), ('nmf', clf)])

    pipe.fit(test_df['reviewText'])
    transform = pipe.fit_transform(test_df['reviewText'])
    
    #topic identification
    topics, reviews = return_topics(vectorizer, clf, transform, test_df, n_top_words, n_top_documents)
    # review summarization
    reviews , summary = summarize_reviews(topics, reviews)
    print("Topics:", len(topics))
    
    return topics, reviews

In [3]:
#reviews_df = getDF('Video_Games_5.json.gz')
reviews_df = getDF('Electronics_5.json.gz')
print(reviews_df.head(4))

       reviewerID        asin              reviewerName   helpful  \
0   AO94DHGC771SJ  0528881469                   amazdnu    [0, 0]   
1   AMO214LNFCEI4  0528881469           Amazon Customer  [12, 15]   
2  A3N7T0DY83Y4IG  0528881469             C. A. Freeman  [43, 45]   
3  A1H8PY3QHMQQA0  0528881469  Dave M. Shaw "mack dave"   [9, 10]   

                                          reviewText  overall  \
0  We got this GPS for my husband who is an (OTR)...      5.0   
1  I'm a professional OTR truck driver, and I bou...      1.0   
2  Well, what can I say.  I've had this unit in m...      3.0   
3  Not going to write a long review, even thought...      2.0   

                   summary  unixReviewTime   reviewTime  
0          Gotta have GPS!      1370131200   06 2, 2013  
1        Very Disappointed      1290643200  11 25, 2010  
2           1st impression      1283990400   09 9, 2010  
3  Great grafics, POOR GPS      1290556800  11 24, 2010  


In [4]:
def max_entropy(n):
    return -np.log(1/n)

def unique(sequence):
    '''get unique elements of list and keep the same order'''
    
    seen = set()
    return [x for x in sequence if not (x in seen or seen.add(x))]

def redundancy(string):
    entropy, string_list = 0, string.split()
    string_set = unique(string_list)
    for word in string_set:
        p = string_list.count(word)/len(string_list)
        entropy -= p*np.log(p)        
    return 1 - entropy/max_entropy(len(string_list))

def lemmatize(string):
    from nltk.stem import PorterStemmer
    stemmer = PorterStemmer()
    tokens = parser(string)
    lemmas = []
    for token in tokens:
        lemmas.append(stemmer.stem(token.lemma_.lower().strip()))
        
    return ' '.join(lemma for lemma in lemmas)

def adjective_count(lemmas):
    string = nltk.word_tokenize(lemmas)
    pos_string = nltk.pos_tag(string)
    count = 0
    for i in pos_string:
        if i[1] == 'JJ' or i[1] == 'JJR' or i[1] == 'JJS' or i[1] == 'RB' or i[1] == 'RBR' or i[1] == 'RBS':
            count += 1
    return count 

In [5]:
string = 'speaker system sound quality headphone jack satellite speakers sound card highly recommend thx certified midrange bang for the buck living room home theater altec lansing'
lemmas = tokenizeText(string) 
lemmas = ' '.join(lemma for lemma in lemmas)
print(lemmas)
print(redundancy(lemmas))

topics, reviews = print_topics('B0002SQ2P2')
print(topics)
lemmas = ' '.join(lemma for lemma in topics)
print(lemmas)
print(redundancy(lemmas))

speaker system sound quality headphone jack satellite speakers sound card highly recommend thx certify midrange bang buck live room home theater altec lansing
0.019223019952826492


  'stop_words.' % sorted(inconsistent))


Topics: 13
['bass loud volume', 'z 2300', 'sound great', 'speakers computer', 'cable remote satellite', 'good sound', 'stop work great', 'speaker best bose', 'review beat price', 'play sound card', 'music movies watch', 'use easy quality', 'buy worth excellent']
bass loud volume z 2300 sound great speakers computer cable remote satellite good sound stop work great speaker best bose review beat price play sound card music movies watch use easy quality buy worth excellent
0.03762646458355279


In [6]:
string = 'earbuds sound quality couple months every 6 months sound was good great sound bass volume ears earbud earphones hear inexpensive jack model broke cord'
lemmas = tokenizeText(string) 
lemmas = ' '.join(lemma for lemma in lemmas)
print(lemmas)
print(redundancy(lemmas))



topics, review = print_topics('B0002D03ZW')
print(topics)
lemmas = ' '.join(lemma for lemma in topics)
print(lemmas)
print(redundancy(lemmas))

earbuds sound quality couple months every 6 months sound good great sound bass volume ears earbud earphones hear inexpensive jack model break cord
0.06492466861659307


  'stop_words.' % sorted(inconsistent))


Topics: 13
['music sound headphones', 'studio years home', 'ears head wear', 'sound comfortable great', 'review headphones say', 'response frequency price', 'work wear hours', 'fit large amp', 'bass monitor bite', 'audio technica', 'love far im', 'overall good better', 'hop record edit']
music sound headphones studio years home ears head wear sound comfortable great review headphones say response frequency price work wear hours fit large amp bass monitor bite audio technica love far im overall good better hop record edit
0.030087065120016687


In [7]:
from sklearn.model_selection import GridSearchCV

def grid_search_topic_model(test_asin):
    search_params = {'n_components': [3,4,5,6,7,8,9,10,11,12]}
    test_df = reviews_df[reviews_df['asin'] == test_asin].dropna()
    n_features, n_top_words, n_topics, n_top_documents = 1000, 3, 6, 3
    vectorizer = TfidfVectorizer(max_features=n_features,
                                 tokenizer=tokenizeText,
                                 stop_words='english',
                                 ngram_range=(1,2))

    clf = NMF()
    clf = LatentDirichletAllocation()

    pipe = Pipeline([('cleanText', CleanTextTransformer()),('vectorizer', vectorizer)])

    # pipe.fit(test_df['reviewText'])
    data_vectorized = pipe.fit_transform(test_df['reviewText'])

    model = GridSearchCV(clf, param_grid=search_params)

    model.fit(data_vectorized)
    best_topic_model = model.best_estimator_
    print("Best Model's Params: ", model.best_params_)

In [8]:
grid_search_topic_model('B0002SQ2P2')

  'stop_words.' % sorted(inconsistent))


Best Model's Params:  {'n_components': 3}


In [9]:
def print_topics_lda(test_asin):

    test_df = reviews_df[reviews_df['asin'] == test_asin].dropna()
    n_features, n_top_words, n_topics, n_top_documents = 1000, 3, 13, 3

    vectorizer = TfidfVectorizer(max_features=n_features,
                                 tokenizer=tokenizeText,
                                 stop_words='english',
                                 ngram_range=(1,2))

    clf = NMF(n_components=n_topics, random_state=1, solver='mu', beta_loss='frobenius')
   
    clf = LatentDirichletAllocation(n_components = 3, max_iter = 5, learning_method ='online',learning_offset = 50.,random_state = 0)

    pipe = Pipeline([('cleanText', CleanTextTransformer()),('vectorizer', vectorizer), ('nmf', clf)])

    pipe.fit(test_df['reviewText'])
    transform = pipe.fit_transform(test_df['reviewText'])
    
    #topic identification
    topics, reviews = return_topics(vectorizer, clf, transform, test_df, n_top_words, n_top_documents)
    # review summarization
    summary = summarize_reviews(topics, reviews)
    #print("Summary :\n", summary)
    print("Topics:")
    
    return topics, reviews

In [10]:
string = 'speaker system sound quality headphone jack satellite speakers sound card highly recommend thx certified midrange bang for the buck living room home theater altec lansing'
lemmas = tokenizeText(string) 
lemmas = ' '.join(lemma for lemma in lemmas)
print(lemmas)
print(redundancy(lemmas))



topics, review = print_topics_lda('B0002SQ2P2')
print(topics)
lemmas = ' '.join(lemma for lemma in topics)
print(lemmas)
print(redundancy(lemmas))

speaker system sound quality headphone jack satellite speakers sound card highly recommend thx certify midrange bang buck live room home theater altec lansing
0.019223019952826492


  'stop_words.' % sorted(inconsistent))


Topics:
['speakers like better', 'speakers sound great', 'speakers sound bass']
speakers like better speakers sound great speakers sound bass
0.23676997261905086


In [11]:
test_amazon_asins = ['B0002SQ2P2', 'B0002KVQBA', 'B00029MTMQ','B00020S7XK','B00063E2HS','B0002D03ZW','B0002WPSBC','B00006JN3G','B0002CZHN6','B00004T8R2','B00004ZCJJ',
'B00007M1TZ','B0002UPGOI','B000204SWE','B0002EMY9Y','B00006IAKJ','B000629GES','B00017LSPI','B0002UM0JW',
'B0000C3GWU','B0001NNLHK','B0000BYDKO','B00008MOPJ','B00066HP7Y','B0000AQIFW','B00066EK2W','B00005NIMJ',
'B00009WQS1','B0000DJEK7','B00028D778','B00030CHRQ','B0002IOIMQ','B0001EMA80','B00006JILE','B0002Y5WXO',
'B00062UW5A','B00007GQLU','B00004TS16','B00005QXWI','B00018MSNI']

test_elecronics5_asins = reviews_df['asin']
print(list(test_elecronics5_asins[:5]))

def intersection(lst1, lst2): 
    return list(set(lst1) & set(lst2))

print(intersection(test_amazon_asins, test_elecronics5_asins)) 

valid_asins = intersection(test_amazon_asins, test_elecronics5_asins)

print(valid_asins)

['0528881469', '0528881469', '0528881469', '0528881469', '0528881469']
['B00066EK2W', 'B0002D03ZW', 'B00004TS16', 'B0001NNLHK', 'B0002IOIMQ', 'B0000DJEK7', 'B0002UPGOI', 'B00028D778', 'B00005QXWI', 'B0002CZHN6', 'B0000AQIFW', 'B0002Y5WXO', 'B00005NIMJ', 'B00007GQLU', 'B00062UW5A', 'B00017LSPI', 'B00066HP7Y', 'B000629GES', 'B00009WQS1', 'B0002KVQBA', 'B00030CHRQ', 'B00020S7XK', 'B0000BYDKO', 'B00007M1TZ', 'B00018MSNI', 'B0002UM0JW', 'B0002SQ2P2', 'B00004T8R2', 'B0000C3GWU', 'B000204SWE', 'B00006JN3G', 'B0002WPSBC']
['B00066EK2W', 'B0002D03ZW', 'B00004TS16', 'B0001NNLHK', 'B0002IOIMQ', 'B0000DJEK7', 'B0002UPGOI', 'B00028D778', 'B00005QXWI', 'B0002CZHN6', 'B0000AQIFW', 'B0002Y5WXO', 'B00005NIMJ', 'B00007GQLU', 'B00062UW5A', 'B00017LSPI', 'B00066HP7Y', 'B000629GES', 'B00009WQS1', 'B0002KVQBA', 'B00030CHRQ', 'B00020S7XK', 'B0000BYDKO', 'B00007M1TZ', 'B00018MSNI', 'B0002UM0JW', 'B0002SQ2P2', 'B00004T8R2', 'B0000C3GWU', 'B000204SWE', 'B00006JN3G', 'B0002WPSBC']


In [12]:
red_nmf = 0 
adj_count_lda, adj_count_nmf = [], []

redundancy_arr = ()

for asin in valid_asins:
    topics, review = print_topics(asin)
    print(asin,topics)
    lemmas = ' '.join(lemma for lemma in topics)
    temp_red = redundancy(lemmas)
    red_nmf += temp_red
    redundancy_arr = list(redundancy_arr)
    redundancy_arr.append((asin,temp_red))
    redundancy_arr = tuple(redundancy_arr)
    
    count = 0
    count = adjective_count(lemmas) / len(topics)
    adj_count_nmf.append(count)
    
    
    
    
red_lda = 0 
for asin in valid_asins:
    topics, review = print_topics_lda(asin)
    print(asin, topics)
    lemmas = ' '.join(lemma for lemma in topics)
    red_lda += redundancy(lemmas)
    
    count = 0
    count = adjective_count(lemmas) / len(topics)
    adj_count_lda.append(count)

  'stop_words.' % sorted(inconsistent))


Topics: 13
B00066EK2W ['player sandisk good', 'use easy', 'drive hard', 'memory expandable mp3', 'file crackle wma', 'amaze sweat years', 'include usb mp3', 'button buy small', 'music classical solve volume', 'operation gift basic recommend product', 'point good review product', 'small ear easy v', 'audio track record']


  'stop_words.' % sorted(inconsistent))


Topics: 13
B0002D03ZW ['music sound headphones', 'studio years home', 'ears head wear', 'sound comfortable great', 'review headphones say', 'response frequency price', 'work wear hours', 'fit large amp', 'bass monitor bite', 'audio technica', 'love far im', 'overall good better', 'hop record edit']


  'stop_words.' % sorted(inconsistent))


Topics: 13
B00004TS16 ['shoot shots picture', 'thing like memory card', 'paper machine quality', 'photography camera snorkel', 'manual nice mode', 'think overall camera great', 's100 fancy friends', 'thing usb easy', 'battery backup photoshop', 'bother camera 2mp', 'tft picture deal', 'battery lithium charger', 'reader small pocket goodbye']


  'stop_words.' % sorted(inconsistent))


Topics: 13
B0001NNLHK ['sound good sleeves', 'ear bud e3c', 'ears cord headphones', 'ipod listen earphones', 'break wire cover', 'love fit sound supply', 'star price general', 'canal ear send', 'player e3cs purchase', 'tip foam earplugs', 'bass koss 10', 'hear long flight', 'noise sound block']


  'stop_words.' % sorted(inconsistent))


Topics: 13
B0002IOIMQ ['batteries charge charger', 'display lcd', 'use years work', 'battery life', 'super quick', 'batteries days sony', 'sony cells kill', 'nice charge batteries', 'battery know let charge 4', 'fast charger', 'camera digital', 'batteries faster great', 'outlet plug wall']


  'stop_words.' % sorted(inconsistent))


Topics: 13
B0000DJEK7 ['use unit roadmate', 'warranty extend', 'update ask customer', 'battery internal', 'location save device', 'loose fiancee door', 'disappoint view excellent', 'use freeway wife', 'newer sync gps', '2004 speak model', 'destination lexus cities', 'minutes time months 10 5', 'north interface america']


  'stop_words.' % sorted(inconsistent))


Topics: 13
B0002UPGOI ['speaker use wire', 'need install amp', 'cheap work great quality', 'watts rms', 'ship fast price amp', 'thing watt price great', 'fine diameter work', 'lead grind truck power', 'make clean easy amp', 'plenty wish wire', 'product maybe u', 'wont work say', 'help ice build']


  'stop_words.' % sorted(inconsistent))


Topics: 13
B00028D778 ['radar detector', 'cord unit passport', 'time cop drive', 'service year heat', '40 display cost', 'expensive know protection', 'mount suction windshield', 'better years old', 'escort product website', 'relatively false positives car', 'item recommend seller', 'x50 band units', 'far trust totally']


  'stop_words.' % sorted(inconsistent))


Topics: 13
B00005QXWI ['turn cd mp3', 'screw plastic work', 'good really sony', 'use prevent trouble', 'player mp3 rangeit far function', '1 expensive 3', 'player qualities purpose', 'listen walk book', '250 100 90', 'use optional disk accessory', 'riovolt provide feature', 'skip intuitive protection quite impressive', 'hope good qualitycons rat']


  'stop_words.' % sorted(inconsistent))


Topics: 13
B0002CZHN6 ['cable hdmi dvi', 'cable price work', 'laptop computer monitor', 'good quality', 'work great', 'pc tv connect', 'xbox 360', 'perfect work monitor', 'audio carry', 'use ps3 problems', 'review work product', 'video card', 'buy long quality']


  'stop_words.' % sorted(inconsistent))


Topics: 13
B0000AQIFW ['player mp3 songs', 'use easy stop product', 'year hours pretty', 'player sound headphones', 'ergonomic better software device', '40 perfect expandable', 'know sonic product blue', 'practical unit version 256meg', 'music gb hours cali', 'useless love send', 'rip cd', 'mp3s computer car hell', 'upgrader rio save']


  'stop_words.' % sorted(inconsistent))


Topics: 13
B0002Y5WXO ['lens kit', 'light stabilization zoom', 'print mm lense', 'mm tamron canon', 'repair canon problem', 'lens mm', 'shift return amazon', 'heavy picture big', '20d nice eos work great', 'lens money really', 'purpose general', 'price need worth f4l', 'lens good crisp']


  'stop_words.' % sorted(inconsistent))


Topics: 13
B00005NIMJ ['use regular mouse', 'trackman marble wheel', 'version wire wireless', 'track ball', 'button easy trackball', 'surface use', '5 years logitech', 'button thumb finger', 'love home game', 'laptop use work great', 'like feel model', 'tunnel carpal great', 'cord cordless device']


  'stop_words.' % sorted(inconsistent))


Topics: 13
B00007GQLU ['lens use shoot', 'f fringe l', 'f1 8', 'lens love', 'lens l purchase', 'nice image crisp', 'light low', 'lens portraits great', 'crop sensor', 'lens lenses make', 'sharp best fast', '85mm ef canon', 'lens like recommend']


  'stop_words.' % sorted(inconsistent))


Topics: 13
B00062UW5A ['box jewel case', 'paper sleeves', 'need storage look', 'store really easy organize', 'easy assemble', 'fold need store', 'box snap lid', 'box dvd purchase', 'love use apart', 'snap stay', 'handy price product quality', 'box work', 'collection cd']


  'stop_words.' % sorted(inconsistent))


Topics: 13
B00017LSPI ['dust blow', 'air compress', 'work great', 'bag camera', 'clean sensor', 'air rocket blaster', 'blower nice quality', 'lens dust remove', 'use remove dust', 'air blast', 'buy good product', 'large work size', 'like look']


  'stop_words.' % sorted(inconsistent))


Topics: 13
B00066HP7Y ['buy pc car', 'thing drum trick', 'thing ipod brain say', 'sound great', 'complaints brain electronic', 'sure shield bite static work', 'cable audio headphones', 'cable audio quality', 'quality remote great', 'sound distortion good', 'cable sound defective', 'suppose computer tv flat', 'quite far bite static']


  'stop_words.' % sorted(inconsistent))


Topics: 13
B000629GES ['cancel noise', 'sony headphones great', 'good bose', 'ears uncomfortable flight', 'advertise work fragile', 'return good pair', 'reduce buy product', 'noise cancellation sound', 'think dollar loud', 'pad ear', 'recommend hiss background headphones', 'ones power flight', 'order sony radio']


  'stop_words.' % sorted(inconsistent))


Topics: 13
B00009WQS1 ['drive hard', 'support version work', 'external usb drive', 'use ghost enclosure', 'right box', 'hdd computer dell', 'laptop upgrade ez', 'way install connect', 'ssd minutes drive', 'like champ', 'latitude time dell', 'recovery space partition', 'recommend pc highly products']


  'stop_words.' % sorted(inconsistent))


Topics: 13
B0002KVQBA ['bass sound good', '10 psw', 'better sub want', 'sound great', 'subwoofer input rca', 'live room', 'theater home', 'price product quality', 'port noise', 'speakers bookshelf polk', 'music movies want', 'nice unit power', 'buy woofer surround']


  'stop_words.' % sorted(inconsistent))


Topics: 13
B00030CHRQ ['cord ipod headphones', 'buy sound earbuds', 'shure headphones quality', 'coat wire expose', 'bass buy sony', 'long extension size', 'use price', 'cable ear leave', 'house comfortably ear', 'ears 5 completely', 'bag music hear', 'run say great', 'player mp3']


  'stop_words.' % sorted(inconsistent))


Topics: 13
B00020S7XK ['fm station radio', 'price little great', 'fine work', 'batteries aa use', 'sound good quality', 'mono headphones stereo', 'good radio little', 'control tune volume', 'baseball listen game', 'battery life', 'sony radio pocket', 'buy love clear', 'emergency case power']


  'stop_words.' % sorted(inconsistent))


Topics: 13
B0000BYDKO ['cord reel', 'cable weight reel', 'hole end plug', 'tie half fold', 'hose use reel', 'buy product great', 'heavy cord gauge', 'use cord time', 'foot cord 100', 'cord extension 50', 'cord hold extention', 'like really look', 'chord thrower bite']


  'stop_words.' % sorted(inconsistent))


Topics: 13
B00007M1TZ ['use headsets headset', 'phone cordless', 'sound quality', 'people hear clearly', 'price work great', 'control volume', 'fit head set', 'wear comfortable', 'free hand', 'buy mm jack', 'work headset', 'use home phone', 'break expect months']


  'stop_words.' % sorted(inconsistent))


Topics: 13
B00018MSNI ['like hd650 headphones', 'hd 650', 'sound hear headphones', 'headphone need sound', 'tube hd600 amp', 'ago years', 'hd650 sound grado', 'watch tv', '650s hd', 'twice phone phenomenal', 'sound amplifier heck', 'dac end high', 'm100 wear design']


  'stop_words.' % sorted(inconsistent))


Topics: 13
B0002UM0JW ['use mouse optical', 'mx1000 surface mx700', 'button forward', 'months close click', 'ergonomics w good', 'mx revolution', 'mac easy set', 'logitech mouse great', 'years mouse time', 'jump time long', 'charge anymore usage', 'office home', 'use original actually']


  'stop_words.' % sorted(inconsistent))


Topics: 13
B0002SQ2P2 ['bass loud volume', 'z 2300', 'sound great', 'speakers computer', 'cable remote satellite', 'good sound', 'stop work great', 'speaker best bose', 'review beat price', 'play sound card', 'music movies watch', 'use easy quality', 'buy worth excellent']


  'stop_words.' % sorted(inconsistent))


Topics: 13
B00004T8R2 ['sound pair headphones', 'good sound', 'price work great', 'cord use long', 'recommend highly', 'fit head phone', 'price low volume', 'buy pair', 'port lightweight xbs', 'use kid want', 'player mp3', 'ear bud', 'light weight']


  'stop_words.' % sorted(inconsistent))


Topics: 13
B0000C3GWU ['speakers sound quality', 'control turn volume', 'work great', 'use sound great', 'book mac pro sound', 'save feature second', 'bass recommend highly', 'amaze pay dollars', 'harman kardon', 'happy purchase', 'phone cell', 'speakers sound set', 'player cd']


  'stop_words.' % sorted(inconsistent))


Topics: 13
B000204SWE ['player dvd', 'play divx file', 'unit turn region', 'dvds play', 'philips warranty months', 'price work great', 'button hold stop close', 'dvp 642', 'picture quality', 'use composite component', 'mpeg 4', 'player remote work media', 'play good model']


  'stop_words.' % sorted(inconsistent))


Topics: 13
B00006JN3G ['lens pen', 'bag camera', 'lenses clean', 'dust brush end', 'use easy carry', 'work product great', 'lens clean', 'good job', 'use black time', 'lens dust filter', 'cheap clean tool', 'kit clean camera', 'handy buy friend']


  'stop_words.' % sorted(inconsistent))


Topics: 13
B0002WPSBC ['speakers use speaker', 'z 5500', 'speakers price set', 'sound great', 'day years ago', 'theater home', 'break customer send', 'sound card', 'sound loud quality', 'amaze buy clear', 'promedia klipsch', 'music listen game', 'recommend control highly panel']


  'stop_words.' % sorted(inconsistent))


Topics:
B00066EK2W ['player use mp3', 'battery issue rechargeable radio', 'player use mp3']


  'stop_words.' % sorted(inconsistent))


Topics:
B0002D03ZW ['use sound headphones', 'headphone better record', 'break sound warn']


  'stop_words.' % sorted(inconsistent))


Topics:
B00004TS16 ['battery camera picture', 'recommend realize size', 'fantastically camera produce zoom']


  'stop_words.' % sorted(inconsistent))


Topics:
B0001NNLHK ['listen sound hear', 'shure rubber music', 'shure earphones headphones']


  'stop_words.' % sorted(inconsistent))


Topics:
B0002IOIMQ ['batteries time problem', 'batteries charge charger', 'batteries charge charger']


  'stop_words.' % sorted(inconsistent))


Topics:
B0000DJEK7 ['unit gps roadmate', 'right probably unit car', 'use model great']


  'stop_words.' % sorted(inconsistent))


Topics:
B0002UPGOI ['price work inexpensive solution', 'box safely speaker wire way', 'need cut excellent say']


  'stop_words.' % sorted(inconsistent))


Topics:
B00028D778 ['radar item detector', 'right x50 claim smart', 'unit need years']


  'stop_words.' % sorted(inconsistent))


Topics:
B00005QXWI ['player year button', 'recharge leather read manuals', 'player make produce screw']


  'stop_words.' % sorted(inconsistent))


Topics:
B0002CZHN6 ['cable hdmi dvi', 'cable work great', 'connect work product']


  'stop_words.' % sorted(inconsistent))


Topics:
B0000AQIFW ['iriver color device', 'player good mp3', 'sonic look product']


  'stop_words.' % sorted(inconsistent))


Topics:
B0002Y5WXO ['lens include lense', 'mm shoot need lense', 'lens good canon']


  'stop_words.' % sorted(inconsistent))


Topics:
B00005NIMJ ['trackman use mouse', 'use delivery mouse', 'use mouse ball']


  'stop_words.' % sorted(inconsistent))


Topics:
B00007GQLU ['lens prime price great', 'lens reasonable great', 'lens f great']


  'stop_words.' % sorted(inconsistent))


Topics:
B00062UW5A ['box cd great', 'box snap use', 'use storage case']


  'stop_words.' % sorted(inconsistent))


Topics:
B00017LSPI ['squeeze work blow', 'manual pfff bring', 'dust air use']


  'stop_words.' % sorted(inconsistent))


Topics:
B00066HP7Y ['cable flat screen', 'garbage need piece leave', 'far pc car stream']


  'stop_words.' % sorted(inconsistent))


Topics:
B000629GES ['earphones work noise', 'really headphones bose', 'know good headphones ok']


  'stop_words.' % sorted(inconsistent))


Topics:
B00009WQS1 ['use hd drive', 'use drive hard', 'clone drive hard']


  'stop_words.' % sorted(inconsistent))


Topics:
B0002KVQBA ['subwoofer sub sound', 'repair sub room', 'huummm subwoofer']


  'stop_words.' % sorted(inconsistent))


Topics:
B00030CHRQ ['cord sound sony', 'cord buy use', 'cord ear design']


  'stop_words.' % sorted(inconsistent))


Topics:
B00020S7XK ['whistle work radio', 'today stop radio order', 'good radio great']


  'stop_words.' % sorted(inconsistent))


Topics:
B0000BYDKO ['cord use reel', 'cord use reel', 'thing cord junk']


  'stop_words.' % sorted(inconsistent))


Topics:
B00007M1TZ ['cable hear say', 'use hear clearly', 'use phone headset']


  'stop_words.' % sorted(inconsistent))


Topics:
B00018MSNI ['headphone sound headphones', 'replace close headphones', 'like sound headphones']


  'stop_words.' % sorted(inconsistent))


Topics:
B0002UM0JW ['mouse hours months', 'button use mouse', 'charge good mouse']


  'stop_words.' % sorted(inconsistent))


Topics:
B0002SQ2P2 ['speakers like better', 'speakers sound great', 'speakers sound bass']


  'stop_words.' % sorted(inconsistent))


Topics:
B00004T8R2 ['use sound headphones', 'bargain wear comfortable note', 'sound price headphones']


  'stop_words.' % sorted(inconsistent))


Topics:
B0000C3GWU ['speakers sound price', 'speakers compact', 'speakers imac work']


  'stop_words.' % sorted(inconsistent))


Topics:
B000204SWE ['play europe dvds', 'player play dvd', 'player gift philips']


  'stop_words.' % sorted(inconsistent))


Topics:
B00006JN3G ['lens use clean', 'lenses clean carry', 'lenspen brush make wear']


  'stop_words.' % sorted(inconsistent))


Topics:
B0002WPSBC ['tigerdirect price sound', 'speakers use sound', 'speakers sound great']


In [13]:
print(redundancy_arr)

(('B00066EK2W', 0.044154529384612284), ('B0002D03ZW', 0.030087065120016687), ('B00004TS16', 0.04748769962424759), ('B0001NNLHK', 0.03173139218403054), ('B0002IOIMQ', 0.1007691788830547), ('B0000DJEK7', 0.009702582539327653), ('B0002UPGOI', 0.06647158925456631), ('B00028D778', 1.1102230246251565e-15), ('B00005QXWI', 0.03237118230576741), ('B0002CZHN6', 0.0809371426841029), ('B0000AQIFW', 0.024977625124585345), ('B0002Y5WXO', 0.07158025692411119), ('B00005NIMJ', 0.04390147839933556), ('B00007GQLU', 0.09753396024068128), ('B00062UW5A', 0.08596729049656449), ('B00017LSPI', 0.08443609377704353), ('B00066HP7Y', 0.10075788379386086), ('B000629GES', 0.05188073513690761), ('B00009WQS1', 0.03504485293793502), ('B0002KVQBA', 0.02312489790974337), ('B00030CHRQ', 0.03112844108214441), ('B00020S7XK', 0.045420999965316566), ('B0000BYDKO', 0.11164639178466218), ('B00007M1TZ', 0.04805814864740876), ('B00018MSNI', 0.07798360875751442), ('B0002UM0JW', 0.04703962232270176), ('B0002SQ2P2', 0.03762646458355

In [14]:
asins_count = len(valid_asins)

avd_red_nmf = red_nmf/asins_count
avg_red_lda = red_lda/asins_count

print('Avg reduncdancy NMF:', avd_red_nmf)
print('Avg reduncdancy LDA:', avg_red_lda)


avg_adj_count_nmf = sum(adj_count_nmf) / asins_count
avg_adj_count_lda = sum(adj_count_lda) / asins_count

print('Avg Adjective and Adverb Count NMF:', avg_adj_count_nmf)
print('Avg Adjective and Adverb Count LDA:', avg_adj_count_lda)

Avg reduncdancy NMF: 0.055306708041789096
Avg reduncdancy LDA: 0.14086402562112305
Avg Adjective and Adverb Count NMF: 0.7740384615384617
Avg Adjective and Adverb Count LDA: 0.5937500000000001


In [15]:
scraped_tags_df = pd.read_csv('amazon_scraped_tags.csv',keep_default_na=False)

In [16]:
print(scraped_tags_df.columns)

tags_df = scraped_tags_df.iloc[: , 2 : 20]
tags_df.head(5)

tags_df['CombinedTags'] = tags_df[tags_df.columns[1:]].apply(
    lambda x: ' '.join(x.dropna().astype(str)),
    axis=1
)
asin_tags_df = tags_df.loc[ : , ['ASIN','CombinedTags'] ]
asin_tags_df= asin_tags_df[asin_tags_df.ASIN.isin(valid_asins)]
asin_tags_df.to_csv('valid_asin_scraped_tags_df.csv')

asin_tags_df.head(5)

Index(['Product', '# of Reviews', 'ASIN', 'Tags', 'Unnamed: 4', 'Unnamed: 5',
       'Unnamed: 6', 'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9', 'Unnamed: 10',
       'Unnamed: 11', 'Unnamed: 12', 'Unnamed: 13', 'Unnamed: 14',
       'Unnamed: 15', 'Unnamed: 16', 'Unnamed: 17', 'Unnamed: 18',
       'Unnamed: 19'],
      dtype='object')


Unnamed: 0,ASIN,CombinedTags
0,B0002SQ2P2,speaker system sound quality headphone jack sa...
1,B0002KVQBA,polk audio living room sounds great surround s...
3,B00020S7XK,great little pocket radio battery life sound q...
5,B0002D03ZW,earbuds sound quality couple months every 6 mo...
6,B0002WPSBC,home theater surround sound speaker system sou...


In [17]:
valid_asins = list(asin_tags_df['ASIN'])
print(valid_asins)
review_sentences = list(asin_tags_df['CombinedTags'])
redundancy_arr = ()

red_amazon_scraped_tags, adj_count = 0, 0 
for i in range(len(review_sentences)):
    temp_red = redundancy(review_sentences[i])
    red_amazon_scraped_tags += temp_red
    redundancy_arr = list(redundancy_arr)
    redundancy_arr.append((valid_asins[i],temp_red))
    redundancy_arr = tuple(redundancy_arr)
    adj_count += (adjective_count(review_sentences[i])/13)
    
asins_count = len(valid_asins)

avd_red_amazon_scraped_tags = red_amazon_scraped_tags/asins_count

print('Avg reduncdancy NMF:', avd_red_amazon_scraped_tags)

['B0002SQ2P2', 'B0002KVQBA', 'B00020S7XK', 'B0002D03ZW', 'B0002WPSBC', 'B00006JN3G', 'B0002CZHN6', 'B00004T8R2', 'B00007M1TZ', 'B0002UPGOI', 'B000204SWE', 'B000629GES', 'B00017LSPI', 'B0002UM0JW', 'B0000C3GWU', 'B0001NNLHK', 'B0000BYDKO', 'B00066HP7Y', 'B0000AQIFW', 'B00066EK2W', 'B00005NIMJ', 'B00009WQS1', 'B0000DJEK7', 'B00028D778', 'B00030CHRQ', 'B0002IOIMQ', 'B0002Y5WXO', 'B00062UW5A', 'B00007GQLU', 'B00004TS16', 'B00005QXWI', 'B00018MSNI']
Avg reduncdancy NMF: 0.06690723690841899


In [18]:
print(asins_count)

32


In [19]:
print(redundancy_arr)

(('B0002SQ2P2', 0.01722706232293625), ('B0002KVQBA', 0.07407407407407396), ('B00020S7XK', 0.05678533309253431), ('B0002D03ZW', 0.06138624817088656), ('B0002WPSBC', 0.12073246487342737), ('B00006JN3G', 0.11009096747637015), ('B0002CZHN6', 0.044574556644966234), ('B00004T8R2', 0.0837725743386264), ('B00007M1TZ', 0.032730162085132464), ('B0002UPGOI', 0.05000000000000038), ('B000204SWE', 0.014196333273133965), ('B000629GES', 0.15912321167269505), ('B00017LSPI', 0.132598753895238), ('B0002UM0JW', 0.12080961137566348), ('B0000C3GWU', 0.06546032417026526), ('B0001NNLHK', 0.03273016208513235), ('B0000BYDKO', 0.042588999819400786), ('B00066HP7Y', 0.09693609377704371), ('B0000AQIFW', 0.03273016208513235), ('B00066EK2W', 0.04075900941810162), ('B00005NIMJ', 0.029716371096644156), ('B00009WQS1', 0.11381876958331827), ('B0000DJEK7', 6.661338147750939e-16), ('B00028D778', 0.01722706232293625), ('B00030CHRQ', 0.11886548438657696), ('B0002IOIMQ', 0.09263755863796663), ('B0002Y5WXO', 0.0681940619047632

In [20]:
print("Average adjective and Adverb count in Scraped Tags", adj_count/asins_count)

Average adjective and Adverb count in Scraped Tags 0.5697115384615384


In [21]:
#add emojis to topics
import emoji
for asin in valid_asins:
    topics, reviews = print_topics(asin)

    emoji_topics = []
    for topic_id, topic in enumerate(topics):

        # grab the average sentiment
        product_reviews_df = pd.DataFrame(reviews)

        product_reviews_df = product_reviews_df[product_reviews_df['topic'] == topic_id]
        polarity   = product_reviews_df['summary_sentiment'].mean()

        # append emojis to topic name based on range of sentiment
        if polarity <= -0.5:
            emoji_topics.append("😕 - "+topic)
        elif polarity > -0.5 and polarity < 0.5:
            emoji_topics.append("😐 - "+topic)
        else:
            emoji_topics.append("🙂 - "+topic)

    for emoji_topic in emoji_topics:
        print(emoji.emojize(emoji_topic))



  'stop_words.' % sorted(inconsistent))


Topics: 13
😐 - bass loud volume
🙂 - z 2300
🙂 - sound great
🙂 - speakers computer
🙂 - cable remote satellite
🙂 - good sound
😐 - stop work great
🙂 - speaker best bose
😐 - review beat price
🙂 - play sound card
🙂 - music movies watch
🙂 - use easy quality
😐 - buy worth excellent


  'stop_words.' % sorted(inconsistent))


Topics: 13
🙂 - bass sound good
😐 - 10 psw
😐 - better sub want
🙂 - sound great
🙂 - subwoofer input rca
🙂 - live room
🙂 - theater home
😐 - price product quality
😐 - port noise
🙂 - speakers bookshelf polk
😐 - music movies want
🙂 - nice unit power
🙂 - buy woofer surround


  'stop_words.' % sorted(inconsistent))


Topics: 13
🙂 - fm station radio
🙂 - price little great
🙂 - fine work
🙂 - batteries aa use
🙂 - sound good quality
😐 - mono headphones stereo
😐 - good radio little
😐 - control tune volume
😐 - baseball listen game
😐 - battery life
😐 - sony radio pocket
🙂 - buy love clear
😐 - emergency case power


  'stop_words.' % sorted(inconsistent))


Topics: 13
🙂 - music sound headphones
🙂 - studio years home
🙂 - ears head wear
🙂 - sound comfortable great
🙂 - review headphones say
🙂 - response frequency price
🙂 - work wear hours
🙂 - fit large amp
😐 - bass monitor bite
🙂 - audio technica
😐 - love far im
🙂 - overall good better
🙂 - hop record edit


  'stop_words.' % sorted(inconsistent))


Topics: 13
🙂 - speakers use speaker
🙂 - z 5500
🙂 - speakers price set
🙂 - sound great
😐 - day years ago
🙂 - theater home
🙂 - break customer send
😐 - sound card
🙂 - sound loud quality
😐 - amaze buy clear
😐 - promedia klipsch
😐 - music listen game
🙂 - recommend control highly panel


  'stop_words.' % sorted(inconsistent))


Topics: 13
🙂 - lens pen
😐 - bag camera
🙂 - lenses clean
🙂 - dust brush end
🙂 - use easy carry
😐 - work product great
🙂 - lens clean
🙂 - good job
😐 - use black time
🙂 - lens dust filter
😐 - cheap clean tool
🙂 - kit clean camera
😐 - handy buy friend


  'stop_words.' % sorted(inconsistent))


Topics: 13
🙂 - cable hdmi dvi
😐 - cable price work
😐 - laptop computer monitor
😐 - good quality
🙂 - work great
😐 - pc tv connect
😐 - xbox 360
🙂 - perfect work monitor
😐 - audio carry
😐 - use ps3 problems
😐 - review work product
😐 - video card
😐 - buy long quality


  'stop_words.' % sorted(inconsistent))


Topics: 13
🙂 - sound pair headphones
😐 - good sound
🙂 - price work great
🙂 - cord use long
😐 - recommend highly
😐 - fit head phone
🙂 - price low volume
😐 - buy pair
😐 - port lightweight xbs
🙂 - use kid want
😐 - player mp3
😐 - ear bud
😐 - light weight


  'stop_words.' % sorted(inconsistent))


Topics: 13
🙂 - use headsets headset
😐 - phone cordless
🙂 - sound quality
😐 - people hear clearly
🙂 - price work great
😐 - control volume
😐 - fit head set
🙂 - wear comfortable
🙂 - free hand
😐 - buy mm jack
🙂 - work headset
🙂 - use home phone
😐 - break expect months


  'stop_words.' % sorted(inconsistent))


Topics: 13
😐 - speaker use wire
😐 - need install amp
🙂 - cheap work great quality
🙂 - watts rms
🙂 - ship fast price amp
😐 - thing watt price great
🙂 - fine diameter work
😐 - lead grind truck power
🙂 - make clean easy amp
🙂 - plenty wish wire
😐 - product maybe u
🙂 - wont work say
😐 - help ice build


  'stop_words.' % sorted(inconsistent))


Topics: 13
🙂 - player dvd
😐 - play divx file
🙂 - unit turn region
🙂 - dvds play
😕 - philips warranty months
🙂 - price work great
😕 - button hold stop close
😐 - dvp 642
🙂 - picture quality
😐 - use composite component
😐 - mpeg 4
😐 - player remote work media
🙂 - play good model


  'stop_words.' % sorted(inconsistent))


Topics: 13
😐 - cancel noise
🙂 - sony headphones great
😐 - good bose
😐 - ears uncomfortable flight
😐 - advertise work fragile
😐 - return good pair
😐 - reduce buy product
😐 - noise cancellation sound
🙂 - think dollar loud
😐 - pad ear
😐 - recommend hiss background headphones
😐 - ones power flight
😐 - order sony radio


  'stop_words.' % sorted(inconsistent))


Topics: 13
🙂 - dust blow
😐 - air compress
🙂 - work great
😐 - bag camera
🙂 - clean sensor
😐 - air rocket blaster
🙂 - blower nice quality
😐 - lens dust remove
😐 - use remove dust
🙂 - air blast
🙂 - buy good product
🙂 - large work size
🙂 - like look


  'stop_words.' % sorted(inconsistent))


Topics: 13
🙂 - use mouse optical
🙂 - mx1000 surface mx700
😐 - button forward
😐 - months close click
🙂 - ergonomics w good
😐 - mx revolution
🙂 - mac easy set
😐 - logitech mouse great
😐 - years mouse time
😐 - jump time long
😐 - charge anymore usage
😐 - office home
😐 - use original actually


  'stop_words.' % sorted(inconsistent))


Topics: 13
🙂 - speakers sound quality
😐 - control turn volume
🙂 - work great
🙂 - use sound great
🙂 - book mac pro sound
😐 - save feature second
🙂 - bass recommend highly
😐 - amaze pay dollars
😐 - harman kardon
🙂 - happy purchase
😐 - phone cell
🙂 - speakers sound set
🙂 - player cd


  'stop_words.' % sorted(inconsistent))


Topics: 13
🙂 - sound good sleeves
🙂 - ear bud e3c
🙂 - ears cord headphones
😐 - ipod listen earphones
😐 - break wire cover
🙂 - love fit sound supply
😐 - star price general
🙂 - canal ear send
🙂 - player e3cs purchase
😐 - tip foam earplugs
😐 - bass koss 10
🙂 - hear long flight
😐 - noise sound block


  'stop_words.' % sorted(inconsistent))


Topics: 13
🙂 - cord reel
😐 - cable weight reel
🙂 - hole end plug
😐 - tie half fold
😐 - hose use reel
🙂 - buy product great
😐 - heavy cord gauge
😐 - use cord time
😐 - foot cord 100
😐 - cord extension 50
😐 - cord hold extention
😐 - like really look
😐 - chord thrower bite


  'stop_words.' % sorted(inconsistent))


Topics: 13
😐 - buy pc car
😐 - thing drum trick
😐 - thing ipod brain say
😐 - sound great
😐 - complaints brain electronic
😐 - sure shield bite static work
😐 - cable audio headphones
😐 - cable audio quality
😐 - quality remote great
😐 - sound distortion good
😐 - cable sound defective
😐 - suppose computer tv flat
😐 - quite far bite static


  'stop_words.' % sorted(inconsistent))


Topics: 13
😐 - player mp3 songs
🙂 - use easy stop product
🙂 - year hours pretty
🙂 - player sound headphones
🙂 - ergonomic better software device
🙂 - 40 perfect expandable
😐 - know sonic product blue
😐 - practical unit version 256meg
🙂 - music gb hours cali
🙂 - useless love send
😐 - rip cd
😐 - mp3s computer car hell
😐 - upgrader rio save


  'stop_words.' % sorted(inconsistent))


Topics: 13
🙂 - player sandisk good
🙂 - use easy
😐 - drive hard
🙂 - memory expandable mp3
🙂 - file crackle wma
🙂 - amaze sweat years
🙂 - include usb mp3
🙂 - button buy small
😐 - music classical solve volume
🙂 - operation gift basic recommend product
🙂 - point good review product
🙂 - small ear easy v
🙂 - audio track record


  'stop_words.' % sorted(inconsistent))


Topics: 13
😐 - use regular mouse
😐 - trackman marble wheel
🙂 - version wire wireless
😐 - track ball
🙂 - button easy trackball
🙂 - surface use
🙂 - 5 years logitech
🙂 - button thumb finger
🙂 - love home game
🙂 - laptop use work great
🙂 - like feel model
😐 - tunnel carpal great
😐 - cord cordless device


  'stop_words.' % sorted(inconsistent))


Topics: 13
😐 - drive hard
🙂 - support version work
🙂 - external usb drive
😐 - use ghost enclosure
😐 - right box
😐 - hdd computer dell
🙂 - laptop upgrade ez
😐 - way install connect
🙂 - ssd minutes drive
🙂 - like champ
😐 - latitude time dell
😐 - recovery space partition
🙂 - recommend pc highly products


  'stop_words.' % sorted(inconsistent))


Topics: 13
🙂 - use unit roadmate
😐 - warranty extend
😐 - update ask customer
😐 - battery internal
😐 - location save device
😐 - loose fiancee door
😐 - disappoint view excellent
😐 - use freeway wife
🙂 - newer sync gps
😐 - 2004 speak model
😐 - destination lexus cities
😐 - minutes time months 10 5
😐 - north interface america


  'stop_words.' % sorted(inconsistent))


Topics: 13
😐 - radar detector
🙂 - cord unit passport
😐 - time cop drive
😕 - service year heat
😐 - 40 display cost
😐 - expensive know protection
😐 - mount suction windshield
🙂 - better years old
😐 - escort product website
🙂 - relatively false positives car
😐 - item recommend seller
🙂 - x50 band units
🙂 - far trust totally


  'stop_words.' % sorted(inconsistent))


Topics: 13
🙂 - cord ipod headphones
🙂 - buy sound earbuds
🙂 - shure headphones quality
😐 - coat wire expose
😐 - bass buy sony
😐 - long extension size
😐 - use price
🙂 - cable ear leave
🙂 - house comfortably ear
😐 - ears 5 completely
😐 - bag music hear
😐 - run say great
😐 - player mp3


  'stop_words.' % sorted(inconsistent))


Topics: 13
🙂 - batteries charge charger
😐 - display lcd
😐 - use years work
😐 - battery life
😐 - super quick
😐 - batteries days sony
😐 - sony cells kill
😐 - nice charge batteries
😐 - battery know let charge 4
😐 - fast charger
😐 - camera digital
🙂 - batteries faster great
😐 - outlet plug wall


  'stop_words.' % sorted(inconsistent))


Topics: 13
🙂 - lens kit
😐 - light stabilization zoom
🙂 - print mm lense
😐 - mm tamron canon
😐 - repair canon problem
🙂 - lens mm
🙂 - shift return amazon
🙂 - heavy picture big
🙂 - 20d nice eos work great
🙂 - lens money really
🙂 - purpose general
🙂 - price need worth f4l
🙂 - lens good crisp


  'stop_words.' % sorted(inconsistent))


Topics: 13
🙂 - box jewel case
🙂 - paper sleeves
🙂 - need storage look
🙂 - store really easy organize
🙂 - easy assemble
😐 - fold need store
😐 - box snap lid
🙂 - box dvd purchase
🙂 - love use apart
😐 - snap stay
😐 - handy price product quality
🙂 - box work
🙂 - collection cd


  'stop_words.' % sorted(inconsistent))


Topics: 13
🙂 - lens use shoot
😐 - f fringe l
🙂 - f1 8
🙂 - lens love
🙂 - lens l purchase
🙂 - nice image crisp
😐 - light low
🙂 - lens portraits great
😐 - crop sensor
🙂 - lens lenses make
🙂 - sharp best fast
😐 - 85mm ef canon
🙂 - lens like recommend


  'stop_words.' % sorted(inconsistent))


Topics: 13
🙂 - shoot shots picture
🙂 - thing like memory card
😐 - paper machine quality
😐 - photography camera snorkel
🙂 - manual nice mode
🙂 - think overall camera great
🙂 - s100 fancy friends
🙂 - thing usb easy
😐 - battery backup photoshop
🙂 - bother camera 2mp
😐 - tft picture deal
😐 - battery lithium charger
😐 - reader small pocket goodbye


  'stop_words.' % sorted(inconsistent))


Topics: 13
🙂 - turn cd mp3
😐 - screw plastic work
🙂 - good really sony
😐 - use prevent trouble
😐 - player mp3 rangeit far function
😐 - 1 expensive 3
😐 - player qualities purpose
🙂 - listen walk book
🙂 - 250 100 90
🙂 - use optional disk accessory
😐 - riovolt provide feature
😕 - skip intuitive protection quite impressive
🙂 - hope good qualitycons rat


  'stop_words.' % sorted(inconsistent))


Topics: 13
🙂 - like hd650 headphones
🙂 - hd 650
🙂 - sound hear headphones
🙂 - headphone need sound
😐 - tube hd600 amp
😐 - ago years
😐 - hd650 sound grado
🙂 - watch tv
🙂 - 650s hd
🙂 - twice phone phenomenal
🙂 - sound amplifier heck
🙂 - dac end high
🙂 - m100 wear design


In [22]:
#string = 'focal length wide open full frame image quality depth of field great lens highly recommend autofocus purple fringing chromatic aberration build quality shallow depth background blur'
string = 'sound quality ear canal bass response soudn isolation make sure noise cancelling much better every penny yellow foam worth every sony mdr-ex noise reduction ultimate ears'
lemmas = tokenizeText(string) 
lemmas = ' '.join(lemma for lemma in lemmas)
#print(lemmas)
print(redundancy(lemmas))
print("adjective and adverb count for Amazon scraped tags", adjective_count(lemmas))

topics, review = print_topics('B0001NNLHK')
#print(topics)
lemmas = ' '.join(lemma for lemma in topics)
#print(lemmas)
print(redundancy(lemmas))
print("adjective and adverb count for NMF", adjective_count(lemmas))

topics, review = print_topics_lda('B0001NNLHK')
#print(topics)
lemmas = ' '.join(lemma for lemma in topics)
#print(lemmas)
print(redundancy(lemmas))
print("adjective and adverb count for LDA", adjective_count(lemmas))

0.03445412464587205
adjective and adverb count for Amazon scraped tags 6


  'stop_words.' % sorted(inconsistent))


Topics: 13
0.03173139218403054
adjective and adverb count for NMF 10


  'stop_words.' % sorted(inconsistent))


Topics:
0.07010330595238412
adjective and adverb count for LDA 3
