# DATA-641 Lab 5
Max Calzada

**Topic Modeling on Amazon Reviews**

In this lab, we will work on how to use different methods for topic modeling in order to get insights from Amazon reviews.

In [35]:
import numpy as np
import pandas as pd

from nltk import word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

#matplotlib imports are used to plot confusion matrices for the classifiers
import matplotlib as mpl 
import matplotlib.cm as cm 
import matplotlib.pyplot as plt 

#import feature extraction methods from sklearn
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction import _stop_words
#pre-processing of text
import string
import re

#import classifiers from sklearn
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

#import different metrics to evaluate the classifiers
from sklearn.metrics import accuracy_score
#from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix 
from sklearn import metrics

#import time function from time module to track the training duration
from time import time

import nltk
nltk.download('punkt')
nltk.download('wordnet')

from sklearn.decomposition import NMF, PCA, TruncatedSVD, FastICA

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## (a) 
Load the reviews data from the .csv file which is provided on Canvas. Filter the first 20k rows and then remove the null values from the data set. Include those words that appear in less than 80% of the reviews and appear in at least two reviews.

In [7]:
# Load the reviews data from the .csv file which is provided on Canvas.
Reviews = pd.read_csv('Reviews_7.csv')

  Reviews = pd.read_csv('Reviews_7.csv')


In [8]:
# Filter the first 20k rows
Reviews = Reviews[:20000]

# and then remove the null values from the data set
  # Max: I'm removing entire rows if there is one NA value in that row.
Reviews.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Reviews.dropna(inplace=True)


In [None]:
# Include those words that appear in less than 80% of the reviews 
# and appear in at least two reviews.

In [None]:
# Reviews

In [None]:
# !pip install lime

In [9]:
# Classification_Word2Vec.ipynb

def clean_text(str_list, lemmatize = False):
    clean_list = []
    
    for text in str_list:
        # to drop pound sign from hash tags
        text = re.sub(r'#', '', text)
        words = word_tokenize(text)
        clean_words = []
        
        for word in words:            
            # drop words with fewer than 2 characters; drop any punctuation "words"
            if (len(word) > 1) and (re.match(r'^\w+$', word)):

                if lemmatize:
                    lemmatizer = WordNetLemmatizer()
            
                clean_words.append(word)
        clean_text = ' '.join(clean_words)
        clean_list.append(clean_text)
    
    return clean_list

In [10]:
Reviews['clean_Text'] = clean_text(Reviews['Text'])

In [11]:
# https://stackoverflow.com/questions/46786211/counting-the-frequency-of-words-in-a-pandas-data-frame

Reviews['clean_Text'].str.split(expand=True).stack().value_counts()

the          56257
and          43213
to           34356
it           29134
of           27539
             ...  
Morse            1
emerges          1
Tying            1
Guayusa          1
depressor        1
Length: 33179, dtype: int64

In [12]:
# https://stackoverflow.com/questions/64435552/count-the-number-of-rows-that-each-word-appears-in

def vocab_dict(data):
    lines_count = {}
    for line in data:
        for word in set(line.split()):
            old_count = lines_count.get(word, 0)
            lines_count[word] = old_count + 1
    return lines_count

In [13]:
# vocab_dict(Reviews['clean_Text'])

In [14]:
Reviews_dict = vocab_dict(Reviews['clean_Text'])

In [15]:
# Code based off of: https://learnpython.com/blog/filter-dictionary-in-python/

def at_least_2(pair):
    key, value = pair
    if value >= 2:
        return True  # keep pair in the filtered dictionary
    else:
        return False  # filter pair out of the dictionary

In [16]:
# Code based off of: https://learnpython.com/blog/filter-dictionary-in-python/
filter_1 = dict(filter(at_least_2, Reviews_dict.items()))
# filter_1

# Code based off of: https://blog.finxter.com/how-to-get-the-key-with-minimum-value-in-a-python-dictionary/#:~:text=To%20find%20the%20key%20with,to%20get%20their%20associated%20values.
print(min(filter_1.values()))

2


In [17]:
rev_len = len(Reviews)
print(rev_len * 0.8, rev_len)

16000.0 20000


In [18]:
# Code based off of: https://learnpython.com/blog/filter-dictionary-in-python/

def less_than_80pc(pair):
    key, value = pair
    if value < 0.8*rev_len:
        return True  # keep pair in the filtered dictionary
    else:
        return False  # filter pair out of the dictionary

In [19]:
filter_dict = dict(filter(less_than_80pc, Reviews_dict.items()))
print(len(filter_dict), max(filter_dict.values()) )

33179 15909


In [46]:
filter_dict

{'this': 10374,
 'smells': 187,
 'looks': 250,
 'stew': 20,
 'it': 11666,
 'have': 7685,
 'is': 12298,
 'Vitality': 3,
 'found': 1659,
 'of': 11776,
 'more': 3546,
 'than': 3355,
 'dog': 1158,
 'and': 15775,
 'My': 2904,
 'better': 2153,
 'like': 5979,
 'them': 4495,
 'all': 3879,
 'food': 1879,
 'to': 13516,
 'Labrador': 3,
 'product': 3637,
 'the': 15909,
 'good': 5091,
 'processed': 100,
 'several': 649,
 'quality': 988,
 'be': 4871,
 'most': 1208,
 'finicky': 48,
 'she': 879,
 'meat': 289,
 'appreciates': 6,
 'The': 5646,
 'canned': 218,
 'products': 839,
 'bought': 1468,
 'or': 4412,
 'sure': 963,
 'was': 6258,
 'sized': 114,
 'vendor': 63,
 'intended': 34,
 'arrived': 547,
 'an': 2527,
 'labeled': 54,
 'actually': 781,
 'Jumbo': 3,
 'were': 2284,
 'if': 3026,
 'error': 32,
 'Salted': 10,
 'as': 5270,
 'represent': 4,
 'peanuts': 125,
 'small': 993,
 'Not': 827,
 'unsalted': 15,
 'Peanuts': 16,
 'Product': 87,
 'recommend': 1472,
 'around': 656,
 'his': 644,
 'confection': 9,
 'fe

## (b) 
Apply truncated SVD using five components. Print the ten strongest words for each of the topics.

In [22]:
# features
tfidf = TfidfVectorizer(max_df = 0.8, min_df = 2, stop_words = "english")
revs_tfidf = tfidf.fit_transform(Reviews['clean_Text'])

In [28]:
svdT = TruncatedSVD(n_components = 5)
svdTFit = svdT.fit_transform(revs_tfidf)

In [29]:
for i, topic in enumerate(svdT.components_):

    first_topic_svd = svdT.components_[i]
    top_topic_words_svd = first_topic_svd.argsort()[-10:]

    print("Top 10 words for topic #{}".format(i + 1))
    print([tfidf.get_feature_names_out()[i] for i in top_topic_words_svd])
    print("\n")

Top 10 words for topic #1
['product', 'tea', 'flavor', 'just', 'taste', 'great', 'good', 'like', 'coffee', 'br']


Top 10 words for topic #2
['orange', 'switch', 'ingredients', 'soda', 'dogs', 'juice', 'food', 'treats', 'dog', 'br']


Top 10 words for topic #3
['bitter', 'smooth', 'strong', 'keurig', 'blend', 'roast', 'bold', 'cup', 'br', 'coffee']


Top 10 words for topic #4
['black', 'br', 'iced', 'drink', 'loose', 'earl', 'grey', 'teas', 'green', 'tea']


Top 10 words for topic #5
['newman', 'cat', 'treat', 'loves', 'coffee', 'tea', 'dogs', 'food', 'treats', 'dog']




## (c) 
Apply non-negative matrix factorization using five components. Print the ten strongest words for each of the topics.

In [32]:
# NMF
nmf = NMF(n_components = 5)
nmf_fit = nmf.fit_transform(revs_tfidf)

# strongest words for 10 topics
for i, topic in enumerate(nmf.components_):

    first_topic_nmf = nmf.components_[i]
    top_topic_words_nmf = first_topic_nmf.argsort()[-10:]

    print("Top 10 words for topic #{}".format(i + 1))
    print([tfidf.get_feature_names_out()[i] for i in top_topic_words_nmf])
    print("\n")

Top 10 words for topic #1
['chocolate', 'really', 'love', 'flavor', 'just', 'product', 'taste', 'good', 'great', 'like']


Top 10 words for topic #2
['bitter', 'like', 'keurig', 'roast', 'flavor', 'blend', 'bold', 'strong', 'cup', 'coffee']


Top 10 words for topic #3
['100', 'water', 'orange', 'http', 'switch', 'soda', 'drink', 'sugar', 'juice', 'br']


Top 10 words for topic #4
['bags', 'flavor', 'drink', 'iced', 'earl', 'loose', 'grey', 'teas', 'green', 'tea']


Top 10 words for topic #5
['old', 'love', 'cat', 'eat', 'treat', 'loves', 'dogs', 'food', 'treats', 'dog']




## (d) 
Apply Latent Dirichlet Allocation (LDA) using five topics. Print the ten strongest words for each of the topics.


In [43]:
# Code based off of: https://towardsdatascience.com/end-to-end-topic-modeling-in-python-latent-dirichlet-allocation-lda-35ce4ed6b3e0

import gensim
from gensim.utils import simple_preprocess
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

def sent_to_words(sentences):
    for sentence in sentences:
        # deacc=True removes punctuations
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

data = Reviews.clean_Text.values.tolist()
data_words = list(sent_to_words(data))
print(data_words[:1][0][:30])

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['have', 'bought', 'several', 'of', 'the', 'vitality', 'canned', 'dog', 'food', 'products', 'and', 'have', 'found', 'them', 'all', 'to', 'be', 'of', 'good', 'quality', 'the', 'product', 'looks', 'more', 'like', 'stew', 'than', 'processed', 'meat', 'and']


In [44]:
import gensim.corpora as corpora
# Create Dictionary
id2word = corpora.Dictionary(data_words)
# Create Corpus
texts = data_words
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

In [45]:
# Code based off of: https://towardsdatascience.com/end-to-end-topic-modeling-in-python-latent-dirichlet-allocation-lda-35ce4ed6b3e0

from pprint import pprint
import gensim
# number of topics
num_topics = 5
# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics)
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.042*"the" + 0.035*"and" + 0.025*"is" + 0.024*"to" + 0.024*"it" + '
  '0.021*"this" + 0.015*"of" + 0.012*"br" + 0.012*"have" + 0.012*"with"'),
 (1,
  '0.051*"the" + 0.028*"and" + 0.026*"br" + 0.025*"to" + 0.020*"of" + '
  '0.015*"it" + 0.014*"in" + 0.014*"they" + 0.013*"for" + 0.012*"my"'),
 (2,
  '0.031*"it" + 0.030*"coffee" + 0.029*"this" + 0.027*"is" + 0.027*"the" + '
  '0.022*"and" + 0.021*"tea" + 0.019*"to" + 0.014*"of" + 0.014*"not"'),
 (3,
  '0.042*"the" + 0.038*"it" + 0.034*"and" + 0.024*"to" + 0.023*"is" + '
  '0.023*"of" + 0.018*"this" + 0.015*"in" + 0.014*"for" + 0.011*"that"'),
 (4,
  '0.043*"the" + 0.027*"and" + 0.024*"to" + 0.022*"it" + 0.020*"for" + '
  '0.019*"this" + 0.016*"of" + 0.013*"is" + 0.013*"you" + 0.012*"my"')]
