<img align="left" src="https://lever-client-logos.s3.amazonaws.com/864372b1-534c-480e-acd5-9711f850815c-1524247202159.png" width=200>
<br></br>
<br></br>

# Topic Modeling
## *Data Science Unit 4 Sprint 1 Assignment 4*

Analyze a corpus of Amazon reviews from Unit 4 Sprint 1 Module 1's lecture using topic modeling: 

- Fit a Gensim LDA topic model on Amazon Reviews
- Select appropriate number of topics
- Create some dope visualization of the topics
- Write a few bullets on your findings in markdown at the end
- **Note**: You don't *have* to use generators for this assignment

In [1]:
import pandas as pd
df = pd.read_csv('./data/Datafiniti_Amazon_Consumer_Reviews_of_Amazon_Products_May19.csv')

In [6]:
# clean up brands
df['brand'] = df['brand'].apply(lambda x: x.lower())
df['brand'].value_counts()

amazon          16153
amazonbasics    12179
Name: brand, dtype: int64

In [17]:
df.columns

Index(['id', 'dateAdded', 'dateUpdated', 'name', 'asins', 'brand',
       'categories', 'primaryCategories', 'imageURLs', 'keys', 'manufacturer',
       'manufacturerNumber', 'reviews.date', 'reviews.dateSeen',
       'reviews.didPurchase', 'reviews.doRecommend', 'reviews.id',
       'reviews.numHelpful', 'reviews.rating', 'reviews.sourceURLs',
       'reviews.text', 'reviews.title', 'reviews.username', 'sourceURLs'],
      dtype='object')

In [22]:
# don't need a lot of these columns, keep review.text, primaryCategories and id:
df = df[['id', 'brand', 'primaryCategories', 'reviews.text']]

In [26]:
# add stop words to take out of tokens:
STOP_WORDS = nlp.Defaults.stop_words.union(['batteries','I', 'amazon', 'i', 'Amazon', 'it', "it's", 'it.', 'the', 'this',])

In [27]:
# tokenize using spacy:
import spacy
from spacy.tokenizer import Tokenizer
nlp = spacy.load("en_core_web_lg")

In [28]:
tokenizer = Tokenizer(nlp.vocab)

In [29]:
tokens = []

for doc in tokenizer.pipe(df['reviews.text'], batch_size=500):
    
    doc_tokens = []
    
    for token in doc: 
        if token.text.lower() not in STOP_WORDS:
            doc_tokens.append(token.text.lower())
   
    tokens.append(doc_tokens)
    
df['tokens'] = tokens

In [30]:
df.head()

Unnamed: 0,id,brand,primaryCategories,reviews.text,tokens
0,AVpgNzjwLJeJML43Kpxn,amazonbasics,Health & Beauty,I order 3 of them and one of the item is bad q...,"[order, 3, item, bad, quality., missing, backu..."
1,AVpgNzjwLJeJML43Kpxn,amazonbasics,Health & Beauty,Bulk is always the less expensive way to go fo...,"[bulk, expensive, way, products, like]"
2,AVpgNzjwLJeJML43Kpxn,amazonbasics,Health & Beauty,Well they are not Duracell but for the price i...,"[duracell, price, happy.]"
3,AVpgNzjwLJeJML43Kpxn,amazonbasics,Health & Beauty,Seem to work as well as name brand batteries a...,"[work, brand, better, price]"
4,AVpgNzjwLJeJML43Kpxn,amazonbasics,Health & Beauty,These batteries are very long lasting the pric...,"[long, lasting, price, great.]"


In [40]:
# imports for LDA:
import gensim
import re
from gensim import corpora
from gensim.models.ldamulticore import LdaMulticore

In [32]:
# A Dictionary Representation of all the words in our corpus
id2word = corpora.Dictionary(df['tokens'])

In [33]:
# Let's remove extreme values from the dataset
id2word.filter_extremes(no_below=5, no_above=0.95)

In [47]:
len(id2word.keys())

5560

In [34]:
# a bag of words(bow) representation of our corpus
# Note: we haven't actually read any text into memory here
# Although abstracted away - tokenization IS happening in the doc_stream f(x)
corpus = [id2word.doc2bow(text) for text in df['tokens']]

In [37]:
# lda model
lda = LdaMulticore(corpus=corpus,
                   id2word=id2word,
                   random_state=723812,
                   num_topics = 10,
                   passes=10,
                   workers=8
                  )

In [38]:
# print topics
lda.print_topics()

[(0,
  '0.066*"great" + 0.034*"good" + 0.033*"long" + 0.025*"price" + 0.019*"batteries." + 0.017*"works" + 0.016*"brand" + 0.016*"buy" + 0.014*"product" + 0.010*"price."'),
 (1,
  '0.046*"good" + 0.044*"great" + 0.022*"work" + 0.021*"battery" + 0.018*"tablet" + 0.016*"price." + 0.010*"case" + 0.010*"buy" + 0.009*"best" + 0.009*"long"'),
 (2,
  '0.017*"don\'t" + 0.014*"tablet" + 0.011*"kindle" + 0.008*"great" + 0.008*"like" + 0.008*"time" + 0.008*"need" + 0.007*"use" + 0.007*"buy" + 0.006*"little"'),
 (3,
  '0.036*"price" + 0.027*"good" + 0.019*"great" + 0.013*"tablet" + 0.012*"love" + 0.010*"apps" + 0.010*"battery" + 0.010*"use" + 0.010*"google" + 0.009*"tablet."'),
 (4,
  '0.031*"great" + 0.022*"can\'t" + 0.017*"best" + 0.017*"value" + 0.015*"-" + 0.013*"beat" + 0.011*"good" + 0.009*"tablet" + 0.008*"buy" + 0.008*"use"'),
 (5,
  '0.039*"kindle" + 0.028*"fire" + 0.018*"great" + 0.014*"tablet" + 0.013*"better" + 0.011*"bought" + 0.010*"good" + 0.008*"use" + 0.007*"purchased" + 0.007*"bo

In [42]:
# make it more readable:
words = [re.findall(r'"([^"]*)"',t[1]) for t in lda.print_topics()]
topics = [' '.join(t[0:5]) for t in words]

for id, t in enumerate(topics): 
    print(f"------ Topic {id} ------")
    print(t, end="\n\n")

------ Topic 0 ------
great good long price batteries.

------ Topic 1 ------
good great work battery tablet

------ Topic 2 ------
don't tablet kindle great like

------ Topic 3 ------
price good great tablet love

------ Topic 4 ------
great can't best value -

------ Topic 5 ------
kindle fire great tablet better

------ Topic 6 ------
tablet year old bought play

------ Topic 7 ------
love loves tablet easy great

------ Topic 8 ------
use i'm easy bought box

------ Topic 9 ------
bought kindle use easy purchased



In [43]:
# visualize--takes a long time to run
import pyLDAvis.gensim

pyLDAvis.enable_notebook()

pyLDAvis.gensim.prepare(lda, corpus, id2word)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [44]:
from gensim.models.coherencemodel import CoherenceModel

def compute_coherence_values(dictionary, corpus, limit, start=2, step=3, passes=5):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    limit : Max num of topics
    passes: the number of times the entire lda model & coherence values are calculated

    Returns:
    -------
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    
    coherence_values = []
    
    for iter_ in range(passes):
        for num_topics in range(start, limit, step):
            model = LdaMulticore(corpus=corpus, num_topics=num_topics, id2word=dictionary, workers=4)
            coherencemodel = CoherenceModel(model=model,dictionary=dictionary,corpus=corpus, coherence='u_mass')
            coherence_values.append({'pass': iter_, 
                                     'num_topics': num_topics, 
                                     'coherence_score': coherencemodel.get_coherence()
                                    })

    return coherence_values

In [46]:
# Can take a long time to run. can lower passes
import warnings
warnings.filterwarnings('ignore')
coherence_values = compute_coherence_values(dictionary=id2word, 
                                                        corpus=corpus,
                                                        start=2, 
                                                        limit=40, 
                                                        step=2,
                                                        passes=5)

Process ForkPoolWorker-869:
Traceback (most recent call last):
  File "/Users/maggie/opt/anaconda3/envs/U4-S1-NLP/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/Users/maggie/opt/anaconda3/envs/U4-S1-NLP/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/maggie/opt/anaconda3/envs/U4-S1-NLP/lib/python3.7/multiprocessing/pool.py", line 105, in worker
    initializer(*initargs)
  File "/Users/maggie/opt/anaconda3/envs/U4-S1-NLP/lib/python3.7/site-packages/gensim/models/ldamulticore.py", line 289, in worker_e_step
    worker_lda.do_estep(chunk)  # TODO: auto-tune alpha?
  File "/Users/maggie/opt/anaconda3/envs/U4-S1-NLP/lib/python3.7/site-packages/gensim/models/ldamodel.py", line 533, in do_estep
    gamma, sstats = self.inference(chunk, collect_sstats=True)
  File "/Users/maggie/opt/anaconda3/envs/U4-S1-NLP/lib/python3.7/site-packages/gensim/models/ldamodel.py", line 498, in inf

KeyboardInterrupt: 

In [None]:
topic_coherence = pd.DataFrame.from_records(coherence_values)

In [None]:
topic_coherence.head()

In [None]:
import seaborn as sns

ax = sns.lineplot(x="num_topics", y="coherence_score", data=topic_coher

In [None]:
# Print the coherence scores
for m, cv in zip(x, coherence_values):
    print("Num Topics =", m, " has Coherence Value of", round(cv, 4))

## Stretch Goals

* Incorporate Named Entity Recognition in your analysis
* Incorporate some custom pre-processing from our previous lessons (like spacy lemmatization)
* Analyze a dataset of interest to you with topic modeling