<img align="left" src="https://lever-client-logos.s3.amazonaws.com/864372b1-534c-480e-acd5-9711f850815c-1524247202159.png" width=200>
<br></br>
<br></br>

# Topic Modeling
## *Data Science Unit 4 Sprint 1 Assignment 4*

Analyze a corpus of Amazon reviews from Unit 4 Sprint 1 Module 1's lecture using topic modeling: 

- Fit a Gensim LDA topic model on Amazon Reviews
- Select appropriate number of topics
- Create some dope visualization of the topics
- Write a few bullets on your findings in markdown at the end
- **Note**: You don't *have* to use generators for this assignment

In [24]:
import pandas as pd
from gensim.corpora import Dictionary
from gensim.models.ldamulticore import LdaMulticore
from gensim.models.coherencemodel import CoherenceModel
import spacy

from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim import corpora

import re
import pyLDAvis.gensim

In [8]:
df = pd.read_csv('./data/Datafiniti_Amazon_Consumer_Reviews_of_Amazon_Products_May19.csv')

In [9]:
df.head(2).T

Unnamed: 0,0,1
id,AVpgNzjwLJeJML43Kpxn,AVpgNzjwLJeJML43Kpxn
dateAdded,2015-10-30T08:59:32Z,2015-10-30T08:59:32Z
dateUpdated,2019-04-25T09:08:16Z,2019-04-25T09:08:16Z
name,AmazonBasics AAA Performance Alkaline Batterie...,AmazonBasics AAA Performance Alkaline Batterie...
asins,"B00QWO9P0O,B00LH3DMUO","B00QWO9P0O,B00LH3DMUO"
brand,Amazonbasics,Amazonbasics
categories,"AA,AAA,Health,Electronics,Health & Household,C...","AA,AAA,Health,Electronics,Health & Household,C..."
primaryCategories,Health & Beauty,Health & Beauty
imageURLs,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...
keys,"amazonbasics/hl002619,amazonbasicsaaaperforman...","amazonbasics/hl002619,amazonbasicsaaaperforman..."


### Get Tokens

In [10]:
nlp = spacy.load('en_core_web_lg')

In [12]:
tokens = []
for doc in nlp.pipe(df['reviews.text'], batch_size=500):
    doc_tokens = []
    for token in doc:
        if (token.is_stop == False) & (token.is_punct == False):
            doc_tokens.append(token.lemma_.lower())
    tokens.append(doc_tokens)
    
df['tokens'] = tokens
del tokens

In [13]:
id2word = Dictionary(df['tokens'])
print(len(id2word))

8902


In [14]:
# cutting out words included in 5 or less documents, or more than 98% of all documents
id2word.filter_extremes(no_below=5, no_above=0.98)
print(len(id2word))

2957


## Create Corpous Object

In [15]:
corpus = [id2word.doc2bow(d) for d in df['tokens']]

In [16]:
lda = LdaMulticore(corpus=corpus,
                   id2word=id2word,
                   num_topics=10,
                   passes=10,
                   workers=8
                  )

In [17]:
lda.print_topics()

[(0,
  '0.062*"love" + 0.038*"old" + 0.033*"year" + 0.027*"buy" + 0.026*"tablet" + 0.025*"game" + 0.023*"kid" + 0.021*"play" + 0.021*"use" + 0.019*"app"'),
 (1,
  '0.052*"tablet" + 0.047*"love" + 0.045*"buy" + 0.034*"gift" + 0.028*"purchase" + 0.026*"great" + 0.025*"christmas" + 0.022*"good" + 0.018*"get" + 0.017*"use"'),
 (2,
  '0.040*"tablet" + 0.038*"fire" + 0.029*"kindle" + 0.024*"amazon" + 0.024*"great" + 0.019*"screen" + 0.015*"use" + 0.014*"book" + 0.014*"love" + 0.013*"read"'),
 (3,
  '0.107*"price" + 0.094*"good" + 0.088*"great" + 0.036*"product" + 0.030*"tablet" + 0.029*"work" + 0.028*"battery" + 0.019*"amazon" + 0.017*"use" + 0.016*"buy"'),
 (4,
  '0.063*"tablet" + 0.043*"great" + 0.039*"kid" + 0.034*"good" + 0.034*"use" + 0.026*"easy" + 0.024*"love" + 0.024*"child" + 0.018*"app" + 0.017*"product"'),
 (5,
  '0.026*"read" + 0.025*"kindle" + 0.025*"easy" + 0.024*"light" + 0.018*"use" + 0.014*"like" + 0.013*"book" + 0.012*"size" + 0.012*"small" + 0.011*"reader"'),
 (6,
  '0.027

In [21]:
words = [re.findall('"([^"]*)"',t[1]) for t in lda.print_topics()]

In [22]:
topics = [' '.join(t[0:10]) for t in words]

In [23]:
for id, t in enumerate(topics): 
    print(f"------ Topic {id} ------")
    print(t, end="\n")

------ Topic 0 ------
love old year buy tablet game kid play use app
------ Topic 1 ------
tablet love buy gift purchase great christmas good get use
------ Topic 2 ------
tablet fire kindle amazon great screen use book love read
------ Topic 3 ------
price good great product tablet work battery amazon use buy
------ Topic 4 ------
tablet great kid good use easy love child app product
------ Topic 5 ------
read kindle easy light use like book size small reader
------ Topic 6 ------
charge long buy battery amazon time like price kindle use
------ Topic 7 ------
battery work brand long buy amazon good great time use
------ Topic 8 ------
great price good value battery quality long buy love tablet
------ Topic 9 ------
work tablet good kid buy amazon fine device case time


In [25]:
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(lda, corpus, id2word)

## Stretch Goals

* Incorporate Named Entity Recognition in your analysis
* Incorporate some custom pre-processing from our previous lessons (like spacy lemmatization)
* Analyze a dataset of interest to you with topic modeling