<img align="left" src="https://lever-client-logos.s3.amazonaws.com/864372b1-534c-480e-acd5-9711f850815c-1524247202159.png" width=200>
<br></br>
<br></br>

# Topic Modeling
## *Data Science Unit 4 Sprint 1 Assignment 4*

Analyze a corpus of Amazon reviews from Unit 4 Sprint 1 Module 1's lecture using topic modeling: 

- Fit a Gensim LDA topic model on Amazon Reviews
- Select appropriate number of topics
- Create some dope visualization of the topics
- Write a few bullets on your findings in markdown at the end
- **Note**: You don't *have* to use generators for this assignment

In [1]:
#Start Here
#Fit a Gensim LDA topic model on Amazon Reviews

In [8]:
import numpy as np
import gensim
import os
import re

from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim import corpora

from gensim.models.ldamulticore import LdaMulticore

import pandas as pd

In [9]:
data = pd.read_csv("./data/Datafiniti_Amazon_Consumer_Reviews_of_Amazon_Products_May19.csv")

In [10]:
data.head(2)

Unnamed: 0,id,dateAdded,dateUpdated,name,asins,brand,categories,primaryCategories,imageURLs,keys,...,reviews.didPurchase,reviews.doRecommend,reviews.id,reviews.numHelpful,reviews.rating,reviews.sourceURLs,reviews.text,reviews.title,reviews.username,sourceURLs
0,AVpgNzjwLJeJML43Kpxn,2015-10-30T08:59:32Z,2019-04-25T09:08:16Z,AmazonBasics AAA Performance Alkaline Batterie...,"B00QWO9P0O,B00LH3DMUO",Amazonbasics,"AA,AAA,Health,Electronics,Health & Household,C...",Health & Beauty,https://images-na.ssl-images-amazon.com/images...,"amazonbasics/hl002619,amazonbasicsaaaperforman...",...,,,,,3,https://www.amazon.com/product-reviews/B00QWO9...,I order 3 of them and one of the item is bad q...,... 3 of them and one of the item is bad quali...,Byger yang,"https://www.barcodable.com/upc/841710106442,ht..."
1,AVpgNzjwLJeJML43Kpxn,2015-10-30T08:59:32Z,2019-04-25T09:08:16Z,AmazonBasics AAA Performance Alkaline Batterie...,"B00QWO9P0O,B00LH3DMUO",Amazonbasics,"AA,AAA,Health,Electronics,Health & Household,C...",Health & Beauty,https://images-na.ssl-images-amazon.com/images...,"amazonbasics/hl002619,amazonbasicsaaaperforman...",...,,,,,4,https://www.amazon.com/product-reviews/B00QWO9...,Bulk is always the less expensive way to go fo...,... always the less expensive way to go for pr...,ByMG,"https://www.barcodable.com/upc/841710106442,ht..."


In [11]:
#test preprocessing
# 1) Plain Python - ''.split command
# 2) Spacy - just the lemmas from the document
# 3) Gensim - simple_preprocess

STOPWORDS = set(STOPWORDS).union(set(['and', 'if', 'for']))

def tokenize(text):
    return [token for token in simple_preprocess(text) if token not in STOPWORDS]

In [12]:
import os

def gather_data(column_name): 
    tokens = []
    for text in data[column_name]:
        tokens.append(tokenize(text))
    return tokens

In [13]:
tokens = gather_data('reviews.text')


In [22]:
tokens[0][0:10]

['order',
 'item',
 'bad',
 'quality',
 'missing',
 'backup',
 'spring',
 'pcs',
 'aluminum',
 'battery']

In [24]:
def doc_stream():
    for t in tokens:
        yield t

In [25]:
# A Dictionary Representation of all the words in our corpus
id2word = corpora.Dictionary(doc_stream())

In [27]:
len(id2word.keys())

9621

In [28]:
# Let's remove extreme values from the dataset
id2word.filter_extremes(no_below=5, no_above=0.95)

In [29]:
len(id2word.keys())

3581

In [30]:
corpus = [id2word.doc2bow(text) for text in doc_stream()]

In [31]:
corpus[345][:10]

[(17, 1), (18, 1), (21, 1), (34, 1)]

In [32]:
lda = LdaMulticore(corpus = corpus,
                   id2word = id2word,
                   random_state = 42,
                   num_topics = 15,
                   passes = 10,
                   workers = 4)

In [33]:
lda.print_topics()

[(0,
  '0.059*"good" + 0.059*"buy" + 0.056*"best" + 0.046*"amazon" + 0.023*"quality" + 0.018*"like" + 0.018*"product" + 0.017*"price" + 0.015*"prime" + 0.014*"store"'),
 (1,
  '0.116*"price" + 0.105*"great" + 0.051*"good" + 0.040*"product" + 0.019*"nice" + 0.016*"case" + 0.016*"screen" + 0.015*"black" + 0.014*"beat" + 0.014*"love"'),
 (2,
  '0.032*"tablet" + 0.023*"kids" + 0.019*"card" + 0.017*"gb" + 0.015*"like" + 0.015*"purchase" + 0.013*"sd" + 0.012*"storage" + 0.012*"good" + 0.011*"great"'),
 (3,
  '0.051*"batteries" + 0.022*"amazon" + 0.015*"box" + 0.014*"use" + 0.011*"like" + 0.010*"price" + 0.010*"haven" + 0.010*"ok" + 0.008*"ve" + 0.008*"basics"'),
 (4,
  '0.044*"kindle" + 0.021*"charge" + 0.016*"new" + 0.016*"like" + 0.013*"light" + 0.012*"reading" + 0.011*"charging" + 0.010*"screen" + 0.009*"time" + 0.009*"device"'),
 (5,
  '0.142*"great" + 0.068*"tablet" + 0.042*"works" + 0.022*"use" + 0.019*"price" + 0.019*"good" + 0.018*"product" + 0.018*"value" + 0.017*"amazon" + 0.016*"r

In [34]:
words = [re.findall(r'"([^"]*)"',t[1]) for t in lda.print_topics()]

In [35]:
topics = [' '.join(t[0:10]) for t in words]

In [36]:
for id, t in enumerate(topics): 
    print(f"------ Topic {id} ------")
    print(t, end="\n\n")

------ Topic 0 ------
good buy best amazon quality like product price prime store

------ Topic 1 ------
price great good product nice case screen black beat love

------ Topic 2 ------
tablet kids card gb like purchase sd storage good great

------ Topic 3 ------
batteries amazon box use like price haven ok ve basics

------ Topic 4 ------
kindle charge new like light reading charging screen time device

------ Topic 5 ------
great tablet works use price good product value amazon recommend

------ Topic 6 ------
amazon tablet screen apps google device play store app use

------ Topic 7 ------
batteries use amazon work battery remote months time brand worked

------ Topic 8 ------
work great grandson deal kids fine good item bought little

------ Topic 9 ------
batteries good long price great battery brand life brands work

------ Topic 10 ------
tablet easy use perfect size kindle love nice good screen

------ Topic 11 ------
love gift bought christmas loved great kindle tablets kids 

In [37]:
import pyLDAvis.gensim

pyLDAvis.enable_notebook()

In [38]:
pyLDAvis.gensim.prepare(lda, corpus, id2word)

## Stretch Goals

* Incorporate Named Entity Recognition in your analysis
* Incorporate some custom pre-processing from our previous lessons (like spacy lemmatization)
* Analyze a dataset of interest to you with topic modeling