#### For reference use the following two blogs:
https://gist.github.com/kevindavenport/d704e37e9f9d175303b1eb083f8e749e#file-topic_modeling_amazon_reviews-ipynb (main)

https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/


In [23]:
import warnings
warnings.filterwarnings('ignore')

In [24]:
import pandas as pd
import gzip

In [25]:
from gensim import corpora, models
import gensim

import pyLDAvis

In [26]:
#spacy
import spacy
from spacy import displacy

#load basic en model (without embeddings)
nlp = spacy.load('en_core_web_sm')

In [27]:
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS

### Helper Methods for reading file

In [28]:
def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield eval(l)
    
def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

In [29]:
## this is the dataset downloaded from AMZN directly.
mobile_electronics_review = 'amazon_reviews_us_Mobile_Electronics_v1_00.tsv.gz'

In [30]:
# skipping bad lines.
df = pd.read_table(mobile_electronics_review, error_bad_lines=False)

# let's see some data
df.head()


b'Skipping line 35246: expected 15 fields, saw 22\n'
b'Skipping line 87073: expected 15 fields, saw 22\n'


Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
0,US,20422322,R8MEA6IGAHO0B,B00MC4CED8,217304173,BlackVue DR600GW-PMP,Mobile_Electronics,5.0,0.0,0.0,N,Y,Very Happy!,"As advertised. Everything works perfectly, I'm...",2015-08-31
1,US,40835037,R31LOQ8JGLPRLK,B00OQMFG1Q,137313254,GENSSI GSM / GPS Two Way Smart Phone Car Alarm...,Mobile_Electronics,5.0,0.0,1.0,N,Y,five star,it's great,2015-08-31
2,US,51469641,R2Y0MM9YE6OP3P,B00QERR5CY,82850235,iXCC Multi pack Lightning cable,Mobile_Electronics,5.0,0.0,0.0,N,Y,great cables,These work great and fit my life proof case fo...,2015-08-31
3,US,4332923,RRB9C05HDOD4O,B00QUFTPV4,221169481,abcGoodefg® FBI Covert Acoustic Tube Earpiece ...,Mobile_Electronics,4.0,0.0,0.0,N,Y,Work very well but couldn't get used to not he...,Work very well but couldn't get used to not he...,2015-08-31
4,US,44855305,R26I2RI1GFV8QG,B0067XVNTG,563475445,Generic Car Dashboard Video Camera Vehicle Vid...,Mobile_Electronics,2.0,0.0,0.0,N,Y,Cameras has battery issues,"Be careful with these products, I have bought ...",2015-08-31


In [31]:
# sort by product_id to see if there are multiple reviews for a given product, and display few items.
df.sort_values(by=['product_id'])[1:20]

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
1841,US,10384722,R1KY2J6YU46AVG,528007262,513854777,Rand McNally TND 720 LM IntelliRoute Truck GPS...,Mobile_Electronics,5.0,0.0,0.0,N,Y,Love it,My husband is a long haul trucker I bot him th...,2015-08-04
4135,US,15928644,RF8AVYHJ16UIB,528007262,513854777,Rand McNally TND 720 LM IntelliRoute Truck GPS...,Mobile_Electronics,1.0,0.0,0.0,N,Y,this could be good. Don't like the map screen,Guess if you are use to gps setups and screens...,2015-06-27
4442,US,876091,R219O8P5O58NFL,528007262,513854777,Rand McNally TND 720 LM IntelliRoute Truck GPS...,Mobile_Electronics,1.0,0.0,0.0,N,Y,Was very disappointed with my purchase cause t...,Was very disappointed with my purchase cause t...,2015-06-22
3705,US,2215538,R1SMRD0ZFB9JXC,528007262,513854777,Rand McNally TND 720 LM IntelliRoute Truck GPS...,Mobile_Electronics,5.0,0.0,0.0,N,Y,Great gps a must have !!,On spot !!!!! Takes you right where you need t...,2015-07-04
4395,US,47365122,R1W75TSXGZSHXG,528007262,513854777,Rand McNally TND 720 LM IntelliRoute Truck GPS...,Mobile_Electronics,5.0,0.0,0.0,N,Y,Five Stars,All good,2015-06-23
2743,US,640640,R1GHGB22CEAYOY,528007262,513854777,Rand McNally TND 720 LM IntelliRoute Truck GPS...,Mobile_Electronics,1.0,0.0,0.0,N,Y,do not buy!!!!,Absolutely a total waste of money!!! Pos has ...,2015-07-19
944,US,23746886,R1YTJAG0C7P2AZ,528007262,513854777,Rand McNally TND 720 LM IntelliRoute Truck GPS...,Mobile_Electronics,5.0,0.0,0.0,N,N,this product the item tha demaged and need rep...,this product the item tha demaged and need rep...,2015-08-17
4971,US,21985645,R3URC7RPIHM117,528007262,513854777,Rand McNally TND 720 LM IntelliRoute Truck GPS...,Mobile_Electronics,3.0,0.0,1.0,N,Y,Three Stars,Maps need updated. Hard to save current update...,2015-06-14
4333,US,13855441,R2K05HKEOOILBC,528007262,513854777,Rand McNally TND 720 LM IntelliRoute Truck GPS...,Mobile_Electronics,4.0,0.0,0.0,N,Y,Four Stars,Its great,2015-06-23
340,US,52779789,R2IJUP8JD5W160,528007262,513854777,Rand McNally TND 720 LM IntelliRoute Truck GPS...,Mobile_Electronics,2.0,0.0,0.0,N,Y,Sorry I wasted my money,"Not user friendly. Dorky touch screen. Click, ...",2015-08-26


In [32]:
%%time

#let's try for a single product first
df_sample = df[df.product_id == "0528007262"]
num_reviews = df_sample.shape[0]

doc_set = df_sample.review_body #[df_sample.review_body[i] for i in range(num_reviews)]

texts = []

for doc in doc_set:
    # print(doc)
    
    # putting our three steps together
    
    #1. Tokenize
    doc_sp = nlp(doc)
    tokens = [token.text.lower() for token in doc_sp]
    
    #2. remove stop words
    stopped_tokens = [token for token in tokens if not token in spacy_stopwords]
    
    #3. lemmetize
    lemmed_tokens = []
    for stopped_token in stopped_tokens:
        lemmed_nlp = nlp(stopped_token)
        lemmed_token = lemmed_nlp[0].lemma_
        lemmed_tokens.append(lemmed_token)
    
    
    # add tokens to list, let's start with stopped_tokens, lemmitization is messing up.
    texts.append(stopped_tokens)

CPU times: user 4.04 s, sys: 795 ms, total: 4.84 s
Wall time: 1.45 s


In [33]:
# take a look at sample text
texts[1]

['product',
 'item',
 'tha',
 'demaged',
 'need',
 'replacement',
 'product',
 'device',
 'thas',
 'erase',
 'missin',
 'device']

## Transform tokenized documents into an id-term dictionary

In [34]:
# Gensim's Dictionary encapsulates the mapping between normalized words and their integer ids.
texts_dict = corpora.Dictionary(texts)
texts_dict.save('mobile_electronics_review.dict') # lets save to disk for later use
# Examine each token’s unique id
print(texts_dict)

Dictionary(136 unique tokens: [',', '.', 'accidentally', 'click', 'dorky']...)


In [35]:
# see mapping between words and their Ids

import operator
print("IDs 1 through 10: {}".format(sorted(texts_dict.token2id.items(), key=operator.itemgetter(1), reverse = False)[:10]))

IDs 1 through 10: [(',', 0), ('.', 1), ('accidentally', 2), ('click', 3), ('dorky', 4), ('friendly', 5), ('garmin', 6), ('money', 7), ('motorhome', 8), ('packaging', 9)]


In [36]:
## We have a lot of unique tokens, let's see what happens if we ignore tokens that appear in less than 30 documents or more than 15% documents. Granted this is arbitrary but a quick search shows tons of methods for reducing noise.

In [37]:
texts_dict.filter_extremes(no_below=1, no_above=0.15) # inlace filter
print(texts_dict)
print("top terms:")
print(sorted(texts_dict.token2id.items(), key=operator.itemgetter(1), reverse = False)[:10])

Dictionary(119 unique tokens: ['accidentally', 'click', 'dorky', 'friendly', 'garmin']...)
top terms:
[('accidentally', 0), ('click', 1), ('dorky', 2), ('friendly', 3), ('garmin', 4), ('motorhome', 5), ('packaging', 6), ('planned', 7), ('prefer', 8), ('receipt', 9)]


### Creating a bag of words

In [38]:
corpus = [texts_dict.doc2bow(text) for text in texts]
len(corpus)

13

#### Dump into a file

In [39]:
%%time 
# Matrix Market format https://radimrehurek.com/gensim/corpora/mmcorpus.html, why exactly? I don't know
gensim.corpora.MmCorpus.serialize('amzn_mob_review.mm', corpus)

CPU times: user 1.14 ms, sys: 1.4 ms, total: 2.54 ms
Wall time: 1.77 ms


### Train LDA model

In [40]:
%%time 
lda_model = gensim.models.LdaModel(corpus,alpha='auto', num_topics=5,id2word=texts_dict, passes=200)

CPU times: user 726 ms, sys: 4.49 ms, total: 730 ms
Wall time: 734 ms


### Infer topics

In [41]:
lda_model.show_topics(num_topics=10,num_words=5)

[(0,
  '0.061*"click" + 0.042*"updated" + 0.023*"handiest" + 0.023*"gadget" + 0.023*"current"'),
 (1,
  '0.025*"pos" + 0.025*"waste" + 0.025*"quit" + 0.025*"total" + 0.025*"absolutely"'),
 (2,
  '0.047*"value" + 0.047*"excellent" + 0.008*"google" + 0.008*"like" + 0.008*"gps"'),
 (3, '0.038*"\'s" + 0.026*"poi" + 0.026*"find" + 0.026*"easy" + 0.026*"use"'),
 (4,
  '0.031*"updates" + 0.031*"order" + 0.031*"complete" + 0.031*"cause" + 0.031*"freezing"')]

### Visualize topic model
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

In [42]:
pyLDAvis.enable_notebook()
from pyLDAvis import gensim
vis = pyLDAvis.gensim.prepare(lda_model, corpus, texts_dict)
vis