In [1]:
from gensim import corpora, models, similarities
import pandas as pd

reviews = pd.read_json('Reviews20160219.json')

In [2]:
reviews.shape

(27080, 5)

In [3]:
reviews.head()

Unnamed: 0,ReviewerExperience,date,name,rating,review
0,382,2012-12-10,Tian Tian Hainanese Chicken Rice,5,I don't even like 海南雞飯 based on past attempts....
1,44,2014-06-18,Tian Tian Hainanese Chicken Rice,5,"Pros: chicken rice is delicious Cons: Hot, no ..."
2,285,2013-06-18,Tian Tian Hainanese Chicken Rice,4,The rice is freakin incredible!!! Wow I was su...
3,112,2013-02-04,Tian Tian Hainanese Chicken Rice,2,DUDE! WHAT HAPPENED TO THIS PLACE?! I kept hea...
4,5,2014-05-31,Tian Tian Hainanese Chicken Rice,4,The Chicken Rice is Singapore's national dish ...


In [4]:
rev_text_norm = pd.Series(reviews.drop_duplicates().review)
rev_text_norm.size

20334

In [5]:
import re, unicodedata
from nltk.corpus import stopwords
stops = set(stopwords.words("english"))

In [6]:
def remove_non_ascii(string):
    ### remove unreadable unicode characters
    return unicodedata.normalize('NFKD', string).encode('ascii','ignore')
    #return string.decode('unicode_escape').encode('ascii','ignore')

def review_to_words(raw_review):
    ascii_only = remove_non_ascii(raw_review)
    letters_only = re.sub("[^a-zA-Z]", " ", ascii_only)
    words = letters_only.lower().split()
    stops = set(stopwords.words("english"))
    meaningful_words = [w for w in words if not w in stops]
    #return( " ".join( meaningful_words ))
    return meaningful_words

In [7]:
texts = [review_to_words(x) for x in rev_text_norm]

In [8]:
texts[0:5]

[['even',
  'like',
  'based',
  'past',
  'attempts',
  'thought',
  'made',
  'obligatory',
  'visit',
  'tian',
  'tian',
  'expecting',
  'lone',
  'dissenter',
  'anthony',
  'bourdain',
  'photo',
  'plastered',
  'stall',
  'check',
  'huge',
  'queue',
  'anything',
  'resembling',
  'lunch',
  'hour',
  'check',
  'serving',
  'ware',
  'looks',
  'like',
  'seen',
  'minor',
  'battle',
  'long',
  'forgotten',
  'war',
  'check',
  'good',
  'really',
  'good',
  'especially',
  'chicken',
  'stock',
  'combination',
  'chicken',
  'rice',
  'light',
  'still',
  'meaty',
  'enough',
  'reaching',
  'spice',
  'best',
  'ever',
  'yes',
  'overrated',
  'probably',
  'since',
  'cannot',
  'cure',
  'cancer',
  'world',
  'hunger',
  'simultaneously',
  'still',
  'absolutely',
  'come',
  'peak',
  'visiting',
  'singapore',
  'absolutely'],
 ['pros',
  'chicken',
  'rice',
  'delicious',
  'cons',
  'hot',
  'c',
  'waiting',
  'line'],
 ['rice',
  'freakin',
  'incredible

In [9]:
>>> # remove words that appear only once
>>> from collections import defaultdict
>>> frequency = defaultdict(int)
>>> for text in texts:
>>>     for token in text:
>>>         frequency[token] += 1

In [10]:
token

'couples'

In [11]:
texts = [[token for token in text if frequency[token] > 1]
         for text in texts]
#from pprint import pprint   # pretty-printer
#pprint(texts)

In [12]:
dictionary = corpora.Dictionary(texts)

In [13]:
print(dictionary)

Dictionary(22901 unique tokens: [u'gai', u'woods', u'clotted', u'hanging', u'woody']...)


In [14]:
print(dictionary.token2id)



In [15]:
# test bag of words
new_vec = dictionary.doc2bow(review_to_words(u"Chicken rice tastes so damn good so"))
print(new_vec) 

[(12, 1), (36, 1), (52, 1), (1522, 1), (1609, 1)]


In [16]:
corpus = [dictionary.doc2bow(text) for text in texts] # all sentences converted into bag of words
#>>> corpora.MmCorpus.serialize('deerwester.mm', corpus) # store to disk, for later use
#>>> print(corpus)

In [17]:
tfidf = models.TfidfModel(corpus)

In [18]:
corpus_tfidf = tfidf[corpus]
#for doc in corpus_tfidf:
#    print(doc)

In [20]:
>>> #### SERIALIZATION OF TRANSFORMATIONS to save memory space
>>> lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=50) # initialize an LSI transformation
>>> corpus_lsi = lsi[corpus_tfidf] # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi
>>> # transformed our tf-ldf corpus via latent semantic indexing into 2-D space (num_topics = 2)
>>> lsi.print_topics(10)

[(0,
  u'0.134*"chicken" + 0.115*"food" + 0.114*"rice" + 0.112*"good" + 0.108*"place" + 0.100*"really" + 0.096*"like" + 0.095*"great" + 0.093*"one" + 0.091*"service"'),
 (1,
  u'0.340*"chicken" + 0.331*"rice" + 0.298*"ramen" + -0.191*"coffee" + 0.189*"soup" + -0.174*"pizza" + 0.167*"noodles" + 0.154*"fried" + 0.126*"pork" + 0.102*"stall"'),
 (2,
  u'-0.798*"ramen" + 0.280*"chicken" + 0.234*"rice" + -0.133*"broth" + 0.122*"crab" + 0.110*"thai" + -0.103*"noodles" + -0.089*"ippudo" + 0.082*"fried" + -0.082*"japanese"'),
 (3,
  u'-0.707*"crab" + 0.285*"chicken" + -0.249*"chili" + 0.199*"rice" + -0.199*"pepper" + -0.146*"crabs" + -0.110*"chilli" + 0.110*"coffee" + -0.108*"seafood" + -0.100*"black"'),
 (4,
  u'-0.666*"sushi" + 0.242*"coffee" + -0.217*"sashimi" + -0.208*"japanese" + -0.130*"salmon" + 0.109*"pizza" + 0.103*"ramen" + -0.102*"fish" + 0.101*"chicken" + -0.100*"quality"'),
 (5,
  u'0.703*"pizza" + -0.342*"coffee" + 0.144*"pizzas" + 0.139*"chicken" + 0.129*"pasta" + 0.126*"crust" +

In [21]:
#### SERIALIZATION OF TRANSFORMATIONS to save memory space
lda_model = models.LdaModel(corpus_tfidf, id2word=dictionary, num_topics=50) # initialize an LDA transformation with tf-idf

lda_model.print_topics(10)

[(24,
  u'0.016*meatballs + 0.008*hainanese + 0.008*ivy + 0.008*lava + 0.007*nam + 0.006*keeping + 0.006*malaysian + 0.006*lobby + 0.006*combos + 0.006*writing'),
 (45,
  u'0.017*otak + 0.012*ban + 0.009*overlooking + 0.008*fashioned + 0.008*jewel + 0.008*sua + 0.007*apologetic + 0.007*oriole + 0.007*waved + 0.007*conveniently'),
 (0,
  u'0.022*terminal + 0.020*changi + 0.015*rendang + 0.014*elevator + 0.013*hk + 0.012*pisang + 0.012*airport + 0.011*superb + 0.010*scones + 0.010*attracted'),
 (7,
  u'0.013*ramen + 0.011*hoon + 0.011*bee + 0.010*stall + 0.009*kee + 0.008*kopitiam + 0.008*court + 0.008*sells + 0.007*chang + 0.007*dimsum'),
 (41,
  u'0.015*hokkien + 0.011*mee + 0.010*espresso + 0.007*coffees + 0.007*italian + 0.006*chee + 0.005*natural + 0.005*grey + 0.005*dough + 0.005*hawkers'),
 (19,
  u'0.053*ta + 0.030*junction + 0.013*ignored + 0.013*imperial + 0.013*yu + 0.012*limp + 0.009*renovations + 0.009*kill + 0.008*auntie + 0.008*liu'),
 (28,
  u'0.021*rainbow + 0.015*maison

In [22]:
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)

In [40]:
for doc in corpus_tfidf[0:50]:
    print(doc)

[(0, 0.08336083604652836), (1, 0.07627094834223797), (2, 0.04617449205619288), (3, 0.20926294063264536), (4, 0.1475290943352146), (5, 0.18306243590789284), (6, 0.18245069651012585), (7, 0.09754330338236454), (8, 0.16616639466254768), (9, 0.18245069651012585), (10, 0.1508556521134043), (11, 0.0843473557273314), (12, 0.16870073567130628), (13, 0.09958069477271554), (14, 0.0801068740519404), (15, 0.1011766349398468), (16, 0.25008250813958505), (17, 0.045350701155222904), (18, 0.03355069895421433), (19, 0.04563976580954829), (20, 0.07545128130144359), (21, 0.09202331833862858), (22, 0.10721015115433935), (23, 0.06063704062787484), (24, 0.0687789768682017), (25, 0.05719426873872109), (26, 0.20185656177835165), (27, 0.053644366758667385), (28, 0.059939229707935124), (29, 0.12968401077731265), (30, 0.18935715648085055), (31, 0.06461232598448433), (32, 0.12393917702441148), (33, 0.07160158478372249), (34, 0.17958719102674603), (35, 0.15317659332736142), (36, 0.11494449639992793), (37, 0.081434