In [1]:
from gensim import corpora, models, similarities
import pandas as pd

reviews = pd.read_json('Reviews20160219.json')

In [2]:
reviews.shape

(27080, 5)

In [3]:
reviews.head()

Unnamed: 0,ReviewerExperience,date,name,rating,review
0,382,2012-12-10,Tian Tian Hainanese Chicken Rice,5,I don't even like 海南雞飯 based on past attempts....
1,44,2014-06-18,Tian Tian Hainanese Chicken Rice,5,"Pros: chicken rice is delicious Cons: Hot, no ..."
2,285,2013-06-18,Tian Tian Hainanese Chicken Rice,4,The rice is freakin incredible!!! Wow I was su...
3,112,2013-02-04,Tian Tian Hainanese Chicken Rice,2,DUDE! WHAT HAPPENED TO THIS PLACE?! I kept hea...
4,5,2014-05-31,Tian Tian Hainanese Chicken Rice,4,The Chicken Rice is Singapore's national dish ...


In [4]:
reviews.drop_duplicates().groupby('name').size().sort_values(ascending=False)

name
Din Tai Fung                                   222
Jumbo Seafood                                  162
Tian Tian Hainanese Chicken Rice               151
Wild Honey                                      94
Ippudo                                          88
Paradise Dynasty                                78
Song Fa Bak Kut Teh                             76
Tim Ho Wan                                      69
Saveur                                          68
PS.Cafe                                         65
Itacho Sushi                                    61
Tiong Bahru Bakery                              61
Nam Nam Noodle Bar                              60
Boon Tong Kee                                   60
Maison Ikkoku Cocktail Bar                      55
CÉ LA VI                                        53
Food For Thought                                52
Omakase Burger                                  48
Kith Cafe                                       47
Pizzeria Mozza            

In [5]:
filtered_reviews = reviews[(reviews.name != "Din Tai Fung")&\
        (reviews.name != "Tian Tian Hainanese Chicken Rice")&\
        (reviews.name != "Jumbo Seafood")]

In [6]:
rev_text_norm = pd.Series(filtered_reviews.drop_duplicates().review)
rev_text_norm.size

19799

In [7]:
import re, unicodedata
from nltk.corpus import stopwords
stops = set(stopwords.words("english"))

In [8]:
def remove_non_ascii(string):
    ### remove unreadable unicode characters
    return unicodedata.normalize('NFKD', string).encode('ascii','ignore')
    #return string.decode('unicode_escape').encode('ascii','ignore')

def review_to_words(raw_review):
    ascii_only = remove_non_ascii(raw_review)
    letters_only = re.sub("[^a-zA-Z]", " ", ascii_only)
    words = letters_only.lower().split()
    stops = set(stopwords.words("english"))
    meaningful_words = [w for w in words if not w in stops]
    #return( " ".join( meaningful_words ))
    return meaningful_words

In [9]:
texts = [review_to_words(x) for x in rev_text_norm]

In [10]:
texts[0:5]

[['us',
  'would',
  'read',
  'papers',
  'mini',
  'chicken',
  'rice',
  'war',
  'maxwell',
  'really',
  'care',
  'less',
  'curiousity',
  'got',
  'better',
  'decided',
  'head',
  'check',
  'ah',
  'tai',
  'chicken',
  'rice',
  'presentation',
  'standard',
  'expected',
  'hawker',
  'stalls',
  'meat',
  'tender',
  'light',
  'soy',
  'dressing',
  'refreshing',
  'clincher',
  'rice',
  'whichi',
  'glistening',
  'chicken',
  'fat',
  'full',
  'flavours',
  'help',
  'even',
  'though',
  'total',
  'stalls',
  'selling',
  'chicken',
  'rice',
  'queues',
  'ah',
  'tai',
  'always',
  'long',
  'sure',
  'get',
  'early',
  'avoid',
  'disappointment',
  'closes',
  'everything',
  'sold'],
 ['ex',
  'chef',
  'tian',
  'tian',
  'brings',
  'us',
  'ah',
  'tai',
  'couple',
  'stalls',
  'famous',
  'tian',
  'tian',
  'stall',
  'maxwell',
  'food',
  'centre',
  'fan',
  'tian',
  'tian',
  'knew',
  'must',
  'try',
  'ah',
  'tai',
  'happy',
  'find',
  'ove

In [11]:
>>> # remove words that appear only once
>>> from collections import defaultdict
>>> frequency = defaultdict(int)
>>> for text in texts:
>>>     for token in text:
>>>         frequency[token] += 1

In [12]:
token

'couples'

In [13]:
texts = [[token for token in text if frequency[token] > 1]
         for text in texts]
#from pprint import pprint   # pretty-printer
#pprint(texts)

In [14]:
dictionary = corpora.Dictionary(texts)

In [15]:
print(dictionary)

Dictionary(22771 unique tokens: [u'gai', u'woods', u'clotted', u'hanging', u'woody']...)


In [16]:
print(dictionary.token2id)



In [17]:
new_vec = dictionary.doc2bow(review_to_words(u"Chicken rice tastes so damn good so tastes"))
print(new_vec) 

[(9, 1), (26, 1), (59, 1), (245, 2), (511, 1)]


In [18]:
>>> corpus = [dictionary.doc2bow(text) for text in texts] # all sentences converted into bag of words
#>>> corpora.MmCorpus.serialize('deerwester.mm', corpus) # store to disk, for later use
#>>> print(corpus)

In [19]:
tfidf = models.TfidfModel(corpus)

In [20]:
corpus_tfidf = tfidf[corpus]
#for doc in corpus_tfidf:
#    print(doc)

In [21]:
>>> #### SERIALIZATION OF TRANSFORMATIONS to save memory space
>>> lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=50) # initialize an LSI transformation
>>> corpus_lsi = lsi[corpus_tfidf] # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi
>>> # transformed our tf-ldf corpus via latent semantic indexing into 2-D space (num_topics = 2)
>>> lsi.print_topics(10)

[(0,
  u'0.129*"chicken" + 0.114*"food" + 0.112*"good" + 0.108*"place" + 0.107*"rice" + 0.100*"really" + 0.096*"like" + 0.095*"great" + 0.093*"one" + 0.091*"service"'),
 (1,
  u'-0.423*"ramen" + -0.258*"rice" + -0.256*"chicken" + -0.211*"soup" + -0.192*"noodles" + 0.189*"coffee" + 0.180*"pizza" + -0.146*"fried" + -0.139*"pork" + -0.119*"broth"'),
 (2,
  u'0.757*"ramen" + -0.294*"chicken" + -0.252*"rice" + -0.158*"thai" + -0.123*"fried" + -0.117*"curry" + 0.114*"broth" + -0.098*"fish" + 0.087*"ippudo" + -0.076*"crab"'),
 (3,
  u'-0.680*"sushi" + 0.222*"coffee" + -0.218*"sashimi" + -0.212*"japanese" + -0.129*"salmon" + -0.099*"quality" + 0.097*"pizza" + -0.090*"fish" + 0.088*"burger" + 0.084*"eggs"'),
 (4,
  u'0.650*"pizza" + -0.381*"coffee" + 0.132*"pizzas" + 0.130*"pasta" + -0.128*"breakfast" + -0.123*"thai" + 0.121*"crust" + 0.118*"crab" + -0.104*"cafe" + -0.101*"toast"'),
 (5,
  u'0.358*"thai" + 0.358*"pizza" + -0.234*"crab" + 0.200*"chicken" + 0.144*"curry" + -0.142*"burger" + 0.133

In [22]:
#### SERIALIZATION OF TRANSFORMATIONS to save memory space
lda_model = models.LdaModel(corpus_tfidf, id2word=dictionary, num_topics=50) # initialize an LSI transformation

lda_model.print_topics(10)



[(17,
  u'0.013*dumpling + 0.008*wan + 0.008*tim + 0.007*maki + 0.007*liberty + 0.006*szechuan + 0.005*laptop + 0.005*hostess + 0.005*requests + 0.005*therapy'),
 (10,
  u'0.014*glutinous + 0.012*sheng + 0.010*kampong + 0.009*shisha + 0.008*chestnut + 0.007*mackerel + 0.007*shortcake + 0.006*ny + 0.006*tradition + 0.006*macaron'),
 (49,
  u'0.009*chatterbox + 0.008*blueberry + 0.008*lattes + 0.008*ikan + 0.007*bilis + 0.006*chopsticks + 0.005*touches + 0.005*americano + 0.005*breko + 0.004*smiles'),
 (13,
  u'0.012*centres + 0.012*tei + 0.011*payoh + 0.011*toa + 0.010*hangout + 0.008*spending + 0.008*imperial + 0.008*btw + 0.007*heated + 0.007*reuben'),
 (28,
  u'0.005*udon + 0.004*mocha + 0.003*japanese + 0.003*vintage + 0.003*stars + 0.003*sashimi + 0.003*meet + 0.003*japan + 0.003*tempura + 0.003*industrial'),
 (39,
  u'0.011*quiche + 0.011*scones + 0.009*ginseng + 0.009*scone + 0.008*hardware + 0.007*church + 0.006*hospital + 0.006*indie + 0.006*soooo + 0.006*huat'),
 (7,
  u'0.013

In [23]:
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)