In [1]:
import pickle
import gensim
import pyLDAvis
import pyLDAvis.gensim
import spacy
import pandas as pd
import nltk; nltk.download('stopwords')
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import re
import warnings

  from .optimizers import Adam, SGD, linear_decay
  from collections import defaultdict, Sequence, Sized, Iterable, Callable
  regargs, varargs, varkwargs, defaults, formatvalue=lambda value: ""
[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Trigram, Stop-word Removal, Lemmatize

In [2]:
with open('reviews_df.pkl', 'rb') as f:
    reviews = pickle.load(f)

In [3]:
reviews.head(2)

Unnamed: 0,business_id,review_range,attributes,categories,city,name,biz_stars,state,date,review_stars,text,useful,real_counts
3187,-0DET7VdEQOJVJ_v6klEug,527 days 13:59:43,"{'GoodForKids': 'True', 'WiFi': ''no'', 'Resta...","Asian Fusion, Restaurants",Markham,Flaming Kitchen,3.0,ON,2017-05-02 00:33:10,4.0,This used to be where Aka Teppan was. All the...,11,106
3188,-0DET7VdEQOJVJ_v6klEug,527 days 13:59:43,"{'GoodForKids': 'True', 'WiFi': ''no'', 'Resta...","Asian Fusion, Restaurants",Markham,Flaming Kitchen,3.0,ON,2017-05-04 21:05:16,4.0,I like this place a lot better than Aka teppan...,1,106


In [4]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

#### Remove new lines

In [5]:
reviews.text[:1].values

array(["This used  to be where Aka Teppan was. All the tables, chairs, cutlery, cups, and hot plates are the same. The menu however is much better. There's a lot of small snacks, drinks, dessert and food variety ! The menu has like 15 pages ! The food was also much tastier than it used to be. They even have a $40 wagu steak on the menu. I'm not sure how many people will be getting that, but I'm sure it's amazing. \n\nThe service wasn't bad either. The staff seem to be the same as the old restaurant. The service speed was not bad for a Monday and they were quite busy for soft opening. \n\nThe salty spicy fried squid balls and wasabi shake shake fries are a must try! This will def become one of my regular spots again."],
      dtype=object)

In [6]:
def strip_newline(series):
    return [review.replace('\n','') for review in series]

In [7]:
reviews['text'] = strip_newline(reviews.text)
reviews.text[2:3].values

array(["If you've been to Aka Teppan (the predecessor at the same location) then you're in for a similar experience. The menu has been greatly expanded with newly enlarged selections of appetizers, drinks, and desserts. They also allow you to construct your own salad or sizzling plate. We started with Spicy Shake Shake Fries as an appetizer. It's a paper bag with fries and a shaker of spicy powder for you to combine. On the plus side, you can control exactly how spicy you want your fries. I had the Black Curry Beef Udon for my main. It arrived on a sizzling hot plate exactly as advertised. Make sure to flip your beef or it will stick to the plate. The noodles don't suffer from the same problem because they're covered in the curry sauce. I found the mild curry to be pleasant, but not memorable. I did manage to also try the Flaming Chicken Spicy Noodle which definitely lived up to its name."],
      dtype=object)

#### Tokenize and remove punctuation

In [8]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

In [9]:
words = list(sent_to_words(reviews.text))

In [10]:
words[1111][:5]

['we', 'came', 'on', 'friday', 'night']

In [11]:
def bigrams_trigrams(words, bi_min=5, tri_min=1, thresh=50):
    bigram = gensim.models.Phrases(words, min_count = bi_min, threshold=thresh)
    trigram = gensim.models.Phrases(bigram[words], min_count = tri_min, threshold=thresh)
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    trigram_mod = gensim.models.phrases.Phraser(trigram)
    return bigram_mod, trigram_mod

In [12]:
bigram_mod, trigram_mod = bigrams_trigrams(words)



In [15]:
type(bigram_mod)

gensim.models.phrases.Phraser

#### Seems to be doing a good job at picking up related food items: 'torched sashimi', 'roasted bone marrow'

In [16]:
print(trigram_mod[bigram_mod[words[1111]]][:4])

['we', 'came', 'on', 'friday_night']


#### Remove stopwords and lemmatize

In [17]:
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [18]:
no_stops = remove_stopwords(reviews.text)

In [19]:
trigrams = [trigram_mod[bigram_mod[review]] for review in no_stops]

In [20]:
nlp = spacy.load('en', disable=['parser', 'ner'])
lemma = lemmatization(trigrams)

#### Note difference to un-lemmatized un-stop-worded above

In [21]:
lemma[1111][:2]

['come', 'friday_night']

## LDA

#### Dictionary and Corpus creation for LDA

In [23]:
id2word = gensim.corpora.Dictionary(lemma)
corpus = [id2word.doc2bow(text) for text in lemma]

In [24]:
corpus[1111][:2]

[(1, 1), (2, 1)]

In [25]:
[(id2word[id], freq) for id, freq in corpus[1111]]

[('also', 1),
 ('amazing', 1),
 ('bad', 1),
 ('even', 1),
 ('fry', 1),
 ('get', 1),
 ('hot', 2),
 ('lot', 1),
 ('menu', 1),
 ('plate', 3),
 ('table', 1),
 ('try', 1),
 ('experience', 1),
 ('location', 1),
 ('sauce', 1),
 ('side', 1),
 ('want', 1),
 ('finish', 1),
 ('go', 2),
 ('order', 1),
 ('come', 6),
 ('flavour', 1),
 ('great', 1),
 ('nice', 1),
 ('quickly', 1),
 ('touch', 1),
 ('fully', 1),
 ('meat', 2),
 ('quality', 2),
 ('back', 1),
 ('everything', 4),
 ('dish', 2),
 ('keep', 1),
 ('serve', 1),
 ('take', 1),
 ('well', 1),
 ('could', 1),
 ('little', 2),
 ('warm', 1),
 ('look', 1),
 ('overall', 1),
 ('star', 1),
 ('sort', 1),
 ('thing', 1),
 ('like', 1),
 ('think', 1),
 ('strange', 1),
 ('cool', 1),
 ('salmon', 2),
 ('wish', 1),
 ('favourite', 2),
 ('cold', 2),
 ('fresh', 1),
 ('weird', 1),
 ('ready', 1),
 ('appetitizer', 2),
 ('barely', 2),
 ('bread', 1),
 ('perfect', 3),
 ('show', 1),
 ('parking', 1),
 ('together', 2),
 ('second', 1),
 ('disappoint', 1),
 ('kind', 1),
 ('whole', 

#### LDA Modeling

In [137]:
with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    lda50 = gensim.models.ldamulticore.LdaMulticore(corpus=corpus,
                           num_topics=50,
                           id2word=id2word,
                           workers=7, # Num. Processing Cores - 1
                           passes=2)
    lda50.save('lda50.model')

In [26]:
lda50 = gensim.models.ldamulticore.LdaMulticore.load('lda50.model')