In [72]:
import pickle
import gensim
import pyLDAvis
import pyLDAvis.gensim
import spacy
import pandas as pd
import nltk; nltk.download('stopwords')
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import re
import warnings
from pprint import pprint

[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Note this notebook runs an HDP model to find the best number of topics. It is not strictly part of the flow and can be considered optional. It also uses a smaller sample of data to run faster.

In [294]:
with open('rev_2016_df.pkl', 'rb') as f:
    rev_train = pickle.load(f)
with open('rev_2017.pkl', 'rb') as f:
    rev_test = pickle.load(f)

In [296]:
rev_train = rev_train.sample(frac=.3, random_state=1).reset_index(drop=True)
rev_test = rev_test.sample(frac=.3, random_state=1).reset_index(drop=True)

In [327]:
len(rev_train)

155924

In [298]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [301]:
stop_words.extend(['come','order','try','go','get','make','drink','plate','dish','restaurant','place'])
stop_words

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

#### Remove new lines

In [302]:
def strip_newline(series):
    return [review.replace('\n','') for review in series]

In [395]:
rev_train['text'] = strip_newline(rev_train.text)
rev_test['text'] = strip_newline(rev_test.text)
rev_train.text[21:22].values

array(['This place has the best Chicken Parmesan I have ever had. And this is my favorite meal ever and I have tried this at hundreds of restaurants! Wow, this blew me away. It was served as a huge portion in a square shape. Very thin, and creative the way it was cooked and presented. I am planning a trip back to Las Vegas in the coming months just to have this again!'],
      dtype=object)

#### Tokenize and remove punctuation

In [304]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

In [305]:
words_tr = list(sent_to_words(rev_train.text))
words_te = list(sent_to_words(rev_test.text))

In [306]:
words_tr[21][:10]

['this',
 'place',
 'has',
 'the',
 'best',
 'chicken',
 'parmesan',
 'have',
 'ever',
 'had']

In [307]:
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

In [308]:
words_tr = remove_stopwords(words_tr)

In [309]:
def bigrams(words, bi_min=15, tri_min=10):
    bigram = gensim.models.Phrases(words, min_count = bi_min)
    trigram = gensim.models.Phrases(bigram[words], min_count = tri_min)
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    trigram_mod = gensim.models.phrases.Phraser(trigram)
    return bigram_mod, trigram_mod

In [310]:
bigram_tr, trigram_tr = bigrams(words_tr)



#### Check some items

In [311]:
print(trigram_tr[bigram_tr[words[16345]]][:200])

['my', 'wife', 'and', 'have', 'been', 'going', 'to', 'the', 'desert_ridge', 'location', 'every_week', 'for', 'many_years', 'and', 'have', 'enjoyed', 'the', 'service', 'and', 'quality', 'of', 'the', 'food', 'we', 'have', 'become', 'part', 'of', 'the', 'keg', 'family', 'over', 'these', 'many_years', 'the', 'bar', 'staff', 'has', 'always', 'greeted_us', 'with', 'big', 'smile', 'highly_recommend', 'the', 'prime_rib', 'and', 'filets', 'the', 'french_onion_soup', 'is', 'the', 'best', 'in', 'town', 'jim', 'and', 'alice']


#### Remove stopwords and lemmatize

In [312]:
nlp = spacy.load('en', disable=['parser', 'ner'])

In [313]:
def lemmatization(texts, allowed_postags=['NOUN','ADJ','VERB','ADV']):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

#### Run test through trained model - will later run test data through trained model here

In [314]:
trigrams_tr = [trigram_tr[bigram_tr[review]] for review in words_tr]

In [319]:
lemma_lg = lemmatization(trigrams_tr)

In [325]:
with open('lemma_lg.pkl', 'wb') as f:
    pickle.dump(lemma_lg, f)

#### Note difference to un-lemmatized un-stop-worded above

In [324]:
lemma_lg[8811][:20]

['hubby',
 'eat',
 'area',
 'service',
 'really',
 'good',
 'manager',
 'actually',
 'come',
 'bit',
 'used',
 'manager',
 'north_york',
 'location',
 'really',
 'nice',
 'food',
 'good',
 'mushroom',
 'burger']

## LDA

#### Dictionary and Corpus creation for LDA

In [368]:
id2word_lg = gensim.corpora.Dictionary(lemma_lg)
id2word_lg.filter_extremes(no_below=10, no_above=0.35)
id2word_lg.compactify()
id2word_lg.save('train_dict_lg')
corpus_lg = [id2word_lg.doc2bow(text) for text in lemma_lg]

In [369]:
with open('corpus_lg.pkl', 'wb') as f:
    pickle.dump(corpus_lg, f)

In [397]:
corpus_lg[21][:2]

[(25, 1), (65, 1)]

In [396]:
[(id2word_lg[id], freq) for id, freq in corpus_lg[21]]

[('cook', 1),
 ('meal', 1),
 ('back', 1),
 ('restaurant', 1),
 ('serve', 1),
 ('try', 1),
 ('month', 1),
 ('las_vegas', 1),
 ('blew_away', 1),
 ('chicken_parmesan', 1),
 ('coming', 1),
 ('creative', 1),
 ('ever', 2),
 ('favorite', 1),
 ('huge_portion', 1),
 ('hundred', 1),
 ('present', 1),
 ('shape', 1),
 ('square', 1),
 ('thin', 1),
 ('way', 1)]

## HDP Model - auto-dinals the best number of topics

In [399]:
from gensim.models import HdpModel
hdp = HdpModel(corpus_lg, id2word_lg, chunksize=10000)

In [373]:
len(hdp.print_topics())

20

In [376]:
hdp.print_topics(num_topics=20)

[(0,
  '0.004*great + 0.003*service + 0.002*order + 0.002*come + 0.002*get + 0.002*really + 0.002*also + 0.002*would + 0.002*love + 0.002*go'),
 (1,
  '0.001*great + 0.001*order + 0.001*service + 0.001*go + 0.001*love + 0.001*also + 0.001*come + 0.001*get + 0.001*would + 0.001*pizza'),
 (2,
  '0.001*great + 0.001*server + 0.000*service + 0.000*come + 0.000*uptown_charlotte + 0.000*single_item + 0.000*greatly_disappoint + 0.000*downtown_lv + 0.000*read_reviews + 0.000*consequently'),
 (3,
  '0.001*really + 0.000*missing_something + 0.000*coup + 0.000*great + 0.000*uncooked + 0.000*lake + 0.000*bare_bone + 0.000*racism + 0.000*hard_boiled_egg + 0.000*simplify'),
 (4,
  '0.000*great + 0.000*coldest + 0.000*rice_pilaf + 0.000*ummm + 0.000*science + 0.000*compared_other + 0.000*lives_hype + 0.000*nacho_cheese + 0.000*groove + 0.000*sunny_side'),
 (5,
  '0.001*fahrenheit + 0.000*watch + 0.000*blueberry_danish + 0.000*chef + 0.000*later_night + 0.000*bound + 0.000*sampler + 0.000*whiz + 0.000