In [72]:
import pickle
import gensim
import pyLDAvis
import pyLDAvis.gensim
import spacy
import pandas as pd
import nltk; nltk.download('stopwords')
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import re
import warnings
from pprint import pprint

[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Trigram, Stop-word Removal, Lemmatize

In [294]:
with open('rev_2016_df.pkl', 'rb') as f:
    rev_train = pickle.load(f)
with open('rev_2017.pkl', 'rb') as f:
    rev_test = pickle.load(f)

In [296]:
rev_train = rev_train.sample(frac=.3, random_state=1).reset_index(drop=True)
rev_test = rev_test.sample(frac=.3, random_state=1).reset_index(drop=True)

In [327]:
len(rev_train)

155924

In [298]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [301]:
stop_words.extend(['come','order','try','go','get','make','drink','plate','dish','restaurant','place'])
stop_words

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

#### Remove new lines

In [302]:
def strip_newline(series):
    return [review.replace('\n','') for review in series]

In [395]:
rev_train['text'] = strip_newline(rev_train.text)
rev_test['text'] = strip_newline(rev_test.text)
rev_train.text[21:22].values

array(['This place has the best Chicken Parmesan I have ever had. And this is my favorite meal ever and I have tried this at hundreds of restaurants! Wow, this blew me away. It was served as a huge portion in a square shape. Very thin, and creative the way it was cooked and presented. I am planning a trip back to Las Vegas in the coming months just to have this again!'],
      dtype=object)

#### Tokenize and remove punctuation

In [304]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

In [305]:
words_tr = list(sent_to_words(rev_train.text))
words_te = list(sent_to_words(rev_test.text))

In [306]:
words_tr[21][:10]

['this',
 'place',
 'has',
 'the',
 'best',
 'chicken',
 'parmesan',
 'have',
 'ever',
 'had']

In [307]:
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

In [308]:
words_tr = remove_stopwords(words_tr)

In [309]:
def bigrams(words, bi_min=15, tri_min=10):
    bigram = gensim.models.Phrases(words, min_count = bi_min)
    trigram = gensim.models.Phrases(bigram[words], min_count = tri_min)
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    trigram_mod = gensim.models.phrases.Phraser(trigram)
    return bigram_mod, trigram_mod

In [310]:
bigram_tr, trigram_tr = bigrams(words_tr)



#### Check some items

In [311]:
print(trigram_tr[bigram_tr[words[16345]]][:200])

['my', 'wife', 'and', 'have', 'been', 'going', 'to', 'the', 'desert_ridge', 'location', 'every_week', 'for', 'many_years', 'and', 'have', 'enjoyed', 'the', 'service', 'and', 'quality', 'of', 'the', 'food', 'we', 'have', 'become', 'part', 'of', 'the', 'keg', 'family', 'over', 'these', 'many_years', 'the', 'bar', 'staff', 'has', 'always', 'greeted_us', 'with', 'big', 'smile', 'highly_recommend', 'the', 'prime_rib', 'and', 'filets', 'the', 'french_onion_soup', 'is', 'the', 'best', 'in', 'town', 'jim', 'and', 'alice']


#### Remove stopwords and lemmatize

In [312]:
nlp = spacy.load('en', disable=['parser', 'ner'])

In [313]:
def lemmatization(texts, allowed_postags=['NOUN','ADJ','VERB','ADV']):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

#### Run test through trained model - will later run test data through trained model here

In [314]:
trigrams_tr = [trigram_tr[bigram_tr[review]] for review in words_tr]

In [319]:
lemma_lg = lemmatization(trigrams_tr)

In [325]:
with open('lemma_lg.pkl', 'wb') as f:
    pickle.dump(lemma_lg, f)

#### Note difference to un-lemmatized un-stop-worded above

In [324]:
lemma_lg[8811][:20]

['hubby',
 'eat',
 'area',
 'service',
 'really',
 'good',
 'manager',
 'actually',
 'come',
 'bit',
 'used',
 'manager',
 'north_york',
 'location',
 'really',
 'nice',
 'food',
 'good',
 'mushroom',
 'burger']

## LDA

#### Dictionary and Corpus creation for LDA

In [368]:
id2word_lg = gensim.corpora.Dictionary(lemma_lg)
id2word_lg.filter_extremes(no_below=10, no_above=0.35)
id2word_lg.compactify()
id2word_lg.save('train_dict_lg')
corpus_lg = [id2word_lg.doc2bow(text) for text in lemma_lg]

In [369]:
with open('corpus_lg.pkl', 'wb') as f:
    pickle.dump(corpus_lg, f)

In [397]:
corpus_lg[21][:2]

[(25, 1), (65, 1)]

In [396]:
[(id2word_lg[id], freq) for id, freq in corpus_lg[21]]

[('cook', 1),
 ('meal', 1),
 ('back', 1),
 ('restaurant', 1),
 ('serve', 1),
 ('try', 1),
 ('month', 1),
 ('las_vegas', 1),
 ('blew_away', 1),
 ('chicken_parmesan', 1),
 ('coming', 1),
 ('creative', 1),
 ('ever', 2),
 ('favorite', 1),
 ('huge_portion', 1),
 ('hundred', 1),
 ('present', 1),
 ('shape', 1),
 ('square', 1),
 ('thin', 1),
 ('way', 1)]

## LDA Modeling

In [119]:
# from gensim.models.wrappers import LdaMallet

# mallet_path = 'home//ubuntu//mallet-2.0.8//bin//mallet'
# lda20_mallet = LdaMallet(mallet_path, corpus=corpus, num_topics=20, id2word=id2word, workers=7)
# lda20_mallet.save('lda20_mallet.model')

#### Tune the number of topics for best coherence score range to then tune further

In [280]:
coherence = []
for topics in [10, 20, 30, 40, 50, 60, 70, 80, 90]:
    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        lda = gensim.models.ldamulticore.LdaMulticore(
                           corpus=corpus,
                           num_topics=topics,
                           id2word=id2word,
                           workers=7, # Num. Processing Cores - 1
                           passes=50,
                           per_word_topics=True)
        coherence_model_lda = CoherenceModel(model=lda, texts=lemma, dictionary=id2word, coherence='c_v')
        coherence.append(coherence_model_lda.get_coherence())

In [282]:
coherence

[0.4569360497920987,
 0.45102773337845636,
 0.4769243195803472,
 0.45451839384076187,
 0.4586513901322973,
 0.44899760814164985,
 0.458999619307086,
 0.4222138879353656,
 0.4286275314022464]

In [281]:
with open('coherence.pkl', 'wb') as f:
    pickle.dump(coherence, f)

#### The actual LDA model. Note the large chunksize, this runs much faster than doing it in a big batch like above, but the Coherence score I get with these parameters matches the batched version. Now if I want to change things above with total data or lemmatization, I can test this much faster.
gensim.models.ldamulticore.LdaMulticore(
                           corpus=corpus_lg,
                           num_topics=30,
                           id2word=id2word,
                           chunksize=25000,
                           workers=7,
                           passes=50,
                           eval_every = 1,
                           per_word_topics=True)

In [336]:
# can we tune large data model to beat .4769
with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    lda30_lg = gensim.models.ldamulticore.LdaMulticore(
                           corpus=corpus_lg,
                           num_topics=30,
                           id2word=id2word_lg,
                           chunksize=25000,
                           workers=7, # Num. Processing Cores - 1
                           passes=50,
                           eval_every = 1,
                           per_word_topics=True)
    lda30_lg.save('lda30_lg.model')

In [391]:
lda30_lg.print_topics(30,num_words=100)

[(0,
  '0.025*"room" + 0.017*"hotel" + 0.016*"stay" + 0.007*"check" + 0.007*"night" + 0.007*"nice" + 0.007*"vegas" + 0.007*"casino" + 0.006*"day" + 0.006*"get" + 0.006*"also" + 0.005*"strip" + 0.005*"would" + 0.005*"clean" + 0.005*"play" + 0.005*"bathroom" + 0.005*"pool" + 0.005*"great" + 0.005*"go" + 0.005*"time" + 0.004*"see" + 0.004*"walk" + 0.004*"really" + 0.004*"even" + 0.004*"area" + 0.004*"need" + 0.004*"show" + 0.003*"people" + 0.003*"free" + 0.003*"want" + 0.003*"find" + 0.003*"fun" + 0.003*"lot" + 0.003*"parking" + 0.003*"use" + 0.003*"club" + 0.003*"bar" + 0.003*"take" + 0.003*"work" + 0.003*"thing" + 0.003*"look" + 0.003*"kid" + 0.003*"staff" + 0.003*"hour" + 0.003*"floor" + 0.003*"book" + 0.003*"way" + 0.003*"experience" + 0.003*"pay" + 0.002*"think" + 0.002*"close" + 0.002*"bed" + 0.002*"old" + 0.002*"well" + 0.002*"game" + 0.002*"back" + 0.002*"still" + 0.002*"could" + 0.002*"much" + 0.002*"little" + 0.002*"coffee" + 0.002*"park" + 0.002*"view" + 0.002*"leave" + 0.002*"

#### Print Coherence Score of Single Model Above

In [340]:
from gensim.models import CoherenceModel

coherence_model_lda = CoherenceModel(model=lda30_lg, texts=lemma_lg, dictionary=id2word_lg, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
coherence_lda

0.46441805321405266

#### Use minimum_probability below to get full distribution for topic!

In [382]:
top_topics = lda30_lg.get_document_topics(corpus_lg[0], minimum_probability=0.0)
top_topics

[(0, 0.16438176),
 (1, 0.0013888889),
 (2, 0.0013888889),
 (3, 0.0013888889),
 (4, 0.15381467),
 (5, 0.0013888889),
 (6, 0.0013888889),
 (7, 0.07722635),
 (8, 0.0013888889),
 (9, 0.0013888889),
 (10, 0.0013888889),
 (11, 0.0013888889),
 (12, 0.0013888889),
 (13, 0.0013888889),
 (14, 0.0013888889),
 (15, 0.0013888889),
 (16, 0.0013888889),
 (17, 0.0013888889),
 (18, 0.0013888889),
 (19, 0.24763249),
 (20, 0.1371998),
 (21, 0.0013888889),
 (22, 0.0013888889),
 (23, 0.0013888889),
 (24, 0.0013888889),
 (25, 0.0013888889),
 (26, 0.0013888889),
 (27, 0.18641163),
 (28, 0.0013888889),
 (29, 0.0013888889)]

#### Get Review Topic Vector

In [352]:
[top_topics[i][1] for i in range(30)]

[0.16491123, 0.15403374, 0.07722656, 0.24698643, 0.13697343, 0.1865353]

#### HDP

In [399]:
from gensim.models import HdpModel
hdp = HdpModel(corpus_lg, id2word_lg, chunksize=10000, K=20, T=150)

In [None]:
coherence_model_hdp = CoherenceModel(model=hdp, texts=lemma_lg, dictionary=id2word, coherence='c_v')
coherence_hdp = coherence_model_hdp.get_coherence()
coherence_hdp

In [373]:
len(hdp.print_topics())

20

In [376]:
hdp.print_topics(num_topics=20)

[(0,
  '0.004*great + 0.003*service + 0.002*order + 0.002*come + 0.002*get + 0.002*really + 0.002*also + 0.002*would + 0.002*love + 0.002*go'),
 (1,
  '0.001*great + 0.001*order + 0.001*service + 0.001*go + 0.001*love + 0.001*also + 0.001*come + 0.001*get + 0.001*would + 0.001*pizza'),
 (2,
  '0.001*great + 0.001*server + 0.000*service + 0.000*come + 0.000*uptown_charlotte + 0.000*single_item + 0.000*greatly_disappoint + 0.000*downtown_lv + 0.000*read_reviews + 0.000*consequently'),
 (3,
  '0.001*really + 0.000*missing_something + 0.000*coup + 0.000*great + 0.000*uncooked + 0.000*lake + 0.000*bare_bone + 0.000*racism + 0.000*hard_boiled_egg + 0.000*simplify'),
 (4,
  '0.000*great + 0.000*coldest + 0.000*rice_pilaf + 0.000*ummm + 0.000*science + 0.000*compared_other + 0.000*lives_hype + 0.000*nacho_cheese + 0.000*groove + 0.000*sunny_side'),
 (5,
  '0.001*fahrenheit + 0.000*watch + 0.000*blueberry_danish + 0.000*chef + 0.000*later_night + 0.000*bound + 0.000*sampler + 0.000*whiz + 0.000

In [278]:
hdp[corpus[44]]

[(0, 0.9598830339695105)]