# Topic Modeling

In [48]:
from nltk.stem import *
import numpy as np
import textmining
import re
import lda
import lda.datasets
import json
from nltk.corpus import stopwords
from nltk.corpus import opinion_lexicon
from nltk.corpus import nps_chat
from scipy.sparse import coo_matrix
stopWords = set(stopwords.words('english'))

stemmer = SnowballStemmer("english")
stopWords |= set(nps_chat.words())
stopWords |= set(opinion_lexicon.negative())
stopWords |= set(opinion_lexicon.positive())

In [49]:
reviewDocs = dict()
with open("../input/review.json") as json_file:
    line_count = 0
    for line in json_file:
        review = json.loads(line)
        text = review['text']
        text = re.sub('[^A-Za-z ]+', '', text)
        text = [word.strip() for word in text.lower().split() if word.strip() not in stopWords]
#         text = ' '.join([stemmer.stem(word) for word in text.split() if stemmer.stem(word) not in stopWords])
        reviewDocs[review['review_id']] = text
    print("Processed all reviews!")

Processed all reviews!


In [50]:
#Create Sparse Document-Term Matrix for LDA

n_nonzero = 0
vocab = set()
for terms in reviewDocs.values():
    unique_terms = set(terms)    # all unique terms of this doc
    vocab |= unique_terms           # set union: add unique terms of this doc
    n_nonzero += len(unique_terms)  # add count of unique terms in this doc

# The ReviewId vector
docnames = np.array(list(reviewDocs.keys()))
# Create the vocab vector
vocab = np.array(list(vocab)) 
# indices that sort vocab
vocab_sorter = np.argsort(vocab)    

ndocs = len(docnames)
nvocab = len(vocab)
print(ndocs,nvocab,n_nonzero)
data = np.empty(n_nonzero, dtype=np.intc)     # unique terms in the combined corpus of all the document
rows = np.empty(n_nonzero, dtype=np.intc)     # document index where the term is present
cols = np.empty(n_nonzero, dtype=np.intc)     # position of the term in the document
print(len(rows),len(cols),len(data))

4736897 2449413 83108547
83108547 83108547 83108547


In [None]:
# current index in the sparse matrix data
ind = 0
# go through all reviews with their terms
for docname, terms in reviewDocs.items():
    # find indices into  such that, if the corresponding elements in were
    # inserted before the indices, the order of  would be preserved
    # -> array of indices of  in 
    term_indices = vocab_sorter[np.searchsorted(vocab, terms, sorter=vocab_sorter)]

    # count the unique terms of the document and get their vocabulary indices
    uniq_indices, counts = np.unique(term_indices, return_counts=True)
    n_vals = len(uniq_indices)  # = number of unique terms
    ind_end = ind + n_vals  #  to  is the slice that we will fill with data

    data[ind:ind_end] = counts                  # save the counts (term frequencies)
    cols[ind:ind_end] = uniq_indices            # save the column index: index in 
    doc_idx = np.where(docnames == docname)     # get the document index for the document name
    rows[ind:ind_end] = np.repeat(doc_idx, n_vals)  # save it as repeated value

    ind = ind_end  # resume with next document -> add data to the end
    
dtm = coo_matrix((data, (rows, cols)), shape=(ndocs, nvocab), dtype=np.intc)
print("type(X): {}".format(type(dtm)))
print("shape: {}".format(dtm.shape))


In [44]:
model = lda.LDA(n_topics=50, n_iter=2000, random_state=1)
model.fit(dtm)
doc_topic = model.doc_topic_

# print doc_topic probability distribution
# for i, doc_dist in enumerate(doc_topic):
#     print("Doc ", i)
#     print(doc_dist)
#     # for j,topic in enumerate(doc_dist):
#     #     print "Topic ",j," = ",topic

topic_word = model.topic_word_
print("Topic Word Distribution")

# print topic_word ditribution
n_top_words = 10
# for i, word_dist in enumerate(topic_word):
#     topic_words = np.array(vocab)[np.argsort(word_dist)][:-(n_top_words + 1):-1]
#     word_dist_sorted = sorted(word_dist,reverse=True)
#     print("Topic ", i)
#     for j,words in enumerate(topic_words):
#         print(words," = ",word_dist_sorted[j])
#     # print "Topic ", i," ", word_dist,"\n"
    
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
    print('Topic {}: {}'.format(i, ' '.join(topic_words)))

INFO:lda:n_documents: 1000
INFO:lda:vocab_size: 6177
INFO:lda:n_words: 19258
INFO:lda:n_topics: 50
INFO:lda:n_iter: 2000
INFO:lda:<0> log likelihood: -276401
INFO:lda:<10> log likelihood: -184618
INFO:lda:<20> log likelihood: -179267
INFO:lda:<30> log likelihood: -176935
INFO:lda:<40> log likelihood: -175910
INFO:lda:<50> log likelihood: -174468
INFO:lda:<60> log likelihood: -174546
INFO:lda:<70> log likelihood: -173975
INFO:lda:<80> log likelihood: -173571
INFO:lda:<90> log likelihood: -173818
INFO:lda:<100> log likelihood: -173444
INFO:lda:<110> log likelihood: -173535
INFO:lda:<120> log likelihood: -173356
INFO:lda:<130> log likelihood: -172975
INFO:lda:<140> log likelihood: -173182
INFO:lda:<150> log likelihood: -172815
INFO:lda:<160> log likelihood: -172623
INFO:lda:<170> log likelihood: -172681
INFO:lda:<180> log likelihood: -172795
INFO:lda:<190> log likelihood: -172471
INFO:lda:<200> log likelihood: -172721
INFO:lda:<210> log likelihood: -172860
INFO:lda:<220> log likelihood: -

Topic Word Distribution
Topic 0: seemed crowd relaxing places restaurants machine spent environment similar separate
Topic 1: reviews seated immediately parking waited absolutely pulled opened friday north
Topic 2: sunday dish review tasting group decor staff etc friday stars
Topic 3: audi dealership manager prius visits continued toyota appointment repairs maintenance
Topic 4: burrito tacos taco chips asada carne rice meal vegas tasty
Topic 5: breakfast sandwich sandwiches salad lounge panini hidden airport phoenix salads
Topic 6: sunday stars wooden easily dhote corkage presentation fees towards frites
Topic 7: dosa indian sambar dosas buffet masala bhavan quality thali rava
Topic 8: building margaritas vibe rare tons setting flavored entry filling public
Topic 9: und das ist der fr zimmer aber mile sich auch
Topic 10: including multiple grab consider patio tight establishment greet venue learning
Topic 11: sushi restaurant places crab madison employees tuna avocado restaurants oakvi