In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

import lda
lemmatizer=WordNetLemmatizer()
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
# !pip install lda
# !pip install nltk


[nltk_data] Downloading package stopwords to /Users/m.yin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/m.yin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/m.yin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
file  = '/content/drive/MyDrive/MAR6669-data/select_reviews.json'
select_reviews = pd.read_json(file)

In [3]:
select_reviews.shape

(500, 9)

In [4]:
review_df = select_reviews['text']
txt = list(review_df)
rating = list(select_reviews['stars'])

In [5]:
txt[:10]

['I LOVE Mexican food.  Unfortunately, good Mexican food in New Orleans is rather hard to find.  I have tried seemingly ALL the Mexican places in New Orleans proper: Taqueria Corona, Felipe\'s Taqueria, Juan\'s Flying Burrito.   In my opinion, none of these are as good as Superior Grill.  \n\nMy friends and I went during Happy Hour, which is from 4:30-6:30, seven days a week, when the margaritas are HALF PRICE!  Let me sidestep and talk about the margaritas for a second.  They are amazing.  Far too often, a margarita tastes far too much like pure tequila, and equally far too often, they are simply too weak.  Superior manages to strike the balance perfectly, resulting in an amazing margarita.  \n\nI think the food is quite tasty.  I got the crawfish enchiladas, which were quite good.  No, it is not "authentic" mexican food, but then again. it tastes good, and by Tex-Mex standards, it is excellent.  Also, they bring you unlimited baskets of chips and salsa for free.  I don\'t take this f

## Preprocessing and text represenation

In [6]:
english_stopwords = stopwords.words('english')
english_stopwords = set(english_stopwords)
english_stopwords.update(["food", 'one'])
txt_clean = []
for i in range(len(txt)):    
    txt[i] = re.sub(r'\d+', '', txt[i]) # remove numbers
    txt[i] = txt[i].lower()     # lower case
    txt[i] = txt[i].translate(str.maketrans('', '', string.punctuation)) # remove punctuation
    txt[i] = txt[i].strip() # remove white space
    tokens = word_tokenize(txt[i]) # tokenize
    remove_stop = [lemmatizer.lemmatize(i) for i in tokens if not i in english_stopwords] # remove stop words and lemmatize
    txt_clean.append(' '.join(remove_stop))

In [13]:
# bag-of-words representation

from sklearn.feature_extraction.text import CountVectorizer

# Create an instance of CountVectorizer
vectorizer = CountVectorizer()

# Fit and transform the texts
bag_of_words = vectorizer.fit_transform(txt_clean)

# Convert to array for easy viewing
bag_of_words_array = bag_of_words.toarray()
vocab = vectorizer.get_feature_names_out() # array of words 



In [9]:
bag_of_words_array.shape #  Number of reviews x  Number of unique words

(500, 5278)

## Model fitting by MLE

In [10]:

model = lda.LDA(n_topics=20, n_iter=500, random_state=1)
model.fit(bag_of_words_array)

INFO:lda:n_documents: 500
INFO:lda:vocab_size: 5278
INFO:lda:n_words: 26341
INFO:lda:n_topics: 20
INFO:lda:n_iter: 500
INFO:lda:<0> log likelihood: -335527
INFO:lda:<10> log likelihood: -238615
INFO:lda:<20> log likelihood: -233871
INFO:lda:<30> log likelihood: -231660
INFO:lda:<40> log likelihood: -230113
INFO:lda:<50> log likelihood: -228948
INFO:lda:<60> log likelihood: -228009
INFO:lda:<70> log likelihood: -227197
INFO:lda:<80> log likelihood: -227324
INFO:lda:<90> log likelihood: -227056
INFO:lda:<100> log likelihood: -226489
INFO:lda:<110> log likelihood: -226371
INFO:lda:<120> log likelihood: -226415
INFO:lda:<130> log likelihood: -225605
INFO:lda:<140> log likelihood: -225360
INFO:lda:<150> log likelihood: -225325
INFO:lda:<160> log likelihood: -225291
INFO:lda:<170> log likelihood: -225251
INFO:lda:<180> log likelihood: -224964
INFO:lda:<190> log likelihood: -225149
INFO:lda:<200> log likelihood: -224917
INFO:lda:<210> log likelihood: -224910
INFO:lda:<220> log likelihood: -22

<lda.lda.LDA at 0x17fa6a590>

## Interpretation

In [18]:
topic_word = model.topic_word_  # word distribution for each topic
topic_word.shape

(20, 5278)

In [19]:
n_top_words = 8
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
    print('Topic {}: {}'.format(i, ' '.join(topic_words)))

Topic 0: service delicious menu everything dish little restaurant amazing
Topic 1: bar party place menu great option new friendly
Topic 2: job guy company even part high tree spent
Topic 3: good breakfast make try nice coffee staff want
Topic 4: place didnt dont sure made want would like
Topic 5: like get know say menu take italian trieste
Topic 6: good great best taco get place love restaurant
Topic 7: order time minute phone pick two card le
Topic 8: used know away right could visit dr feel
Topic 9: good ive really like time first better back
Topic 10: store im owner location great day business super
Topic 11: year week day friendly office keep experience time
Topic 12: time get back service table went order took
Topic 13: great hair cut always kid people work patient
Topic 14: place pizza back way everything night would wont
Topic 15: really got definitely go great special well pretty
Topic 16: would told car said customer call manager called
Topic 17: room large take thing stay nev

In [21]:
doc_topic = model.doc_topic_
doc_topic.shape

(500, 20)

In [22]:
for i in range(10):
    print("Doc {} (top topic: {})".format(i, doc_topic[i].argmax()))

Doc 0 (top topic: 6)
Doc 1 (top topic: 9)
Doc 2 (top topic: 18)
Doc 3 (top topic: 13)
Doc 4 (top topic: 9)
Doc 5 (top topic: 11)
Doc 6 (top topic: 18)
Doc 7 (top topic: 11)
Doc 8 (top topic: 5)
Doc 9 (top topic: 8)
