In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

import lda
lemmatizer=WordNetLemmatizer()

# !pip install lda
# !pip install nltk


In [24]:
file  = '/content/drive/MyDrive/MAR6669-data/select_reviews.json'
select_reviews = pd.read_json(file)

In [25]:
select_reviews.shape

(500, 9)

In [26]:
review_df = select_reviews['text']
txt = list(review_df)
rating = list(select_reviews['stars'])

In [27]:
english_stopwords = stopwords.words('english')
english_stopwords = set(english_stopwords)
english_stopwords.update(["food", 'one'])
txt_clean = []
for i in range(len(txt)):    
    txt[i] = re.sub(r'\d+', '', txt[i]) # remove numbers
    txt[i] = txt[i].lower()     # lower case
    txt[i] = txt[i].translate(str.maketrans('', '', string.punctuation)) # remove punctuation
    txt[i] = txt[i].strip() # remove white space
    tokens = word_tokenize(txt[i]) # tokenize
    remove_stop = [lemmatizer.lemmatize(i) for i in tokens if not i in english_stopwords] # remove stop words and lemmatize
    txt_clean.append(' '.join(remove_stop))

In [29]:
from sklearn.feature_extraction.text import CountVectorizer

# Create an instance of CountVectorizer
vectorizer = CountVectorizer()

# Fit and transform the texts
bag_of_words = vectorizer.fit_transform(txt_clean)

# Convert to array for easy viewing
bag_of_words_array = bag_of_words.toarray()

# Create a DataFrame for better visualization
vocab = vectorizer.get_feature_names_out()



In [30]:
bag_of_words_array.shape

(500, 5273)

In [38]:

model = lda.LDA(n_topics=20, n_iter=500, random_state=1)
model.fit(bag_of_words_array)

INFO:lda:n_documents: 500
INFO:lda:vocab_size: 5273
INFO:lda:n_words: 25752
INFO:lda:n_topics: 20
INFO:lda:n_iter: 500


INFO:lda:<0> log likelihood: -328978
INFO:lda:<10> log likelihood: -233389
INFO:lda:<20> log likelihood: -228689
INFO:lda:<30> log likelihood: -226283
INFO:lda:<40> log likelihood: -224927
INFO:lda:<50> log likelihood: -223979
INFO:lda:<60> log likelihood: -223355
INFO:lda:<70> log likelihood: -222981
INFO:lda:<80> log likelihood: -222585
INFO:lda:<90> log likelihood: -222185
INFO:lda:<100> log likelihood: -221762
INFO:lda:<110> log likelihood: -221762
INFO:lda:<120> log likelihood: -221313
INFO:lda:<130> log likelihood: -221358
INFO:lda:<140> log likelihood: -221346
INFO:lda:<150> log likelihood: -221100
INFO:lda:<160> log likelihood: -220862
INFO:lda:<170> log likelihood: -221027
INFO:lda:<180> log likelihood: -220915
INFO:lda:<190> log likelihood: -220778
INFO:lda:<200> log likelihood: -220803
INFO:lda:<210> log likelihood: -220388
INFO:lda:<220> log likelihood: -220386
INFO:lda:<230> log likelihood: -220219
INFO:lda:<240> log likelihood: -219899
INFO:lda:<250> log likelihood: -2198

In [35]:
topic_word = model.topic_word_ 
n_top_words = 8
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
    print('Topic {}: {}'.format(i, ' '.join(topic_words)))

Topic 0: place best service love fish favorite definitely restaurant
Topic 1: great hair cut kid highly dog thank see
Topic 2: coffee first place make wanted want made im
Topic 3: car store owner new star service better employee
Topic 4: good nice got sandwich really breakfast enjoyed well
Topic 5: time back great go place really good taco
Topic 6: tree people guy car mechanic hill tell station
Topic 7: time staff visit experience month year office issue
Topic 8: place good nice eat menu wont better pizza
Topic 9: even back around will said going getting though
Topic 10: say never cant got know review take name
Topic 11: place best great make perfect staff will burger
Topic 12: great bar beer drink friendly night good service
Topic 13: room party front told find stay problem guest
Topic 14: salad thought im wing staff well side although
Topic 15: dont pizza think little lot place really two
Topic 16: called day phone thing now already service delivery
Topic 17: time order minute table 

In [42]:
doc_topic = model.doc_topic_
for i in range(10):
    print("Doc {} (top topic: {})".format(i, doc_topic[i].argmax()))

Doc 0 (top topic: 0)
Doc 1 (top topic: 19)
Doc 2 (top topic: 18)
Doc 3 (top topic: 1)
Doc 4 (top topic: 19)
Doc 5 (top topic: 5)
Doc 6 (top topic: 19)
Doc 7 (top topic: 5)
Doc 8 (top topic: 10)
Doc 9 (top topic: 7)
