In [1]:
#%pip install pyLDAvis --user
#%pip install gensim

In [2]:
import numpy as np 
import pandas as pd 

import nltk
import string

from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

import gensim
from gensim.utils import simple_preprocess

import pyLDAvis.gensim_models
import pyLDAvis

from nltk.stem import WordNetLemmatizer 
from nltk.corpus import wordnet

import gensim.corpora as corpora  
from pprint import pprint


In [3]:
reviews = pd.read_csv('yelp_labelled.csv')
reviews = reviews.astype({'Text':'string'})
text = reviews['Text']
text1 = []
for review in text:
    #print(sentence)
    #remove punctuation
    review = review.translate(str.maketrans('', '', string.punctuation))  
    # remove digits/numbers
    review = review.translate(str.maketrans('', '', string.digits))
    #change to lowercase
    review = review.lower()
    #print(sentence)
    text1.append(review)
text1 = pd.Series(text1)
stop_words = set(stopwords.words('english'))

text1 = text1.apply(lambda x: ' '.join(term for term in x.split() if term not in stop_words))
def get_wordnet_pos(word):

    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)


lemmatizer = WordNetLemmatizer()
text3 = text1.apply(lambda x:' '.join(lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in x.split()))
label = reviews['Sentiment']
reviews1 = list(zip(text3, label))

reviewsP = pd.DataFrame (reviews1, columns = ['Review', 'Sentiment'])
reviewsP.head()

Unnamed: 0,Review,Sentiment
0,wow love place,1
1,crust good,0
2,tasty texture nasty,0
3,stop late may bank holiday rick steve recommen...,1
4,selection menu great price,1


In [4]:
data = reviewsP.Review.values.tolist()

In [5]:
data

['wow love place',
 'crust good',
 'tasty texture nasty',
 'stop late may bank holiday rick steve recommendation love',
 'selection menu great price',
 'get angry want damn pho',
 'honeslty didnt taste fresh',
 'potato like rubber could tell make ahead time kept warmer',
 'fry great',
 'great touch',
 'service prompt',
 'would go back',
 'cashier care ever say still end wayyy overprice',
 'try cape cod ravoli chickenwith cranberrymmmm',
 'disgust pretty sure human hair',
 'shock sign indicate cash',
 'highly recommend',
 'waitress little slow service',
 'place worth time let alone vega',
 'like',
 'burrittos blah',
 'food amaze',
 'service also cute',
 'could care less interior beautiful',
 'perform',
 'thats rightthe red velvet cakeohhh stuff good',
 'never brought salad ask',
 'hole wall great mexican street taco friendly staff',
 'take hour get food table restaurant food luke warm sever run around like totally overwhelmed',
 'bad salmon sashimi',
 'also combo like burger fry beer de

In [6]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence)))

words = list(sent_to_words(data))

In [7]:
id2word = corpora.Dictionary(words)                   

# Create Corpus
texts = words                                        

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]    

# View
print(corpus[:1][0][:30])

[(0, 1), (1, 1), (2, 1)]


In [8]:
num_topics = 10

# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics)

# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())

[(0,
  '0.024*"food" + 0.014*"good" + 0.011*"well" + 0.010*"one" + 0.010*"get" + '
  '0.010*"go" + 0.009*"sandwich" + 0.008*"salad" + 0.008*"say" + '
  '0.008*"service"'),
 (1,
  '0.028*"place" + 0.020*"like" + 0.018*"service" + 0.015*"good" + 0.014*"bad" '
  '+ 0.012*"go" + 0.012*"taste" + 0.011*"say" + 0.009*"also" + 0.009*"ever"'),
 (2,
  '0.022*"service" + 0.018*"great" + 0.017*"place" + 0.015*"food" + '
  '0.010*"disappointed" + 0.010*"bad" + 0.010*"good" + 0.009*"like" + '
  '0.009*"fry" + 0.008*"time"'),
 (3,
  '0.021*"back" + 0.019*"good" + 0.018*"go" + 0.017*"food" + 0.014*"great" + '
  '0.010*"menu" + 0.010*"best" + 0.008*"place" + 0.008*"really" + '
  '0.007*"nice"'),
 (4,
  '0.019*"come" + 0.019*"good" + 0.014*"back" + 0.014*"dont" + 0.014*"food" + '
  '0.012*"service" + 0.010*"meal" + 0.010*"great" + 0.010*"amaze" + '
  '0.009*"friendly"'),
 (5,
  '0.019*"service" + 0.015*"food" + 0.013*"good" + 0.012*"dish" + 0.012*"time" '
  '+ 0.011*"order" + 0.009*"delicious" + 0.008*"

In [9]:
pyLDAvis.enable_notebook()

LDAvis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)

LDAvis

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF



# Create a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(reviewsP['Review'])

# Define the number of topics
num_topics = 5

# Apply NMF
nmf = NMF(n_components=num_topics, random_state=42)
nmf.fit(tfidf)

# Display the topics and their top words
def display_topics(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic {topic_idx + 1}:")
        print([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]])

n_top_words = 5  # Number of top words per topic
print("Topics and their top words:")
display_topics(nmf, tfidf_vectorizer.get_feature_names_out(), n_top_words)



Topics and their top words:
Topic 1:
['good', 'price', 'selection', 'really', 'pizza']
Topic 2:
['place', 'love', 'recommend', 'like', 'eat']
Topic 3:
['food', 'delicious', 'bad', 'terrible', 'amaze']
Topic 4:
['service', 'friendly', 'slow', 'bad', 'fantastic']
Topic 5:
['great', 'time', 'experience', 'eat', 'staff']
