In [13]:
# Import libraries
import pandas as pd
import string
import spacy
import nltk
from nltk.corpus import stopwords
from gensim import corpora, models # Create the corpora of the words...
import pyLDAvis.gensim_models as gensimvis

# Load NLTK resources
nltk.download('wordnet')
nltk.download('stopwords')

# Load spaCy model
nlp = spacy.load('en_core_web_sm')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mukul\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mukul\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [30]:
# Read Yelp review dataset
yelp_review = pd.read_csv('train.csv')

In [51]:
yelp_review = yelp_review.sample(10000)

In [52]:
yelp_review.to_csv("Yelp.csv", index = False)

In [32]:
yelp_review.head()

Unnamed: 0,Topic,Text
142951,1,I've not been back to Las Vegas since 2002 in ...
147779,1,Bad services!!!!!!!
404031,2,Our family enjoys eating here. They have a fri...
41430,2,Very courteous and helpful. Made mailing my UP...
388517,2,Actual Date of Visit: 10/9/2011\n\nWhen I firs...


In [33]:
# Clean text data
def clean_text(text):
    text = text.translate(str.maketrans('', '', string.punctuation)).lower()
    return ' '.join([word for word in text.split() if not word.isdigit() and len(word) > 3])

yelp_review['clean_text'] = yelp_review['Text'].apply(clean_text)

In [34]:
# Remove stopwords
stop_words = set(stopwords.words('english'))
yelp_review['clean_text'] = yelp_review['clean_text'].\
apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

In [35]:
# Lemmatization
def lemmatize(text):
    doc = nlp(text)
    return [token.lemma_ for token in doc]

yelp_review['lemmatized_text'] = yelp_review['clean_text'].apply(lemmatize)

In [36]:
# Create dictionary and document-term matrix
dictionary = corpora.Dictionary(yelp_review['lemmatized_text'])
doc_term_matrix = [dictionary.doc2bow(text) for text in yelp_review['lemmatized_text']]

In [37]:
# Build LDA model
lda_model = models.LdaModel(corpus=doc_term_matrix, 
                            id2word=dictionary, 
                            num_topics=10, passes=50, 
                            iterations=100)

In [38]:
# Print topics
lda_model.print_topics()

[(0,
  '0.037*"not" + 0.020*"do" + 0.014*"order" + 0.014*"time" + 0.013*"food" + 0.013*"place" + 0.012*"come" + 0.011*"go" + 0.011*"would" + 0.011*"wait"'),
 (1,
  '0.017*"call" + 0.016*"not" + 0.015*"would" + 0.015*"tell" + 0.012*"do" + 0.011*"say" + 0.011*"service" + 0.011*"customer" + 0.010*"back" + 0.010*"time"'),
 (2,
  '0.116*"coffee" + 0.024*"starbuck" + 0.023*"class" + 0.013*"carne" + 0.013*"asada" + 0.010*"studio" + 0.009*"yoga" + 0.009*"latte" + 0.008*"espresso" + 0.008*"joe"'),
 (3,
  '0.053*"room" + 0.029*"hotel" + 0.027*"stay" + 0.015*"vegas" + 0.012*"club" + 0.011*"night" + 0.011*"floor" + 0.010*"pool" + 0.010*"casino" + 0.010*"strip"'),
 (4,
  '0.014*"good" + 0.014*"order" + 0.013*"like" + 0.012*"not" + 0.011*"taste" + 0.011*"chicken" + 0.010*"fry" + 0.010*"sauce" + 0.010*"burger" + 0.009*"cheese"'),
 (5,
  '0.148*"pizza" + 0.018*"crust" + 0.010*"slice" + 0.009*"raman" + 0.007*"delivery" + 0.006*"thin" + 0.006*"pepperoni" + 0.006*"topping" + 0.006*"cater" + 0.005*"mein"'

Metrics for the Model

1. Perplexity - Perplexity is a measure of **how well a probability distribution or probability model predicts a sample.** In the context of topic modeling, perplexity measures how well a topic model predicts a held-out or unseen set of documents.

**Interpretation:**

* A **lower perplexity value indicates that the model is better** at predicting unseen data, suggesting that it has learned meaningful topics from the corpus.
* **Higher perplexity values suggest that the model is less effective** at predicting unseen data, indicating that it may have overfit or failed to capture meaningful patterns in the data.

2. Coherence - It measures the **interpretability or semantic consistency of the topics** generated by a topic model. It evaluates how closely related and meaningful the top words in each topic are.

**Interpretation:**

* **Higher coherence values indicate that the topics are more coherent and interpretable,** as the top words within each topic are more semantically related.
* **Lower coherence values suggest that the topics are less coherent** and may contain unrelated or noisy words, making them less interpretable.

### Gauging model performance:

* Perplexity and coherence are complementary metrics used together to evaluate the quality of topic models.
* A good topic model should have both low perplexity and high coherence values.

However, there may be cases where optimizing one metric may adversely affect the other. Therefore, it's important to strike a balance between perplexity and coherence while fine-tuning the topic model.

In [45]:
# calculate perplexity and coherence

# Compute Coherence Score
from gensim.models import CoherenceModel

print('\Perplexity: ', lda_model.log_perplexity(doc_term_matrix))  
 
# calculate coherence
coherence_model_lda = CoherenceModel(model=lda_model,
                                     texts=doc_term_matrix, dictionary=dictionary , 
                                     coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence: ', coherence_lda)

  print('\Perplexity: ', lda_model.log_perplexity(doc_term_matrix))


\Perplexity:  -8.396597517921837


  m_lr_i = np.log(numerator / denominator)
  return cv1.T.dot(cv2)[0, 0] / (_magnitude(cv1) * _magnitude(cv2))


Coherence:  nan
