Basic tutorial - https://colab.research.google.com/drive/1FieRA9fLdkQEGDIMYl0I3MCjSUKVF8C-?usp=sharing#scrollTo=S9qDqEHddgKq

Best practices - https://colab.research.google.com/drive/1BoQ_vakEVtojsd2x_U6-_x52OOuqruj2?usp=sharing

In [2]:
import time
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
import emoji
import Preprocessing as preproc
import nltk
import warnings
warnings.filterwarnings("ignore")

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [2]:
import pickle

with open('data_hyderabad/data_preprocessed_classification.pkl', 'rb') as file:
    reviews = pickle.load(file)

In [3]:
reviews.head()

Unnamed: 0,Restaurant,Review,Review_Preprocessed,Cuisines,meals
0,Beyond Flavours,"The ambience was good, food was quite good . h...","[(ambience, NN), (good, JJ), (food, NN), (quit...","Chinese, Continental, Kebab, European, South I...",[]
1,Beyond Flavours,Ambience is too good for a pleasant evening. S...,"[(ambience, NN), (good, JJ), (pleasant, JJ), (...","Chinese, Continental, Kebab, European, South I...",[]
2,Beyond Flavours,A must try.. great food great ambience. Thnx f...,"[(must, MD), (try, VB), (great, JJ), (food, NN...","Chinese, Continental, Kebab, European, South I...",[Penne Alfredo Pasta]
3,Beyond Flavours,Soumen das and Arun was a great guy. Only beca...,"[(soumen, NNP), (das, NNS), (arun, NNP), (grea...","Chinese, Continental, Kebab, European, South I...",[]
4,Beyond Flavours,Food is good. we ordered Kodi drumsticks and b...,"[(food, NN), (good, JJ), (ordered, VBD), (kodi...","Chinese, Continental, Kebab, European, South I...","[Kodi drumsticks, basket mutton biryani]"


In [3]:
reviews_raw = pd.read_csv(r"data_hyderabad/10k_reviews.csv")

In [4]:
reviews_data = reviews_raw[reviews_raw["Rating"].notna() & reviews_raw["Review"].notna()]
reviews_data.isna().sum()

Restaurant    0
Reviewer      0
Review        0
Rating        0
Metadata      0
Time          0
Pictures      0
dtype: int64

In [5]:
# split UPPERCASE WORDS 
def splitting_words_process(word):
    # only upper case letters
    if word.isupper():
        return word
    
    # more than one upper case letter inside
    elif re.search(r'[A-Z][a-z]*[A-Z]', word):
        split_word = re.findall(r'[A-Z][a-z]*', word)
        return ' '.join(split_word)
    
    # <2 upper case letters
    else:
        return word

reviews_data['Review'] = reviews_data['Review'].apply(lambda x: ' '.join([splitting_words_process(word) for word in x.split()]))

In [6]:
# Function to replace 'gud', 'goo', 'gd' with the appropriate 'good'
def replace_gud_with_good(text):
    if isinstance(text, str):
        # Define the regex pattern to match 'gud', 'goo', 'gd' in various capitalizations
        pattern = re.compile(r'\b([Gg][Uu][Dd]|[Gg][Oo][Oo]|[Gg][Dd])\b')

        # Replacement function to check the case of the first letter
        def replacement(match):
            word = match.group()
            # Check if the first letter is uppercase, then return 'Good', else 'good'
            if word[0].isupper():
                return 'Good'
            else:
                return 'good'
        
        # Use re.sub to apply the replacement function
        return pattern.sub(replacement, text)
    
    return text

# Apply the function to the 'Review' column to replace the variants of 'good'
reviews_data['Review'] = reviews_data['Review'].apply(replace_gud_with_good)

In [7]:
# Function to replace 'kk', 'Oke', 'k', 'Ok' with 'ok'
def replace_to_ok(text):
    if isinstance(text, str):
        # Define the regex pattern to match the variants of 'ok'
        pattern = re.compile(r'\b(k|kk|Ok|Oke)\b', re.IGNORECASE)

        # Replacement function to return 'ok' for all matched words
        def replacement(match):
            return 'ok'
        
        # Use re.sub to apply the replacement function
        return pattern.sub(replacement, text)
    
    return text

# Apply the function to the 'Review' column to replace the variants of 'ok'
reviews_data['Review'] = reviews_data['Review'].apply(replace_to_ok)

In [8]:
# add space after ! | " | # | $ | % | & | ( | ) | * | + | , | . | : | ; followed immediately by a word
def add_space_after_punctuation(df):

    df['Review'] = df['Review'].apply(lambda text: re.sub(r'([\u0021-\u0026\u0028-\u002C\u002E\u003A-\u003F]+(?=\w))', r'\1 ', text) if isinstance(text, str) else text)
    return df

# Example usage:
reviews_data = add_space_after_punctuation(reviews_data)

In [9]:
# remove gibberish words like "ggggggggggd", "eshjdgue"
def remove_gibberish(text):
    cleaned_text = re.sub(r'\b\w{15,}\b', '', text)  # removes 15+ words
    cleaned_text = re.sub(r'\b\w*(\w)\1{2,}\w*\b', '', cleaned_text)  # removes words that contain 3+ repeating letters

    return cleaned_text

reviews_data['Review'] = reviews_data['Review'].apply(remove_gibberish)

In [10]:
reviews_preproc = reviews_data['Review'].apply(lambda x: preproc.main_pipeline(
    x, 
    print_output=False, 
    no_stopwords=False,
    custom_stopwords=[],
    convert_diacritics=True, 
    no_punctuation=False,
    remove_contractions = True,
    lowercase=False,
    lemmatized=False,
    stemmed=False, 
    tokenized_output=False
))

In [None]:
from bertopic import BERTopic

In [None]:
model_base = BERTopic(language="english", calculate_probabilities=True)
topics, probs = model_base.fit_transform(reviews_preproc)

In [None]:
freq = model_base.get_topic_info(); freq.head(5)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,4185,-1_food_place_good_service,"[food, place, good, service, ambience, great, ...",[nice place ambience different food order tast...
1,0,522,0_biryani_biriyani_mutton_chicken,"[biryani, biriyani, mutton, chicken, piece, or...","[nice biryani, good biryani, biryani good]"
2,1,385,1_good_test_boy_happy,"[good, test, boy, happy, getfry, bhehave, verr...","[good, good, good]"
3,2,376,2_manager_rude_bad_customer,"[manager, rude, bad, customer, ask, bill, tabl...",[hear lot place get chance finally visit react...
4,3,199,3_buffet_lunch_spread_course,"[buffet, lunch, spread, course, main, starter,...",[one best place look quiet calm place dinner l...


In [None]:
model_base.visualize_topics()

In [13]:
from sentence_transformers import SentenceTransformer

# Pre-calculate embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedding_model.encode(reviews_preproc.tolist(), show_progress_bar=True)

Batches:   0%|          | 0/312 [00:00<?, ?it/s]

In [15]:
from umap import UMAP

umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)

In [22]:
from hdbscan import HDBSCAN

hdbscan_model = HDBSCAN(min_cluster_size=25,metric='euclidean', cluster_selection_method='eom', prediction_data=True)

In [78]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer_model = CountVectorizer(stop_words="english", min_df=2, ngram_range=(1, 2))

In [27]:
from bertopic.vectorizers import ClassTfidfTransformer

ctfidf_model = ClassTfidfTransformer()

In [79]:
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, PartOfSpeech

# KeyBERT
keybert_model = KeyBERTInspired()

# Part-of-Speech
pos_model = PartOfSpeech("en_core_web_sm")

# MMR
mmr_model = MaximalMarginalRelevance(diversity=0.3)

# All representation models
representation_model = {
    "KeyBERT": keybert_model,
    "MMR": mmr_model,
    "POS": pos_model
}

In [80]:
from bertopic import BERTopic

custom_model = BERTopic(

  # Pipeline models
  embedding_model=embedding_model,
  umap_model=umap_model,
  hdbscan_model=hdbscan_model,
  vectorizer_model=vectorizer_model,
  representation_model=representation_model,
  ctfidf_model=ctfidf_model,

  # Hyperparameters
  top_n_words=10,
  verbose=True
)

topics, probs = custom_model.fit_transform(reviews_preproc, embeddings)

2024-12-11 00:56:18,684 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-11 00:56:40,699 - BERTopic - Dimensionality - Completed ✓
2024-12-11 00:56:40,700 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-11 00:56:41,199 - BERTopic - Cluster - Completed ✓
2024-12-11 00:56:41,206 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-11 00:57:06,932 - BERTopic - Representation - Completed ✓


In [82]:
custom_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,KeyBERT,MMR,POS,Representative_Docs
0,-1,3387,-1_food_chicken_good_place,"[food, chicken, good, place, ordered, taste, s...","[restaurant, buffet, good food, food good, dis...","[food, chicken, restaurant, order, rice, panee...","[food, chicken, good, place, taste, service, g...",[When you love food and especially north India...
1,0,1753,0_place_good_ambience_service,"[place, good, ambience, service, food, great, ...","[great place, visit place, best place, good pl...","[place, ambience, service, visit, drinks, frie...","[place, good, ambience, service, food, great, ...",[Great place to hangout . Good food + Good dri...
2,1,561,1_biryani_chicken_chicken biryani_taste,"[biryani, chicken, chicken biryani, taste, bir...","[chicken biryani, biryani chicken, biryani tas...","[chicken biryani, biriyani, mutton biryani, re...","[biryani, chicken, taste, biriyani, mutton, go...","[biryani is good, Just the name of Biryani and..."
3,2,365,2_manager_service_asked_table,"[manager, service, asked, table, worst, rude, ...","[restaurant, serving, waiters, waiter, lunch, ...","[manager, table, rude, staff, waiter, customer...","[manager, service, table, worst, rude, staff, ...","[The worst place in the city, I was here for a..."
4,3,309,3_good good_good_nice good_good combo,"[good good, good, nice good, good combo, combo...","[good, good good, nice good, good nice, nice, ...","[good good, good, nice good, good combo, combo...","[good, combo, nice, , , , , , , ]","[good, good, good good]"
5,4,233,4_hyderabad_best_place_food,"[hyderabad, best, place, food, place hyderabad...","[hyderabad food, restaurant hyderabad, food hy...","[hyderabad, place hyderabad, visit, places, hy...","[hyderabad, best, place, food, service, great,...",[I am a huge fan of regional cuisines . The di...
6,5,175,5_buffet_spread_lunch_starters,"[buffet, spread, lunch, starters, main, main c...","[buffet food, buffet lunch, dinner buffet, lun...","[buffet, lunch, main course, lunch buffet, des...","[buffet, spread, lunch, starters, main, main c...",[Very good place to have buffet . Buffet sprea...
7,6,169,6_cake_cakes_brownies_cupcakes,"[cake, cakes, brownies, cupcakes, chocolate, r...","[cup cakes, cup cake, cakes, cake chocolate, c...","[cakes, brownies, cupcakes, red velvet, bakery...","[cake, cakes, brownies, cupcakes, chocolate, r...",[My boyfriend had gotten a Cheese cake for my ...
8,7,131,7_gachibowli_place_place gachibowli_food,"[gachibowli, place, place gachibowli, food, go...","[gachibowli place, visit gachibowli, gachibowl...","[gachibowli, place gachibowli, food, visit, pl...","[gachibowli, place, food, good, ambience, best...",[I love this place in Gachibowli . I have been...
9,8,131,8_quantity_packing_food_food food,"[quantity, packing, food, food food, cold, tas...","[food quality, quality food, taste quality, qu...","[food food, food good, food cold, quantity foo...","[quantity, packing, food, cold, taste, good qu...",[Food is not that good . Quantity is more but ...


In [83]:
custom_model.visualize_topics()

In [84]:
custom_model_MRR = custom_model

In [85]:
# assign labels
keybert_topic_labels = {topic: " | ".join(list(zip(*values))[0][:3]) for topic, values in custom_model.topic_aspects_["KeyBERT"].items()}
custom_model.set_topic_labels(keybert_topic_labels)

In [None]:
mrr_topic_labels = {topic: " | ".join(list(zip(*values))[0][:3]) for topic, values in custom_model_MRR.topic_aspects_["MMR"].items()}
custom_model_MRR.set_topic_labels(mrr_topic_labels)

In [95]:
from transformers import pipeline

# Summarization pipeline
summarizer = pipeline("summarization", model="t5-small")

topic_labels = {}
topic_info = custom_model.get_topic_info()

for index, row in topic_info.iterrows():
    topic = row['Topic']
    representative_docs = row['Representation']
    text = " ".join(representative_docs)
    summary = summarizer(text, max_length=2, min_length=1, do_sample=False)
    topic_labels[topic] = summary[0]["summary_text"]

print(topic_labels)


Device set to use cpu


{-1: 'food', 0: 'place', 1: 'bi', 2: 'manager', 3: 'good', 4: '', 5: 'buffet', 6: 'cake', 7: '', 8: 'quantity', 9: 'authentic', 10: '', 11: 'chicken', 12: '', 13: 'noodles', 14: 'super', 15: 'quantity', 16: 'delivery', 17: '', 18: 'delivery', 19: '', 20: 'pizza', 21: 'pan', 22: 'para', 23: 'chicken', 24: 'order', 25: 'good', 26: 'in', 27: 'mango', 28: 'superb', 29: 'mom', 30: 'pun', 31: 'man', 32: 'taste', 33: 'good', 34: 'bread', 35: 'shake', 36: 'coffee', 37: 'fast', 38: 'ma', 39: 'awesome', 40: 'service', 41: '', 42: 'tasty', 43: 'spicy', 44: 'nice', 45: 'excellent', 46: 'pi', 47: 'wa', 48: 'and', 49: 'wings', 50: '', 51: 'si', 52: 'wrap', 53: 'awesome'}


In [87]:
custom_model_MRR.get_topic_info().head()

Unnamed: 0,Topic,Count,Name,CustomName,Representation,KeyBERT,MMR,POS,Representative_Docs
0,-1,3387,-1_food_chicken_good_place,food | chicken | restaurant,"[food, chicken, good, place, ordered, taste, s...","[restaurant, buffet, good food, food good, dis...","[food, chicken, restaurant, order, rice, panee...","[food, chicken, good, place, taste, service, g...",[When you love food and especially north India...
1,0,1753,0_place_good_ambience_service,place | ambience | service,"[place, good, ambience, service, food, great, ...","[great place, visit place, best place, good pl...","[place, ambience, service, visit, drinks, frie...","[place, good, ambience, service, food, great, ...",[Great place to hangout . Good food + Good dri...
2,1,561,1_biryani_chicken_chicken biryani_taste,chicken biryani | biriyani | mutton biryani,"[biryani, chicken, chicken biryani, taste, bir...","[chicken biryani, biryani chicken, biryani tas...","[chicken biryani, biriyani, mutton biryani, re...","[biryani, chicken, taste, biriyani, mutton, go...","[biryani is good, Just the name of Biryani and..."
3,2,365,2_manager_service_asked_table,manager | table | rude,"[manager, service, asked, table, worst, rude, ...","[restaurant, serving, waiters, waiter, lunch, ...","[manager, table, rude, staff, waiter, customer...","[manager, service, table, worst, rude, staff, ...","[The worst place in the city, I was here for a..."
4,3,309,3_good good_good_nice good_good combo,good good | good | nice good,"[good good, good, nice good, good combo, combo...","[good, good good, nice good, good nice, nice, ...","[good good, good, nice good, good combo, combo...","[good, combo, nice, , , , , , , ]","[good, good, good good]"


In [88]:
custom_model.get_topic_info().head()

Unnamed: 0,Topic,Count,Name,CustomName,Representation,KeyBERT,MMR,POS,Representative_Docs
0,-1,3387,-1_food_chicken_good_place,food | chicken | restaurant,"[food, chicken, good, place, ordered, taste, s...","[restaurant, buffet, good food, food good, dis...","[food, chicken, restaurant, order, rice, panee...","[food, chicken, good, place, taste, service, g...",[When you love food and especially north India...
1,0,1753,0_place_good_ambience_service,place | ambience | service,"[place, good, ambience, service, food, great, ...","[great place, visit place, best place, good pl...","[place, ambience, service, visit, drinks, frie...","[place, good, ambience, service, food, great, ...",[Great place to hangout . Good food + Good dri...
2,1,561,1_biryani_chicken_chicken biryani_taste,chicken biryani | biriyani | mutton biryani,"[biryani, chicken, chicken biryani, taste, bir...","[chicken biryani, biryani chicken, biryani tas...","[chicken biryani, biriyani, mutton biryani, re...","[biryani, chicken, taste, biriyani, mutton, go...","[biryani is good, Just the name of Biryani and..."
3,2,365,2_manager_service_asked_table,manager | table | rude,"[manager, service, asked, table, worst, rude, ...","[restaurant, serving, waiters, waiter, lunch, ...","[manager, table, rude, staff, waiter, customer...","[manager, service, table, worst, rude, staff, ...","[The worst place in the city, I was here for a..."
4,3,309,3_good good_good_nice good_good combo,good good | good | nice good,"[good good, good, nice good, good combo, combo...","[good, good good, nice good, good nice, nice, ...","[good good, good, nice good, good combo, combo...","[good, combo, nice, , , , , , , ]","[good, good, good good]"


In [48]:
custom_model_df = pd.DataFrame(custom_model.get_topic_info())
custom_model_mrr_df = pd.DataFrame(custom_model_MRR.get_topic_info())

In [61]:
merged_df = custom_model_df.join(custom_model_mrr_df, lsuffix='_keybert', rsuffix='_mrr')

In [62]:
merged_df.head()

Unnamed: 0,Topic_keybert,Count_keybert,Name_keybert,CustomName_keybert,Representation_keybert,KeyBERT_keybert,MMR_keybert,POS_keybert,Representative_Docs_keybert,Topic_mrr,Count_mrr,Name_mrr,CustomName_mrr,Representation_mrr,KeyBERT_mrr,MMR_mrr,POS_mrr,Representative_Docs_mrr
0,-1,3387,-1_food_good_chicken_place,food | chicken | ambience,"[food, good, chicken, place, ordered, taste, s...","[restaurant, buffet, good food, food good, dis...","[food, chicken, ambience, restaurant, order, r...","[food, good, chicken, place, taste, service, g...",[When you love food and especially north India...,-1,3387,-1_food_good_chicken_place,food | chicken | ambience,"[food, good, chicken, place, ordered, taste, s...","[restaurant, buffet, good food, food good, dis...","[food, chicken, ambience, restaurant, order, r...","[food, good, chicken, place, taste, service, g...",[When you love food and especially north India...
1,0,1753,0_place_good_ambience_service,place | ambience | service,"[place, good, ambience, service, food, great, ...","[great place, best place, visit place, good pl...","[place, ambience, service, visit, drinks, frie...","[place, good, ambience, service, food, great, ...",[Komatose is such wonderful place to chilled o...,0,1753,0_place_good_ambience_service,place | ambience | service,"[place, good, ambience, service, food, great, ...","[great place, best place, visit place, good pl...","[place, ambience, service, visit, drinks, frie...","[place, good, ambience, service, food, great, ...",[Komatose is such wonderful place to chilled o...
2,1,561,1_biryani_chicken_chicken biryani_taste,chicken biryani | biriyani | mutton biryani,"[biryani, chicken, chicken biryani, taste, ord...","[ordered chicken biryani, chicken biryani, bir...","[chicken biryani, biriyani, mutton biryani, re...","[biryani, chicken, taste, biriyani, mutton, go...",[One of the new tastes of biryanis in hyd we o...,1,561,1_biryani_chicken_chicken biryani_taste,chicken biryani | biriyani | mutton biryani,"[biryani, chicken, chicken biryani, taste, ord...","[ordered chicken biryani, chicken biryani, bir...","[chicken biryani, biriyani, mutton biryani, re...","[biryani, chicken, taste, biriyani, mutton, go...",[One of the new tastes of biryanis in hyd we o...
3,2,365,2_manager_service_asked_table,manager | table | worst,"[manager, service, asked, table, worst, rude, ...","[restaurant, serving, waiters, waiter, serve, ...","[manager, table, worst, rude, staff, waiter, c...","[manager, service, table, worst, rude, staff, ...","[Visited this place for my team lunch, and the...",2,365,2_manager_service_asked_table,manager | table | worst,"[manager, service, asked, table, worst, rude, ...","[restaurant, serving, waiters, waiter, serve, ...","[manager, table, worst, rude, staff, waiter, c...","[manager, service, table, worst, rude, staff, ...","[Visited this place for my team lunch, and the..."
4,3,309,3_good good good_good good_good_nice good good,good good good | good good | nice good good,"[good good good, good good, good, nice good go...","[good, good good, nice good, good good good, g...","[good good good, good good, nice good good, ni...","[good, combo, nice, , , , , , , ]","[good, good, good good]",3,309,3_good good good_good good_good_nice good good,good good good | good good | nice good good,"[good good good, good good, good, nice good go...","[good, good good, nice good, good good good, g...","[good good good, good good, nice good good, ni...","[good, combo, nice, , , , , , , ]","[good, good, good good]"


In [65]:
merged_df = merged_df[['Topic_keybert', 'Count_keybert', 'Name_keybert', 'CustomName_keybert', 'CustomName_mrr','Representation_keybert', 'KeyBERT_keybert','MMR_keybert', 'POS_keybert', 'Representative_Docs_keybert']]