Basic tutorial - https://colab.research.google.com/drive/1FieRA9fLdkQEGDIMYl0I3MCjSUKVF8C-?usp=sharing#scrollTo=S9qDqEHddgKq

Best practices - https://colab.research.google.com/drive/1BoQ_vakEVtojsd2x_U6-_x52OOuqruj2?usp=sharing

In [1]:
import time
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
import emoji
import Preprocessing as preproc
import nltk
import warnings
warnings.filterwarnings("ignore")

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [2]:
reviews_raw = pd.read_csv(r"data_hyderabad/10k_reviews.csv")

In [3]:
reviews_data = reviews_raw[reviews_raw["Rating"].notna() & reviews_raw["Review"].notna()]
reviews_data.isna().sum()

Restaurant    0
Reviewer      0
Review        0
Rating        0
Metadata      0
Time          0
Pictures      0
dtype: int64

In [4]:
# split UPPERCASE WORDS 
def splitting_words_process(word):
    # only upper case letters
    if word.isupper():
        return word
    
    # more than one upper case letter inside
    elif re.search(r'[A-Z][a-z]*[A-Z]', word):
        split_word = re.findall(r'[A-Z][a-z]*', word)
        return ' '.join(split_word)
    
    # <2 upper case letters
    else:
        return word

reviews_data['Review'] = reviews_data['Review'].apply(lambda x: ' '.join([splitting_words_process(word) for word in x.split()]))

In [5]:
# Function to replace 'gud', 'goo', 'gd' with the appropriate 'good'
def replace_gud_with_good(text):
    if isinstance(text, str):
        # Define the regex pattern to match 'gud', 'goo', 'gd' in various capitalizations
        pattern = re.compile(r'\b([Gg][Uu][Dd]|[Gg][Oo][Oo]|[Gg][Dd])\b')

        # Replacement function to check the case of the first letter
        def replacement(match):
            word = match.group()
            # Check if the first letter is uppercase, then return 'Good', else 'good'
            if word[0].isupper():
                return 'Good'
            else:
                return 'good'
        
        # Use re.sub to apply the replacement function
        return pattern.sub(replacement, text)
    
    return text

# Apply the function to the 'Review' column to replace the variants of 'good'
reviews_data['Review'] = reviews_data['Review'].apply(replace_gud_with_good)

In [6]:
# Function to replace 'kk', 'Oke', 'k', 'Ok' with 'ok'
def replace_to_ok(text):
    if isinstance(text, str):
        # Define the regex pattern to match the variants of 'ok'
        pattern = re.compile(r'\b(k|kk|Ok|Oke)\b', re.IGNORECASE)

        # Replacement function to return 'ok' for all matched words
        def replacement(match):
            return 'ok'
        
        # Use re.sub to apply the replacement function
        return pattern.sub(replacement, text)
    
    return text

# Apply the function to the 'Review' column to replace the variants of 'ok'
reviews_data['Review'] = reviews_data['Review'].apply(replace_to_ok)

In [7]:
# add space after ! | " | # | $ | % | & | ( | ) | * | + | , | . | : | ; followed immediately by a word
def add_space_after_punctuation(df):

    df['Review'] = df['Review'].apply(lambda text: re.sub(r'([\u0021-\u0026\u0028-\u002C\u002E\u003A-\u003F]+(?=\w))', r'\1 ', text) if isinstance(text, str) else text)
    return df

# Example usage:
reviews_data = add_space_after_punctuation(reviews_data)

In [8]:
# remove gibberish words like "ggggggggggd", "eshjdgue"
def remove_gibberish(text):
    cleaned_text = re.sub(r'\b\w{15,}\b', '', text)  # removes 15+ words
    cleaned_text = re.sub(r'\b\w*(\w)\1{2,}\w*\b', '', cleaned_text)  # removes words that contain 3+ repeating letters

    return cleaned_text

reviews_data['Review'] = reviews_data['Review'].apply(remove_gibberish)

In [9]:
def remove_space_before_punctuation(text):
    if isinstance(text, str):
        # Define the regex pattern to match spaces before punctuation
        pattern = re.compile(r'\s+([?.!,;:])')
        
        # Use re.sub to remove spaces before punctuation
        return pattern.sub(r'\1', text)
    
    return text

# Apply the function to the 'Review' column to remove spaces before punctuation
reviews_data['Review'] = reviews_data['Review'].apply(remove_space_before_punctuation)

In [10]:
reviews_preproc = reviews_data['Review'].apply(lambda x: preproc.main_pipeline(
    x, 
    print_output=False, 
    no_stopwords=False,
    custom_stopwords=[],
    convert_diacritics=True, 
    no_punctuation=False,
    remove_contractions = True,
    lowercase=False,
    lemmatized=False,
    stemmed=False, 
    tokenized_output=False
))

In [11]:
def remove_space_before_punctuation(text):
    if isinstance(text, str):
        # Define the regex pattern to match spaces before punctuation
        pattern = re.compile(r'\s+([?.!,;:])')
        
        # Use re.sub to remove spaces before punctuation
        return pattern.sub(r'\1', text)
    
    return text

# Apply the function to the 'Review' column to remove spaces before punctuation
reviews_preproc = reviews_preproc.apply(remove_space_before_punctuation)

In [12]:
reviews_preproc = reviews_preproc[reviews_preproc.apply(lambda x: len(x) >= 3 if isinstance(x, str) else False)]

In [267]:
len(reviews_preproc)

9911

In [35]:
from spellchecker import SpellChecker

# Initialize the spell checker
spell = SpellChecker()

def correct_spelling(text):
    if isinstance(text, str):
        # Split the text into words
        words = text.split()
        
        # Correct the spelling of each word
        corrected_words = [spell.correction(word) if spell.correction(word) is not None else word for word in words]
        
        # Join the corrected words back into a single string
        corrected_text = ' '.join(corrected_words)
        
        return corrected_text
    
    return text

# Apply the function to the 'Review' column to correct spelling mistakes
reviews_preproc_spell =reviews_preproc.apply(correct_spelling)

KeyboardInterrupt: 

In [36]:
from transformers import pipeline

fix_spelling = pipeline("text2text-generation",model="oliverguhr/spelling-correction-english-base")

print(fix_spelling("lets do a comparsion",max_length=2048))

config.json:   0%|          | 0.00/1.74k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/353 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Device set to use cpu


[{'generated_text': "Let's do a comparison."}]


In [37]:
reviews_preproc = reviews_preproc.apply(lambda x: fix_spelling(x, max_length=2048)[0]['generated_text'])

KeyboardInterrupt: 

In [None]:
reviews_preproc_spell.diff(reviews_preproc)

In [None]:
print(reviews_preproc)

0       The ambience was good, food was quite good. ha...
1       Ambience is too good for a pleasant evening. S...
2       A must try.. great food great ambience. Thnx f...
3       Soumen das and Arun was a great guy. Only beca...
4       Food is good. we ordered Kodi drumsticks and b...
                              ...                        
9995    Madhumathi Mahajan Well to start with nice cou...
9996    This place has never disappointed us.. The foo...
9997    Bad rating is mainly because of "Chicken Bone ...
9998    I personally love and prefer Chinese Food. Had...
9999    Checked in here to try some delicious chinese ...
Name: Review, Length: 9910, dtype: object

In [13]:
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, PartOfSpeech
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer
from umap import UMAP
from hdbscan import HDBSCAN

In [14]:
from sentence_transformers import SentenceTransformer

# Pre-calculate embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedding_model.encode(reviews_preproc.tolist(), show_progress_bar=True)

Batches:   0%|          | 0/310 [00:00<?, ?it/s]

In [30]:
vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1, 1))
ctfidf_model = ClassTfidfTransformer(bm25_weighting=True)
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
hdbscan_model = HDBSCAN(min_cluster_size=30,metric='euclidean', cluster_selection_method='eom', prediction_data=True)

In [31]:
model_base = BERTopic(
    language="english", 
    calculate_probabilities=True,
    embedding_model= embedding_model,
    vectorizer_model=vectorizer_model,
    ctfidf_model=ctfidf_model,
    umap_model= umap_model,
    hdbscan_model=hdbscan_model,
    representation_model={"MMR": MaximalMarginalRelevance(diversity=0.3),
                          "KeyBert": KeyBERTInspired(),
                          "Pos": PartOfSpeech()},
    verbose = True
)
topics_base, probs_base = model_base.fit_transform(reviews_preproc,embeddings)

2024-12-15 17:37:37,604 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-15 17:38:00,261 - BERTopic - Dimensionality - Completed ✓
2024-12-15 17:38:00,263 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-15 17:38:02,669 - BERTopic - Cluster - Completed ✓
2024-12-15 17:38:02,676 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-15 17:38:26,225 - BERTopic - Representation - Completed ✓


In [28]:
model_base.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,MMR,KeyBert,Pos,Representative_Docs
0,-1,4202,-1_pizza_friendly_ambience_great,"[pizza, friendly, ambience, great, awesome, ov...","[pizza, friendly, prawns, tikka, menu, lunch, ...","[restaurant, buffet, dishes, dish, dinner, lun...","[pizza, friendly, ambience, great, awesome, ov...",[This place has been a bookmark for me since v...
1,0,584,0_biryani_biriyani_biryanis_mutton,"[biryani, biriyani, biryanis, mutton, dum, par...","[biryani, biriyani, mutton, hyderabadi, quanti...","[biryani, biiyani, biryanis, biriyani, birayan...","[biryani, biriyani, biryanis, mutton, paradise...",[Well this is an absolute honest review from a...
2,1,426,1_manager_rude_asked_waiter,"[manager, rude, asked, waiter, table, pathetic...","[rude, waiter, mins, waiters, leave, tables, b...","[waiters, waiter, serve, customers, booking, r...","[manager, rude, waiter, table, pathetic, mins,...",[I went to club rouge since the day it started...
3,2,328,2_dj_dance_floor_music,"[dj, dance, floor, music, hangout, crowd, roof...","[dj, dance, hangout, rooftop, party, pub, week...","[pubs, hangout, partying, terrace, pub, place,...","[dj, dance, floor, music, hangout, crowd, roof...","[This place has amazing setup, separated in mu..."
4,3,308,3_good_bhehave_verry_combo,"[good, bhehave, verry, combo, nice, , , , , ]","[good, bhehave, verry, combo, nice, , , , , , ...","[good, nice, , , , , , , , ]","[good, combo, nice, , , , , , , ]","[good, good, good bhehave]"
5,4,259,4_hyderabad_hyderabadi_daawat_sikandari,"[hyderabad, hyderabadi, daawat, sikandari, raa...","[hyderabadi, places, culture, dawaat, beers, c...","[hyderabadi, hyderabad, cuisine, shorba, cuisi...","[hyderabad, hyderabadi, daawat, sikandari, pla...",[Ask anyone what you like about Hyderabad the ...
6,5,240,5_govind_bahadur_excellent_shivam,"[govind, bahadur, excellent, shivam, sarvice, ...","[govind, bahadur, shivam, sabir, krishna, mr, ...","[visit, hospitality, kusal, servicing, santhu,...","[bahadur, excellent, sarvice, abs, suraj, than...",[Excellent food and excellent service providin...
7,6,178,6_zomato_gold_dominos_refund,"[zomato, gold, dominos, refund, support, deliv...","[zomato, dominos, refund, contact, payment, fa...","[refund, customers, orders, restaurant, domino...","[zomato, gold, dominos, refund, support, conta...","[delivery, poor customer support. Both me and ..."
8,7,177,7_buffet_spread_hyatt_buffets,"[buffet, spread, hyatt, buffets, lunch, carte,...","[buffet, hyatt, buffets, lunch, counters, dess...","[buffets, buffet, dishes, lunch, salads, barbe...","[buffet, spread, buffets, lunch, carte, counte...",[I have had the breakfast buffet here quite a ...
9,8,141,8_cake_brownies_cupcakes_cakes,"[cake, brownies, cupcakes, cakes, velvet, labo...","[brownies, cupcakes, cakes, chocolate, cookies...","[bakery, bakeries, cakes, cupcakes, cupcake, b...","[cake, brownies, cupcakes, cakes, velvet, labo...",[Absolutely delighted with our overall experie...


In [29]:
# Create a DataFrame with reviews and their top 3 assigned topics with probabilities
reviews_with_topics = pd.DataFrame({
    'Review': reviews_preproc,
    'Top_3_Topics': [np.argsort(probs_base[i])[-3:][::-1] for i in range(len(reviews_preproc))],
    'Top_3_Probabilities': [np.round(np.sort(probs_base[i])[-3:][::-1] * 100, 2) for i in range(len(reviews_preproc))]
})

# Map the topic IDs to their custom names from topic_aspects_["MMR"]
topic_info_MMR = {topic: " | ".join(list(zip(*values))[0][:3]) for topic, values in model_base.topic_aspects_["MMR"].items()}
topic_info_KB = {topic: " | ".join(list(zip(*values))[0][:3]) for topic, values in model_base.topic_aspects_["KeyBert"].items()}
topic_info_POS = {topic: " | ".join(list(zip(*values))[0][:3]) for topic, values in model_base.topic_aspects_["Pos"].items()}
reviews_with_topics['Topics_MMR'] = reviews_with_topics['Top_3_Topics'].apply(lambda topics: [topic_info_MMR[topic] for topic in topics])
reviews_with_topics['Topics_KB'] = reviews_with_topics['Top_3_Topics'].apply(lambda topics: [topic_info_KB[topic] for topic in topics])
reviews_with_topics['Topics_POS'] = reviews_with_topics['Top_3_Topics'].apply(lambda topics: [topic_info_POS[topic] for topic in topics])

# Display the first few rows of the new DataFrame
reviews_with_topics.head()


Unnamed: 0,Review,Top_3_Topics,Top_3_Probabilities,Topics_MMR,Topics_KB,Topics_POS
0,"The ambience was good, food was quite good. ha...","[26, 11, 5]","[5.15, 4.94, 4.38]","[chese | space | hospitality, ambience | kaful...","[hospitality | visiting | spacious, ambience |...","[chese | family | space, ambience | goreng | a..."
1,Ambience is too good for a pleasant evening. S...,"[11, 26, 2]","[48.85, 3.5, 2.03]","[ambience | kafuli | goreng, chese | space | h...","[ambience | drinks | cocktail, hospitality | v...","[ambience | goreng | average, chese | family |..."
2,A must try.. great food great ambience. Thnx f...,"[11, 26, 27]","[12.61, 5.94, 3.11]","[ambience | kafuli | goreng, chese | space | h...","[ambience | drinks | cocktail, hospitality | v...","[ambience | goreng | average, chese | family |..."
3,Soumen das and Arun was a great guy. Only beca...,"[5, 26, 7]","[56.52, 2.08, 1.44]","[govind | bahadur | shivam, chese | space | ho...","[visit | hospitality | kusal, hospitality | vi...","[bahadur | excellent | sarvice, chese | family..."
4,Food is good. we ordered Kodi drumsticks and b...,"[7, 13, 27]","[1.05, 0.95, 0.89]","[buffet | hyatt | buffets, nawabi | prawns | m...","[buffets | buffet | dishes, soup | soups | spi...","[buffet | spread | buffets, fish | virgin | na..."


In [19]:
hierarchical_topics = model_base.hierarchical_topics(reviews_preproc)


100%|██████████| 52/52 [00:00<00:00, 189.77it/s]


In [20]:
model_base.visualize_hierarchy(hierarchical_topics=hierarchical_topics)

In [21]:
len(model_base.get_topics())

54

In [22]:
model_reduced = model_base.reduce_topics(reviews_preproc, nr_topics=10)

2024-12-15 17:31:44,605 - BERTopic - Topic reduction - Reducing number of topics
2024-12-15 17:31:50,318 - BERTopic - Topic reduction - Reduced number of topics from 54 to 10


In [23]:
topics_reduced = model_reduced.topics_
probs_reduced = model_reduced.probabilities_

In [24]:
# Create a DataFrame with reviews and their top 3 assigned topics with probabilities
reviews_with_topics_merged = pd.DataFrame({
    'Review': reviews_preproc,
    'Top_3_Topics': [np.argsort(probs_reduced[i])[-3:][::-1] for i in range(len(reviews_preproc))],
    'Top_3_Probabilities': [np.round(np.sort(probs_reduced[i])[-3:][::-1] * 100, 2) for i in range(len(reviews_preproc))]
})

# Map the topic IDs to their custom names from topic_aspects_["MMR"]
topic_info_MMR_merged = {topic: " | ".join(list(zip(*values))[0][:3]) for topic, values in model_reduced.topic_aspects_["MMR"].items()}
topic_info_KB_merged = {topic: " | ".join(list(zip(*values))[0][:3]) for topic, values in model_reduced.topic_aspects_["KeyBert"].items()}
topic_info_POS_merged = {topic: " | ".join(list(zip(*values))[0][:3]) for topic, values in model_reduced.topic_aspects_["Pos"].items()}
reviews_with_topics_merged['Topics_MMR'] = reviews_with_topics_merged['Top_3_Topics'].apply(lambda topics: [topic_info_MMR_merged[topic] for topic in topics])
reviews_with_topics_merged['Topics_KB'] = reviews_with_topics_merged['Top_3_Topics'].apply(lambda topics: [topic_info_KB_merged[topic] for topic in topics])
reviews_with_topics_merged['Topics_POS'] = reviews_with_topics_merged['Top_3_Topics'].apply(lambda topics: [topic_info_POS_merged[topic] for topic in topics])

# Display the first few rows of the new DataFrame
reviews_with_topics_merged.head()

Unnamed: 0,Review,Top_3_Topics,Top_3_Probabilities,Topics_MMR,Topics_KB,Topics_POS
0,"The ambience was good, food was quite good. ha...","[0, 2, 4]","[58.83, 5.82, 2.5]","[food | place | chicken, chocolate | donuts | ...","[restaurant | buffet | lunch, icecream | choco...","[food | place | good, cake | cream | ice, spic..."
1,Ambience is too good for a pleasant evening. S...,"[0, 2, 4]","[77.88, 3.42, 1.36]","[food | place | chicken, chocolate | donuts | ...","[restaurant | buffet | lunch, icecream | choco...","[food | place | good, cake | cream | ice, spic..."
2,A must try.. great food great ambience. Thnx f...,"[0, 2, 4]","[59.79, 5.87, 2.14]","[food | place | chicken, chocolate | donuts | ...","[restaurant | buffet | lunch, icecream | choco...","[food | place | good, cake | cream | ice, spic..."
3,Soumen das and Arun was a great guy. Only beca...,"[0, 2, 7]","[81.65, 3.0, 1.4]","[food | place | chicken, chocolate | donuts | ...","[restaurant | buffet | lunch, icecream | choco...","[food | place | good, cake | cream | ice, sizz..."
4,Food is good. we ordered Kodi drumsticks and b...,"[0, 2, 4]","[15.72, 1.45, 0.73]","[food | place | chicken, chocolate | donuts | ...","[restaurant | buffet | lunch, icecream | choco...","[food | place | good, cake | cream | ice, spic..."


In [334]:
model_custom_merged= BERTopic(
    language="english", 
    calculate_probabilities=True,
    embedding_model= embedding_model,
    vectorizer_model=vectorizer_model,
    ctfidf_model=ctfidf_model,
    umap_model= umap_model,
    hdbscan_model=hdbscan_model,
    representation_model={"MMR": MaximalMarginalRelevance(diversity=0.3),
                          "KeyBert": KeyBERTInspired(),
                          "Pos": PartOfSpeech()},
    verbose = True
)
topics_custom, probs_custom = model_custom_merged.fit_transform(reviews_preproc,embeddings)

2024-12-14 20:01:26,757 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-14 20:01:43,825 - BERTopic - Dimensionality - Completed ✓
2024-12-14 20:01:43,826 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-14 20:01:45,477 - BERTopic - Cluster - Completed ✓
2024-12-14 20:01:45,482 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-14 20:02:08,945 - BERTopic - Representation - Completed ✓


In [335]:
model_custom_merged.merge_topics(reviews_preproc, topics_to_merge=[3,33])

In [337]:
len(model_custom_merged.get_topic_info())

52

In [338]:
model_custom_merged.merge_topics(reviews_preproc, topics_to_merge=[16,17,25,35,37,42])

In [339]:
len(model_custom_merged.get_topic_info())

47

In [340]:
model_custom_merged.merge_topics(reviews_preproc, topics_to_merge=[31,22])

In [342]:
topics_custom = model_custom_merged.topics_
probs_custom = model_custom_merged.probabilities_

In [352]:
model_custom_merged.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,MMR,KeyBert,Pos,Representative_Docs
0,-1,3547,-1_food_good_place_chicken,"[food, good, place, chicken, service, ordered,...","[food, place, chicken, service, ambience, rest...","[good food, restaurant, food good, food, buffe...","[food, good, place, chicken, service, taste, a...",[Arena eleven sports bar I Pl is here and what...
1,0,1425,0_place_good_food_ambience,"[place, good, food, ambience, service, great, ...","[ambience, service, music, visit, drinks, dj, ...","[great place, good ambience, good place, nice ...","[place, good, food, ambience, service, great, ...",[Great ambience. awesome service by Manoranjan...
2,1,565,1_biryani_chicken_chicken biryani_taste,"[biryani, chicken, chicken biryani, taste, ord...","[biryani, chicken biryani, taste, biriyani, mu...","[biryani good, good biryani, biryani taste, ch...","[biryani, chicken, taste, biriyani, good, mutt...","[good biryani, biryani is good, Mutton biryani..."
3,2,466,2_service_manager_food_asked,"[service, manager, food, asked, staff, place, ...","[service, manager, staff, table, rude, restaur...","[serving, serve, restaurant, waiters, buffet, ...","[service, manager, food, staff, place, table, ...",[We went today for an office team lunch. We we...
4,3,379,3_delivery_maggi_shake_coffee,"[delivery, maggi, shake, coffee, shakes, cafe,...","[maggi, coffee, shakes, cafe, bikes, time deli...","[good delivery, delivery good, excellent deliv...","[delivery, shake, coffee, shakes, cafe, order,...","[good delivery, good delivery, delivery is very]"
5,4,363,4_good good_good_nice good_good nice,"[good good, good, nice good, good nice, good b...","[boy good, rider kind, good tast, bhehave good...","[good, good good, great good, good great, nice...","[good, good boy, nice, boy, good tast, nice ri...","[good, good good, a very good good]"
6,5,230,5_hyderabad_place_best_food,"[hyderabad, place, best, food, visit, service,...","[hyderabad, visit, hyderabadi, place hyderabad...","[hyderabad food, places hyderabad, place hyder...","[hyderabad, place, best, food, service, hydera...",[One of the best buffet experiences we had in ...
7,6,226,6_buffet_spread_lunch_starters,"[buffet, spread, lunch, starters, good, food, ...","[buffet, lunch, food, main course, lunch buffe...","[buffet food, buffet lunch, dinner buffet, lun...","[buffet, spread, lunch, starters, good, food, ...",[Very good place to have buffet. Buffet spread...
8,7,156,7_cake_cakes_brownies_chocolate,"[cake, cakes, brownies, chocolate, cupcakes, r...","[cakes, brownies, chocolate, cupcakes, red vel...","[cakes brownies, cup cakes, cake chocolate, ch...","[cake, cakes, brownies, chocolate, cupcakes, r...",[They make the Best red velvet cake and cup ca...
9,8,142,8_chicken_fish_veg_taste,"[chicken, fish, veg, taste, good, paneer, star...","[chicken, paneer, dish, prawns, tikka, spicy, ...","[restaurant, buffet, dishes, chicken, chilli, ...","[chicken, fish, veg, taste, good, paneer, star...",[Best place Ever Jonathan is Kitchen If you ar...


In [359]:
# Create a DataFrame with reviews and their top 3 assigned topics with probabilities
reviews_with_topics_custom = pd.DataFrame({
    'Review': reviews_preproc,
    'Top_3_Topics': [np.argsort(probs_custom[i])[-3:][::-1] for i in range(len(reviews_preproc))],
    'Top_3_Probabilities': [np.round(np.sort(probs_custom[i])[-3:][::-1] * 100, 2) for i in range(len(reviews_preproc))]
})

# Map the topic IDs to their custom names from topic_aspects_["MMR"]
#topic_info_MMR_custom = {topic: " | ".join(list(zip(*values))[0][:3]) for topic, values in model_custom_merged.topic_aspects_["MMR"].items()}
topic_info_KB_custom = {topic: " | ".join(list(zip(*values))[0][:1]) for topic, values in model_custom_merged.topic_aspects_["KeyBert"].items()}
#topic_info_POS_custom = {topic: " | ".join(list(zip(*values))[0][:3]) for topic, values in model_custom_merged.topic_aspects_["Pos"].items()}
#reviews_with_topics_custom['Topics_MMR'] = reviews_with_topics_custom['Top_3_Topics'].apply(lambda topics: [topic_info_MMR_custom[topic] for topic in topics])
reviews_with_topics_custom['Topics_KB'] = reviews_with_topics_custom['Top_3_Topics'].apply(lambda topics: [topic_info_KB_custom[topic] for topic in topics])
#reviews_with_topics_custom['Topics_POS'] = reviews_with_topics_custom['Top_3_Topics'].apply(lambda topics: [topic_info_POS_custom[topic] for topic in topics])

# Display the first few rows of the new DataFrame
reviews_with_topics_custom.head()

Unnamed: 0,Review,Top_3_Topics,Top_3_Probabilities,Topics_KB
0,"The ambience was good, food was quite good. ha...","[3, 0, 6]","[8.42, 8.05, 4.87]","[good delivery, great place, buffet food]"
1,Ambience is too good for a pleasant evening. S...,"[0, 3, 21]","[100.0, 0.0, 0.0]","[great place, good delivery, pizza ravioli]"
2,A must try.. great food great ambience. Thnx f...,"[0, 3, 21]","[11.36, 9.53, 5.21]","[great place, good delivery, pizza ravioli]"
3,Soumen das and Arun was a great guy. Only beca...,"[0, 3, 43]","[37.61, 5.87, 2.95]","[great place, good delivery, sizzlers restaurant]"
4,Food is good. we ordered Kodi drumsticks and b...,"[3, 21, 6]","[1.2, 0.77, 0.75]","[good delivery, pizza ravioli, buffet food]"


In [52]:
from sentence_transformers import SentenceTransformer

# Pre-calculate embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedding_model.encode(reviews_preproc.tolist(), show_progress_bar=True)

Batches:   0%|          | 0/312 [00:00<?, ?it/s]

In [None]:
from umap import UMAP

umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
from hdbscan import HDBSCAN

hdbscan_model = HDBSCAN(min_cluster_size=30,metric='euclidean', cluster_selection_method='eom', prediction_data=True)

In [34]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer_model = CountVectorizer(stop_words="english", min_df=2, ngram_range=(1, 2))

In [35]:
from bertopic.vectorizers import ClassTfidfTransformer

ctfidf_model = ClassTfidfTransformer()

In [111]:
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, PartOfSpeech

# KeyBERT
keybert_model = KeyBERTInspired()

# Part-of-Speech
pos_model = PartOfSpeech("en_core_web_sm")

# MMR
mmr_model = MaximalMarginalRelevance(diversity=0.2)

# All representation models
representation_model = {
    "KeyBERT": keybert_model,
    "MMR": mmr_model,
    "POS": pos_model
}

In [202]:
from bertopic import BERTopic

custom_model = BERTopic(

  # Pipeline models
  embedding_model=embedding_model,
  umap_model=umap_model,
  hdbscan_model=hdbscan_model,
  vectorizer_model=vectorizer_model,
  ctfidf_model=ctfidf_model,
  representation_model=representation_model,

  # Hyperparameters
  top_n_words=10,
  verbose=True
)

topics, probs = custom_model.fit_transform(reviews_preproc, embeddings)

2024-12-14 17:39:51,547 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-14 17:40:08,317 - BERTopic - Dimensionality - Completed ✓
2024-12-14 17:40:08,318 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-14 17:40:08,762 - BERTopic - Cluster - Completed ✓
2024-12-14 17:40:08,766 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-14 17:40:35,096 - BERTopic - Representation - Completed ✓


In [113]:
mrr_topic_labels = {topic: " | ".join(list(zip(*values))[0][:3]) for topic, values in custom_model.topic_aspects_["MMR"].items()}
custom_model.set_topic_labels(mrr_topic_labels)

In [204]:
custom_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,KeyBERT,MMR,POS,Representative_Docs
0,-1,4186,-1_ambience_great_amazing_starters,"[ambience, great, amazing, starters, really, n...","[chicken tikka, restaurant, good food, dishes,...","[ambience, starters, pizza, ambiance, lunch, v...","[ambience, great, amazing, starters, nice, fri...","[""Oh! Hyderabad"" brings amazing super exciting..."
1,0,577,0_biryani_chicken biryani_mutton biryani_biriyani,"[biryani, chicken biryani, mutton biryani, bir...","[biryani tasty, biryani food, hyderabadi birya...","[chicken biryani, mutton biryani, biryani good...","[biryani, biriyani, biryanis, mutton, good bir...",[Another place fo delicious Hyderabadi rum bir...
2,1,429,1_manager_rude_asked_table,"[manager, rude, asked, table, pathetic, waiter...","[serving, waiters, waiter, serve, service bad,...","[rude, waiter, customer, worst service, waiter...","[manager, rude, table, pathetic, waiter, worst...",[I visited this place with my wife on 16 March...
3,2,332,2_dj_dance_dance floor_floor,"[dj, dance, dance floor, floor, hangout, music...","[place party, places hangout, place amazing, f...","[dance, dance floor, hangout, place hangout, p...","[dj, dance, floor, hangout, music, rooftop, pa...","[This place has amazing setup, separated in mu..."
4,3,309,3_good good_good_combo good_nice good,"[good good, good, combo good, nice good, verry...","[good, good good, nice good, good nice, nice, ...","[good good, good, combo good, nice good, verry...","[good, combo, nice, , , , , , , ]","[good, good, good good]"
5,4,231,4_hyderabad_place hyderabad_places hyderabad_h...,"[hyderabad, place hyderabad, places hyderabad,...","[hyderabadi cuisine, restaurants hyderabad, hy...","[place hyderabad, places hyderabad, hyderabad ...","[hyderabad, hyderabadi, beer, beers, places, b...",[So this place is located near ikea above the ...
6,5,226,5_govind_excellent service_excellent_abs,"[govind, excellent service, excellent, abs, sh...","[excellent food, food excellent, good hospital...","[suraj, food excellent, visit soon, service go...","[excellent service, excellent, abs, suraj, exc...",[Excellent food and excellent service by bahad...
7,6,213,6_zomato_gold_zomato gold_order,"[zomato, gold, zomato gold, order, dominos, de...","[zomato restaurant, restaurant zomato, food zo...","[zomato, zomato gold, order, dominos, refund, ...","[zomato, gold, order, dominos, support, contac...","[delivery, poor customer support . Both me and..."
8,7,192,7_buffet_spread_hyatt_lunch buffet,"[buffet, spread, hyatt, lunch buffet, buffets,...","[buffet food, buffet menu, dinner buffet, plac...","[buffet, lunch buffet, buffets, buffet spread,...","[buffet, spread, buffets, carte, lunch, good b...",[I have had the breakfast buffet here quite a ...
9,8,166,8_cake_brownies_cupcakes_cakes,"[cake, brownies, cupcakes, cakes, red velvet, ...","[chocolate cake, bakery items, bakery products...","[brownies, cupcakes, cakes, cookies, chocolate...","[cake, brownies, cupcakes, cakes, red velvet, ...","[A true match for Theo' s from Mumbai, one of ..."


In [125]:
# Create a DataFrame with reviews and their assigned topics with probabilities, excluding topic -1
topic_info = custom_model.get_topic_info()
reviews_with_topics = pd.DataFrame({
    'Review': reviews_preproc,
    'Assigned_Topic': [topics[i] if topics[i] != -1 else (np.argsort(probs[i])[-2] if len(probs[i]) > 1 else np.argsort(probs[i])[-1]) for i in range(len(reviews_preproc))],
    'Assigned_Topic_Name': [topic_info.iloc[topics[i]]['CustomName'] if topics[i] != -1 else (topic_info.iloc[np.argsort(probs[i])[-2]]['CustomName'] if len(probs[i]) > 1 else topic_info.iloc[np.argsort(probs[i])[-1]]['CustomName']) for i in range(len(reviews_preproc))],
    'Topic_Probabilities': [np.round(probs[i] * 100, 2) if topics[i] != -1 else (np.round(probs[i][np.argsort(probs[i])[-2]] * 100, 2) if len(probs[i]) > 1 else np.round(probs[i][np.argsort(probs[i])[-1]] * 100, 2)) for i in range(len(reviews_preproc))]
}).dropna()

# Display the first few rows of the new DataFrame
reviews_with_topics.head()


TypeError: object of type 'numpy.float64' has no len()

Unnamed: 0,Topic,Count,Name,CustomName,Representation,KeyBERT,MMR,POS,Representative_Docs
0,-1,3387,-1_food_chicken_good_place,food | chicken | restaurant,"[food, chicken, good, place, ordered, taste, s...","[restaurant, buffet, good food, food good, dis...","[food, chicken, restaurant, order, rice, panee...","[food, chicken, good, place, taste, service, g...",[When you love food and especially north India...
1,0,1753,0_place_good_ambience_service,place | ambience | service,"[place, good, ambience, service, food, great, ...","[great place, visit place, best place, good pl...","[place, ambience, service, visit, drinks, frie...","[place, good, ambience, service, food, great, ...",[Great place to hangout . Good food + Good dri...
2,1,561,1_biryani_chicken_chicken biryani_taste,chicken biryani | biriyani | mutton biryani,"[biryani, chicken, chicken biryani, taste, bir...","[chicken biryani, biryani chicken, biryani tas...","[chicken biryani, biriyani, mutton biryani, re...","[biryani, chicken, taste, biriyani, mutton, go...","[biryani is good, Just the name of Biryani and..."
3,2,365,2_manager_service_asked_table,manager | table | rude,"[manager, service, asked, table, worst, rude, ...","[restaurant, serving, waiters, waiter, lunch, ...","[manager, table, rude, staff, waiter, customer...","[manager, service, table, worst, rude, staff, ...","[The worst place in the city, I was here for a..."
4,3,309,3_good good_good_nice good_good combo,good good | good | nice good,"[good good, good, nice good, good combo, combo...","[good, good good, nice good, good nice, nice, ...","[good good, good, nice good, good combo, combo...","[good, combo, nice, , , , , , , ]","[good, good, good good]"
5,4,233,4_hyderabad_best_place_food,hyderabad | place hyderabad | visit,"[hyderabad, best, place, food, place hyderabad...","[hyderabad food, restaurant hyderabad, food hy...","[hyderabad, place hyderabad, visit, places, hy...","[hyderabad, best, place, food, service, great,...",[I am a huge fan of regional cuisines . The di...
6,5,175,5_buffet_spread_lunch_starters,buffet | lunch | main course,"[buffet, spread, lunch, starters, main, main c...","[buffet food, buffet lunch, dinner buffet, lun...","[buffet, lunch, main course, lunch buffet, des...","[buffet, spread, lunch, starters, main, main c...",[Very good place to have buffet . Buffet sprea...
7,6,169,6_cake_cakes_brownies_cupcakes,cakes | brownies | cupcakes,"[cake, cakes, brownies, cupcakes, chocolate, r...","[cup cakes, cup cake, cakes, cake chocolate, c...","[cakes, brownies, cupcakes, red velvet, bakery...","[cake, cakes, brownies, cupcakes, chocolate, r...",[My boyfriend had gotten a Cheese cake for my ...
8,7,131,7_gachibowli_place_place gachibowli_food,gachibowli | place gachibowli | food,"[gachibowli, place, place gachibowli, food, go...","[gachibowli place, visit gachibowli, gachibowl...","[gachibowli, place gachibowli, food, visit, pl...","[gachibowli, place, food, good, ambience, best...",[I love this place in Gachibowli . I have been...
9,8,131,8_quantity_packing_food_food food,food food | food good | food cold,"[quantity, packing, food, food food, cold, tas...","[food quality, quality food, taste quality, qu...","[food food, food good, food cold, quantity foo...","[quantity, packing, food, cold, taste, good qu...",[Food is not that good . Quantity is more but ...


In [83]:
custom_model.visualize_topics()

In [84]:
custom_model_MRR = custom_model

In [85]:
# assign labels
keybert_topic_labels = {topic: " | ".join(list(zip(*values))[0][:3]) for topic, values in custom_model.topic_aspects_["KeyBERT"].items()}
custom_model.set_topic_labels(keybert_topic_labels)

In [95]:
from transformers import pipeline

# Summarization pipeline
summarizer = pipeline("summarization", model="t5-small")

topic_labels = {}
topic_info = custom_model.get_topic_info()

for index, row in topic_info.iterrows():
    topic = row['Topic']
    representative_docs = row['Representation']
    text = " ".join(representative_docs)
    summary = summarizer(text, max_length=2, min_length=1, do_sample=False)
    topic_labels[topic] = summary[0]["summary_text"]

print(topic_labels)


Device set to use cpu


{-1: 'food', 0: 'place', 1: 'bi', 2: 'manager', 3: 'good', 4: '', 5: 'buffet', 6: 'cake', 7: '', 8: 'quantity', 9: 'authentic', 10: '', 11: 'chicken', 12: '', 13: 'noodles', 14: 'super', 15: 'quantity', 16: 'delivery', 17: '', 18: 'delivery', 19: '', 20: 'pizza', 21: 'pan', 22: 'para', 23: 'chicken', 24: 'order', 25: 'good', 26: 'in', 27: 'mango', 28: 'superb', 29: 'mom', 30: 'pun', 31: 'man', 32: 'taste', 33: 'good', 34: 'bread', 35: 'shake', 36: 'coffee', 37: 'fast', 38: 'ma', 39: 'awesome', 40: 'service', 41: '', 42: 'tasty', 43: 'spicy', 44: 'nice', 45: 'excellent', 46: 'pi', 47: 'wa', 48: 'and', 49: 'wings', 50: '', 51: 'si', 52: 'wrap', 53: 'awesome'}


In [87]:
custom_model_MRR.get_topic_info().head()

Unnamed: 0,Topic,Count,Name,CustomName,Representation,KeyBERT,MMR,POS,Representative_Docs
0,-1,3387,-1_food_chicken_good_place,food | chicken | restaurant,"[food, chicken, good, place, ordered, taste, s...","[restaurant, buffet, good food, food good, dis...","[food, chicken, restaurant, order, rice, panee...","[food, chicken, good, place, taste, service, g...",[When you love food and especially north India...
1,0,1753,0_place_good_ambience_service,place | ambience | service,"[place, good, ambience, service, food, great, ...","[great place, visit place, best place, good pl...","[place, ambience, service, visit, drinks, frie...","[place, good, ambience, service, food, great, ...",[Great place to hangout . Good food + Good dri...
2,1,561,1_biryani_chicken_chicken biryani_taste,chicken biryani | biriyani | mutton biryani,"[biryani, chicken, chicken biryani, taste, bir...","[chicken biryani, biryani chicken, biryani tas...","[chicken biryani, biriyani, mutton biryani, re...","[biryani, chicken, taste, biriyani, mutton, go...","[biryani is good, Just the name of Biryani and..."
3,2,365,2_manager_service_asked_table,manager | table | rude,"[manager, service, asked, table, worst, rude, ...","[restaurant, serving, waiters, waiter, lunch, ...","[manager, table, rude, staff, waiter, customer...","[manager, service, table, worst, rude, staff, ...","[The worst place in the city, I was here for a..."
4,3,309,3_good good_good_nice good_good combo,good good | good | nice good,"[good good, good, nice good, good combo, combo...","[good, good good, nice good, good nice, nice, ...","[good good, good, nice good, good combo, combo...","[good, combo, nice, , , , , , , ]","[good, good, good good]"


In [88]:
custom_model.get_topic_info().head()

Unnamed: 0,Topic,Count,Name,CustomName,Representation,KeyBERT,MMR,POS,Representative_Docs
0,-1,3387,-1_food_chicken_good_place,food | chicken | restaurant,"[food, chicken, good, place, ordered, taste, s...","[restaurant, buffet, good food, food good, dis...","[food, chicken, restaurant, order, rice, panee...","[food, chicken, good, place, taste, service, g...",[When you love food and especially north India...
1,0,1753,0_place_good_ambience_service,place | ambience | service,"[place, good, ambience, service, food, great, ...","[great place, visit place, best place, good pl...","[place, ambience, service, visit, drinks, frie...","[place, good, ambience, service, food, great, ...",[Great place to hangout . Good food + Good dri...
2,1,561,1_biryani_chicken_chicken biryani_taste,chicken biryani | biriyani | mutton biryani,"[biryani, chicken, chicken biryani, taste, bir...","[chicken biryani, biryani chicken, biryani tas...","[chicken biryani, biriyani, mutton biryani, re...","[biryani, chicken, taste, biriyani, mutton, go...","[biryani is good, Just the name of Biryani and..."
3,2,365,2_manager_service_asked_table,manager | table | rude,"[manager, service, asked, table, worst, rude, ...","[restaurant, serving, waiters, waiter, lunch, ...","[manager, table, rude, staff, waiter, customer...","[manager, service, table, worst, rude, staff, ...","[The worst place in the city, I was here for a..."
4,3,309,3_good good_good_nice good_good combo,good good | good | nice good,"[good good, good, nice good, good combo, combo...","[good, good good, nice good, good nice, nice, ...","[good good, good, nice good, good combo, combo...","[good, combo, nice, , , , , , , ]","[good, good, good good]"


In [48]:
custom_model_df = pd.DataFrame(custom_model.get_topic_info())
custom_model_mrr_df = pd.DataFrame(custom_model_MRR.get_topic_info())

In [61]:
merged_df = custom_model_df.join(custom_model_mrr_df, lsuffix='_keybert', rsuffix='_mrr')

In [62]:
merged_df.head()

Unnamed: 0,Topic_keybert,Count_keybert,Name_keybert,CustomName_keybert,Representation_keybert,KeyBERT_keybert,MMR_keybert,POS_keybert,Representative_Docs_keybert,Topic_mrr,Count_mrr,Name_mrr,CustomName_mrr,Representation_mrr,KeyBERT_mrr,MMR_mrr,POS_mrr,Representative_Docs_mrr
0,-1,3387,-1_food_good_chicken_place,food | chicken | ambience,"[food, good, chicken, place, ordered, taste, s...","[restaurant, buffet, good food, food good, dis...","[food, chicken, ambience, restaurant, order, r...","[food, good, chicken, place, taste, service, g...",[When you love food and especially north India...,-1,3387,-1_food_good_chicken_place,food | chicken | ambience,"[food, good, chicken, place, ordered, taste, s...","[restaurant, buffet, good food, food good, dis...","[food, chicken, ambience, restaurant, order, r...","[food, good, chicken, place, taste, service, g...",[When you love food and especially north India...
1,0,1753,0_place_good_ambience_service,place | ambience | service,"[place, good, ambience, service, food, great, ...","[great place, best place, visit place, good pl...","[place, ambience, service, visit, drinks, frie...","[place, good, ambience, service, food, great, ...",[Komatose is such wonderful place to chilled o...,0,1753,0_place_good_ambience_service,place | ambience | service,"[place, good, ambience, service, food, great, ...","[great place, best place, visit place, good pl...","[place, ambience, service, visit, drinks, frie...","[place, good, ambience, service, food, great, ...",[Komatose is such wonderful place to chilled o...
2,1,561,1_biryani_chicken_chicken biryani_taste,chicken biryani | biriyani | mutton biryani,"[biryani, chicken, chicken biryani, taste, ord...","[ordered chicken biryani, chicken biryani, bir...","[chicken biryani, biriyani, mutton biryani, re...","[biryani, chicken, taste, biriyani, mutton, go...",[One of the new tastes of biryanis in hyd we o...,1,561,1_biryani_chicken_chicken biryani_taste,chicken biryani | biriyani | mutton biryani,"[biryani, chicken, chicken biryani, taste, ord...","[ordered chicken biryani, chicken biryani, bir...","[chicken biryani, biriyani, mutton biryani, re...","[biryani, chicken, taste, biriyani, mutton, go...",[One of the new tastes of biryanis in hyd we o...
3,2,365,2_manager_service_asked_table,manager | table | worst,"[manager, service, asked, table, worst, rude, ...","[restaurant, serving, waiters, waiter, serve, ...","[manager, table, worst, rude, staff, waiter, c...","[manager, service, table, worst, rude, staff, ...","[Visited this place for my team lunch, and the...",2,365,2_manager_service_asked_table,manager | table | worst,"[manager, service, asked, table, worst, rude, ...","[restaurant, serving, waiters, waiter, serve, ...","[manager, table, worst, rude, staff, waiter, c...","[manager, service, table, worst, rude, staff, ...","[Visited this place for my team lunch, and the..."
4,3,309,3_good good good_good good_good_nice good good,good good good | good good | nice good good,"[good good good, good good, good, nice good go...","[good, good good, nice good, good good good, g...","[good good good, good good, nice good good, ni...","[good, combo, nice, , , , , , , ]","[good, good, good good]",3,309,3_good good good_good good_good_nice good good,good good good | good good | nice good good,"[good good good, good good, good, nice good go...","[good, good good, nice good, good good good, g...","[good good good, good good, nice good good, ni...","[good, combo, nice, , , , , , , ]","[good, good, good good]"


In [65]:
merged_df = merged_df[['Topic_keybert', 'Count_keybert', 'Name_keybert', 'CustomName_keybert', 'CustomName_mrr','Representation_keybert', 'KeyBERT_keybert','MMR_keybert', 'POS_keybert', 'Representative_Docs_keybert']]