In [17]:
import numpy as np
import pandas as pd
from utils.gibberish_detector import classify_gibberish
from utils.preproc_utils import main_pipeline

In [18]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.decomposition import TruncatedSVD
from gensim.models import LsiModel, LdaModel
from bertopic import BERTopic
from sklearn.decomposition import LatentDirichletAllocation

In [19]:
reviews = pd.read_pickle('data/reviews_initial_preproc.pkl')

In [20]:
reviews["is_giberish"] =\
      reviews["Review"].apply(lambda review : classify_gibberish(str(review)))
reviews = reviews[reviews['is_giberish']<85]

In [21]:
reviews['rev_proc'] = reviews['Review'].apply(lambda review: main_pipeline(review,
                  print_output = False, 
                  no_stopwords = True,
                  exception_stopwords=['no','not','nor','very','few','all','again','but'],
                  convert_diacritics = True, 
                  lowercase = True, 
                  lemmatized = True,
                  list_pos = ["n","v","a","r","s"],
                  stemmed = False, 
                  pos_tags_list = "no_pos",
                  tokenized_output = False,
                  word_correction=False))

In [22]:
bow_vectorizer = CountVectorizer(ngram_range=(1,1), token_pattern=r"(?u)\b\w+\b")
reviews_bow_matrix = bow_vectorizer.fit_transform(reviews["rev_proc"]).toarray()
reviews["bow_vector"] = reviews_bow_matrix.tolist()

tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,1), token_pattern=r"(?u)\b\w+\b")
reviews_tfidf_matrix = tfidf_vectorizer.fit_transform(reviews["rev_proc"]).toarray()
reviews["tfidf_vector"] = reviews_tfidf_matrix.tolist()

In [7]:
reviews_bow_matrix = np.array([[component for component in doc] for doc in reviews["bow_vector"]])
reviews_tfidf_matrix = np.array([[component for component in doc] for doc in reviews["tfidf_vector"]])

In [8]:
lsa = TruncatedSVD(n_components=10) 
lsa_result = lsa.fit_transform(reviews_bow_matrix)

In [9]:
bow_vocab = bow_vectorizer.get_feature_names_out()
word_topic_dict = dict(zip(bow_vocab,[lsa.components_[:,i] for i in range(len(bow_vocab))]))

In [10]:
topic_word_dict = [{word : value for word, value in zip(bow_vocab,component)} for component in lsa.components_]

In [11]:
topic_df = pd.DataFrame(topic_word_dict)

In [12]:
topic_tgt = topic_df.loc[2]
topic_tgt = topic_tgt.sort_values(ascending=False)

In [13]:
topic_tgt

good       0.807057
taste      0.089554
chicken    0.058696
biryani    0.044994
veg        0.037702
             ...   
u         -0.094337
one       -0.098426
food      -0.136737
order     -0.155086
place     -0.363915
Name: 2, Length: 16372, dtype: float64

In [14]:
lsa.components_

array([[ 7.56791149e-04,  4.58704308e-05,  8.75586760e-05, ...,
         4.74477442e-04,  9.47739771e-05,  3.51857416e-07],
       [-2.27654877e-04, -1.68958793e-04,  8.60048037e-05, ...,
         1.04541699e-03, -9.81516327e-05,  1.25170940e-07],
       [-1.98629731e-03, -1.50754049e-04, -2.22456518e-04, ...,
        -6.28448770e-04, -2.89277227e-04, -1.24504341e-06],
       ...,
       [ 1.06503683e-03,  1.22971009e-04, -5.10821754e-05, ...,
        -7.02910914e-04, -3.64056409e-04, -3.42459294e-06],
       [ 1.09766213e-04, -2.73097929e-04,  3.50053761e-04, ...,
        -1.65794905e-04, -2.20006856e-04, -2.38332356e-06],
       [ 9.84960233e-04,  2.26486710e-05, -1.64921399e-05, ...,
        -1.01596441e-03, -2.64878369e-04,  1.40022734e-06]])

In [15]:
def get_top_words(components, feature_names, n_top_words=10):
    topics = []
    for topic_idx, topic in enumerate(components):
        top_words = [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
        topics.append(top_words)
    return topics

In [16]:
pd.DataFrame(get_top_words(lsa.components_, bow_vocab, 10))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,good,place,food,chicken,taste,order,service,one,try,like
1,chicken,taste,biryani,dish,try,fry,rice,fish,order,veg
2,good,taste,chicken,biryani,veg,ambience,nice,also,overall,ok
3,place,good,visit,n,best,one,try,friend,cake,must
4,chicken,food,place,great,ambience,try,nice,must,amaze,tikka
5,order,chicken,place,biryani,good,zomato,quantity,delivery,bad,deliver
6,taste,food,n,try,best,one,love,order,cake,chocolate
7,n,u,chicken,food,serve,sauce,onion,good,prawn,top
8,service,great,order,n,ambience,really,taste,visit,best,try
9,biryani,taste,service,bad,place,restaurant,serve,even,time,soo


In [23]:
import bertopic

In [24]:
from sklearn.cluster import HDBSCAN

In [25]:
hd = HDBSCAN(min_cluster_size=150, min_samples=2)

In [30]:
topic_model = BERTopic(nr_topics=5, hdbscan_model=hd,verbose=True) 
docs = reviews["rev_proc"].reset_index(drop=True)
topics, probs = topic_model.fit_transform(docs, embeddings=np.array(reviews["tfidf_vector"].tolist()))
reviews_topics_df = pd.DataFrame({'topic': topics, 'document': docs})

2024-12-07 12:46:19,881 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-07 12:46:50,457 - BERTopic - Dimensionality - Completed ✓
2024-12-07 12:46:50,473 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-07 12:46:51,328 - BERTopic - Cluster - Completed ✓
2024-12-07 12:46:51,329 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-07 12:46:52,401 - BERTopic - Representation - Completed ✓
2024-12-07 12:46:52,409 - BERTopic - Topic reduction - Reducing number of topics
2024-12-07 12:46:52,953 - BERTopic - Topic reduction - Reduced number of topics from 19 to 5


In [31]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,3639,-1_food_good_place_service,"[food, good, place, service, very, but, not, t...",[recently invite food taste session marsala fo...
1,0,5611,0_good_place_food_not,"[good, place, food, not, very, but, chicken, o...",[one best place biryani gachibowli biryani bes...
2,1,218,1_delivery_time_good_boy,"[delivery, time, good, boy, order, quick, food...","[good delivery, delivery good, good delivery t..."
3,2,181,2_quantity_less_good_price,"[quantity, less, good, price, food, very, tast...","[less quantity, taste good but quantity very l..."
4,3,173,3_deliver_order_receive_time,"[deliver, order, receive, time, wrong, not, fo...","[great time order deliver, order not deliver, ..."


In [33]:
topic_model.visualize_topics()

In [13]:
topic_model.visualize_documents(docs)

In [32]:
topic_model.visualize_barchart()