#### This notebook is a record of our topic modeling process. In this notebook, we tried different topic modeling methods, including NMF, LDA, and BERTopic. Since topic modeling requires a lot of hyper-tuning, we kept a 2% sample of our merged dataset to ensure each run of hyper-tuning is within an acceptable time range. In our final notebook, we will use more data to ensure a more accurate and reliable analysis.

In [22]:
import gdown
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

url = 'https://drive.google.com/uc?export=download&id=1hVsTyC7DJH8pplo1DmVm13v8QDkONjwo'
output = 'review_list.json'
gdown.download(url, output, quiet=False)

df_review = pd.read_json(output, lines=True)
df_review = df_review.drop(columns=['images'])
df_review.rename(columns={'title': 'review_title'}, inplace=True)
df_review.rename(columns={'text': 'review_text'}, inplace=True)

df_review

Downloading...
From (original): https://drive.google.com/uc?export=download&id=1hVsTyC7DJH8pplo1DmVm13v8QDkONjwo
From (redirected): https://drive.google.com/uc?export=download&id=1hVsTyC7DJH8pplo1DmVm13v8QDkONjwo&confirm=t&uuid=96127c73-18e7-40d0-bdac-89fa24bfdacb
To: C:\Users\zehui\review_list.json
100%|██████████| 227M/227M [00:16<00:00, 13.8MB/s] 


Unnamed: 0,rating,review_title,review_text,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase
0,4,12 mg is 12 on the periodic table people! Mg f...,This review is more to clarify someone else’s ...,B07TDSJZMR,B07TDSJZMR,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,2020-02-06 00:49:35.902,3,True
1,5,Save the lanet using less plastic.,Love these easy multitasking bleach tablets. B...,B08637FWWF,B08637FWWF,AEVWAM3YWN5URJVJIZZ6XPD2MKIA,2020-11-02 22:03:06.880,3,True
2,5,Fantastic,I have been suffering a couple months with hee...,B07KJVGNN5,B07KJVGNN5,AHSPLDNW5OOUK2PLH7GXLACFBZNQ,2019-07-24 11:13:58.905,0,True
3,4,It holds the water and makes bubbles. That's ...,"It's cheap and it does what I wanted. The ""ma...",B007HY7GC2,B092RP73CX,AEZGPLOYTSAPR3DHZKKXEFPAXUAA,2022-09-04 02:29:02.725,7,True
4,1,Not for me,Didn't do a thing for me. Not saying they don'...,B08KYJLF5T,B08KYJLF5T,AEQAYV7RXZEBXMQIQPL6KCT2CFWQ,2022-01-20 23:53:07.262,0,True
...,...,...,...,...,...,...,...,...,...
494116,5,Best brush!,"Material is good. Worthy to buy.Firstly, the c...",B07KXT7Y48,B07KXT7Y48,AEQG5UEVYBNLWPXB3E2EODQ3EGSQ,2019-04-12 04:21:04.257,6,True
494117,5,It makes my skin softer.,This brush is a good tool for cleaning and mas...,B07KXT7Y48,B07KXT7Y48,AGTVLNJAFZTKURBCHLUIH6VEOQCQ,2020-07-17 05:01:39.190,0,True
494118,5,This brush is perfect !,"Honestly, the brush totally is 15inch. Maybe y...",B07KXT7Y48,B07KXT7Y48,AGMA5UN3JPLQLQZ2PFYHJYSC4PNA,2019-03-07 22:33:36.968,173,True
494119,5,Easy to use,"Easy to hold, soft and flexible.",B07KXT7Y48,B07KXT7Y48,AE2Q3FXHIVGSSDGTNI4YLXDXMCIA,2019-09-26 12:22:51.244,0,True


In [23]:
url = 'https://drive.google.com/uc?export=download&id=14dVOPZM7-hCEdXJswxlb_c6XDErC7lac'
output = 'meta_list.json'
gdown.download(url, output, quiet=False)

df_meta = pd.read_json(output, lines=True)
df_meta = df_meta.drop(columns=['main_category','features','description','price','images','videos','categories','bought_together'])
df_meta.rename(columns={'title': 'meta_title'}, inplace=True)

df_meta

Downloading...
From (original): https://drive.google.com/uc?export=download&id=14dVOPZM7-hCEdXJswxlb_c6XDErC7lac
From (redirected): https://drive.google.com/uc?export=download&id=14dVOPZM7-hCEdXJswxlb_c6XDErC7lac&confirm=t&uuid=0d962b89-aac6-4ebb-8252-c9cca9b9e840
To: C:\Users\zehui\meta_list.json
100%|██████████| 118M/118M [00:08<00:00, 13.4MB/s] 


Unnamed: 0,meta_title,average_rating,rating_number,store,details,parent_asin
0,Silicone Bath Body Brush Exfoliator Shower Bac...,3.9,7,Rzoeox,{'Package Dimensions': '15 x 3.3 x 1.5 inches;...,B07V346GZH
1,"iPhone 7 Plus 8 Plus Screen Protector, ZHXIN T...",3.8,2,ZHXIN,"{'Brand': 'ZHXIN', 'Compatible Devices': 'Cell...",B075W927RH
2,Zig Zag Rolling Machine 70mm Size With FREE BO...,3.9,7,,{'Package Dimensions': '4.1 x 1.8 x 0.3 inches...,B01FB26VKY
3,Sting-Kill Disposable Wipes 8 Each ( Pack of 5),4.1,6,Sting-kill,"{'Brand': 'Sting-kill', 'Item Form': 'Wipe', '...",B01IAI29RU
4,Heated Eyelash Curler Mini Portable Electric E...,3.3,8,BiBOSS,{'Package Dimensions': '6.1 x 3.1 x 1.9 inches...,B08CMN38RC
...,...,...,...,...,...,...
60288,5th Scooby Doo Party Supplies | Decorations | ...,4.1,5,JUMPHOP,{'Package Dimensions': '14.5 x 10.5 x 1.5 inch...,B08VVZRBBL
60289,"Crystal Hair Eraser for Hair Removal, Crystal ...",2.3,10,PARCO,{},B0BZQH4WGD
60290,Designer Modern Ball Acrylic Chair with Black ...,5.0,2,Designer Modern,{'Package Dimensions': '50 x 45 x 45 inches; 5...,B00A6CANPY
60291,Biotics Research - UltraVir-X 90C (2 Pack),5.0,2,BIOTICS,"{'Is Discontinued By Manufacturer': 'No', 'Dat...",B06W57751V


In [28]:
df_review['datetime'] = pd.to_datetime(df_review['timestamp'], unit='ms')

df_review['review_text_word_count'] = df_review['review_text'].apply(lambda x: len(x.split()) if isinstance(x, str) else 0)
df_review['review_text_word_count_cutoff'] = df_review['review_text_word_count'].apply(lambda x: x if x <= 300 else 300)
df_review['review_title_word_count'] = df_review['review_title'].apply(lambda x: len(x.split()) if isinstance(x, str) else 0)

df_meta['meta_title_word_count'] = df_meta['meta_title'].apply(lambda x: len(x.split()) if isinstance(x, str) else 0)
df_meta['average_rating_threshold'] = np.ceil(df_meta['average_rating'] * 2) / 2

df_review['timestamp'] = pd.to_datetime(df_review['timestamp'])
df_review['year'] = df_review['timestamp'].dt.year
df_review['month'] = df_review['timestamp'].dt.month
df_review['day'] = df_review['timestamp'].dt.day
df_review['weekday'] = df_review['timestamp'].dt.weekday

In [29]:
merged_df = pd.merge(df_review, df_meta, on='parent_asin', how='left')
merged_df = merged_df.sample(frac=0.02, random_state=99)

In [30]:
#Tocknize - Text Handling - Prepare
import re
import nltk
import emoji
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from contractions import fix
from nltk.corpus import wordnet
from nltk.tokenize.casual import TweetTokenizer
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('omw-1.4')
stop_words = set(stopwords.words('english'))
tokenizer = TweetTokenizer()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\zehui\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\zehui\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\zehui\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\zehui\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\zehui\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\zehui\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_pe

In [31]:
#Lemmatizer Function
from nltk import pos_tag
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

#Stemming Function
from nltk.stem import PorterStemmer

#Clean and tockenize function
def clean_and_tokenize(text):

    # Expanding contractions (e.g., "don't" → "do not")
    text = fix(text)
    # Lowercasing
    text = text.lower()
    # Removing multiple spaces
    text = re.sub(r'\s+', ' ', text)
    # Removing punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Removing emojis
    text = emoji.replace_emoji(text, replace='')
    # Removing URLs
    text = re.sub(r'http\S+|www.\S+', '', text)
    # Removing email addresses
    text = re.sub(r'\S+@\S+', '', text)
    # Removing stop words
    text = ' '.join([word for word in text.split() if word not in stop_words])
    # Removing extra short words (e.g., "a", "an", "it")
    text = ' '.join([word for word in text.split() if len(word) > 1])

    words = tokenizer.tokenize(text)  # Tokenization
    return words

def get_wordnet_pos(word):
    tag = pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.VERB)

def lemmatize_tokens(tokens):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(token, get_wordnet_pos(token)) for token in tokens]

stemmer = PorterStemmer()

def stem_tokens(tokens):
    return [stemmer.stem(word) for word in tokens]

#merged_df["tokenized_review_title"] = merged_df["review_title"].apply(clean_and_tokenize)
#merged_df["tokenized_review_text"] = merged_df["review_text"].apply(clean_and_tokenize)

#merged_df["tokenized_review_title_lem"] = merged_df["tokenized_review_title"].apply(lemmatize_tokens)
#merged_df["tokenized_review_text_lem"] = merged_df["tokenized_review_text"].apply(lemmatize_tokens)

#merged_df["tokenized_review_title_stem"] = merged_df["tokenized_review_title"].apply(stem_tokens)
#merged_df["tokenized_review_text_stem"] = merged_df["tokenized_review_text"].apply(stem_tokens)

## Everything above was our regular step for data cleaning. Topic modeling starts here. 

In [33]:
!pip install sentence-transformers
!pip install spacy
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ------- -------------------------------- 2.4/12.8 MB 12.2 MB/s eta 0:00:01
     ----------------- ---------------------- 5.5/12.8 MB 14.0 MB/s eta 0:00:01
     ------------------------ --------------- 7.9/12.8 MB 12.8 MB/s eta 0:00:01
     -------------------------------- ------ 10.7/12.8 MB 13.2 MB/s eta 0:00:01
     --------------------------------------- 12.8/12.8 MB 12.7 MB/s eta 0:00:00
[38;5;2m[+] Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


#### The cell below is used to prepare merged_df['meta_title'] for topic modeling. The list of words we chose to drop was updated multiple times based on our results later. The main criterion for dropping these words is to retain only those that truly indicate the type of product. For example, if a product name is "fish oil for man", we'd only keep "fish oil" to reduce noise — for subcategory classification, we're only interested in the product itself. Therefore, we removed common stop words, descriptive words like "premium" and color terms, as well as special characters. We also applied lemmatization to return words to their original forms.

In [37]:
import re
import spacy

nlp = spacy.load("en_core_web_sm")

stopwords = {"and", "&", "|", "-", "for", "with", "of"}
common_units = {"mg", "g", "kg", "lb", "oz", "ml", "l", "fl", "inch", "cm", "mm",
                "count", "pcs", "pack", "set", "ct", "bottle", "bag"}
custom_stopwords = {
    "the", "a", "is", "in", "on", "for", "to", "by", "this", "with", "your", 
    "it", "as", "no", "or", "all", "at", "be", "an", "we", "that", "can", "so", "you",
    "one", "from", "us", "more", "other", "will", "not", "any", "than", "some", "get",
    "each", "our", "if", "over", "when", "their", "do", "out", "after", "before",
    "back", "my", "me", "her", "his", "them", "they", "him", "she", "he"}

more_words = {"premium", "professional", "plus", "use", "large", "high", "size", 
              "make", "easy", "white", "black", "grey", "gray", "silver", "gold", "golden", "blue", "green"
              "red", "yellow", "purple", "pink", "party", "kit", "make", "long", "use", "birthday", "woman", "man", "kid"}

all_stopwords = stopwords | common_units | custom_stopwords | more_words

def clean_meta_title(title, do_lemmatize=True):

    title = title.lower().strip()
    title = re.sub(r'[^\w\s]', '', title)
    title = re.sub(r'[^a-zA-Z\s]', '', title)
    title = re.sub(r'\s+', ' ', title).strip()
    words = title.split()
    cleaned_words = [word for word in words if word not in all_stopwords]

    if do_lemmatize:
        doc = nlp(" ".join(cleaned_words))
        lemmatized = []
        for token in doc:
            if len(token.lemma_) > 2:
                lemmatized.append(token.lemma_)
        cleaned_words = lemmatized
    
    cleaned_words = [w for w in cleaned_words if len(w) > 1]

    return " ".join(cleaned_words)

merged_df["cleaned_meta_title"] = merged_df["meta_title"].apply(lambda x: clean_meta_title(x, do_lemmatize=True))

#### From our later analysis, we noticed that some product title keywords like "balloon", "wedding", and pet-related terms appeared unusually frequently. These words are not necessarily related to our "Health and Personal Care" category. We believe these products may have been miscategorized by Amazon sellers. Therefore, we decided to drop these products from our dataset and focus only on those products that are truly relevant to our category.

In [42]:
keywords = {"gift", "decoration", "wedding", "balloon", "cat", "dog", "pet"}

def contains_keywords(text):
    tokens = text.split()
    return any(word in tokens for word in keywords)

merged_df = merged_df[~merged_df["cleaned_meta_title"].apply(contains_keywords)]

#### We applied NMF using both BoW and TF-IDF vectorization to compare the top 20 words in each subtopic by frequency. The printouts from the two approaches revealed that while both methods generally helped NMF classify products into subcategories, TF-IDF produced more coherent and well-defined topics overall. For instance, topics related to orthopedic supports, nutritional supplements, hair styling, and foot care were more focused with TF-IDF. In contrast, the BoW approach tended to include less informative words (e.g., "dead", "come") and mixed signals—such as blending fish oil products with words like "scale". Furthermore, TF-IDF gives greater influence to unique, category-specific terms while reducing noise from common words. 
#### Therefore, we decided to use TF-IDF vectorization for NMF.

In [45]:
#BoW NMF

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import NMF

vectorizer = CountVectorizer(
    max_df=0.5, 
    min_df=10, 
    stop_words=list(all_stopwords)
)
bow = vectorizer.fit_transform(merged_df["cleaned_meta_title"])

num_topics = 10 
nmf_model_bow = NMF(
    n_components=num_topics, 
    random_state=99,           
    l1_ratio=0.5, 
    max_iter=500
)

nmf_topics_bow = nmf_model_bow.fit_transform(bow)

feature_names_bow = vectorizer.get_feature_names_out()
for topic_idx, topic in enumerate(nmf_model_bow.components_):
    top_words = [feature_names_bow[i] for i in topic.argsort()[:-21:-1]]
    print(f"Topic {topic_idx+1}: {', '.join(top_words)}")

Topic 1: clean, natural, mat, organic, cleaning, safe, gym, towel, gear, restore, yoga, equipment, residue, fitness, aroma, come, lemongrass, wmicrofiber, refreshe, slippery
Topic 2: foot, callus, remover, pedicure, file, shoe, heel, skin, tool, dry, massager, dead, crack, spa, scrubber, massage, pumice, odor, insole, stone
Topic 3: hair, iron, straightener, ceramic, straighten, curler, brush, trimmer, flat, styling, curl, comb, straight, fast, detangle, growth, cordless, shaver, anion, natural
Topic 4: oil, fish, liquid, essential, supplement, lemon, pure, organic, fine, source, omegas, norwegian, carlson, sustainably, wildcaught, very, scale, dropper, peppermint, usda
Topic 5: neck, pain, relief, support, brace, adjustable, traction, shoulder, posture, cervical, device, corrector, provide, stretcher, therapy, upper, muscle, knee, heat, inflatable
Topic 6: brush, electric, facial, head, body, toothbrush, skin, scrubber, sponge, shower, rechargeable, bath, care, massager, shaver, water

### THIS IS OUR FINAL METHOD 

In [46]:
#TF-IDF NMF 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

vectorizer = TfidfVectorizer(
    max_df=0.5, 
    min_df=10, 
    stop_words=list(all_stopwords),
)
tfidf = vectorizer.fit_transform(merged_df["cleaned_meta_title"])


num_topics = 10
nmf_model = NMF(
    n_components=num_topics, 
    random_state=99,
    l1_ratio=0.5, 
    max_iter=500
)
nmf_topics = nmf_model.fit_transform(tfidf)

feature_names = vectorizer.get_feature_names_out()
for topic_idx, topic in enumerate(nmf_model.components_):
    top_words = [feature_names[i] for i in topic.argsort()[:-21:-1]]
    print(f"Topic {topic_idx+1}: {', '.join(top_words)}")

Topic 1: sponge, silicone, makeup, brush, body, shower, facial, bath, scrubber, skin, loofah, exfoliate, massager, scrub, cosmetic, latex, care, handle, soft, lotion
Topic 2: clean, mat, natural, cleaning, towel, restore, mindful, refreshe, deepcleanse, asutra, wmicrofiber, slippery, lemongrass, residue, yoga, fitness, come, aroma, gym, gear
Topic 3: pain, relief, support, neck, brace, posture, shoulder, corrector, adjustable, pad, upper, provide, therapy, knee, clavicle, compression, heating, cervical, muscle, traction
Topic 4: oil, organic, pure, essential, powder, cap, usda, dropper, supplement, certify, peppermint, available, wimprove, variation, fish, natural, capsule, liquid, vitamin, pill
Topic 5: hair, iron, straightener, ceramic, curler, brush, straighten, trimmer, flat, curl, styling, comb, shaver, straight, fast, electric, tool, eyelash, heating, salon
Topic 6: design, cushion, pressure, comfort, chair, grid, gaming, ultimate, usa, reduce, travel, relieve, seat, charcoal, ba

In [48]:
#Distribution of TF-IDF NMF Topics 
import numpy as np
import pandas as pd

dominant_topics = np.argmax(nmf_topics, axis=1)

merged_df["dominant_topic"] = dominant_topics

topic_distribution = merged_df["dominant_topic"].value_counts(normalize=True).sort_index()
print(topic_distribution)

dominant_topic
0    0.068079
1    0.068940
2    0.089159
3    0.213917
4    0.056141
5    0.027963
6    0.105399
7    0.024521
8    0.223381
9    0.122499
Name: proportion, dtype: float64


#### We also tried the LDA method. Below is our preparation for LDA. In addition to the general cleaning described above, we also attempted to merge frequently appearing words (based on our understanding of the domain) to increase the interpretability of our output.

In [52]:
merged_df["tokens"] = merged_df["cleaned_meta_title"].apply(lambda x: x.split())

phrases_to_merge = [
    ("tooth", "brush"),
    ("dental", "care"),
    ("oral", "care"),
    ("electric", "toothbrush"),
    ("tooth", "paste"),
    ("fish", "oil"),
    ("face", "mask"),
    ("face", "cream"),
    ("facial", "cleanser"),
    ("face", "wash"),
    ("facial", "mask"),
    ("skin", "care"),
    ("skin", "moisturizer"),
    ("facial", "serum"),
    ("skin", "toner"),
    ("hair", "brush"),
    ("hair", "dryer"),
    ("hair", "straightener"),
    ("hair", "treatment"),
    ("hair", "removal"),
    ("body", "lotion"),
    ("bath", "bomb"),
    ("shower", "gel"),
    ("body", "wash"),
    ("body", "scrub"),
    ("pain", "relief"),
    ("weight", "loss"),
    ("blood", "pressure"),
    ("vitamin", "supplement"),
    ("supplement", "capsule"),
    ("nail", "polish"),
    ("nail", "file"),
    ("foot", "care"),
    ("massage", "chair"),
    ("sunscreen", "lotion"),
    ("anti", "aging")
]
phrase_set = set(phrases_to_merge)

def merge_phrases(doc, phrase_tuples):
    i = 0
    merged_doc = []
    while i < len(doc):
        if i < len(doc) - 1 and (doc[i], doc[i+1]) in phrase_tuples:
            merged_doc.append(f"{doc[i]}_{doc[i+1]}")
            i += 2 
        else:
            merged_doc.append(doc[i])
            i += 1
    return merged_doc

merged_df["tokens"] = merged_df["tokens"].apply(lambda doc: merge_phrases(doc, phrase_set))

In [54]:
from gensim.corpora import Dictionary

dictionary = Dictionary(merged_df["tokens"])
dictionary.filter_extremes(no_below=5, no_above=0.8)
bow_corpus = [dictionary.doc2bow(tokens) for tokens in merged_df["tokens"]]

#### When performing LDA, we used the LDA Coherence score to assess the performance of our model. A higher score means more semantically consistent, which means cleaner separation. We learned from ChatGPT that a score above 0.4 is generally acceptable. At the end of this notebook, please find a full record of our LDA hyperparameter tuning, which took hours to complete. We looped over alpha = [0.01, 0.1, 1], passes = [10, 20, 30], and tested 10–30 topics for each hyperparameter combination. From the output, we found that the best set of hyperparameters is [alpha = 1.0, passes = 10, num_topics = 28, Coherence = 0.4716]. However, we think 28 topics is too many for our analysis, so we eventually decided to go with num_topics = 11, which generally received a high coherence score across different passes and alpha values.

#### When we were hyper-tuning and interpreting the printout of LDA-separated topics’ high-frequency words, we found there is a general trade-off between coherence score and interpretability. We once reached a score as high as 0.68, but the printed words included uncleaned stopwords and some single alphabets. The final parameters we chose resulted in an LDA score of 0.4152, which just passed the acceptable standard for LDA coherence. We found that the topics it segregated are overall interpretable, though some topics are mixed and overlapping. For example, Topic 6 (toothbrush, head, non, replacement, air, muscle, cap, reduce, technology, odor, gmo, base, pro, uvc, sanitizer, guardian, pluggable, ggpk, petssmokemoldcooke, laundrypack) appears to be a mixture of oral, odor, and germ-related topics.

#### We also compared the coherence score of NMF and LDA. With a score of 0.5447, more interpretable topic segregation, and a much shorter computing time, NMF is clearly the better choice.

In [56]:
from gensim.models import LdaModel

num_topics = 11 
lda_model = LdaModel(
    corpus=bow_corpus,
    id2word=dictionary,
    num_topics=num_topics,
    random_state=42,
    passes=10,
    iterations=400,
    alpha= 0.01,
    per_word_topics=True
)

In [57]:
for i, topic in lda_model.show_topics(num_topics=num_topics, num_words=20, formatted=False):
    top_words = [word for word, _ in topic]
    print(f"Topic {i}: {', '.join(top_words)}")

Topic 0: brush, facial, body, shower, sponge, bath, glass, clean, scrubber, head, replacement, electric, steel, stainless, face, handle, wipe, woman, beard, clipper
Topic 1: foot, massage, massager, roller, remover, file, callus, spa, skin, pedicure, solution, care, clean, muscle, deep, dry, tool, heat, value, clear
Topic 2: color, light, ultra, red, free, water, medical, nail, filter, bottle, supply, original, paper, probiotic, repair, fit, tablet, patch, baby, cup
Topic 3: hair, ear, iron, noise, sleep, ceramic, flat, machine, plug, air, hair_straightener, portable, sound, snore, straighten, plantar, fasciitis, brush, fast, scale
Topic 4: supplement, powder, capsule, support, organic, vitamin, natural, health, energy, liquid, fish_oil, vegan, formula, hair, strength, flavor, joint, nongmo, extract, free
Topic 5: clean, protein, mat, natural, cleaning, bar, organic, towel, cloth, fitness, bag, safe, restore, equipment, reusable, yoga, aroma, gear, residue, pad
Topic 6: toothbrush, non

In [58]:
from gensim.models.coherencemodel import CoherenceModel

# LDA Score
coherence_model_lda = CoherenceModel(
    model=lda_model, 
    texts=merged_df["tokens"], 
    dictionary=dictionary, 
    coherence='c_v'
)
coherence_score = coherence_model_lda.get_coherence()
print(f"LDA Coherence: {coherence_score:.4f}")

# NMF Score
def get_nmf_topics(nmf_model, feature_names, topn=20):
    topics = []
    for topic_idx, topic in enumerate(nmf_model.components_):
        top_words = [feature_names[i] for i in topic.argsort()[:-topn-1:-1]]
        topics.append(top_words)
    return topics

nmf_topics_words = get_nmf_topics(nmf_model, feature_names, topn=20)

nmf_coherence_model = CoherenceModel(topics=nmf_topics_words, texts=merged_df["tokens"].tolist(), dictionary=dictionary, coherence='c_v')
nmf_coherence = nmf_coherence_model.get_coherence()
print(f"NMF Coherence: {nmf_coherence:.4f}")

LDA Coherence: 0.4152
NMF Coherence: 0.5447


#### Besides NMF and LDA, we also performed BERTopic, according to ChatGPT's suggestion. However, BERTopic does not allow us to set the number of topics manually, and it produced 297 topics for our data. We tried to reduce the topics to 10, but the reduced topics resulted in an extremely unbalanced distribution—Topic 1 contains about 1,500 products and Topic 2 contains about 7,000 products, while the remaining topics average around 100 products. This shows that BERTopic did not perform well when the number of topics was reduced to a small number.

In [60]:
!pip install bertopic



In [61]:
from bertopic import BERTopic

documents = merged_df["meta_title"].tolist()
topic_model = BERTopic(verbose=True)
topics, probabilities = topic_model.fit_transform(documents)

topic_info = topic_model.get_topic_info()
print(topic_info)

2025-03-01 22:32:12,674 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/291 [00:00<?, ?it/s]

2025-03-01 22:35:12,909 - BERTopic - Embedding - Completed ✓
2025-03-01 22:35:12,911 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-03-01 22:36:07,276 - BERTopic - Dimensionality - Completed ✓
2025-03-01 22:36:07,279 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-03-01 22:36:08,248 - BERTopic - Cluster - Completed ✓
2025-03-01 22:36:08,253 - BERTopic - Representation - Extracting topics from clusters using representation models.
2025-03-01 22:36:09,064 - BERTopic - Representation - Completed ✓


     Topic  Count                                        Name  \
0       -1   1692                        -1_cleaner_by_kit_of   
1        0    182            0_iron_straightener_ceramic_hair   
2        1    143           1_capsules_moringa_120_vegetarian   
3        2    136         2_reading_readers_glasses_eyekepper   
4        3    129               3_pill_organizer_box_medicine   
..     ...    ...                                         ...   
283    282     11        282_whitening_teeth_combine_enhances   
284    283     11           283_feets_unbleached_japan_sliver   
285    284     10  284_hyaluronic_acid_lubrication_naturebell   
286    285     10                  285_brushing_dry_boar_skin   
287    286     10                      286_mask_kpop_hole_exo   

                                        Representation  \
0    [cleaner, by, kit, of, energy, the, oz, capsul...   
1    [iron, straightener, ceramic, hair, curling, c...   
2    [capsules, moringa, 120, vegetarian, ext

In [62]:
from bertopic import BERTopic

topic_model = BERTopic()
topics, probs = topic_model.fit_transform(documents)

topic_model = topic_model.reduce_topics(docs=documents, nr_topics=10)

In [63]:
topic_info = topic_model.get_topic_info()
print(topic_info)

   Topic  Count                                     Name  \
0     -1   1617                     -1_for_and_pack_with   
1      0   6928                      0_for_and_with_pack   
2      1    270          1_scale_digital_weight_bathroom   
3      2    192                  2_ear_plugs_noise_sound   
4      3     93                  3_battery_lithium_3d_3v   
5      4     88              4_backnobber_positive_co_ii   
6      5     47       5_smokebuddy_buddy_smoke_cigarette   
7      6     38            6_nerdwax_slipping_shark_seen   
8      7     13                 7_sq_stretchtite_3000_ft   
9      8     12  8_beard_application_horsehair_handlebar   

                                      Representation  \
0  [for, and, pack, with, of, support, oz, foot, ...   
1  [for, and, with, pack, oz, of, hair, natural, ...   
2  [scale, digital, weight, bathroom, smart, glas...   
3  [ear, plugs, noise, sound, concerts, reduction...   
4  [battery, lithium, 3d, 3v, micropedi, emjoi, t...   
5  

In [83]:
#LDA Hyper-Tunning
"""
#DISPLAY ONLY, DON'T RUN
from gensim.models import LdaModel
from gensim.models.coherencemodel import CoherenceModel

def compute_coherence_values(dictionary, corpus, texts, start, limit, step, alpha_vals, passes_vals):

    results = []
    
    for alpha in alpha_vals:
        for passes in passes_vals:
            for num_topics in range(start, limit + 1, step):
                model = LdaModel(
                    corpus=corpus,
                    id2word=dictionary,
                    num_topics=num_topics,
                    random_state=42,
                    iterations=400,
                    passes=passes,
                    alpha=alpha,
                    eta='auto'
                )
                
                coherence_model = CoherenceModel(
                    model=model, 
                    texts=texts, 
                    dictionary=dictionary, 
                    coherence='c_v'
                )
                coherence_score = coherence_model.get_coherence()
                
                result = {
                    'num_topics': num_topics,
                    'alpha': alpha,
                    'passes': passes,
                    'coherence': coherence_score,
                    'model': model
                }
                results.append(result)
                print(f"alpha={alpha}, passes={passes}, num_topics={num_topics}, Coherence Score = {coherence_score:.4f}")
    
    return results

alpha_values = [0.01, 0.1, 1.0]
passes_values = [10, 20, 30]
start = 10
limit = 30 
step = 1


results = compute_coherence_values(
    dictionary=dictionary,
    corpus=bow_corpus,
    texts=merged_df["tokens"].tolist(),
    start=start, 
    limit=limit, 
    step=step,
    alpha_vals=alpha_values,
    passes_vals=passes_values
)


best_result = max(results, key=lambda x: x['coherence'])
print("\nBest result:")
print(f"alpha={best_result['alpha']}, passes={best_result['passes']}, num_topics={best_result['num_topics']}, Coherence = {best_result['coherence']:.4f}")
"""

alpha=0.01, passes=10, num_topics=10, Coherence Score = 0.4157
alpha=0.01, passes=10, num_topics=11, Coherence Score = 0.4587
alpha=0.01, passes=10, num_topics=12, Coherence Score = 0.4242
alpha=0.01, passes=10, num_topics=13, Coherence Score = 0.4481
alpha=0.01, passes=10, num_topics=14, Coherence Score = 0.4432
alpha=0.01, passes=10, num_topics=15, Coherence Score = 0.3727
alpha=0.01, passes=10, num_topics=16, Coherence Score = 0.4084
alpha=0.01, passes=10, num_topics=17, Coherence Score = 0.3837
alpha=0.01, passes=10, num_topics=18, Coherence Score = 0.4288
alpha=0.01, passes=10, num_topics=19, Coherence Score = 0.4381
alpha=0.01, passes=10, num_topics=20, Coherence Score = 0.4001
alpha=0.01, passes=10, num_topics=21, Coherence Score = 0.4196
alpha=0.01, passes=10, num_topics=22, Coherence Score = 0.4283
alpha=0.01, passes=10, num_topics=23, Coherence Score = 0.4087
alpha=0.01, passes=10, num_topics=24, Coherence Score = 0.4315
alpha=0.01, passes=10, num_topics=25, Coherence Score =