In [2]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from gensim import corpora
from gensim.models import LsiModel
from gensim.models.coherencemodel import CoherenceModel
import matplotlib.pyplot as plt



In [3]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/baonguyen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/baonguyen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
data = pd.read_json("/Users/baonguyen/IU/thesis/data/raw_data/renttherunway_final_data.json",lines=True)

# Clean Data

In [6]:
# take just the content of the article, lowercase and remove punctuation
summaries = data['review_text'].str.lower().apply(lambda x: re.sub(r"([^\w\s])", "", x))

In [7]:
# stop word removal
en_stopwords = stopwords.words('english')
summaries = summaries.apply(lambda x: ' '.join([word for word in x.split() if word not in (en_stopwords)]))

# embedding

In [8]:
# 5. Generate sentence embeddings using all-MiniLM-L6-v2
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(summaries, show_progress_bar=True)

Batches: 100%|██████████| 6017/6017 [03:49<00:00, 26.21it/s]


# UMAP

In [9]:
# 6. UMAP for dimensionality reduction to 2D
import umap
reducer = umap.UMAP(n_components=16, random_state=42)
umap_embeddings = reducer.fit_transform(embeddings)

  warn(
OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


# clustering

In [20]:
# 7. HDBSCAN clustering
from sklearn.cluster import DBSCAN
clusterer = DBSCAN(min_samples=3)
labels = clusterer.fit_predict(umap_embeddings)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [26]:
data['Topic'] = labels

# bertopic

In [31]:
import numpy as np
from bertopic import BERTopic
from gensim.corpora.dictionary import Dictionary
from gensim.models.coherencemodel import CoherenceModel
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.representation import KeyBERTInspired

# Fine-tune your topic representations
representation_model = KeyBERTInspired()

# Convert doc_term to a list of strings
docs = []
for doc in doc_term:
    doc_str = ' '.join([dictionary[word_id] for word_id, _ in doc])
    print(doc_str)
    docs.append(doc_str)

# Fit BERTopic model
topic_model = BERTopic(embedding_model='sentence-transformers/all-MiniLM-L6-v2',nr_topics=15,verbose=True,representation_model=representation_model)
topics, probabilities = topic_model.fit_transform(docs)

# Preprocess documents
cleaned_docs = topic_model._preprocess_text(docs)

# Extract vectorizer and tokenizer from BERTopic
vectorizer = topic_model.vectorizer_model
tokenizer = vectorizer.build_tokenizer()

# Extract features for Topic Coherence evaluation
words = vectorizer.get_feature_names_out()
tokens = [tokenizer(doc) for doc in cleaned_docs]
dictionary = Dictionary(tokens)
corpus = [dictionary.doc2bow(token) for token in tokens]

# Extract topic words
topic_words = [[words for words, _ in topic_model.get_topic(topic)]
               for topic in range(len(set(topics)) - 1)]

# Evaluate coherence
coherence_model = CoherenceModel(topics=topic_words,
                                 texts=tokens,
                                 corpus=corpus,
                                 dictionary=dictionary,
                                 coherence='c_v')
coherence = coherence_model.get_coherence()

print(f"Coherence Score: {coherence}")

2025-04-14 21:43:02,859 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 6017/6017 [03:27<00:00, 29.04it/s]
2025-04-14 21:46:41,954 - BERTopic - Embedding - Completed ✓
2025-04-14 21:46:41,954 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-14 21:47:55,556 - BERTopic - Dimensionality - Completed ✓
2025-04-14 21:47:55,560 - BERTopic - Cluster - Start clustering the reduced embeddings
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZER

Coherence Score: 0.517098040582251


In [34]:
# Add topics to the original DataFrame
data['Topic'] = topics

In [28]:
data.to_csv('data/clean_data/data_with_bertopic_column_.csv')