In [21]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from gensim import corpora
from gensim.models import LsiModel
from gensim.models.coherencemodel import CoherenceModel
import matplotlib.pyplot as plt



In [22]:
data = pd.read_json("/Users/baonguyen/IU/thesis/data/raw_data/renttherunway_final_data.json",lines=True)

In [23]:
data

Unnamed: 0,fit,user_id,bust size,item_id,weight,rating,rented for,review_text,body type,review_summary,category,height,size,age,review_date
0,fit,420272,34d,2260466,137lbs,10.0,vacation,An adorable romper! Belt and zipper were a lit...,hourglass,So many compliments!,romper,"5' 8""",14,28.0,"April 20, 2016"
1,fit,273551,34b,153475,132lbs,10.0,other,I rented this dress for a photo shoot. The the...,straight & narrow,I felt so glamourous!!!,gown,"5' 6""",12,36.0,"June 18, 2013"
2,fit,360448,,1063761,,10.0,party,This hugged in all the right places! It was a ...,,It was a great time to celebrate the (almost) ...,sheath,"5' 4""",4,116.0,"December 14, 2015"
3,fit,909926,34c,126335,135lbs,8.0,formal affair,I rented this for my company's black tie award...,pear,Dress arrived on time and in perfect condition.,dress,"5' 5""",8,34.0,"February 12, 2014"
4,fit,151944,34b,616682,145lbs,10.0,wedding,I have always been petite in my upper body and...,athletic,Was in love with this dress !!!,gown,"5' 9""",12,27.0,"September 26, 2016"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
192539,fit,66386,34dd,2252812,140lbs,10.0,work,Fit like a glove!,hourglass,LOVE IT!!! First Item Im thinking of buying!,jumpsuit,"5' 9""",8,42.0,"May 18, 2016"
192540,fit,118398,32c,682043,100lbs,10.0,work,The pattern contrast on this dress is really s...,petite,LOVE it!,dress,"5' 1""",4,29.0,"September 30, 2016"
192541,fit,47002,36a,683251,135lbs,6.0,everyday,"Like the other DVF wraps, the fit on this is f...",straight & narrow,"Loud patterning, flattering fit",dress,"5' 8""",8,31.0,"March 4, 2016"
192542,fit,961120,36c,126335,165lbs,10.0,wedding,This dress was PERFECTION. it looked incredib...,pear,loved this dress it was comfortable and photog...,dress,"5' 6""",16,31.0,"November 25, 2015"


# Clean Data

In [24]:
# take just the content of the article, lowercase and remove punctuation
summaries = data['text'].str.lower().apply(lambda x: re.sub(r"([^\w\s])", "", x))

In [25]:
# stop word removal
en_stopwords = stopwords.words('english')
summaries = summaries.apply(lambda x: ' '.join([word for word in x.split() if word not in (en_stopwords)]))

In [26]:
# tokenize
summaries = summaries.apply(lambda x: word_tokenize(x))

In [27]:
# stemming (done for speed as we have a lot of text)
ps = PorterStemmer()
sumsummaries =summaries.apply(lambda tokens: [ps.stem(token) for token in tokens])

In [28]:
summaries

0         [adorable, romper, belt, zipper, little, hard,...
1         [rented, dress, photo, shoot, theme, hollywood...
2         [hugged, right, places, perfect, dress, event,...
3         [rented, companys, black, tie, awards, banquet...
4         [always, petite, upper, body, extremely, athle...
                                ...                        
192539                                   [fit, like, glove]
192540    [pattern, contrast, dress, really, stunning, u...
192541    [like, dvf, wraps, fit, fantastic, albeit, col...
192542    [dress, perfection, looked, incredible, photos...
192543    [dress, wonderful, originally, planned, wear, ...
Name: review_text, Length: 192544, dtype: object

# Vectorization

In [29]:
# create dictionary of all words
dictionary = corpora.Dictionary(summaries)
print(dictionary)

Dictionary<48569 unique tokens: ['absolutely', 'adorable', 'belt', 'compliments', 'day']...>


In [30]:
# vecotize using bag of words into a document term matrix
doc_term = [dictionary.doc2bow(text) for text in summaries]

# bertopic

In [31]:
import numpy as np
from bertopic import BERTopic
from gensim.corpora.dictionary import Dictionary
from gensim.models.coherencemodel import CoherenceModel
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.representation import KeyBERTInspired

# Fine-tune your topic representations
representation_model = KeyBERTInspired()

# Convert doc_term to a list of strings
docs = []
for doc in doc_term:
    doc_str = ' '.join([dictionary[word_id] for word_id, _ in doc])
    docs.append(doc_str)

# Fit BERTopic model
topic_model = BERTopic(embedding_model='sentence-transformers/all-MiniLM-L6-v2',nr_topics=15,verbose=True,representation_model=representation_model)
topics, probabilities = topic_model.fit_transform(docs)

# Preprocess documents
cleaned_docs = topic_model._preprocess_text(docs)

# Extract vectorizer and tokenizer from BERTopic
vectorizer = topic_model.vectorizer_model
tokenizer = vectorizer.build_tokenizer()

# Extract features for Topic Coherence evaluation
words = vectorizer.get_feature_names_out()
tokens = [tokenizer(doc) for doc in cleaned_docs]
dictionary = Dictionary(tokens)
corpus = [dictionary.doc2bow(token) for token in tokens]

# Extract topic words
topic_words = [[words for words, _ in topic_model.get_topic(topic)]
               for topic in range(len(set(topics)) - 1)]

# Evaluate coherence
coherence_model = CoherenceModel(topics=topic_words,
                                 texts=tokens,
                                 corpus=corpus,
                                 dictionary=dictionary,
                                 coherence='c_v')
coherence = coherence_model.get_coherence()

print(f"Coherence Score: {coherence}")

2025-04-14 21:43:02,859 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 6017/6017 [03:27<00:00, 29.04it/s]
2025-04-14 21:46:41,954 - BERTopic - Embedding - Completed ✓
2025-04-14 21:46:41,954 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-14 21:47:55,556 - BERTopic - Dimensionality - Completed ✓
2025-04-14 21:47:55,560 - BERTopic - Cluster - Start clustering the reduced embeddings
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZER

Coherence Score: 0.517098040582251


In [32]:
# Convert to a format suitable for BERTopic
# docs = []
# for doc in doc_term:
#     doc_str = ' '.join([dictionary[word_id] for word_id, _ in doc])
#     docs.append(doc_str)

# # Apply BERTopic
# topic_model = bertopic.BERTopic(embedding_model='sentence-transformers/all-MiniLM-L6-v2',nr_topics=15,verbose=True)
# topics, probabilities = topic_model.fit_transform(docs)




In [33]:
topic_model.get_topic_info().to_csv("data/topic_info.csv",index=False)

In [34]:
# Add topics to the original DataFrame
data['Topic'] = topics

In [35]:
data.to_csv('data/clean_data/data_with_bertopic_column.csv')