In [60]:
import pandas as pd
import numpy as np
import requests
import json
from tqdm import tqdm
from langdetect import detect, DetectorFactory
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from bertopic import BERTopic
from hdbscan import HDBSCAN
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance

# Translation

In [63]:
DetectorFactory.seed = 0

API_KEY = ""
TRANSLATE_URL = "https://translation.googleapis.com/language/translate/v2"

def detect_language(text):
    try:
        return detect(text) if pd.notna(text) and text.strip() != "" else "en"
    except:
        return "en"

def translate_batch(texts):
    try:
        if not texts:
            return texts
        
        data = {
            "q": texts,
            "target": "en",
            "format": "text"
        }
        
        response = requests.post(
            TRANSLATE_URL,
            params={"key": API_KEY},
            json=data
        )
        response.raise_for_status()
        
        result = response.json()
        return [t["translatedText"] for t in result["data"]["translations"]]
    
    except Exception as e:
        print(f"Error translating batch: {e}")
        return texts

reviews_data = pd.read_csv('reviews.csv')

print("Detecting languages...")
reviews_data["lang"] = reviews_data["comments"].apply(detect_language)

non_english_mask = reviews_data["lang"] != "en"
non_english_comments = reviews_data.loc[non_english_mask, "comments"].tolist()

batch_size = 100
translated_comments = []
for i in tqdm(range(0, len(non_english_comments), batch_size), desc="Translating non-English comments"):
    batch = non_english_comments[i:i + batch_size]
    translated_batch = translate_batch(batch)
    translated_comments.extend(translated_batch)

reviews_data.loc[non_english_mask, "comments"] = translated_comments

reviews_data.to_csv('reviews_cleaned.csv', index=False)
print("Translation complete. Only non-English texts were translated.")

Detecting languages...


Translating non-English comments: 100%|███████| 191/191 [00:34<00:00,  5.61it/s]


Translation complete. Only non-English texts were translated.


# After Translation

In [2]:
output = "reviews_cleaned.csv"
reviews_data = pd.read_csv(output)

## Basic Statistics

In [4]:
# Data First Few Lines
reviews_data.head(20)

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments,word_count,has_html
0,2595,17857,2009-11-21,50679,Jean,Our three-night stay. We enjoyed the apartment...,124,False
1,2595,19176,2009-12-05,53267,Cate,Great experience.,2,False
2,2595,19760,2009-12-10,38960,Anita,I've stayed with my friend at the Midtown Cast...,90,False
3,2595,34320,2010-04-09,71130,Kai-Uwe,"We've been staying here for about 9 nights, en...",66,False
4,2595,46312,2010-05-25,117113,Alicia,We had a wonderful stay at Jennifer's charming...,24,False
5,2595,1238204,2012-05-07,1783688,Sergey,Hi to everyone!\rWould say our greatest compli...,99,False
6,2595,1293632,2012-05-17,1870771,Loïc,"Jennifer was very friendly and helpful, and he...",37,False
7,2595,2022498,2012-08-18,2124102,Melanie,This apartment is like a real castle old and u...,208,False
8,2595,4682989,2013-05-20,496053,Eric,Jennifer's place was in a great midtown locati...,57,False
9,2595,13193832,2014-05-21,13685934,Gerald,Jennifer is a very nice host. Everything is cl...,25,False


In [6]:
#Change Date Type
reviews_data['date'] = pd.to_datetime(reviews_data['date'])

reviews_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 969486 entries, 0 to 969485
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype         
---  ------         --------------   -----         
 0   listing_id     969486 non-null  int64         
 1   id             969486 non-null  int64         
 2   date           969486 non-null  datetime64[ns]
 3   reviewer_id    969486 non-null  int64         
 4   reviewer_name  969486 non-null  object        
 5   comments       969485 non-null  object        
 6   word_count     969486 non-null  int64         
 7   has_html       969486 non-null  bool          
dtypes: bool(1), datetime64[ns](1), int64(4), object(2)
memory usage: 52.7+ MB


In [7]:
# Data Description
reviews_data.describe()

Unnamed: 0,listing_id,id,date,reviewer_id,word_count
count,969486.0,969486.0,969486,969486.0,969486.0
mean,1.636331e+17,4.890135e+17,2021-02-20 06:09:18.466651648,161224700.0,44.981082
min,2595.0,3149.0,2009-05-25 00:00:00,1.0,1.0
25%,9841695.0,366479800.0,2019-01-02 00:00:00,31445090.0,15.0
50%,27612880.0,5.209964e+17,2021-12-19 00:00:00,105885300.0,32.0
75%,51709110.0,9.101962e+17,2023-06-09 00:00:00,250688900.0,60.0
max,1.308179e+18,1.325553e+18,2025-01-02 00:00:00,669621300.0,1001.0
std,3.350861e+17,4.734849e+17,,157251700.0,45.958727


In [8]:
#total values per column
reviews_data.count()

listing_id       969486
id               969486
date             969486
reviewer_id      969486
reviewer_name    969486
comments         969485
word_count       969486
has_html         969486
dtype: int64

## Data Cleaning

In [10]:
reviews_data['comments'].fillna("blank", inplace=True)
reviews_data['reviewer_name'].fillna("none", inplace=True)
reviews_data.isnull().sum()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  reviews_data['comments'].fillna("blank", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  reviews_data['reviewer_name'].fillna("none", inplace=True)


listing_id       0
id               0
date             0
reviewer_id      0
reviewer_name    0
comments         0
word_count       0
has_html         0
dtype: int64

## Cleaning & Preprocessing for Text Analysis

In [12]:
reviews = reviews_data.copy()

In [13]:
reviews = pd.DataFrame(reviews_data.comments.str.lower()) #make everything lowercase
reviews = pd.DataFrame(reviews.comments.str.replace('[^\w\s]','', regex=True)) # remove punctuation
reviews = pd.DataFrame(reviews.comments.str.strip()) # removing leading & trailing spaces
reviews.head()

  reviews = pd.DataFrame(reviews.comments.str.replace('[^\w\s]','', regex=True)) # remove punctuation


Unnamed: 0,comments
0,our threenight stay we enjoyed the apartment w...
1,great experience
2,ive stayed with my friend at the midtown castl...
3,weve been staying here for about 9 nights enjo...
4,we had a wonderful stay at jennifers charming ...


# Topic Modeling

## 1. Total Dataset

In [15]:
comments = reviews_data['comments'].dropna().tolist()

comments = list(tqdm(comments, desc="Processing comments for topic modeling"))

vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1, 2))

topic_model = BERTopic(
    language="english",
    vectorizer_model=vectorizer_model,
    nr_topics=None, 
    verbose=True
)

with tqdm(total=len(comments), desc="Fitting BERTopic model") as pbar:
    topics, probabilities = topic_model.fit_transform(comments)
    pbar.update(len(comments))

topic_model.reduce_topics(comments)

print(topic_model.get_topic_info())
topic_info = topic_model.get_topic_info()

Processing comments for topic modeling: 100%|█| 969486/969486 [00:00<00:00, 8489
Fitting BERTopic model:   0%|                        | 0/969486 [00:00<?, ?it/s]2025-02-28 17:16:59,670 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/30297 [00:00<?, ?it/s]

2025-02-28 17:30:03,119 - BERTopic - Embedding - Completed ✓
2025-02-28 17:30:03,120 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-02-28 18:57:04,162 - BERTopic - Dimensionality - Completed ✓
2025-02-28 18:57:04,217 - BERTopic - Cluster - Start clustering the reduced embeddings
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling paralle

    Topic   Count                                               Name  \
0      -1  374768                          -1_great_stay_place_clean   
1       0  570453                       0_great_place_stay_apartment   
2       1   15482                       1_good_great_thank_good good   
3       2    3322                                 2_na_yes_na na_ras   
4       3    1568    3_recommended_recommend_highly_recommend highly   
5       4     975               4_value_great value_good value_money   
6       5     720                            5_ok_ok ok_okay_alright   
7       6     669            6_advertised_described_expected_exactly   
8       7     397          7_blank_blank blank_comment_comment blank   
9       8     283                      8_review_thumbs_previous_time   
10      9     258     9_1010_1010 1010_recommend 1010_1010 recommend   
11     10     163    10_needed_exactly_exactly needed_needed exactly   
12     11     116            11_wont_regret_wont regret_disappoi

In [None]:
topic_info.to_csv('total_topic.csv', index=False)

## 2. Post Dataset (Post, Sep/05/2023)

In [5]:
reviews_data['date'] = pd.to_datetime(reviews_data['date'], errors='coerce')
df_post = reviews_data[reviews_data['date'] >= '2023-09-05']

In [21]:
docs = df_post['comments'].tolist()

vectorizer = CountVectorizer(
    stop_words="english",
    ngram_range=(1, 3)
)

hdbscan_model = HDBSCAN(
    min_cluster_size=150,
    min_samples=10,
    metric='euclidean',
    cluster_selection_method='eom'
)

representation_model = {
    "Main": KeyBERTInspired(),
    "MMR": MaximalMarginalRelevance(diversity=0.3)
}

topic_model = BERTopic(
    vectorizer_model=vectorizer,
    hdbscan_model=hdbscan_model,
    representation_model=representation_model,
    nr_topics=10,
    verbose=True
)

print("Starting topic modeling...")
with tqdm(total=len(docs), desc="Processing documents") as pbarzzzzzzz
    topics, probs = topic_model.fit_transform(docs)
    pbar.update(len(docs))

topic_info = topic_model.get_topic_info()

outlier_indices = [i for i, t in enumerate(topics) if t == -1]
if outlier_indices:
    print(f"Reassigning {len(outlier_indices)} outliers to the closest cluster...")
    
    embeddings = topic_model.embedding_model.embedding_model.encode(docs, show_progress_bar=True)
    
    topic_embeddings = {}
    for topic_id in topic_info['Topic']:
        if topic_id != -1:
            topic_docs = [embeddings[i] for i, t in enumerate(topics) if t == topic_id]
            topic_embeddings[topic_id] = np.mean(topic_docs, axis=0)
    
    for i in outlier_indices:
        doc_embedding = embeddings[i].reshape(1, -1)
        similarities = {t: cosine_similarity(doc_embedding, emb.reshape(1, -1))[0, 0] for t, emb in topic_embeddings.items()}
        closest_topic = max(similarities, key=similarities.get)
        topics[i] = closest_topic
    topic_model.update_topics(docs, topics=topics)
topic_info = topic_model.get_topic_info()

representative_docs = {}
for topic in topic_info['Topic']:
    reps = topic_model.get_representative_docs(topic)
    representative_docs[topic] = reps[0] if reps else "No representative document"

topic_info['Representative_Sentence'] = topic_info['Topic'].map(representative_docs)

print("Topic info with meaningful representations and sentences (excluding Topic -1):")
print(topic_info[['Topic', 'Count', 'Name', 'Representation', 'Representative_Sentence']])

Starting topic modeling...


Processing documents:   0%|                          | 0/185779 [00:00<?, ?it/s]2025-03-01 15:21:30,401 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/5806 [00:00<?, ?it/s]

2025-03-01 15:23:50,583 - BERTopic - Embedding - Completed ✓
2025-03-01 15:23:50,583 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-03-01 15:24:45,282 - BERTopic - Dimensionality - Completed ✓
2025-03-01 15:24:45,289 - BERTopic - Cluster - Start clustering the reduced embeddings
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling paralle

Reassigning 82506 outliers to the closest cluster...


Batches:   0%|          | 0/5806 [00:00<?, ?it/s]



Topic info with meaningful representations and sentences (excluding Topic -1):
   Topic   Count                                  Name  \
0      0  166022                      0_the_and_to_was   
1      1    3596           1_good_great_nice_excellent   
2      2    7027                      2_the_was_to_and   
3      3    4029          3_everything_was_thank_great   
4      4    2225                  4_clean_and_very_was   
5      5    1192       5_experience_service_great_very   
6      6     744                6_value_price_for_good   
7      7     439  7_recommend_recommended_highly_would   
8      8     505           8_as_exactly_what_described   

                                      Representation  \
0  [the, and, to, was, is, in, very, stay, for, g...   
1  [good, great, nice, excellent, perfect, thank,...   
2     [the, was, to, and, not, in, it, of, that, we]   
3  [everything, was, thank, great, you, it, perfe...   
4  [clean, and, very, was, the, everything, nice,...   
5  [

In [23]:
topic_info.to_csv('post_topic.csv', index=False)

## 3. Pre Dataset (Pre, Sep/05/2023)

In [27]:
reviews_data['date'] = pd.to_datetime(reviews_data['date'], errors='coerce')
df_pre = reviews_data[reviews_data['date'] < '2023-09-05']

In [29]:
docs = df_pre['comments'].tolist()

vectorizer = CountVectorizer(
    stop_words="english",
    ngram_range=(1, 3)
)

hdbscan_model = HDBSCAN(
    min_cluster_size=150,
    min_samples=10,
    metric='euclidean',
    cluster_selection_method='eom'
)

representation_model = {
    "Main": KeyBERTInspired(),
    "MMR": MaximalMarginalRelevance(diversity=0.3)
}

topic_model = BERTopic(
    vectorizer_model=vectorizer,
    hdbscan_model=hdbscan_model,
    representation_model=representation_model,
    nr_topics=10,
    verbose=True
)

print("Starting topic modeling...")
with tqdm(total=len(docs), desc="Processing documents") as pbar:
    topics, probs = topic_model.fit_transform(docs)
    pbar.update(len(docs))

topic_info = topic_model.get_topic_info()

outlier_indices = [i for i, t in enumerate(topics) if t == -1]
if outlier_indices:
    print(f"Reassigning {len(outlier_indices)} outliers to the closest cluster...")
    
    embeddings = topic_model.embedding_model.embedding_model.encode(docs, show_progress_bar=True)
    
    topic_embeddings = {}
    for topic_id in topic_info['Topic']:
        if topic_id != -1:
            topic_docs = [embeddings[i] for i, t in enumerate(topics) if t == topic_id]
            topic_embeddings[topic_id] = np.mean(topic_docs, axis=0)
    
    for i in outlier_indices:
        doc_embedding = embeddings[i].reshape(1, -1)
        similarities = {t: cosine_similarity(doc_embedding, emb.reshape(1, -1))[0, 0] for t, emb in topic_embeddings.items()}
        closest_topic = max(similarities, key=similarities.get)
        topics[i] = closest_topic
    topic_model.update_topics(docs, topics=topics)
topic_info = topic_model.get_topic_info()

representative_docs = {}
for topic in topic_info['Topic']:
    reps = topic_model.get_representative_docs(topic)
    representative_docs[topic] = reps[0] if reps else "No representative document"

topic_info['Representative_Sentence'] = topic_info['Topic'].map(representative_docs)

print("Topic info with meaningful representations and sentences (excluding Topic -1):")
print(topic_info[['Topic', 'Count', 'Name', 'Representation', 'Representative_Sentence']])

Starting topic modeling...


Processing documents:   0%|                          | 0/781039 [00:00<?, ?it/s]2025-03-02 01:00:24,291 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/24408 [00:00<?, ?it/s]

2025-03-02 01:12:21,868 - BERTopic - Embedding - Completed ✓
2025-03-02 01:12:21,869 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-03-02 01:41:58,162 - BERTopic - Dimensionality - Completed ✓
2025-03-02 01:41:58,188 - BERTopic - Cluster - Start clustering the reduced embeddings
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling paralle

Reassigning 279574 outliers to the closest cluster...


Batches:   0%|          | 0/24408 [00:00<?, ?it/s]



Topic info with meaningful representations and sentences (excluding Topic -1):
   Topic   Count                          Name  \
0      0  732215              0_the_and_to_was   
1      1   22492   1_great_was_everything_very   
2      2    7789   2_good_great_excellent_nice   
3      3   13329  3_clean_very_and_comfortable   
4      4    2454      4_value_price_good_money   
5      5     955            5_ok_all_good_okay   
6      6     472           6_blank_top_no_none   
7      7     439        7_stars_star_five_host   
8      8     894        8_review_as_no_comment   

                                      Representation  \
0  [the, and, to, was, is, in, very, for, we, great]   
1  [great, was, everything, very, and, host, than...   
2  [good, great, excellent, nice, amazing, very, ...   
3  [clean, very, and, comfortable, was, nice, the...   
4  [value, price, good, money, for, great, worth,...   
5  [ok, all, good, okay, thanks, alright, everyth...   
6  [blank, top, no, none, no

In [31]:
topic_info.to_csv('pre_topic.csv', index=False)