# Load Dataset Pickle file

In [None]:
import pandas as pd

df = pd.read_pickle('./dataset/State_Hotel_reviews_v1_240415.01.pkl')

In [None]:
version='240418.01'

# Friend/Couple Positive (4 or 5 顆星)

Cleanliness, Room, Location

In [None]:
#Cleanliness, Room, Location
travel_type = "Friend/Couple" #"Family", "Business/Solo"
df_friend_pos = df[df['Travel Type Label'].str.contains(travel_type, na=False, regex=False)]
df_friend_pos = df_friend_pos [#df_friend_pos ["Overall Rating"].isin([4, 5]) &
           #df_friend_pos ["Value"].isin([4, 5]) &
           df_friend_pos ["Location"].isin([4, 5]) &
           df_friend_pos ["Rooms"].isin([4, 5]) &
           #df_friend_pos ["Service"].isin([4, 5]) &
           #df_friend_pos ["Sleep Quality"].isin([4, 5]) &
           df_friend_pos ["Cleanliness"].isin([4, 5])]

print(f'{travel_type} positive: {len(df_friend_pos)}')

In [None]:
df_sample = df_friend_pos

In [None]:
#docs = df['text'].tolist()
docs = df_sample['token'].map(" ".join).tolist()
docs[:3]

In [None]:
len(df_sample)

In [None]:
%%time

import os

os.environ["CUDA_VISIBLE_DEVICES"]="0"
os.environ["TOKENIZERS_PARALLELISM"]="true"

from umap import UMAP
from hdbscan import HDBSCAN
#from cuml.cluster import HDBSCAN
#from cuml.manifold import UMAP
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from bertopic.vectorizers import ClassTfidfTransformer


seed = 42

# Step 1 - Extract embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Step 2 - Reduce dimensionality
umap_model = UMAP(n_neighbors=30, #20,#15,
                  n_components=2,
                  min_dist=0.05,
                  metric='cosine',
                  random_state=seed)

# Step 3 - Cluster reduced embeddings
hdbscan_model = HDBSCAN(min_cluster_size=300,
                        metric='euclidean',
                        cluster_selection_method='eom',
                        prediction_data=True)

# Step 4 - Tokenize topics
#def tokenizer_split(text):
#    return text.split(',')

vectorizer_model = CountVectorizer(ngram_range=(2, 3), stop_words="english")

# Step 5 - Create topic representation
ctfidf_model = ClassTfidfTransformer()

# Step 6 - (Optional) Fine-tune topic representations with
# a `bertopic.representation` model
representation_model = KeyBERTInspired(top_n_words=100)

# All steps together
topic_model = BERTopic(
    embedding_model=embedding_model,          # Step 1 - Extract embeddings
    umap_model=umap_model,                    # Step 2 - Reduce dimensionality
    hdbscan_model=hdbscan_model,              # Step 3 - Cluster reduced embeddings
    vectorizer_model=vectorizer_model,        # Step 4 - Tokenize topics
    ctfidf_model=ctfidf_model,                # Step 5 - Extract topic words
    representation_model=representation_model, # Step 6 - (Optional) Fine-tune topic represenations
    top_n_words=100,
    #min_topic_size=100,
)

topics, probs = topic_model.fit_transform(docs)
#topic_model.fit_transform(docs)
#topic_model.reduce_topics(docs, nr_topics='auto') 

In [None]:
topic_model.save(f"./output/model_{travel_type.replace('/','-')}_pos_{version}")

In [None]:
topic_model.get_topic_info()

In [None]:
topic_model.reduce_topics(docs=docs, nr_topics='auto')

In [None]:
topic_model.reduce_topics(docs=docs, nr_topics=16)

In [None]:
%%time

topic_model.visualize_topics()


In [None]:
%%time

topic_model.visualize_documents(docs)


In [None]:
%%time

topic_model.visualize_barchart(topics=list(set(topics))[:10], n_words=30)


In [None]:
topic_model.save(f"./output/model_{travel_type.replace('/','-')}_pos_{version}")

In [None]:
df_topickeywords = pd.DataFrame([topic_model.get_topic(i)
                                 for i in range(len(topic_model.topic_sizes_)-1)]).transpose()

df_topickeywords.to_pickle(f"./output/df_keywords_{travel_type.replace('/','-')}_pos_{version}.pkl")
df_topickeywords.to_csv(f"./output/keywords_{travel_type.replace('/','-')}_pos_{version}.csv",
                        encoding='utf-8', index=False)

df_topickeywords

In [None]:
for i in range(len(topic_model.topic_sizes_)-1):
    print(f'### Topic {i}:')
    print(f'```python!')
    print(f'{[item[0] for item in topic_model.get_topic(i)]}')
    print(f'```')