# README

### Purpose of this notebook
- Cluster comments with `BERTopic` library, which also apply UMAP and HDBSCAN to cluster comments.

### Steps
1. Read the raw comment text.
2. Perform BERTopic to form topics (clusters).
3. Visualization.
    - Intertopic distance map
    - Cluster and scatter plot
    - Topic hierarchy
    - Keywords for each topic
    - Topic similarity matrix
    - Term score decline per Topic

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from tqdm import tqdm

from importlib import reload

# Utility variable
import sys
sys.path.insert(0, '../..')

# var
import var.var as V
import var.path as P

# utils
import utils.articut as A
import utils.bertopic as BT
import utils.data as D
import utils.io as IO
# import utils.visualize_cluster as VC

In [None]:
from bertopic import BERTopic

In [None]:
# Set up chinese font for matplotlib
plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['font.sans-serif'] = ['Noto Sans CJK TC']  
plt.rcParams['axes.unicode_minus'] = False

# Disable hugging face tokenizer parallelism
os.environ["TOKENIZERS_PARALLELISM"] = "false"

## Hyper-parameters

In [None]:
SBERT_MODEL_NAME = 'ckiplab/bert-base-chinese'
BERTOPIC_MODEL_NAME = "BERTopic_custom_mcs_100_ckip_diversified_low_all"
DIVERSITY = 0.3
NR_TOPICS = None

MIN_CLUSTER_SIZE = 100

TRAIN_MODEL = False
LOAD_MODEL = not TRAIN_MODEL

TRAIN_OR_ALL = 'all'

SPLITTER = '＄'

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

VIS_WIDTH=800
VIS_HEIGHT=600

## Load Model

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
_pass = BT._pass
topic_doc_tokenizer = BT.topic_doc_tokenizer
vectorizer = CountVectorizer(tokenizer=topic_doc_tokenizer, lowercase=False)

In [None]:
if LOAD_MODEL:
    try:
        topic_model = BERTopic.load(os.path.join(P.FP_COMMENT_CLUSTERING_MODEL_DIR, BERTOPIC_MODEL_NAME))
        print("Load BERTopic model success.")
    except:
        print("BERTopic model does not exist.")
        TRAIN_MODEL = True
        LOAD_MODEL = not TRAIN_MODEL

## Read comment sentences

In [None]:
df_split_comments = D.read_df_split_comments_no_duplicate(TRAIN_OR_ALL)
split_comments = D.read_split_comments_no_duplicate(TRAIN_OR_ALL)
df_tokenization_database = df_split_comments

In [None]:
len(split_comments)

In [None]:
df_split_comments.grade.value_counts() / len(df_split_comments) * 100

## Prepare custom models for BERTopic

### Construct a Dimension Reduction Pipeline
- Original: High-dimensional embedding from SBERT (300+)
- Use PCA to reduce to 50 dimensions
- Use UMAP to reduce to 20 dimensions
- Normalize the embeddings

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from umap import UMAP
from sklearn.preprocessing import Normalizer

### UMAP Parameters

- `n_neighbors`: the size of the local neighborhood UMAP will look at when attempting to learn the manifold structure of the data.
    - Low value: focus on the local sturcture
    - High value: focus on the global sturcture
- `min_dist`: the minimum distance apart that points are allowed to be in the low dimensional representation. 
    - Low value: clumpier embeddings, good for clustering
    - High value: focus on the topological structure
- `n_components`: Dimensions of the reduced dimension space.
    - For visualization: 2 or 3
    - For clustering: Larger value is acceptable (10 or 50)
- `metric`: euclidean, minkowski, cosine, etc.

In [None]:
## PCA params
PCA_COMPONENTS = 300
## UMAP params
UMAP_NEIGHBORS = 100
UMAP_COMPONENTS = 50
UMAP_MIN_DIST = 0.01
UMAP_METRIC = 'cosine'

In [None]:
dimension_reduction_pipe = Pipeline([
    ('pca', PCA(
        n_components=PCA_COMPONENTS
    )),
    ('umap', UMAP(
        n_neighbors=UMAP_NEIGHBORS,
        n_components=UMAP_COMPONENTS,
        min_dist=UMAP_MIN_DIST,
        metric=UMAP_METRIC,
        random_state=RANDOM_STATE,
    )),
    ('norm', Normalizer(
        norm='l2'
    )),
])

### Prepare HDBSCAN model

In [None]:
from hdbscan import HDBSCAN

In [None]:
## HDBSCAN parmas
MIN_SAMPLES = 10 # fixed
CLUSTER_SELECTION_METHOD = 'eom' # 'eom' or 'leaf'

In [None]:
cluster = HDBSCAN(
    min_samples=MIN_SAMPLES,
    min_cluster_size=MIN_CLUSTER_SIZE,
    cluster_selection_method=CLUSTER_SELECTION_METHOD,
#     metric='minkowski', p=10,
    prediction_data=True,
)

### Generate SBERT embeddings

In [None]:
from sentence_transformers import SentenceTransformer

In [None]:
try:
    sentence_bert = topic_model.embedding_model.embedding_model
except:
    sentence_bert = SentenceTransformer(SBERT_MODEL_NAME)

In [None]:
split_comments_embeds = sentence_bert.encode(split_comments, show_progress_bar=False)

### Embedding Visualization pipeline

In [None]:
visualization_pipe = Pipeline([
    ('umap', UMAP(
        n_neighbors=UMAP_NEIGHBORS,
        n_components=2,
        min_dist=UMAP_MIN_DIST,
        metric=UMAP_METRIC,
        random_state=RANDOM_STATE
    )),
])

In [None]:
%%time
split_comments_plot_data = visualization_pipe.fit_transform(split_comments_embeds)

## BERTopic integration

### Parameters
- `calculate_probabilities`: Whether to calculate the probabilities of all topics per document instead of the probability of the assigned topic per document. 

- `diversity`: Whether to use MMR to diversify the top n words inside the topic. The value is ranged between 0 to 1.
    - 0: not diverse
    - 1: completely diverse

### Apply BERTopic

In [None]:
if TRAIN_MODEL:
    topic_model = BERTopic(
    #     top_n_words=10,
    #     n_gram_range=(1, 1),
    #     min_topic_size=10,
        nr_topics=NR_TOPICS,
        calculate_probabilities=True,
        diversity=DIVERSITY, 
    #     seed_topic_list=None,
        embedding_model=sentence_bert,
        umap_model=dimension_reduction_pipe,
        hdbscan_model=cluster,
        vectorizer_model=vectorizer,
        verbose=True
    )

In [None]:
%%time
if TRAIN_MODEL:
    topics, probs = BT.custom_fit_transform(topic_model, split_comments)

### Save BERTopic model

In [None]:
if TRAIN_MODEL:
    topic_model.verbose = False
    topic_model.save(os.path.join(P.FP_COMMENT_CLUSTERING_MODEL_DIR, BERTOPIC_MODEL_NAME))

## Improve Topic Representation

### Get reduced embeddings

In [None]:
if not TRAIN_MODEL:
    reduced_split_comments_embeds = topic_model.umap_model['umap'].embedding_
    reduced_split_comments_embeds = topic_model.umap_model['norm'].transform(reduced_split_comments_embeds)
    reduced_split_comments_embeds.shape

### Get the topic labels

In [None]:
import hdbscan

In [None]:
%%time
if not TRAIN_MODEL:
    _, probs = hdbscan.approximate_predict(
        topic_model.hdbscan_model, reduced_split_comments_embeds
    )
    topics = topic_model.hdbscan_model.labels_
    

    topics = topic_model._map_predictions(topics)
    probs = topic_model._map_probabilities(probs, original_topics=True)

In [None]:
# %%time
# _ = BT.custom_update_topics(topic_model, split_comments, topics, vectorizer_model=vectorizer)

In [None]:
topic_model.get_topics()

In [None]:
num_topics = len(set(topics)) - 1
num_topics

## BERTopic Visualization

In [None]:
topic_model.visualize_topics()

In [None]:
topic_model.visualize_topics()

In [None]:
topic_model.visualize_documents(
    split_comments, reduced_embeddings=split_comments_plot_data,
    width=VIS_WIDTH, height=VIS_HEIGHT
)

In [None]:
topic_model.visualize_documents(
    split_comments, reduced_embeddings=split_comments_plot_data,
    width=VIS_WIDTH, height=VIS_HEIGHT
)

In [None]:
import scipy.cluster.hierarchy as sch

In [None]:
linkage_function = lambda x: sch.linkage(x, 'ward', optimal_ordering=True)

In [None]:
hierarchical_topics = BT.custom_hierarchical_topics(
    topic_model, split_comments, topics, linkage_function=linkage_function
)

In [None]:
{0: [9, 12, 14, 19, 24, 27, 31, 32, 36, 42],
 1: [0, 1, 3, 6, 8, 11, 13, 15, 17, 18, 22, 23, 25, 28, 33, 35, 38],
 2: [20, 41],
 3: [2, 4, 5, 10, 16, 26, 29, 30, 37, 39],
 4: [7, 21, 34, 40, 43]}

In [None]:
topic_model.visualize_hierarchy(
    hierarchical_topics=hierarchical_topics, linkage_function=linkage_function,
    width=VIS_WIDTH, height=VIS_HEIGHT, color_threshold=1.75
)

In [None]:
hierarchical_topics.to_csv(os.path.join(
    P.FP_COMMENT_CLUSTERING_TOPIC_HIERARCHY_DIR, "{}_hierarchical_topics.csv".format(BERTOPIC_MODEL_NAME)
))

In [None]:
print(topic_model.get_topic_tree(hierarchical_topics))

In [None]:
topic_model.visualize_hierarchical_documents(
    split_comments, hierarchical_topics, reduced_embeddings=split_comments_plot_data,
    width=VIS_WIDTH, height=VIS_HEIGHT
)

In [None]:
topic_model.visualize_barchart(
    top_n_topics=num_topics, n_words=5, width=VIS_WIDTH, height=VIS_HEIGHT
)

In [None]:
topic_model.visualize_heatmap()

In [None]:
topic_model.visualize_term_rank()

In [None]:
%%time
topics_per_class, _ = BT.custom_topics_per_class(
    topic_model, split_comments, topics=topics, classes=df_split_comments['grade']
)

In [None]:
topics_per_class

In [None]:
df_split_comments.grade.value_counts() / len(df_split_comments) * 100

In [None]:
df_split_comments[df_split_comments['grade'] == "P"]['split_comment'].to_list()

In [None]:
topics_per_class

In [None]:
topic_model.visualize_topics_per_class(
    topics_per_class, top_n_topics=num_topics , width=VIS_WIDTH, height=VIS_HEIGHT
)

## Representative sentences for each topic

In [None]:
num_topics = len(topic_model.get_topic_info()) - 1

In [None]:
for tid in range(num_topics):
    IO.print_dividing_line("Topic: {}".format(tid))
    for rep in topic_model.get_representative_docs(tid):
        print(rep)

In [None]:
for tid in range(num_topics):
    IO.print_dividing_line("Topic: {}".format(tid))
    for rep in topic_model.get_representative_docs(tid):
        print(rep)

## Aggregate topics
- Aggregate the topics together with hierarchical topics (down to five topics)
- Find the mean of the aggregated topics

In [None]:
MAX_NUM_PERSPECTIVE = V.MAX_NUM_PERSPECTIVE

In [None]:
hierarchical_topics.head()

In [None]:
topic_parents_ids = set()
topic_leaf_ids = set()

for _, row in hierarchical_topics.iterrows():
    if len(topic_leaf_ids) == MAX_NUM_PERSPECTIVE:
        break
        
    parents_id = row['Parent_ID']
    left_child_id = row['Child_Left_ID']
    right_child_id = row['Child_Right_ID']
    
    topic_parents_ids.add(parents_id)
    topic_leaf_ids.discard(parents_id)
    
    topic_leaf_ids.add(left_child_id)
    topic_leaf_ids.add(right_child_id)

In [None]:
topic_parents_ids

In [None]:
topic_leaf_ids

In [None]:
import ast

In [None]:
hierarchical_topics.query("`Parent_ID` == '66'")

In [None]:
topic_aggregate_dict = {}
all_topic = set()

for i, tid in enumerate(topic_leaf_ids):
    children_topics = hierarchical_topics.query("`Parent_ID` == '{}'".format(tid))['Topics'].to_list()[0]
    if type(children_topics) == str:
        children_topics = ast.literal_eval(children_topics)
    
    topic_aggregate_dict[i] = children_topics
    
    for ctid in children_topics:
        all_topic.add(ctid)
        
len(all_topic)

In [None]:
# topic_aggregate_dict

In [None]:
topic_aggregate_dict = {0: [9, 12, 14, 19, 24, 27, 31, 32, 36, 42],
 1: [0, 1, 3, 6, 8, 11, 13, 15, 17, 18, 22, 23, 25, 28, 33, 35, 38],
 2: [20, 41],
 3: [2, 4, 5, 10, 16, 26, 29, 30, 37, 39],
 4: [7, 21, 34, 40, 43]}

## Find the mean of each condensed perspective

In [None]:
topic_aggregate_embed_mean_dict = {}

for i, pids in topic_aggregate_dict.items():
    _filter = [_idx for _idx in range(len(topics)) if topics[_idx] in pids]
    mean_embed = np.take(split_comments_embeds, _filter, axis=0)
    mean_embed = np.mean(mean_embed, axis=0)
    
    topic_aggregate_embed_mean_dict[i] = mean_embed

In [None]:
topic_aggregate_embed_mean_dict

## Compute the similarity of each condensed perspective

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
topic_aggregate_intra_similarity_dict = {}

for i, pids in topic_aggregate_dict.items():
    num_pids = len(pids)
    _filter = [_idx for _idx in range(len(topics)) if topics[_idx] in pids]
    embeds = np.take(split_comments_embeds, _filter, axis=0)
    
    num_instance = len(embeds)
#     print(num_instance)
    ## [TODO] calculate the diversity of the condensed perspective
    sim_mat = cosine_similarity(embeds, embeds)
    
    intra_similarity = (np.sum(sim_mat) - num_instance) / 2.0 / (num_instance * (num_instance-1) / 2)
    topic_aggregate_intra_similarity_dict[i] = intra_similarity
    
#     print("pers:", i, "; num pers:", num_pids)
#     print("pids: ", pids)
#     print(num_pids)
#     print(intra_similarity)   

In [None]:
topic_aggregate_intra_similarity_dict

## Save to pickle file

In [None]:
import pickle

In [None]:
fn = os.path.join(
    P.FP_COMMENT_CLUSTERING_TOPIC_HIERARCHY_DIR, 
    "{}_topic_aggregate_info.pkl".format(BERTOPIC_MODEL_NAME)
)

with open(fn, "wb") as f:
    pickle.dump({
        "topic_aggregate_dict": topic_aggregate_dict,
        "topic_aggregate_embed_mean_dict": topic_aggregate_embed_mean_dict,
        "topic_aggregate_intra_similarity_dict": topic_aggregate_intra_similarity_dict,
    }, f)