# Hierarchical Clustering on Target Sentences

to help deriving a taxonomy for intents, hierarchical clustering is being leveraged.

In [1568]:
from bertopic import BERTopic
from scipy.cluster import hierarchy as sch
import pandas as pd
import logging
from sklearn.metrics.pairwise import cosine_distances
from tqdm import tqdm
import numpy as np
from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
import re

In [2]:
df = pd.read_parquet('../../data/processed/intents/send.parquet', engine='fastparquet')

In [3]:
df['target'] = df['target'].str.replace(r'\r\n', ' ', regex=True)
docs = df['target'].tolist()[:10000]

## Training the Model

In [4]:
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("BERTopic")
logger.setLevel(logging.INFO)

In [5]:
# Initialize UMAP with a fixed random state
random_state = 250 # Other seeds sometimes caused negative values in the distanace matrix

# Set global random seeds
np.random.seed(random_state)

embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
umap_model = UMAP(n_neighbors=15, n_components=5, random_state=random_state)
hdbscan_model = HDBSCAN(min_cluster_size=10, prediction_data=True)

topic_model = BERTopic(verbose=True, umap_model=umap_model, hdbscan_model=hdbscan_model, embedding_model=embedding_model)
topics, probs = topic_model.fit_transform(docs)

INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: mps
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2
2024-10-29 13:30:53,855 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 313/313 [00:22<00:00, 13.76it/s]
2024-10-29 13:31:16,671 - BERTopic - Embedding - Completed ✓
2024-10-29 13:31:16,672 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.
2024-10-29 13:31:37,361 - BERTopic - Dimensionality - Completed ✓
2024-10-29 13:31:37,362 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-10-29 13:31:37,647 - BERTopic - Cluster - Completed ✓
2024-10-29 13:31:37,650 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-10-29 13:31:37,818 - BERTopic - Representation - Completed ✓


In [6]:
hierarchical_topics = topic_model.hierarchical_topics(docs)

100%|██████████| 212/212 [00:00<00:00, 617.67it/s]


In [None]:
#topic_model.save("../../data/bertopic_models/send")

## Inspecting clustering results

In [7]:
#topic_model = BERTopic.load("../../data/bertopic_models/send")

In [8]:
#docs = df['target'].iloc[:10000].tolist()
#docs = df['target'].tolist()

In [7]:
topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)

## Merging Topics

In [1679]:
# Inspect Topics to Merge
topic_id = 0 # Replace with the ID of the topic you're interested in
top_words = topic_model.get_topic(topic_id)
print("Topic: "+str(topic_id)+"\n\nTop Words:")
for word in top_words:
    print(word[0], end=", ")
print('\n\nRepresentative Target Sentences: ')
for doc in topic_model.get_topic_info().iloc[topic_id+1]['Representative_Docs']:
    print(doc)

Topic: 0

Top Words:
him, he, her, it, she, me, sent, send, ll, you, 

Representative Target Sentences: 
I'm sending this e-mail to her as well so she can call you.
If you want to call him know, it is ok or you can email me your response and I will send it on.
so I'll alert him that he needs to send you this info.


In [1677]:
custom_labels = {
    topic_id: str(topic_id)+"_"+"send_signed_copy",
}
topic_model.set_topic_labels(custom_labels)

In [1553]:
documents = [docs[i] for i, topic in enumerate(topic_model.topics_) if topic == topic_id]
for i in range(20):
    print(documents[i])

David,  I believe that Sujan will send this to you as soon as the resource assignment gets finalized (after we've completed our internal discussions).
Please send that information to the  documentation team.
I am documenting the example db demo and I will send it to the whole team shortly.
If you have more than one application /demo to do ,please send me the details of what is to be done.
I am putting together the requirements document for the demo and I will send it out to everyone when it is complete.
Hi Team,  When you need an MC to work on Requirements, please remember to send us an e-mail with the following info.:  1.
can you send me the details for the KPMG demo so that I can start working on it.
Attached is a list of key requirements that I agreed to send to you.
Also, if possible please resend docs on RIR that I requested.
Zenki will send out an email tonight with more detail information on what the bank is looking for from the alerts requirements.
Steve,  I have just sent the 

In [1517]:
topic_model.merge_topics(docs, [-1,topic_id])

In [1441]:
len(topic_model.get_topics())

55

In [1680]:
topic_model.save("../../data/bertopic_models/processed/send_final")



In [1681]:
topic_model.visualize_hierarchy(custom_labels=True)

### Create Dataset 

In [1577]:
def get_label_from_topic_nr(topic_nr):
    for label in topic_model.custom_labels_:
        if label.startswith(str(topic_nr)):
            label = label
            label = re.sub(r"^\d+_", "", label)
            return label
    raise Exception('Provided Topic Number has no label assigned!')

In [1562]:
topic_number = 0  # Replace this with the specific topic number you want to inspect
topic_docs = [doc for doc, topic in zip(docs, topic_model.topics_) if topic == topic_number]

In [1563]:
topic_docs

['Being that she told you and I [on the phone] that she had already sent them (???).',
 "see, that's exactly why i sent it to you first.",
 'Will you please send them by tomorrow morning?',
 'no, I sent it to you.',
 'Raymond had sent me a mail saying that he was going to collect the car on Friday.',
 'I just sent it to you.',
 "As soon as I receive them, I'll send them to you.",
 'You sent me only the shortcut.',
 "Gents, I'm sending you this for 2 reasons.",
 'Just send it to me.',
 '[mailto:Mark_Cohen@janus.com] Sent:\tThursday, February 08, 2001 7:08 PM   Brett, Thanks for the insight into what you do.',
 '1. Please send him mail for problem with Clear Case.',
 '(I guess I probably should have checked that out before I sent it, but',
 'Did you send it back?',
 'I would like to send this out today, if possible.',
 'I also tried sending her email.',
 'I called her about 20 minutes ago to inform her of the status that Stephen just sent.  Douga...',
 'Doug Adkins has been sending these

In [1576]:
get_label_from_topic_nr(6)

'send_presentation'