# Hierarchical Clustering on Target Sentences
to help deriving a taxonomy for intents, hierarchical clustering is being leveraged.

#### Functions

####

In [2]:
from bertopic import BERTopic
from scipy.cluster import hierarchy as sch
import pandas as pd
import logging
from sklearn.metrics.pairwise import cosine_distances
from tqdm import tqdm
import numpy as np
from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
import re
import sys
sys.path.append('../../../')
from utility.utility_functions import *

In [3]:
df = pd.read_parquet('../../../data/processed/intents/meeting.parquet', engine='fastparquet')

In [4]:
df['target'] = df['target'].str.replace(r'\r\n', ' ', regex=True)
docs = list(set(df['target'].tolist()))

In [5]:
# Amount of unique target sentences
len(docs)

10782

## Training the Model

In [6]:
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("BERTopic")
logger.setLevel(logging.INFO)

In [7]:
# Initialize UMAP with a fixed random state
random_state = 1 # Other seeds sometimes caused negative values in the distanace matrix

# Set global random seeds
np.random.seed(random_state)

embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
umap_model = UMAP(n_neighbors=15, n_components=5, random_state=random_state)
hdbscan_model = HDBSCAN(min_cluster_size=10, prediction_data=True)

topic_model = BERTopic(verbose=True, umap_model=umap_model, hdbscan_model=hdbscan_model, embedding_model=embedding_model)
topics, probs = topic_model.fit_transform(docs)

INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: mps
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2
2024-11-10 16:22:59,113 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/337 [00:00<?, ?it/s]

2024-11-10 16:23:21,492 - BERTopic - Embedding - Completed ✓
2024-11-10 16:23:21,493 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-11-10 16:23:34,030 - BERTopic - Dimensionality - Completed ✓
2024-11-10 16:23:34,031 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-11-10 16:23:34,312 - BERTopic - Cluster - Completed ✓
2024-11-10 16:23:34,315 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-11-10 16:23:34,452 - BERTopic - Representation - Completed ✓


In [8]:
hierarchical_topics = topic_model.hierarchical_topics(docs)

100%|██████████| 95/95 [00:00<00:00, 733.70it/s]


In [9]:
topic_model.save("../../../data/bertopic_models/intents/05_meeting/meeting_unprocessed")



## Inspecting clustering results

In [None]:
#topic_model = BERTopic.load("../../data/bertopic_models/send")

In [10]:
topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)

## Merging Topics

In [424]:
topic_nr = 0
inspect_topic(topic_model,topic_nr)

Topic: 0

Top Words:
meeting, person, to, the, have, you, and, with, we, location, 

Representative Target Sentences: 
I have a meeting with [PERSON] for next week .
Please let me know the next time you will be coming to [LOCATION] [LOCATION] in order that we can arrange a meeting with you and some of our Senior Executive Management Team .
[LOCATION] and we 'd like to have you join us .


In [427]:
topic_model.custom_labels_

['-1_person_the_to_and',
 '0_meeting_planning',
 '1_meeting_reschedule',
 '2_meeting_confirmation',
 '3_meeting_not_attendable',
 '4_meeting_for_bugs',
 '5_meeting_cancellation',
 '6_meeting_missed']

In [399]:
topic_model.merge_topics(docs, [-1,topic_nr])

In [407]:
topic_model.merge_topics(docs, [0,5])

In [422]:
create_custom_label(topic_model, "meeting_missed", topic_nr)

In [406]:
topic_docs = inspect_topic_docs(topic_model,topic_nr,docs,n_docs=20)

Number of Documents: 36

Ok with me ... Set up a meeting so it goes on my calendar .
Let me know what might work best for you and I will put a meeting on the calendar .
For some reason , I do n't have Staff Meeting on 4/17 on my calendar .
otherwise I 'll just set - up the meeting on your calendar .
I 'm hoping that all of you still have this meeting showing up weekly on your calendars .. please let me know if you do not ..
These essential software programs will help you remember important dates , set your own reminders , keep better track of your finances , even get you up in time to make that early morning meeting .
Thanks [PERSON]    I still need you to delete the meeting you have in the calendar for that room or change it to a different room , the system wo n't let me book it until you have deleted it !
We 've had some " twist and turns " on individual calendars here and I wanted   to get back intouch with you regarding our planned meeting .
I do n't see a meeting scheduled on my c

In [43]:
len(topic_model.get_topics())

91

In [425]:
topic_model.save("../../../data/bertopic_models/intents/05_meeting/meeting_processed")



In [408]:
topic_model.visualize_hierarchy(custom_labels=False)

In [426]:
topic_model.save("../../../data/bertopic_models/intents/05_meeting/meeting_final")



### Create Dataset for intent 'help'

####  Labels

Created custom labels for topics

| Label                             | Description                                       | Extraction Strings | Context needed |
|-----------------------------------|---------------------------------------------------|--------------------|----------------|
| '0_meeting_planning',
| '1_meeting_reschedule',
| '2_meeting_confirmation',
| '3_meeting_not_attendable',
| '4_meeting_for_bugs',
| '5_meeting_cancellation',
| '6_meeting_missed'

#### Inspecting Topics

In [None]:
'''
for  label in topic_model.custom_labels_:
   print(label)'''

In [575]:
topic_id = 5
topic_docs = inspect_topic_docs(topic_model=topic_model, topic_nr=topic_id,original_docs=docs, n_docs=20)

Number of Documents: 28

Can we cancel meeting tomorrow ?
Canceling the meeting sparked much activity .
Hi ,    I 'd like to suggest we cancel Monday " O Staff " meetings .
Let us cancel today 's meeting since many of us are not available .
I have asked him to cancel the meeting so that   all of us can concentrate on interviewing the two candidates who are going   to show up .
As a result , I have cancelled the meeting you scheduled for Tuesday .
Canceling two communications meetings in a row is not   a good practice !
We would like to cancel SP meeting this week because we has already discussed detail of all issues .
Ericsson cancels handset plant meeting to rethink   7 .
Currently , we have licensing release meeting everyday at 4:00pm   I have asked [PERSON] if we can cancel that and have this one instead .
can you cancel the meeting for next monday and suggest some times .
I have to cancel another meeting so I 'll wait to hear from you before canceling .
So I have cancelled the Thur

In [579]:
# Based on the inspection of the sentences
white_list = [
    'can ', 'let us', 'would like'
     #'not be attending', 'unable', 'not able', 'not be able', "n't be able", "n't able to attend", "not beable to attend"
    #'please confirm',
]
black_list = [
    'please', 'mailto'
    #'should', 'could', "let's", 'let me know', 'is it possible', "did n't", 'asked', '[PERSON] wrote'
]

In [580]:
sents_to_keep, sents_to_discard = filter_topic_documents(topic_docs=topic_docs, white_list=white_list, black_list=black_list)
sents_to_keep

Amount of sentences: 8


["Let us cancel today 's meeting since many of us are not available .",
 'Can we cancel meeting tomorrow ?',
 'can you cancel the meeting for next monday and suggest some times .',
 "We would like to cancell today 's SP meeting .",
 'Currently , we have licensing release meeting everyday at 4:00pm   I have asked [PERSON] if we can cancel that and have this one instead .',
 "I would like to propose cancelling this Friday 's meeting but instead having everyone agree to review materials and provide feedback by next Tuesday .",
 'We would like to cancel SP meeting this week because we has already discussed detail of all issues .',
 'I have asked him to cancel the meeting so that   all of us can concentrate on interviewing the two candidates who are going   to show up .']

In [581]:
print(len(sents_to_discard))
sents_to_discard

20


['Hi ,    I \'d like to suggest we cancel Monday " O Staff " meetings .',
 'The last thing in the world we want to do is set up meetings and then cancel .',
 "I had to double - book the meeting on the 20th because I could n't modify nor cancel the first meeting I put on there .",
 'The only reason I will cancel this meeting is if I am physically unable to be here .',
 'Unfortunately I have to cancel our tentative meeting next week .',
 'Team   I have canceled the Monday weekly team group meetings .',
 'We had to cancel tomorrows meeting .',
 'So I have cancelled the Thursday meeting .',
 "I have to cancel another meeting so I 'll wait to hear from you before canceling .",
 'Ericsson cancels handset plant meeting to rethink   7 .',
 'So , i have to cancel the weekly meeting today .',
 "I 'm going to cancel today 's meeting and   reschedule .",
 'As a result , I have cancelled the meeting you scheduled for Tuesday .',
 'Canceling the meeting sparked much activity .',
 'Canceling two comm

In [573]:
fp = "../../../data/labeled/meeting.tsv"
#df = pd.DataFrame({"text": sents_to_keep, "label": get_label_from_topic_nr(topic_id)})
df = pd.DataFrame({"text": sents_to_keep, "label": 'unable_to_attend_meeting'})
concat_to_df(df, filepath=fp, overwrite=False)

Unnamed: 0,text,label
0,I could make a meeting before or after that th...,propose_meeting
1,4 ) How pleasant and instructive could your ex...,propose_meeting
2,Maybe you should put this in your request back...,propose_meeting
3,"[PERSON] [PERSON] [PERSON] , and I could take ...",propose_meeting
4,"[PERSON] , If you could approve these vacat...",propose_meeting
...,...,...
787,I will not be attending the meeting .,unable_to_attend_meeting
788,"Om , I may not be able to attend this meeti...",unable_to_attend_meeting
789,"Hi there , Unfortunately , I wo n't be able...",unable_to_attend_meeting
790,I 'll be traveling that week and would n't be ...,unable_to_attend_meeting


In [None]:
df.head()