# Hierarchical Clustering on Target Sentences
to help deriving a taxonomy for intents, hierarchical clustering is being leveraged.

####

In [11]:
from bertopic import BERTopic
from scipy.cluster import hierarchy as sch
import pandas as pd
import logging
from sklearn.metrics.pairwise import cosine_distances
from tqdm import tqdm
import numpy as np
from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
import re
import sys
sys.path.append('../../../')
from utility.utility_functions import *

In [3]:
df = pd.read_parquet('../../../data/processed/intents/contact.parquet', engine='fastparquet')

In [4]:
df['target'] = df['target'].str.replace(r'\r\n', ' ', regex=True)
docs = list(set(df['target'].tolist()))#[:10000]

In [5]:
# Amount of unique target sentences
len(docs)

80374

## Training the Model

In [5]:
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("BERTopic")
logger.setLevel(logging.INFO)

In [6]:
# Initialize UMAP with a fixed random state
random_state = 1 # Other seeds sometimes caused negative values in the distanace matrix

# Set global random seeds
np.random.seed(random_state)

embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
umap_model = UMAP(n_neighbors=15, n_components=5, random_state=random_state)
hdbscan_model = HDBSCAN(min_cluster_size=20, prediction_data=True)

topic_model = BERTopic(verbose=True, umap_model=umap_model, hdbscan_model=hdbscan_model, embedding_model=embedding_model)
topics, probs = topic_model.fit_transform(docs)

2024-11-10 22:56:33,847 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/2512 [00:00<?, ?it/s]

2024-11-10 22:58:29,809 - BERTopic - Embedding - Completed ✓
2024-11-10 22:58:29,809 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-11-10 22:59:27,195 - BERTopic - Dimensionality - Completed ✓
2024-11-10 22:59:27,196 - BERTopic - Cluster - Start clustering the reduced embeddings
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling paralle

In [7]:
hierarchical_topics = topic_model.hierarchical_topics(docs)

100%|██████████| 357/357 [00:00<00:00, 424.22it/s]


In [9]:
topic_model.save("../../../data/bertopic_models/intents/01_contact/contact_unprocessed")



## Inspecting clustering results

In [10]:
topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)

## Merging Topics

In [626]:
topic_nr = 80
inspect_topic(topic_model,topic_nr)

Topic: 80

Top Words:
hesitate, questions, do, any, contact, not, have, please, if, me, 

Representative Target Sentences: 
Please do not hesitate to contact me if you have any questions .
If you   have any questions , please do not hesitate to contact me .
Please do not hesitate to contact me if you have any   questions .


In [None]:
topic_model.merge_topics(docs, [-1,topic_nr])

In [89]:
create_custom_label("", )

In [22]:
documents = [docs[i] for i, topic in enumerate(topic_model.topics_) if topic == topic_nr]
print('Number of Documents: '+str(len(documents))+'\n')
for i in range(20):
    print(documents[i])

Number of Documents: 1023

I 'd like to send and recieve email ( in [LOCATION] ) with my Magic Link .
As soon as I get more information I will send another update .    [LOCATION] .
We prefer to send people within the territory to their shows .
If you need help for [LOCATION] side , please call me .
The Depatment of Managed Health Care has requested that   United Healthcare of [LOCATION] send a letter to enrolled subscribers .
[PERSON] from the headhunters also called me - he said it was a 12 hour flight to [LOCATION] from where [PERSON] was in [LOCATION] !
If you 're OK with it , feel free to send it to [LOCATION] [LOCATION]    [PERSON] .
Please contact me if I can be of help   to you .    _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _   Terrorist Attacks on [LOCATION] - How can you help ?
I think we should make sure that we send this to the folks in [LOCATION] also .
[PERSON] , please send the attached energy alert out to all [LOCAT

In [668]:
len(topic_model.get_topics())

238

In [704]:
topic_model.save("../../data/bertopic_models/intent/send_new/send_processed")



In [627]:
topic_model.visualize_hierarchy(custom_labels=False)

### Keep Intent-like Topics

In [1]:
topic_id = 15
inspect_topic(topic_model,topic_id)

NameError: name 'inspect_topic' is not defined

In [1146]:
topic_model.merge_topics(docs, [6, 15])

In [1108]:
documents = [docs[i] for i, topic in enumerate(topic_model.topics_) if topic == topic_id]
print('Number of Documents: '+str(len(documents))+'\n')
for i in range(20):
    print(documents[i])

Number of Documents: 40

* 	 [PERSON] [PERSON] will help connect me with BEA .
Hi [PERSON]    [PERSON] also asked me to send you some article examples .
[PERSON] , if I 'm wrong , please correct me .
[PERSON] ,    Can we please talk about this we you get in ?    Thanks .
from [PERSON] like we did last time .. however .. that being said .. you can provide me with some bullet points and you
They are all labeled with each persons ' name and size so you can just call off people 's names .
Responding to [PERSON] [PERSON] ( 05:07 PM 8/6/2001 [LOCATION] ):   > To review : in our survey of the philosophy of science thus far we 've   > reached the point where we 're starting to question [PERSON] ( as we   > should ) , and are coming upon the notions of [PERSON] ( which we all intuit   > anyway ) , and maybe someday can handle ( if we 're especially studious )   >
I do n't think we 've dealt with [PERSON] , though [PERSON] or D'Arcy will correct   me if I am wrong .
[PERSON] , Please correct me 

In [1658]:
create_custom_label(custom_label='send_check', topic_id=topic_id)

In [1139]:
len(topic_model.get_topics())

104

In [1147]:
topic_model.visualize_hierarchy(custom_labels=False)

In [1660]:
topic_model.save("../../data/bertopic_models/intent/send_new/send_final")



### Create Dataset for intent 'send'

####  Labels

Created custom labels for topics

| Label                             | Description                                       | Extraction Strings | Context needed |
|-----------------------------------|---------------------------------------------------|--------------------|----------------|
| send_it                           | Something is being send.                          | Data 2   |
|0_send_something
|1_send_review  ||'review', 'comment', 'suggestion', 'feedback', 'correction'|
|2_send_presentation || 'powerpoint', 'presentation', 'slide', 'ppt', 
|3_send_files || ''
|4_send_contract
|5_send_copy   || copy
|6_send_pqa || pqa
|7_item_oca_data_osa
|8_send_fax ||fax
|9_send_update_changes || 'update','change' 
|10_send_pictures || 'pic','jpeg', 'image', 'photo'
|11_send_meeting_request || 'meeting'
|12_meeting_calendar_request_meetings
|13_send_test_results
|14_send_details_information || 'information', 'details'
|15_send_list
|16_send_invitation
|17_send_resume
|18_resume_resumes_my_me
|19_send_bug_status

#### Inspecting Topics

In [1672]:
'''
for  label in topic_model.custom_labels_:
   print(label)'''

'\nfor  label in topic_model.custom_labels_:\n   print(label)'

In [3]:
topic_model = BERTopic.load("../../data/bertopic_models/processed/send_final")

In [1833]:
topic_nr_to_inspect = 31
topic_docs = inspect_topic_and_its_docs(topic_nr=topic_nr_to_inspect, n_docs=100)

Label: press_release_releases_kits

Number of sentences for label: 67

Sentences:

Gents ,   Thanks for sending this along .
Thanks a lot for sending this .
Thanks for sending this our way .
[PERSON] ,    Thanks for sending the other materials .
Yes .     [PERSON] ,     Do you have one that you can send to me ?
[PERSON] ,   Thanks for sending this .
[PERSON] ,    Thank you for reading the things that I sent you .
Thanks for sending this out .
Send me what you have thus far .    Thanks ,    [PERSON] .
[PERSON] , thanks for sending out the email earlier .
[PERSON]    I meant to send this to you also .
[PERSON] , thanks for sending this along .
[PERSON] ,     Thanks for sending me the itineraries .
[PERSON] , Thanks for sending the calculation over .
[PERSON] ,    Thanks for sending this message .
I thank you for sending a good guy over here .
Thanks for sending these .
Thanks for sending it out last night !     Amit .
Thanks for the sending the request [PERSON] .    Regards ,   Shawn
[ S

In [1817]:
# Based on the inspection of the sentences
white_list = [
    'resume', 'cv', 
]
black_list = [
]

In [1818]:
sentences_to_keep = filter_topic_documents(topic_docs=topic_docs, white_list=white_list, black_list=black_list)
sentences_to_keep[:5]

Amount of sentences: 121


['Please send me your latest / updated resumes if you are still looking for .',
 "If he is interested , send me his resume and I 'll get [PERSON] to   have the right people look at it .",
 'I thought that you might have sent the wrong copy of your resume .',
 'I want   to get into Business Development and was wondering if you could send my   resume to your contacts .',
 'The other night we talked on the hallway   and you asked me to send you my resume , which I am attaching .']

In [1819]:
df = pd.DataFrame({"text": sentences_to_keep, "label": get_label_from_topic_nr(topic_nr_to_inspect)})
concat_to_df(df, filepath="../../data/labeled/send.tsv", overwrite=True)

Unnamed: 0,text,label
0,"Please send your comments to me ASAP , but n...",send_review
1,Please review and send feedback .,send_review
2,"If you would like ask a question , or if you j...",send_review
3,I am sending it to the customer to review an...,send_review
4,Thanks for capturing all the feedback and send...,send_review
...,...,...
1293,Do you want me to send you my resume ?,send_resume
1294,I am not sure if you have already sent me your...,send_resume
1295,"Per you request , I am sending you my resume .",send_resume
1296,I will look for my resume and send it to you .,send_resume


In [1770]:
df.head()

Unnamed: 0,text,label
