# Hierarchical Clustering on Target Sentences
to help deriving a taxonomy for intents, hierarchical clustering is being leveraged.

#### Functions

In [1145]:
def inspect_topic(topic_nr):
    print('-----------')
    top_words = topic_model.get_topic(topic_nr)
    print("Topic: "+str(topic_nr)+"\n\nTop Words:")
    for word in top_words:
        print(word[0], end=", ")
    print('\n\nRepresentative Target Sentences: ')
    for doc in topic_model.get_topic_info().iloc[topic_nr+1]['Representative_Docs']:
        print(doc)

def create_custom_label(custom_label, topic_id):
    custom_labels = {
        topic_id: str(topic_id)+"_"+custom_label,
    }
    topic_model.set_topic_labels(custom_labels)

# Function to retrieve custom label from topic model
def get_label_from_topic_nr(topic_nr):
    for label in topic_model.custom_labels_:
        if label.startswith(str(topic_nr)):
            label = label
            label = re.sub(r"^\d+_", "", label)
            return label
    raise Exception('Provided Topic Number has no label assigned!')

# Function to inspect a topic by its number and retrieve its documents
'''
def inspect_topic_and_its_docs(topic_nr,n_docs=50):
    topic_docs = list(set([doc for doc, topic in zip(docs, topic_model.topics_) if topic == topic_nr]))
    print("Label: " + get_label_from_topic_nr(topic_nr)+"\n")
    print("Number of sentences for label: "+str(len(topic_docs)))
    print('\nSentences:\n')
    if len(topic_docs) < n_docs:
        n_docs = len(topic_docs)
    for i in range(n_docs):
        print(topic_docs[i])
    return topic_docs
'''

def inspect_topic_docs(topic_id, n_docs=20):
    documents = [docs[i] for i, topic in enumerate(topic_model.topics_) if topic == topic_id]
    print('Number of Documents: '+str(len(documents))+'\n')
    if len(documents) < n_docs:
        n_docs = len(documents)
    for i in range(n_docs):
        print(documents[i])
    return documents

# given a list of strings, returns a list of documents that contain at least one of the strings in the given list
def filter_topic_documents(topic_docs, white_list, black_list=[] ):
    sentences_to_keep = []
    sentences_to_discard = []
    for doc in topic_docs:
        if any(substring in doc for substring in white_list):
            sentences_to_keep.append(doc)
    sentences_to_keep = [
    item for item in sentences_to_keep
        if not any(substring in item for substring in black_list)
    ]
    sentences_to_keep = list(set(sentences_to_keep)) # remove duplicates
    sentences_to_discard = list(set(topic_docs) - set(sentences_to_keep))
    print('Amount of sentences: '+str(len(sentences_to_keep)))
    return sentences_to_keep, sentences_to_discard

# Function to facilitate the concatenation of new DFs
def concat_to_df(new_df,filepath, overwrite=False, ):
    try:
        df = pd.read_csv(filepath, sep='\t')
    except FileNotFoundError:
        df = pd.DataFrame(columns=["text", "label"])
    if new_df.iloc[0]['label'] not in df['label'].values: # Make sure that DF does not already contain entry for label
        new_df = new_df.reset_index(drop=True)
        df = df.reset_index(drop=True)
        df = pd.concat([df, new_df], ignore_index=True)
        df = df.reset_index(drop=True)
        df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
        df.to_csv(filepath, sep='\t')
        return df
    else:
        if overwrite:
            df = df[~df['label'].str.contains(new_df.iloc[0]['label'])]
            df = pd.concat([df, new_df], ignore_index=True)
            df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
            df.to_csv("../../data/labeled/send.tsv", sep='\t', index=False)
            return df
        else:
            print("Label '"+new_df.iloc[0]['label']+"' already in the final DF, so it's being skipped.")


####

In [2]:
from bertopic import BERTopic
from scipy.cluster import hierarchy as sch
import pandas as pd
import logging
from sklearn.metrics.pairwise import cosine_distances
from tqdm import tqdm
import numpy as np
from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
import re

In [3]:
df = pd.read_parquet('../../../data/processed/intents/call.parquet', engine='fastparquet')

In [4]:
df['target'] = df['target'].str.replace(r'\r\n', ' ', regex=True)
docs = list(set(df['target'].tolist()))

In [5]:
# Amount of unique target sentences
len(docs)

23376

## Training the Model

In [6]:
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("BERTopic")
logger.setLevel(logging.INFO)

In [7]:
# Initialize UMAP with a fixed random state
random_state = 1 # Other seeds sometimes caused negative values in the distanace matrix

# Set global random seeds
np.random.seed(random_state)

embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
umap_model = UMAP(n_neighbors=15, n_components=5, random_state=random_state)
hdbscan_model = HDBSCAN(min_cluster_size=10, prediction_data=True)

topic_model = BERTopic(verbose=True, umap_model=umap_model, hdbscan_model=hdbscan_model, embedding_model=embedding_model)
topics, probs = topic_model.fit_transform(docs)

INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: mps
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2
2024-11-06 14:35:34,568 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/731 [00:00<?, ?it/s]

2024-11-06 14:36:12,020 - BERTopic - Embedding - Completed ✓
2024-11-06 14:36:12,020 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.
2024-11-06 14:36:32,963 - BERTopic - Dimensionality - Completed ✓
2024-11-06 14:36:32,964 - BERTopic - Cluster - Start clustering the reduced embeddings
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers

In [8]:
hierarchical_topics = topic_model.hierarchical_topics(docs)

100%|██████████| 196/196 [00:00<00:00, 703.68it/s]


In [9]:
topic_model.save("../../../data/bertopic_models/intents/02_call/call_unprocessed")



## Inspecting clustering results

In [7]:
#topic_model = BERTopic.load("../../data/bertopic_models/send")

In [324]:
topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)

## Merging Topics

In [1134]:
topic_id = 16
inspect_topic(topic_id)

-----------
Topic: 16

Top Words:
vm, left, vmail, leave, tried, you, me, 344, and, unable, 

Representative Target Sentences: 
I left you a VM , please feel free to call me later this afternoon .
He called me with some info to pass onto you regarding your urgent vm to him and his discussions with Broadbeam , since he was unable to get through to you or leave you a message ( apparently , vm is not working this a.m. )   Please give me a call and I 'll fill you in .
Left you a VM -- can you call me ?


In [1120]:
topic_model.custom_labels_

['-1_person_to_the_call',
 '0_contact_me_in_case_of_questions',
 '1_call_me_on_cell_phone',
 '2_set_up_conference_call',
 '3_promise_call',
 '4_jsp_code',
 '5_request_call',
 '6_call_method_code',
 '7_call_at_office',
 '8_have_conference_call',
 '9_call_with_meeting_as_context',
 '10_apologies_for_missing_call',
 '11_call_because_of_changes',
 '12_ask_for_number',
 '13_call_if_urgent',
 '14_call_about_lunch',
 '15_call_java_code',
 '16_left_voicemail',
 '17_vm_left_vmail_leave',
 '18_proposal_will_you_thoughts',
 '19_code_location_it_arguments']

In [1113]:
topic_model.merge_topics(docs, [-1,topic_id])

In [None]:
create_custom_label("call_", topic_id=topic_id)

In [1151]:
inspect_topic_docs(topic_id,n_docs=20)

Number of Documents: 15

[PERSON] ,    I tried calling your number ( x-7965 ) and I was unable to leave a VM .
Hi [PERSON] :    Received your vmail yesterday ... have tried calling you unsuccessfully ( your mailbox was full ) .
He left me a vm this morning ( 7:35am ) and I am calling him back right now ( I   forgot once I checked the message ... )
Please mark it up and fax it back to me at 650 344 - 9607 , or leave vm comments at x8069 , or call me at home , 650 344 - 9157 .
If by chance you are still up call me at 1 617 568 6700 room 721 or my mobile 1 408 205 9958 and we can dicuss your vmail you left , otherwise I will call you first thing Friday am [LOCATION] time .
He called me with some info to pass onto you regarding your urgent vm to him and his discussions with Broadbeam , since he was unable to get through to you or leave you a message ( apparently , vm is not working this a.m. )   Please give me a call and I 'll fill you in .
V will be calling periumma today .
Left you a VM 

['[PERSON] ,    I tried calling your number ( x-7965 ) and I was unable to leave a VM .',
 'Hi [PERSON] :    Received your vmail yesterday ... have tried calling you unsuccessfully ( your mailbox was full ) .',
 'He left me a vm this morning ( 7:35am ) and I am calling him back right now ( I   forgot once I checked the message ... )',
 'Please mark it up and fax it back to me at 650 344 - 9607 , or leave vm comments at x8069 , or call me at home , 650 344 - 9157 .',
 'If by chance you are still up call me at 1 617 568 6700 room 721 or my mobile 1 408 205 9958 and we can dicuss your vmail you left , otherwise I will call you first thing Friday am [LOCATION] time .',
 "He called me with some info to pass onto you regarding your urgent vm to him and his discussions with Broadbeam , since he was unable to get through to you or leave you a message ( apparently , vm is not working this a.m. )   Please give me a call and I 'll fill you in .",
 'V will be calling periumma today .',
 'Left you 

In [942]:
len(topic_model.get_topics())

35

In [485]:
topic_model.save("../../../data/bertopic_models/intents/02_call/call_processed")



In [1119]:
topic_model.visualize_hierarchy(custom_labels=True)

In [1121]:
topic_model.save("../../../data/bertopic_models/intents/02_call/call_final")



### Create Dataset for intent 'send'

####  Labels

Created custom labels for topics

| Label                             | Description                                       | Extraction Strings | Context needed |
|-----------------------------------|---------------------------------------------------|--------------------|----------------|
|'0_contact_me_in_case_of_questions',|
| '1_call_me_on_cell_phone',|
| '2_set_up_conference_call',|
| '3_promise_call',|
| '4_jsp_code',|
| '5_request_call',|
| '6_call_method_code',|
| '7_call_at_office',|
| '8_have_conference_call',|
| '9_call_with_meeting_as_context',|
| '10_apologies_for_missing_call',|
| '11_call_because_of_changes',|
| '12_ask_for_number',|
| '13_call_if_urgent',|
| '14_call_about_lunch',|
| '15_call_java_code',|
| '16_left_voicemail',|

#### Inspecting Topics

In [1672]:
'''
for  label in topic_model.custom_labels_:
   print(label)'''

'\nfor  label in topic_model.custom_labels_:\n   print(label)'

In [3]:
topic_model = BERTopic.load("../../data/bertopic_models/processed/send_final")

In [1218]:
topic_id = 11
topic_docs = inspect_topic_docs(topic_id=topic_id, n_docs=20)

Number of Documents: 52

I will call you on Monday to get more information .    2 ) Header customization support .
Please give me a call if you have any questions regarding the recent changes   to your account .
Feel free to call me to discuss my changes .
Please review and call me at your leisure .
I think we have a deal ;   please look over the few minor changes we suggest and feel free to call me to discuss .
If there are any changes that need to be made to the information above please contact me prior to noon our time today ( 2/19/01 - Monday PST ) .    thank you .
Once you have had a chance to review and if you feel there need to be more edits , please call my mobile :   408.836.9657 and we can go through .
Once you 've reviewed , please give me a call so can   further discuss .
I do n't know who you called to get changed , etc .    We need to get this resolved now- during thw show .
If you 'd like to have the order changed to Diverse T1 's , I 'll get a sales   person to call you

In [1219]:
# Based on the inspection of the sentences
white_list = [
    'review', 'update', 'change', 
]
black_list = [
    
]

In [1220]:
sents_to_keep, sents_to_discard = filter_topic_documents(topic_docs=topic_docs, white_list=white_list, black_list=black_list)
sents_to_keep

Amount of sentences: 50


['Once you have had a chance to review and if you feel there need to be more edits , please call my mobile :   408.836.9657 and we can go through .',
 'All the changes are in TradingHistoryForm.aml    Call me @ ( 408 ) 857 8516 if you have any questions .',
 'Please give me a call if you have any questions regarding the recent changes   to your account .',
 "If you 'd like to have the order changed to Diverse T1 's , I 'll get a sales   person to call you and discuss it in more detail .",
 'blair , please review and call me asap   thank you',
 'i have only gotten one edit / change on the website so i think we are functioning at a level that is as good as can be expected .     going back home and back to bed .   just call me rip .. see you in 20yrs .',
 'Please give me a call after you get a chance to review it and we can discuss your thoughts and make the necessary changes to accommodate your needs .',
 'If you want this changed , please call me and I can   tell you over the phone on h

In [1221]:
sents_to_discard

['I will call you on Monday to get more information .    2 ) Header customization support .',
 'Give me a call if you have any questions or want some help in modifying it .']

In [1222]:
fp = "../../../data/labeled/call.tsv"
df = pd.DataFrame({"text": sents_to_keep, "label": get_label_from_topic_nr(topic_id)})
concat_to_df(df, filepath=fp, overwrite=False)

Unnamed: 0,text,label
0,Please contact me if there is anything else ...,contact_me_in_case_of_questions
1,"If you have any questions , please do not hesi...",contact_me_in_case_of_questions
2,Call me if you need more help .,contact_me_in_case_of_questions
3,If you have any questions or comments regardin...,contact_me_in_case_of_questions
4,If you have questions as you look over our web...,contact_me_in_case_of_questions
...,...,...
2259,If you can submit your changes to my email via...,call_because_of_changes
2260,If anything changes I will call you .,call_because_of_changes
2261,I think we have a deal ; please look over th...,call_because_of_changes
2262,If I can be of any assistance in the review pr...,call_because_of_changes


In [None]:
df.head()

Unnamed: 0,text,label
0,Once you have had a chance to review and if yo...,call_because_of_changes
1,All the changes are in TradingHistoryForm.aml ...,call_because_of_changes
2,Please give me a call if you have any question...,call_because_of_changes
3,If you 'd like to have the order changed to Di...,call_because_of_changes
4,"blair , please review and call me asap thank...",call_because_of_changes


: 