# Hierarchical Clustering on Target Sentences
to help deriving a taxonomy for intents, hierarchical clustering is being leveraged.

#### Functions

In [1]:
def inspect_topic(topic_nr):
    top_words = topic_model.get_topic(topic_nr)
    print("Topic: "+str(topic_nr)+"\n\nTop Words:")
    for word in top_words:
        print(word[0], end=", ")
    print('\n\nRepresentative Target Sentences: ')
    rep_sents = []
    for doc in topic_model.get_topic_info().iloc[topic_nr+1]['Representative_Docs']:
        print(doc)
        rep_sents.append(doc)
    return top_words, rep_sents

def create_custom_label(custom_label, topic_id):
    custom_labels = {
        topic_id: str(topic_id)+"_"+custom_label,
    }
    topic_model.set_topic_labels(custom_labels)

# Function to retrieve custom label from topic model
def get_label_from_topic_nr(topic_nr):
    for label in topic_model.custom_labels_:
        if label.startswith(str(topic_nr)):
            label = label
            label = re.sub(r"^\d+_", "", label)
            return label
    raise Exception('Provided Topic Number has no label assigned!')

# Function to inspect a topic by its number and retrieve its documents
def inspect_topic_and_its_docs(topic_nr,n_docs=50):
    topic_docs = list(set([doc for doc, topic in zip(docs, topic_model.topics_) if topic == topic_nr]))
    print("Label: " + get_label_from_topic_nr(topic_nr)+"\n")
    print("Number of sentences for label: "+str(len(topic_docs)))
    print('\nSentences:\n')
    if len(topic_docs) < n_docs:
        n_docs = len(topic_docs)
    for i in range(n_docs):
        print(topic_docs[i])
    return topic_docs

# given a list of strings, returns a list of documents that contain at least one of the strings in the given list
def filter_topic_documents(topic_docs, white_list, black_list=[] ):
    sentences_to_keep = []
    for doc in topic_docs:
        if any(substring in doc for substring in white_list):
            sentences_to_keep.append(doc)
    sentences_to_keep = [
    item for item in sentences_to_keep
        if not any(substring in item for substring in black_list)
    ]
    sentences_to_keep = list(set(sentences_to_keep)) # remove duplicates
    print('Amount of sentences: '+str(len(sentences_to_keep)))
    return sentences_to_keep

# Function to facilitate the concatenation of new DFs
def concat_to_df(new_df,filepath, overwrite=False, ):
    try:
        df = pd.read_csv(filepath, sep='\t')
    except FileNotFoundError:
        df = pd.DataFrame(columns=["text", "label"])
    if new_df.iloc[0]['label'] not in df['label'].values: # Make sure that DF does not already contain entry for label
        new_df = new_df.reset_index(drop=True)
        df = df.reset_index(drop=True)
        df = pd.concat([df, new_df], ignore_index=True)
        df = df.reset_index(drop=True)
        df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
        df.to_csv(filepath, sep='\t')
        return df
    else:
        if overwrite:
            df = df[~df['label'].str.contains(new_df.iloc[0]['label'])]
            df = pd.concat([df, new_df], ignore_index=True)
            df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
            df.to_csv("../../data/labeled/send.tsv", sep='\t', index=False)
            return df
        else:
            print("Label '"+new_df.iloc[0]['label']+"' already in the final DF, so it's being skipped.")


####

In [2]:
from bertopic import BERTopic
from scipy.cluster import hierarchy as sch
import pandas as pd
import logging
from sklearn.metrics.pairwise import cosine_distances
from tqdm import tqdm
import numpy as np
from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
import re

In [6]:
df = pd.read_parquet('../../../data/processed/intents/remaining.parquet', engine='fastparquet')

In [7]:
df['target'] = df['target'].str.replace(r'\r\n', ' ', regex=True)
docs = list(set(df['target'].tolist()))

In [8]:
# Amount of unique target sentences
len(docs)

281466

## Training the Model

In [9]:
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("BERTopic")
logger.setLevel(logging.INFO)

In [13]:
# Initialize UMAP with a fixed random state
random_state = 1 # Other seeds sometimes caused negative values in the distanace matrix

# Set global random seeds
np.random.seed(random_state)

embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
umap_model = UMAP(n_neighbors=15, n_components=5, random_state=random_state)
hdbscan_model = HDBSCAN(min_cluster_size=10, prediction_data=True)

topic_model = BERTopic(nr_topics=100, verbose=True, umap_model=umap_model, hdbscan_model=hdbscan_model, embedding_model=embedding_model)
topics, probs = topic_model.fit_transform(docs)

INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: mps
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2
2024-11-06 12:50:18,552 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/8796 [00:00<?, ?it/s]

2024-11-06 12:55:19,092 - BERTopic - Embedding - Completed ✓
2024-11-06 12:55:19,093 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-11-06 12:58:49,702 - BERTopic - Dimensionality - Completed ✓
2024-11-06 12:58:49,704 - BERTopic - Cluster - Start clustering the reduced embeddings
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling paralle

In [14]:
hierarchical_topics = topic_model.hierarchical_topics(docs)

100%|██████████| 98/98 [00:00<00:00, 218.30it/s]


In [12]:
topic_model.save("../../../data/bertopic_models/intents/09_remaining/remaining_unprocessed")



## Inspecting clustering results

In [None]:
#topic_model = BERTopic.load("../../../data/bertopic_models/intents/00_all_intents/all_unprocessed")

In [15]:
topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)

## Merging Topics

In [36]:
topic_id = 500
_, rep_sents = inspect_topic(topic_id)

Topic: 500

Top Words:
explorer, msn, intl, download, egroups, extremeprogramming, unsubscribe, jbr, free, courtesy, 

Representative Target Sentences: 
[PERSON]    _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _   Get your FREE download of MSN Explorer at http://explorer.msn.com/intl.asp     To Post a message , send it to :    extremeprogramming@eGroups.com    To Unsubscribe , send a blank message to : extremeprogramming-unsubscribe@eGroups.com    add - free courtesy of objectmentor.com    Your use of Yahoo !
Thanks ,   [PERSON] [PERSON]     _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _   Get your FREE download of MSN Explorer at http://explorer.msn.com/intl.asp     To Post a message , send it to :    extremeprogramming@eGroups.com    To Unsubscribe , send a blank message to : extremeprogramming-unsubscribe@eGroups.com   

In [37]:
documents = [docs[i] for i, topic in enumerate(topic_model.topics_) if topic == topic_id]
print('Number of Documents: '+str(len(documents))+'\n')
for i in range(25):
    print(documents[i])

Number of Documents: 53

Followup with Voice Genie for Number 	 Oct 23 		 Oct 27       announcement ( not in TTS )       I have listed the issues on which we will need support from Voice Genie .
If you are committed to driving sales of your product that incorporates Nuance 's speech engines and would like to work with us to optimize the development and delivery of your solution , please learn more about the Nuance OEM Program and complete a Nuance Partner Application .
However , SignalSoft do see the potential for AvocadoIT technology to sit between their middleware and other non - wirelessly enabled location specific content to enhance the proposition to carriers .
If you received   this message directly from David Allen & Co. , you are   automatically subscribed to this newsletter .
do you have any ideas / suggestions for the two gaiting items :   > - Lists - does n't look like partner will work and our list we could do it   >
I just spoke with him and he said to go ahead and pass hi

In [None]:
topic_model.merge_topics(docs, [-1,topic_id])

In [None]:
create_custom_label("", )

In [None]:
len(topic_model.get_topics())

In [None]:
#topic_model.save("../../../data/bertopic_models/intents/00_all_intents/all_processed")

In [4]:
topic_model = BERTopic.load("../../../data/bertopic_models/intents/00_all_intents/all_processed_cut_to_600_processed")

### Remove Topics now

In [None]:
topic_model.save("../../../data/bertopic_models/intents/00_all_intents/all_processed_cut_to_600_processed")

In [None]:

merge_list = []
for i in range(600,1000):
    item = [-1,i]
    merge_list.append(item)
topic_model.merge_topics(docs, merge_list)
#topic_model.save("../../../data/bertopic_models/intents/00_all_intents/all_processed_cut_to_600")

In [50]:
topic_id = 5
_, rep_sents = inspect_topic(topic_id)

Topic: 5

Top Words:
ass, person, com, to, it, the, please, that, on, you, 

Representative Target Sentences: 
Please call me with any questions you may have .
Let me know what help you need here .
Please call or e - mail [PERSON] to set up a call with him .


In [None]:
documents = [docs[i] for i, topic in enumerate(topic_model.topics_) if topic == topic_id]
print('Number of Documents: '+str(len(documents))+'\n')
for i in range(25):
    print(documents[i])

Number of Documents: 1700

Tell us what you think . 	   E - mail this LifeMinder to a friend   	  * * * COOL STUFF * * *   Accident ?
Under our mutual NDA signed last month , [PERSON] will explain the status of this round and explore with you Chase.com 's possible participation .
This will give us 5 E*Trade servers in the cluster .
New readers receiving a free   trial will receive their issue the following day , in the afternoon .   _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _   For additional information about Cabot 's Internet Stock of the week , please   visit our Web site at http://www.icabot.com .
I personally attained [PERSON] 's address , printed out the presentation , wrote the memo note , and FedExed ( along with my business card ) to [PERSON] this afternoon .
I will send out a note that properly positions us and the beta .    Thanks .
I imagine you folks must have a power point presentation that you use to describe the service .
I had hoped to keep this a

: 

In [38]:
topic_model.merge_topics(docs, [-1,500])

In [39]:
topic_model.visualize_hierarchy(custom_labels=False)

### Keep Intent-like Topics

In [None]:
topic_id = 20
inspect_topic(topic_id)

In [None]:
topic_model.merge_topics(docs, [-1,topic_id])

In [None]:
documents = [docs[i] for i, topic in enumerate(topic_model.topics_) if topic == topic_id]
print('Number of Documents: '+str(len(documents))+'\n')
for i in range(20):
    print(documents[i])

In [None]:
create_custom_label(custom_label='send_check', topic_id=topic_id)

In [None]:
topic_model.visualize_hierarchy(custom_labels=False)

In [None]:
topic_model.save("../../data/bertopic_models/intent/send_new/send_final")

### Create Dataset for intent 'send'

####  Labels

Created custom labels for topics

| Label                             | Description                                       | Extraction Strings | Context needed |
|-----------------------------------|---------------------------------------------------|--------------------|----------------|
| send_it                           | Something is being send.                          | Data 2   |
|0_send_something
|1_send_review  ||'review', 'comment', 'suggestion', 'feedback', 'correction'|
|2_send_presentation || 'powerpoint', 'presentation', 'slide', 'ppt', 
|3_send_files || ''
|4_send_contract
|5_send_copy   || copy
|6_send_pqa || pqa
|7_item_oca_data_osa
|8_send_fax ||fax
|9_send_update_changes || 'update','change' 
|10_send_pictures || 'pic','jpeg', 'image', 'photo'
|11_send_meeting_request || 'meeting'
|12_meeting_calendar_request_meetings
|13_send_test_results
|14_send_details_information || 'information', 'details'
|15_send_list
|16_send_invitation
|17_send_resume
|18_resume_resumes_my_me
|19_send_bug_status

#### Inspecting Topics

In [None]:
'''
for  label in topic_model.custom_labels_:
   print(label)'''

In [None]:
topic_model = BERTopic.load("../../data/bertopic_models/processed/send_final")

In [None]:
topic_nr_to_inspect = 31
topic_docs = inspect_topic_and_its_docs(topic_nr=topic_nr_to_inspect, n_docs=100)

In [None]:
# Based on the inspection of the sentences
white_list = [
    'resume', 'cv', 
]
black_list = [
]

In [None]:
sentences_to_keep = filter_topic_documents(topic_docs=topic_docs, white_list=white_list, black_list=black_list)
sentences_to_keep[:5]

In [None]:
df = pd.DataFrame({"text": sentences_to_keep, "label": get_label_from_topic_nr(topic_nr_to_inspect)})
concat_to_df(df, filepath="../../data/labeled/send.tsv", overwrite=True)

In [None]:
df.head()