# Hierarchical Clustering on Target Sentences
to help deriving a taxonomy for intents, hierarchical clustering is being leveraged.

#### Functions

In [1]:
def inspect_topic(topic_nr):
    print('-----------')
    top_words = topic_model.get_topic(topic_nr)
    print("Topic: "+str(topic_nr)+"\n\nTop Words:")
    for word in top_words:
        print(word[0], end=", ")
    print('\n\nRepresentative Target Sentences: ')
    for doc in topic_model.get_topic_info().iloc[topic_nr+1]['Representative_Docs']:
        print(doc)

def create_custom_label(custom_label, topic_id):
    custom_labels = {
        topic_id: str(topic_id)+"_"+custom_label,
    }
    topic_model.set_topic_labels(custom_labels)

# Function to retrieve custom label from topic model
def get_label_from_topic_nr(topic_nr):
    for label in topic_model.custom_labels_:
        if label.startswith(str(topic_nr)):
            label = label
            label = re.sub(r"^\d+_", "", label)
            return label
    raise Exception('Provided Topic Number has no label assigned!')

# Function to inspect a topic by its number and retrieve its documents
'''
def inspect_topic_and_its_docs(topic_nr,n_docs=50):
    topic_docs = list(set([doc for doc, topic in zip(docs, topic_model.topics_) if topic == topic_nr]))
    print("Label: " + get_label_from_topic_nr(topic_nr)+"\n")
    print("Number of sentences for label: "+str(len(topic_docs)))
    print('\nSentences:\n')
    if len(topic_docs) < n_docs:
        n_docs = len(topic_docs)
    for i in range(n_docs):
        print(topic_docs[i])
    return topic_docs
'''

def inspect_topic_docs(topic_id, n_docs=20):
    documents = [docs[i] for i, topic in enumerate(topic_model.topics_) if topic == topic_id]
    print('Number of Documents: '+str(len(documents))+'\n')
    if len(documents) < n_docs:
        n_docs = len(documents)
    for i in range(n_docs):
        print(documents[i])
    return documents

# given a list of strings, returns a list of documents that contain at least one of the strings in the given list
def filter_topic_documents(topic_docs, white_list, black_list=[] ):
    sentences_to_keep = []
    sentences_to_discard = []
    for doc in topic_docs:
        if any(substring in doc for substring in white_list):
            sentences_to_keep.append(doc)
    sentences_to_keep = [
    item for item in sentences_to_keep
        if not any(substring in item for substring in black_list)
    ]
    sentences_to_keep = list(set(sentences_to_keep)) # remove duplicates
    sentences_to_discard = list(set(topic_docs) - set(sentences_to_keep))
    print('Amount of sentences: '+str(len(sentences_to_keep)))
    return sentences_to_keep, sentences_to_discard

# Function to facilitate the concatenation of new DFs
def concat_to_df(new_df,filepath, overwrite=False, ):
    try:
        df = pd.read_csv(filepath, sep='\t')
    except FileNotFoundError:
        df = pd.DataFrame(columns=["text", "label"])
    if new_df.iloc[0]['label'] not in df['label'].values: # Make sure that DF does not already contain entry for label
        new_df = new_df.reset_index(drop=True)
        df = df.reset_index(drop=True)
        df = pd.concat([df, new_df], ignore_index=True)
        df = df.reset_index(drop=True)
        df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
        df.to_csv(filepath, sep='\t')
        return df
    else:
        if overwrite:
            df = df[~df['label'].str.contains(new_df.iloc[0]['label'])]
            df = pd.concat([df, new_df], ignore_index=True)
            df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
            df.to_csv("../../data/labeled/send.tsv", sep='\t', index=False)
            return df
        else:
            print("Label '"+new_df.iloc[0]['label']+"' already in the final DF, so it's being skipped.")


####

In [2]:
from bertopic import BERTopic
from scipy.cluster import hierarchy as sch
import pandas as pd
import logging
from sklearn.metrics.pairwise import cosine_distances
from tqdm import tqdm
import numpy as np
from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
import re

In [3]:
df = pd.read_parquet('../../../data/processed/intents/help.parquet', engine='fastparquet')

In [4]:
df['target'] = df['target'].str.replace(r'\r\n', ' ', regex=True)
docs = list(set(df['target'].tolist()))

In [5]:
# Amount of unique target sentences
len(docs)

12672

## Training the Model

In [6]:
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("BERTopic")
logger.setLevel(logging.INFO)

In [7]:
# Initialize UMAP with a fixed random state
random_state = 1 # Other seeds sometimes caused negative values in the distanace matrix

# Set global random seeds
np.random.seed(random_state)

embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
umap_model = UMAP(n_neighbors=15, n_components=5, random_state=random_state)
hdbscan_model = HDBSCAN(min_cluster_size=10, prediction_data=True)

topic_model = BERTopic(verbose=True, umap_model=umap_model, hdbscan_model=hdbscan_model, embedding_model=embedding_model)
topics, probs = topic_model.fit_transform(docs)

INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: mps
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2
2024-11-06 18:28:39,644 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/396 [00:00<?, ?it/s]

2024-11-06 18:29:05,187 - BERTopic - Embedding - Completed ✓
2024-11-06 18:29:05,188 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.
2024-11-06 18:29:19,734 - BERTopic - Dimensionality - Completed ✓
2024-11-06 18:29:19,735 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-11-06 18:29:20,070 - BERTopic - Cluster - Completed ✓
2024-11-06 18:29:20,073 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-11-06 18:29:20,238 - BERTopic - Representation - Completed ✓


In [8]:
hierarchical_topics = topic_model.hierarchical_topics(docs)

100%|██████████| 147/147 [00:00<00:00, 722.93it/s]


In [None]:
topic_model.save("../../../data/bertopic_models/intents/03_help/help_unprocessed")



## Inspecting clustering results

In [7]:
#topic_model = BERTopic.load("../../data/bertopic_models/send")

In [10]:
topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)

## Merging Topics

In [733]:
topic_id = 6
inspect_topic(topic_id)

-----------
Topic: 6

Top Words:
slides, presentation, slide, the, need, help, you, your, to, can, 

Representative Target Sentences: 
My secretary can put your 3 - 5 PowerPoint slides together   if need any help on the slides .
on that note -- do you still need help with the [PERSON] [PERSON] slides ?
Betty ,    Take a look at the competition slides in this presentation .


In [735]:
topic_model.custom_labels_

['-1_the_person_to_help',
 '0_offer_help',
 '1_request_help',
 '2_jsp_java_code',
 '3_refer_help',
 '4_person_help_out_you',
 '5_it_problems',
 '6_problem_it_the_bug',
 '7_meeting_schedule_the_you',
 '8_slides_presentation_slide_the',
 '9_build_cm_01_problems',
 '10_bugzilla_bugs_bug_show_bug',
 '11_hesitate_do_questions_any',
 '12_demo_the_demos_we',
 '13_pair_programming_code_coding',
 '14_changes_change_let_know']

In [689]:
topic_model.merge_topics(docs, [-1,topic_id])

In [732]:
create_custom_label("it_problems", topic_id=topic_id)

In [734]:
topic_docs = inspect_topic_docs(topic_id,n_docs=20)

Number of Documents: 63

Please let me know if you need any help getting others involvement with the presentation .
I hope that the presentation and the audio can help you .
i took a look at the presentation .
on that note -- do you still need help with the [PERSON] [PERSON] slides ?
Do you need my help preparing / formatting slides ?
Presentation given to an Accenture Communications & High - Tech Executive Meeting    Let me know if you have any questions .
His focus will be on helping the R&D presenters to hone their presentations to fit the timeframe available .
If anyone has problems accessing the slides , please let me know
You need help with the presentation this weekend .
My secretary can put your 3 - 5 PowerPoint slides together   if need any help on the slides .
Can you take a look at the attached slide and tell me if I 'm missing anything or inaccurate in any of the bullets .
Can you please have your slide presentation doen here by then ?     [PERSON] R can help you with the o

In [427]:
len(topic_model.get_topics())

59

In [485]:
topic_model.save("../../../data/bertopic_models/intents/02_call/call_processed")



In [707]:
topic_model.visualize_hierarchy(custom_labels=False)

In [1121]:
topic_model.save("../../../data/bertopic_models/intents/02_call/call_final")



### Create Dataset for intent 'help'

####  Labels

Created custom labels for topics

| Label                             | Description                                       | Extraction Strings | Context needed |
|-----------------------------------|---------------------------------------------------|--------------------|----------------|
| '0_offer_help',
| '1_request_help',
| '2_jsp_java_code',
| '3_refer_help',
| '4_person_help_out_you',
| '5_it_problems',

#### Inspecting Topics

In [1672]:
'''
for  label in topic_model.custom_labels_:
   print(label)'''

'\nfor  label in topic_model.custom_labels_:\n   print(label)'

In [3]:
topic_model = BERTopic.load("../../data/bertopic_models/processed/send_final")

In [758]:
topic_id = 5
topic_docs = inspect_topic_docs(topic_id=topic_id, n_docs=20)

Number of Documents: 106

I also think that we will need help from [PERSON] on fixing some of the bugs which are filed by [PERSON] and hard to reproduce at [LOCATION] [LOCATION] .
Micro$oft software is having problems similar to this and there have been patches made available for this problem for a couple weeks .
Ravi ,   this is an interesting scenario , I think we have some problems with that , that needs to be fixed .
Simple errors like this should n't require [PERSON] 's help to resolve , but she is getting pulled in to them .
Hi Shailesh ,         Please try this patch for the project having problem .
Clone of ' [PERSON] ' virus infects the Internet    Promising help in the mating game , a new Trojan virus similar to the " [PERSON] " bug is proliferating quickly across the Internet , computer security specialists cautioned this week .    - FULL STORY -    Sci - Tech index 	  -Next category- 	  - Back to top - 	      _ _ _ _ _
We did n't have the second problem when we tested app l

In [759]:
# Based on the inspection of the sentences
white_list = [
    ""
]
black_list = [

]

In [760]:
sents_to_keep, sents_to_discard = filter_topic_documents(topic_docs=topic_docs, white_list=white_list, black_list=black_list)
sents_to_keep

Amount of sentences: 106


["My 2 cents :    - Old open bugs : Bugs can not be closed using [PERSON] 's help                            without seeing whether those bugs are relevant or not .",
 'have you had any problem with updating this spread sheet .',
 'But that has not helped me in pointing out the problem on the colocation server .',
 'Please take a look at this and see if you can reproduce this using the latest build .',
 "We did n't have the second problem when we tested app locally , but when we put it on COLO , it happened .",
 '[PERSON]    Can you help get some more information on this bug ?    Thanks   Srik',
 'Also , can you have [PERSON] take a look again at the 8 purge failures   in OSA .',
 "We did n't have the second problem when we tested app locally , but when we   > put it on COLO , it happened .",
 "[PERSON] ,    We may need Sreenivas ' help to reproduce bugs during weekend .",
 'Let us fix this then , this seems like a totally internal issue which will   help us all .',
 'If you need some 

In [761]:
sents_to_discard

[]

In [762]:
fp = "../../../data/labeled/help.tsv"
df = pd.DataFrame({"text": sents_to_keep, "label": get_label_from_topic_nr(topic_id)})
concat_to_df(df, filepath=fp, overwrite=False)

Unnamed: 0,text,label
0,Please let me know if you have any questions .,offer_help
1,"( [PERSON] ) If you have questions , please...",offer_help
2,> > Please let me know if you have any quest...,offer_help
3,"If you have any questions , please do not he...",offer_help
4,Please let me know if this helps and if you ha...,offer_help
...,...,...
987,[PERSON] is having some problems with the LC20...,it_problems
988,So I 'll try my best to fix the problems on th...,it_problems
989,[PERSON] does this have the changes [PERSON...,it_problems
990,This worked until a couple of weeks ago . H...,it_problems


In [1223]:
df.head()

Unnamed: 0,text,label
0,Once you have had a chance to review and if yo...,call_because_of_changes
1,All the changes are in TradingHistoryForm.aml ...,call_because_of_changes
2,Please give me a call if you have any question...,call_because_of_changes
3,If you 'd like to have the order changed to Di...,call_because_of_changes
4,"blair , please review and call me asap thank...",call_because_of_changes
