# Hierarchical Clustering on Target Sentences
to help deriving a taxonomy for intents, hierarchical clustering is being leveraged.

#### Functions

In [2]:
def inspect_topic(topic_nr):
    top_words = topic_model.get_topic(topic_nr)
    print("Topic: "+str(topic_nr)+"\n\nTop Words:")
    for word in top_words:
        print(word[0], end=", ")
    print('\n\nRepresentative Target Sentences: ')
    for doc in topic_model.get_topic_info().iloc[topic_nr+1]['Representative_Docs']:
        print(doc)

def create_custom_label(custom_label, topic_id):
    custom_labels = {
        topic_id: str(topic_id)+"_"+custom_label,
    }
    topic_model.set_topic_labels(custom_labels)

# Function to retrieve custom label from topic model
def get_label_from_topic_nr(topic_nr):
    for label in topic_model.custom_labels_:
        if label.startswith(str(topic_nr)):
            label = label
            label = re.sub(r"^\d+_", "", label)
            return label
    raise Exception('Provided Topic Number has no label assigned!')

# Function to inspect a topic by its number and retrieve its documents
def inspect_topic_and_its_docs(topic_nr,n_docs=50):
    topic_docs = list(set([doc for doc, topic in zip(docs, topic_model.topics_) if topic == topic_nr]))
    print("Label: " + get_label_from_topic_nr(topic_nr)+"\n")
    print("Number of sentences for label: "+str(len(topic_docs)))
    print('\nSentences:\n')
    if len(topic_docs) < n_docs:
        n_docs = len(topic_docs)
    for i in range(n_docs):
        print(topic_docs[i])
    return topic_docs

# given a list of strings, returns a list of documents that contain at least one of the strings in the given list
def filter_topic_documents(topic_docs, white_list, black_list=[] ):
    sentences_to_keep = []
    for doc in topic_docs:
        if any(substring in doc for substring in white_list):
            sentences_to_keep.append(doc)
    sentences_to_keep = [
    item for item in sentences_to_keep
        if not any(substring in item for substring in black_list)
    ]
    sentences_to_keep = list(set(sentences_to_keep)) # remove duplicates
    print('Amount of sentences: '+str(len(sentences_to_keep)))
    return sentences_to_keep

# Function to facilitate the concatenation of new DFs
def concat_to_df(new_df,filepath, overwrite=False, ):
    try:
        df = pd.read_csv(filepath, sep='\t')
    except FileNotFoundError:
        df = pd.DataFrame(columns=["text", "label"])
    if new_df.iloc[0]['label'] not in df['label'].values: # Make sure that DF does not already contain entry for label
        new_df = new_df.reset_index(drop=True)
        df = df.reset_index(drop=True)
        df = pd.concat([df, new_df], ignore_index=True)
        df = df.reset_index(drop=True)
        df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
        df.to_csv(filepath, sep='\t')
        return df
    else:
        if overwrite:
            df = df[~df['label'].str.contains(new_df.iloc[0]['label'])]
            df = pd.concat([df, new_df], ignore_index=True)
            df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
            df.to_csv("../../data/labeled/send.tsv", sep='\t', index=False)
            return df
        else:
            print("Label '"+new_df.iloc[0]['label']+"' already in the final DF, so it's being skipped.")


####

In [3]:
from bertopic import BERTopic
from scipy.cluster import hierarchy as sch
import pandas as pd
import logging
from sklearn.metrics.pairwise import cosine_distances
from tqdm import tqdm
import numpy as np
from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
import re

In [4]:
df = pd.read_parquet('../../../data/processed/targets/avocado_train_targets_exploded_cleaned_entities.parquet', engine='fastparquet')

In [5]:
df['target'] = df['target'].str.replace(r'\r\n', ' ', regex=True)
docs = list(set(df['target'].tolist()))

In [6]:
# Amount of unique target sentences
len(docs)

403329

## Training the Model

In [7]:
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("BERTopic")
logger.setLevel(logging.INFO)

In [None]:
# Initialize UMAP with a fixed random state
random_state = 1 # Other seeds sometimes caused negative values in the distanace matrix

# Set global random seeds
np.random.seed(random_state)

embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
umap_model = UMAP(n_neighbors=15, n_components=5, random_state=random_state)
hdbscan_model = HDBSCAN(min_cluster_size=10, prediction_data=True)

topic_model = BERTopic(verbose=True, umap_model=umap_model, hdbscan_model=hdbscan_model, embedding_model=embedding_model)
topics, probs = topic_model.fit_transform(docs)

INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: mps
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2
2024-11-05 14:30:04,963 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/12605 [00:00<?, ?it/s]

In [None]:
hierarchical_topics = topic_model.hierarchical_topics(docs)

100%|██████████| 395/395 [00:00<00:00, 538.70it/s]


In [None]:
topic_model.save("../../../data/bertopic_models/intents/00_all_intents/all_unprocessed")



## Inspecting clustering results

In [None]:
#topic_model = BERTopic.load("../../../data/bertopic_models/intents/00_all_intents/all_unprocessed")

In [12]:
topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)

NameError: name 'hierarchical_topics' is not defined

## Merging Topics

In [702]:
topic_id = 19
inspect_topic(topic_id)

Topic: 19

Top Words:
servlet, archives, java, sun, listserv, interest, html, resources, user, manuals, 

Representative Target Sentences: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _   To unsubscribe , send email to listserv@java.sun.com and include in the body   of the message " signoff SERVLET - INTEREST " .    Archives : http://archives.java.sun.com/archives/servlet-interest.html   Resources : http://java.sun.com/products/servlet/external-resources.html   LISTSERV Help : http://www.lsoft.com/manuals/user/user.html
S.    _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _   To unsubscribe , send email to listserv@java.sun.com and include in the body   of the message " signoff SERVLET - INTEREST " .    Archives : http://archives.java.sun.com/archives/servlet-interest.html   Resources

In [703]:
topic_model.merge_topics(docs, [-1,topic_id])

In [89]:
create_custom_label("", )

In [677]:
documents = [docs[i] for i, topic in enumerate(topic_model.topics_) if topic == topic_id]
print('Number of Documents: '+str(len(documents))+'\n')
for i in range(20):
    print(documents[i])

Number of Documents: 16

We are going to send it today unless someone raises a flag that this is not a good idea .
I think they are sending that to us .
They do not always send these to us .
you 're not sure what they believe , or what they will   > > > >       think of you for sending it to them .
Are we sending this to them ?
i thought we decided not to send this because it would raise more questions ?
Let me know what you think of this -- it is much   stronger than what they sent us , so they might freak out .
We can still send them to nursing home later on if it is not possible .
Go ahead and send them through .
We can still send   > them to nursing home later   > on if it is not possible .
Alternatively , you can just send them the plug - in ahead of time .
However , if you feel that we need to send them something like this , then go ahead .
they need to send us the agmt .
They send stuff to me once in a while , most of it is not interesting - this is very interesting .
When in do

IndexError: list index out of range

In [668]:
len(topic_model.get_topics())

238

In [704]:
topic_model.save("../../data/bertopic_models/intent/send_new/send_processed")



In [705]:
topic_model.visualize_hierarchy(custom_labels=False)

### Keep Intent-like Topics

In [1671]:
topic_id = 20
inspect_topic(topic_id)

Topic: 20

Top Words:
call, cell, me, number, phone, or, touch, reach, home, at, 

Representative Target Sentences: 
If u have anything for me , call me on my cell or send me email .
call me on the cell or send email .
If you need me send email or call my cell .


In [1633]:
topic_model.merge_topics(docs, [-1,topic_id])

In [1598]:
documents = [docs[i] for i, topic in enumerate(topic_model.topics_) if topic == topic_id]
print('Number of Documents: '+str(len(documents))+'\n')
for i in range(20):
    print(documents[i])

Number of Documents: 1790

I 'll send it back to you as soon as I can finish it .
Please send it to us !
--------------------------   Sent from [PERSON] 's BB .   Instant E - mail messaging Rocks !
I 'm gone next week but have been assured she 'll send it to you Monday .
Please email him this along with the PDF I sent this am .
I sent this to you already
[PERSON] ,    This is the email that I had sent to [PERSON] on Friday in regards to the problem that I had talked to you about .
ty ;    i will work on this tomorrow and send it to you .
I 'll try to send it again , soon .
He 's now sending updates like these .
[PERSON] sent this to me .
just what you feel , make her laugh out loud , recall a special moment you shared , or send a heart - felt thanks .
I 'll copy you all on what I send to [PERSON] .
Please send [PERSON] ( bullet point )
I will send it to [PERSON] as soon as you receive it .
[PERSON] ,    This info below is from [PERSON] :    The last email that you sent to me is this th

In [1658]:
create_custom_label(custom_label='send_check', topic_id=topic_id)

In [1659]:
topic_model.visualize_hierarchy(custom_labels=True)

In [1660]:
topic_model.save("../../data/bertopic_models/intent/send_new/send_final")



### Create Dataset for intent 'send'

####  Labels

Created custom labels for topics

| Label                             | Description                                       | Extraction Strings | Context needed |
|-----------------------------------|---------------------------------------------------|--------------------|----------------|
| send_it                           | Something is being send.                          | Data 2   |
|0_send_something
|1_send_review  ||'review', 'comment', 'suggestion', 'feedback', 'correction'|
|2_send_presentation || 'powerpoint', 'presentation', 'slide', 'ppt', 
|3_send_files || ''
|4_send_contract
|5_send_copy   || copy
|6_send_pqa || pqa
|7_item_oca_data_osa
|8_send_fax ||fax
|9_send_update_changes || 'update','change' 
|10_send_pictures || 'pic','jpeg', 'image', 'photo'
|11_send_meeting_request || 'meeting'
|12_meeting_calendar_request_meetings
|13_send_test_results
|14_send_details_information || 'information', 'details'
|15_send_list
|16_send_invitation
|17_send_resume
|18_resume_resumes_my_me
|19_send_bug_status

#### Inspecting Topics

In [1672]:
'''
for  label in topic_model.custom_labels_:
   print(label)'''

'\nfor  label in topic_model.custom_labels_:\n   print(label)'

In [3]:
topic_model = BERTopic.load("../../data/bertopic_models/processed/send_final")

In [1833]:
topic_nr_to_inspect = 31
topic_docs = inspect_topic_and_its_docs(topic_nr=topic_nr_to_inspect, n_docs=100)

Label: press_release_releases_kits

Number of sentences for label: 67

Sentences:

Gents ,   Thanks for sending this along .
Thanks a lot for sending this .
Thanks for sending this our way .
[PERSON] ,    Thanks for sending the other materials .
Yes .     [PERSON] ,     Do you have one that you can send to me ?
[PERSON] ,   Thanks for sending this .
[PERSON] ,    Thank you for reading the things that I sent you .
Thanks for sending this out .
Send me what you have thus far .    Thanks ,    [PERSON] .
[PERSON] , thanks for sending out the email earlier .
[PERSON]    I meant to send this to you also .
[PERSON] , thanks for sending this along .
[PERSON] ,     Thanks for sending me the itineraries .
[PERSON] , Thanks for sending the calculation over .
[PERSON] ,    Thanks for sending this message .
I thank you for sending a good guy over here .
Thanks for sending these .
Thanks for sending it out last night !     Amit .
Thanks for the sending the request [PERSON] .    Regards ,   Shawn
[ S

In [1817]:
# Based on the inspection of the sentences
white_list = [
    'resume', 'cv', 
]
black_list = [
]

In [1818]:
sentences_to_keep = filter_topic_documents(topic_docs=topic_docs, white_list=white_list, black_list=black_list)
sentences_to_keep[:5]

Amount of sentences: 121


['Please send me your latest / updated resumes if you are still looking for .',
 "If he is interested , send me his resume and I 'll get [PERSON] to   have the right people look at it .",
 'I thought that you might have sent the wrong copy of your resume .',
 'I want   to get into Business Development and was wondering if you could send my   resume to your contacts .',
 'The other night we talked on the hallway   and you asked me to send you my resume , which I am attaching .']

In [1819]:
df = pd.DataFrame({"text": sentences_to_keep, "label": get_label_from_topic_nr(topic_nr_to_inspect)})
concat_to_df(df, filepath="../../data/labeled/send.tsv", overwrite=True)

Unnamed: 0,text,label
0,"Please send your comments to me ASAP , but n...",send_review
1,Please review and send feedback .,send_review
2,"If you would like ask a question , or if you j...",send_review
3,I am sending it to the customer to review an...,send_review
4,Thanks for capturing all the feedback and send...,send_review
...,...,...
1293,Do you want me to send you my resume ?,send_resume
1294,I am not sure if you have already sent me your...,send_resume
1295,"Per you request , I am sending you my resume .",send_resume
1296,I will look for my resume and send it to you .,send_resume


In [1770]:
df.head()

Unnamed: 0,text,label
