## Heuristics applied to pairs

In [1]:
from bertopic import BERTopic
from scipy.cluster import hierarchy as sch
import pandas as pd
import logging
from sklearn.metrics.pairwise import cosine_distances
from tqdm import tqdm
import numpy as np
from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
import re
import sys
sys.path.append('../../')
from utility.utility_functions import *

In [4]:
df_problem = pd.read_parquet('../../data/processed/intents/problem.parquet', engine='pyarrow')

## Feedback

In [5]:
df_problem_counts = create_count_df(df_problem)
df_problem_counts.head(10)

Unnamed: 0,action_object_pairs,Count
0,have_questions,9476
1,thank_you,4120
2,need_help,2332
3,take_look,1900
4,contact_me,1775
5,help_you,1363
6,have_problems,1303
7,help_us,1044
8,answer_questions,805
9,provide_support,794


### Inspect Pairs and create groups

In [6]:
df_problem_counts.head(20)

Unnamed: 0,action_object_pairs,Count
0,have_questions,9476
1,thank_you,4120
2,need_help,2332
3,take_look,1900
4,contact_me,1775
5,help_you,1363
6,have_problems,1303
7,help_us,1044
8,answer_questions,805
9,provide_support,794


Based on the Inspection, the following groups will be created:



reschedule_meeting

schedule_meeting, arrange_meeting, schedule_meetings, set_meetings, set_meeting, setup_meeting, set_appointment, have_meeting, have_discussion, have_discussions, have_meetings, have_appointment, have_conversation, hold_meeting,

 attend_conference, attend_meeting, attend_meetings


review_content

In [7]:
offer_help = ['have_questions','need_help','take_look','contact_me','help_you','have_problems','have_problem','have_question','need_assistance']

In [8]:
df_offer_help =  df_problem[df_problem['action_object_pairs'].apply(lambda pairs: any(item.lower() in offer_help for item in pairs))]

In [9]:
df_offer_help.shape

(17926, 19)

In [None]:
df_offer_help.iloc[3]['target']

'Please send us your comments or ideas .'

In [10]:
# Define the words or phrases to filter
list_a = {'if','feel free','in case'}
list_b = {'have', 'need', 'take'}
list_c = {'question','questions','issue','issues','problems','help','assistance','look' }


# Combine List A and List B using OR, then combine with List C using AND
pattern_a = '|'.join(rf'\b{word}\b' for word in list_a)  
pattern_b = '|'.join(rf'\b{word}\b' for word in list_b)   
pattern_c = '|'.join(rf'\b{word}\b' for word in list_c)   

# Apply the filter
df_offer_help_filtered = df_offer_help[
    df_offer_help['target'].str.contains(pattern_a, case=False, na=False) &     
    df_offer_help['target'].str.contains(pattern_b, case=False, na=False) & 
    df_offer_help['target'].str.contains(pattern_c, case=False, na=False) 
]

df_non_matching = df_offer_help[~(
    df_offer_help['target'].str.contains(pattern_a, case=False, na=False) &
    df_offer_help['target'].str.contains(pattern_b, case=False, na=False) &
    df_offer_help['target'].str.contains(pattern_c, case=False, na=False)
)]

In [11]:
df_offer_help_filtered.shape

(11512, 19)

In [18]:
df_offer_help_filtered.iloc[4]['target']

'-11:00 a.m. ( central ) \r\n Re :        AvocadoIT Fact Sheet Development \r\n Dial - in # :      ( 800 ) 953 - 6502 \r\n Passcode : 6117888 \r\n\r\n Please let me know if you have any questions or concerns .'

In [12]:
index = 7
df_non_matching.iloc[index]['target']

'Hi [PERSON] and [PERSON] , \r\n  Have you two had a chance to take a look at the inquiry below ?'

In [19]:
df_offer_help_filtered.to_csv('./help.tsv',sep='\t')

In [20]:
# Create dataset
pd.DataFrame({
    'text': df_offer_help_filtered['target'],         
    'label': 'OfferHelp'   
}).to_csv("./problem.tsv",sep='\t',index=False)