## Heuristics applied to pairs

In [1]:
from bertopic import BERTopic
from scipy.cluster import hierarchy as sch
import pandas as pd
import logging
from sklearn.metrics.pairwise import cosine_distances
from tqdm import tqdm
import numpy as np
from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
import re
import sys
sys.path.append('../../')
from utility.utility_functions import *

In [2]:
df_meeting = pd.read_parquet('../../data/processed/intents/meeting.parquet', engine='fastparquet')

In [3]:
def create_count_df(df):
    all_strings = [item for sublist in df['action_object_pairs'] for item in sublist]
    all_strings = [s.lower() for s in all_strings]
    # Step 2: Count occurrences of each full string
    full_string_counts = Counter(all_strings)

    # Step 3: Convert the Counter to a DataFrame and sort by counts in descending order
    counts_df = pd.DataFrame(full_string_counts.items(), columns=['action_object_pairs', 'Count']).sort_values(by='Count', ascending=False)

    # Reset index for neatness (optional)
    counts_df = counts_df.reset_index(drop=True)
    return counts_df

## Meeting

In [4]:
df_meeting_counts = create_count_df(df_meeting)

In [5]:
df_meeting_counts

Unnamed: 0,action_object_pairs,Count
0,call_me,6191
1,join_us,2507
2,have_meeting,2337
3,tell_me,2028
4,have_questions,1403
...,...,...
25999,buddy_accenture,1
26000,take_weds,1
26001,join_os,1
26002,join_set,1


### Inspect Pairs and create groups

In [6]:
df_meeting_counts

Unnamed: 0,action_object_pairs,Count
0,call_me,6191
1,join_us,2507
2,have_meeting,2337
3,tell_me,2028
4,have_questions,1403
...,...,...
25999,buddy_accenture,1
26000,take_weds,1
26001,join_os,1
26002,join_set,1


Based on the Inspection, the following groups will be created:



reschedule_meeting

schedule_meeting, arrange_meeting, schedule_meetings, set_meetings, set_meeting, setup_meeting, set_appointment, have_meeting, have_discussion, have_discussions, have_meetings, have_appointment, have_conversation, hold_meeting,

 attend_conference, attend_meeting, attend_meetings


review_content

In [14]:
plan = ['schedule_meeting', 'schedule_meetings', 'setup_meeting', 'set_meetings', 'set_appointment', 'set_meeting', 'arrange_meeting','have_meeting', 'have_discussion', 'have_discussions', 'have_meetings', 'have_appointment', 'have_conversation', 'hold_meeting','reschedule_meeting', 'move_meeting']


In [None]:
df_plan_meeting =  df_meeting[df_meeting['action_object_pairs'].apply(lambda pairs: any(item.lower() in plan for item in pairs))]

In [15]:
df_plan_meeting.iloc[0]['target']

'[PERSON] has a meeting starting now at 6 pm EST , but call him soon to coordinate .'

In [62]:
# Define the set of words to look for (whitelist)
whitelist_words = {'could we schedule a meeting', 'can we schedule a meeting',  'can we arrange a meeting', } 

# Define the set of words or phrases to exclude (blacklist)
blacklist_words = {'I ',}  # Add any other terms to exclude

# Join the words to form regular expression patterns for whitelist and blacklist
whitelist_pattern = '|'.join(rf'\b{word}\b' for word in whitelist_words)
blacklist_pattern = '|'.join(rf'\b{word}\b' for word in blacklist_words)

# Filter the DataFrame to include rows that match whitelist and don't match blacklist
df_plan_meeting_filtered = df_plan_meeting[
    df_plan_meeting['target'].str.contains(whitelist_pattern, case=False, na=False) &
    ~df_plan_meeting['target'].str.contains(blacklist_pattern, case=False, na=False)
]

In [114]:
df_plan_meeting_filtered.shape

(301, 19)

In [64]:
df_plan_meeting_filtered.iloc[9]['target']

'Can we arrange a meeting with interested parties to see what is involved in making this possible . \r\n\r\n Rajeev'

In [112]:

# Define the words or phrases to filter
list_a = {'could we', 'can we', 'should we', 'could you', 'would you','let me know', 'can you'}
list_b = {'schedule', 'arrange', 'set','have','hold'}
list_c = {'appointment', 'meeting', }

# Combine List A and List B using OR, then combine with List C using AND
pattern_a = '|'.join(rf'\b{word}\b' for word in list_a)  
pattern_b = '|'.join(rf'\b{word}\b' for word in list_b)   
pattern_c = '|'.join(rf'\b{word}\b' for word in list_c)   

# Apply the filter
df_plan_meeting_filtered = df_plan_meeting[
    df_plan_meeting['target'].str.contains(pattern_a, case=False, na=False) &     
    df_plan_meeting['target'].str.contains(pattern_b, case=False, na=False) &    
    df_plan_meeting['target'].str.contains(pattern_c, case=False, na=False) 
]

df_non_matching = df_plan_meeting[~(
    df_plan_meeting['target'].str.contains(pattern_a, case=False, na=False) &
    df_plan_meeting['target'].str.contains(pattern_b, case=False, na=False) &
    df_plan_meeting['target'].str.contains(pattern_c, case=False, na=False)
)]

In [125]:
df_plan_meeting_filtered.iloc[8]['target']

'If you need me to set up another meeting , please let me know .'

In [110]:
index = 121
df_non_matching.iloc[index]['target']

'We appreciate your positioning AvocadoIT to the client and setting up an in - person client meeting .'

In [128]:
df_plan_meeting_filtered.to_csv('./meeting.tsv',sep='\t')

In [129]:
# Create dataset
pd.DataFrame({
    'text': df_plan_meeting_filtered['target'],         
    'label': 'PlanMeeting'   
}).to_csv("./meeting.tsv",sep='\t',index=False)