## Heuristics applied to pairs

In [2]:
from bertopic import BERTopic
from scipy.cluster import hierarchy as sch
import pandas as pd
import logging
from sklearn.metrics.pairwise import cosine_distances
from tqdm import tqdm
import numpy as np
from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
import re
import sys
sys.path.append('../../')
from utility.utility_functions import *

In [3]:
df_feedback = pd.read_parquet('../../data/processed/intents/feedback.parquet', engine='pyarrow')

## Feedback

In [4]:
df_feedback_counts = create_count_df(df_feedback)
df_feedback_counts.head(10)

Unnamed: 0,action_object_pairs,Count
0,send_feedback,938
1,send_comments,805
2,provide_feedback,772
3,review_it,752
4,give_feedback,732
5,put_it,540
6,have_comments,433
7,get_feedback,414
8,put_them,311
9,review_document,304


### Inspect Pairs and create groups

In [5]:
df_feedback_counts.head(20)

Unnamed: 0,action_object_pairs,Count
0,send_feedback,938
1,send_comments,805
2,provide_feedback,772
3,review_it,752
4,give_feedback,732
5,put_it,540
6,have_comments,433
7,get_feedback,414
8,put_them,311
9,review_document,304


Based on the Inspection, the following groups will be created:



reschedule_meeting

schedule_meeting, arrange_meeting, schedule_meetings, set_meetings, set_meeting, setup_meeting, set_appointment, have_meeting, have_discussion, have_discussions, have_meetings, have_appointment, have_conversation, hold_meeting,

 attend_conference, attend_meeting, attend_meetings


review_content

In [6]:
request_feedback = ['send_feedback','send_comments','provide_feedback','review_it','give_feedback','review_this','review_them',]

In [7]:
df_request_feedback =  df_feedback[df_feedback['action_object_pairs'].apply(lambda pairs: any(item.lower() in request_feedback for item in pairs))]

In [8]:
df_request_feedback.iloc[3]['target']

'Please send us your comments or ideas .'

In [22]:
# Define the words or phrases to filter
list_a = {'need','please','can you'}
list_b = {'review','comment','comments','feedback' }
#list_c = {'change', 'changes', }

# Combine List A and List B using OR, then combine with List C using AND
pattern_a = '|'.join(rf'\b{word}\b' for word in list_a)  
pattern_b = '|'.join(rf'\b{word}\b' for word in list_b)   
#pattern_c = '|'.join(rf'\b{word}\b' for word in list_c)   

# Apply the filter
df_request_feedback_filtered = df_request_feedback[
    df_request_feedback['target'].str.contains(pattern_a, case=False, na=False) &     
    df_request_feedback['target'].str.contains(pattern_b, case=False, na=False)    
    #df_request_feedback['target'].str.contains(pattern_c, case=False, na=False) 
]

df_non_matching = df_request_feedback[~(
    df_request_feedback['target'].str.contains(pattern_a, case=False, na=False) &
    df_request_feedback['target'].str.contains(pattern_b, case=False, na=False) 
    #df_request_feedback['target'].str.contains(pattern_c, case=False, na=False)
)]

In [24]:
df_request_feedback_filtered.shape

(1977, 19)

In [34]:
df_request_feedback_filtered.iloc[13]['target']

'Please provide any feedback as you may find fit . \r\n\r\n Thanks , \r\n\r\n [PERSON] \r\n\r\n --- \r\n Ricardo Garcia \r\n Technical Marketing Engineer \r\n AvocadoIT , Inc. \r\n 2211 North First Street , Suite 200 \r\n [LOCATION] [LOCATION] , [LOCATION]   95131 \r\n ( 408 ) 562 - 7984 office \r\n ( 408 ) 562 - 8100 fax \r\n ( 408 ) 829 - 4827 cell \r\n ricardo.garcia@avocadoit.com \r\n www.avocadoit.com'

In [31]:
index = 7
df_non_matching.iloc[index]['target']

"It is understood that fifteen banks have given [PERSON] initial feedback on \r\n the planned 10 - year deal 's structure and pricing ."

In [35]:
# Create dataset
pd.DataFrame({
    'text': df_request_feedback_filtered['target'],         
    'label': 'RequestFeedback'   
}).to_csv("./feedback.tsv",sep='\t',index=False)