## Filter Action-Object Pairs

In [1]:
import pandas as pd
from collections import Counter
import spacy
from spacy.matcher import Matcher
from tqdm import tqdm
from IPython.display import display, HTML
tqdm.pandas()
import sys
sys.path.append('../../')
from utility.utility_functions import *

# Load the English model
nlp = spacy.load("en_core_web_lg")

#### Functions

In [None]:
# Function to replace PERSON and GPE entities
def replace_entities(text):
    doc = nlp(text)
    modified_text = []
    for token in doc:
        if (token.ent_type_ == "PERSON") or (token.ent_type_ == "GPE"):
            modified_text.append("[ENTITY]")
        else:
            modified_text.append(token.text)
    return " ".join(modified_text)

####

In [None]:
df = pd.read_parquet('../../data/processed/targets/avocado_train_targets_exploded.parquet', engine='fastparquet')
df = df.rename(columns={'targets': 'target'})

In [None]:
df.shape

In [None]:
# Remove empty entries
df_filtered = df[~df['action_object_pairs'].apply(lambda x: x is None or (isinstance(x, list) and len(x) == 0))]

In [None]:
df_filtered.shape

### Inspect Action-Object-Pairs results

#### Functions

In [None]:
def get_distinct_words_df(df):
    all_words = [word for sublist in df['action_object_pairs'] for word in sublist]

    # Use Counter to count occurrences of each word
    word_counts = Counter(all_words)

    # Convert to DataFrame (optional, if you want to keep it in tabular form)
    distinct_words_df = pd.DataFrame(word_counts.items(), columns=['action_object_pairs', 'count'])
    distinct_words_df = distinct_words_df.sort_values(by='count', ascending=False)
    #display(HTML(distinct_words_df.head(n).to_html()))
    return distinct_words_df

# Define a function to inspect entries for a given action-object pair
def inspect_action_object_pair(pair_to_inspect, entry, print_extracted_text=True, print_target=True):
    test = df_filtered[df_filtered['action_object_pairs'].apply(lambda x: pair_to_inspect in x)]
    if print_extracted_text:
        print(test.iloc[entry]['extracted_text'])
    if print_target:
        if print_extracted_text:
            print('----------------------')
    print(test.iloc[entry]['target'])
    del test

####

In [None]:
get_distinct_words_df(df_filtered).head(20)

### Inspect Messages


| Action-Object-Pair    | Count |
|-------------|-------------|
 start_Server |	12831
send_it |	12052
have_questions |	11024
send_message |	10704
send_email |	8186
fail_Message |	8065
start_Failures |	6983
start_Occurrences |	6428
call_me	| 6177
thank_you | 4472
contact_me | 3995
post_message | 3790
miss_UNIVERSE | 3742
do_what | 3584
send_mail | 3490
start_occurrence | 3152
do_it | 2816
give_call | 2642
need_help | 2541
unsubscribe_mailto | 2468

In [None]:
inspect_action_object_pair('send_e', 1,print_extracted_text=False)

## Entry Removal

After inspecting the most-frequently occurring action-object-pairs, the following ones will not be considered further:

| Action-Object-Pair    | Reason |
|-------------|-------------|
|fail_Message|belongs to error message and thus does not contain a humanly intent|
|start_Failures|belongs to error message and thus does not contain a humanly intent|
|start_Occurrences|belongs to error message and thus does not contain a humanly intent|
|post_message|belongs to an automated message|
|unsubscribe_mailto|belongs to an automated message|
|miss_UNIVERSE|belongs to an automated message|
|start_occurrence|belongs to an automated message|

also the extremeprogramming unsubscribe message is being filtered from the original dataframe, as well as java messages

In [None]:
entries_to_remove = [
    'fail_Message',
    'start_Failures',
    'start_Occurrences',
    'post_message',
    'unsubscribe_mailto',
    'miss_UNIVERSE',
    'start_occurrence',
    'set_sender'
]

# Remove these entries
df_filtered = df_filtered[~df_filtered['action_object_pairs'].apply(lambda x: any(item in entries_to_remove for item in x))]

In [None]:
df_filtered.shape

In [None]:
test_string = "Unexpected flow reached by the server"

df_filtered = df_filtered[~df_filtered['extracted_text'].apply(lambda x: test_string in str(x))]


In [None]:
df_filtered.shape

In [None]:
# Save the cleaned model

#df_filtered.to_parquet('../../data/processed/targets/avocado_train_targets_exploded_cleaned.parquet')

In [None]:
df_filtered = pd.read_parquet('../../data/processed/targets/avocado_train_targets_exploded_cleaned.parquet', engine='fastparquet')

In [None]:
# Apply the function to the 'target' column
df_filtered['target'] = df_filtered['target'].progress_apply(replace_entities)

In [None]:
df_filtered.to_parquet('../../data/processed/targets/avocado_train_targets_exploded_cleaned_entities.parquet')

## Group Action-Object-Pairs

#### Implementation

In [2]:
# Read filtered DF
df_filtered = pd.read_parquet('../../data/processed/targets/avocado_train_targets_exploded_cleaned_entities.parquet', engine='fastparquet')

In [113]:
df_grouped = get_distinct_words_df(df_filtered)
df_grouped.head(20)

Unnamed: 0,action_object_pairs,count
172,have_questions,11024
296,send_email,8182
60,send_it,6188
389,call_me,6177
61,send_message,4856
99,thank_you,4433
229,contact_me,3994
404,do_what,3567
668,send_mail,3490
148,do_it,2806


### Grouping

In [138]:
df_grouped.head(40)

Unnamed: 0,action_object_pairs,count
0,"do_what, do_it, do_this, tell_me, do_that, pro...",194854
1,"have_questions, need_help, take_look, help_you...",70971
2,"send_it, send_e, send_courtesy, send_this, sen...",43530
3,"send_email, send_mail, receive_email, receive_...",37063
4,"join_us, have_meeting, set_meeting, schedule_m...",34014
5,"call_me, give_call, call_you, have_call, set_c...",32895
6,"provide_information, contain_information, need...",32579
7,"make_changes, change_address, give_update, mak...",29808
8,"send_message, contact_me, contact_us, receive_...",29205
9,"send_feedback, provide_feedback, review_it, se...",20537


In [64]:
#for entry in df_grouped.iloc[10]['action_object_pairs'].split(', '):
#  print(entry)

- 'meeting','appointment'
- 'issue','problem','help','question', 'assistance', 'take_look', 'ask_'
- 'thank', 'appreciate'
- 'contact', 'reach', 'message',
- 'information'
- 'change', 'amend'
- 'call'
- 'mail'
- 'send'
- 'do_','use_'
- '_it','_us','_them','_you','_her','_him','_me','_what','_this','_that'


In [None]:
kw_meeting = ['meeting','appointment','join_us',"_con",'discussion' ]
kw_feedback = ['feedback','comment','review',"input",'criticism', 'critique']
kw_problem = ['issue','problem','help','question', 'assist', 'take_look', 'aid', 'support', 'guide', 'give_hand','lend_hand','solution','solve']
kw_thank = ['thank', 'appreciate']
kw_contact = ['contact', 'reach', 'message',]
kw_information = ['information', "data", "details", "report", ]
kw_change = ['change', 'amend', "modify", "adjust", "revise", "alter", "transform", "edit", "update", "convert", "rework", "refine", "adapt", "remodel", "reorganize", "reform", "shift", "rebuild", "vary", "recalibrate", "rectify"]
kw_call = ['call']
kw_mail = ['mail']
kw_send = ['send_']
kw_this_and_that = ['do_','use_','_it','_us','_them','_you','_her','_him','_me','_what','_this','_that']

groups = [
   kw_meeting, kw_feedback, kw_problem, kw_thank, kw_contact,kw_information, kw_change, kw_call, kw_mail, kw_send, kw_this_and_that
]


In [122]:
for group in groups:
    df_grouped = aggregate_by_keywords_substring(df_grouped, group)

In [None]:
#df_grouped = aggregate_specific_indices(df_grouped, [(0,2)])

In [None]:
#df_grouped = remove_entries_containing(df_grouped, df_filtered, 'send_email')
#df_grouped.head(10)

#### Final Taxonomy

| Action-Object-Pairs    | Overarching Intent |
|-----------------------|-------------|
|send_  | Send something or request somebody to send something|
| call_ | Contacting somebody or request somebody to contact them |
| have_questions, need_help | request help or provide help |
| do_ | unspecified doing of something |
| meeting | planning of meetings |
| information | requesting or providing information |
| change | making changes or request for changes |
| thank_you | thank you messages |
| ... | rest of the Dataframe |

### Create Dataframe Subsets for overarching intents

In [139]:
df_start = df_filtered.copy()

In [140]:
pairs_meeting = df_grouped.iloc[4]['action_object_pairs']
df_meeting, df_remaining = split_dataframes(df_start, pairs_meeting)
print(f'Intent DF shape: {df_meeting.shape}; Remaining DF shape: {df_remaining.shape}')

100%|██████████| 686570/686570 [00:47<00:00, 14394.11it/s]


Intent DF shape: (51036, 19); Remaining DF shape: (635534, 19)


In [141]:
pairs_feedback = df_grouped.iloc[9]['action_object_pairs']
df_feedback, df_remaining = split_dataframes(df_remaining, pairs_feedback)
print(f'Intent DF shape: {df_feedback.shape}; Remaining DF shape: {df_remaining.shape}')

100%|██████████| 635534/635534 [00:16<00:00, 37434.83it/s]


Intent DF shape: (22285, 19); Remaining DF shape: (613249, 19)


In [142]:
pairs_problem = df_grouped.iloc[1]['action_object_pairs']
df_problem, df_remaining = split_dataframes(df_remaining, pairs_problem)
print(f'Intent DF shape: {df_problem.shape}; Remaining DF shape: {df_remaining.shape}')

100%|██████████| 613249/613249 [00:42<00:00, 14479.58it/s]


Intent DF shape: (64233, 19); Remaining DF shape: (549016, 19)


In [143]:
pairs_thank = df_grouped.iloc[10]['action_object_pairs']
df_thank, df_remaining = split_dataframes(df_remaining, pairs_thank)
print(f'Intent DF shape: {df_thank.shape}; Remaining DF shape: {df_remaining.shape}')

100%|██████████| 549016/549016 [00:07<00:00, 77068.45it/s]


Intent DF shape: (5772, 19); Remaining DF shape: (543244, 19)


In [144]:
pairs_contact = df_grouped.iloc[8]['action_object_pairs']
df_contact, df_remaining = split_dataframes(df_remaining, pairs_contact)
print(f'Intent DF shape: {df_contact.shape}; Remaining DF shape: {df_remaining.shape}')

100%|██████████| 543244/543244 [00:15<00:00, 36125.04it/s]


Intent DF shape: (21185, 19); Remaining DF shape: (522059, 19)


In [145]:
pairs_information = df_grouped.iloc[6]['action_object_pairs']
df_info, df_remaining = split_dataframes(df_remaining, pairs_information)
print(f'Intent DF shape: {df_info.shape}; Remaining DF shape: {df_remaining.shape}')

100%|██████████| 522059/522059 [00:14<00:00, 35324.34it/s]


Intent DF shape: (29829, 19); Remaining DF shape: (492230, 19)


In [146]:
pairs_change = df_grouped.iloc[7]['action_object_pairs']
df_change, df_remaining = split_dataframes(df_remaining, pairs_change)
print(f'Intent DF shape: {df_change.shape}; Remaining DF shape: {df_remaining.shape}')

100%|██████████| 492230/492230 [00:27<00:00, 18055.84it/s]


Intent DF shape: (27758, 19); Remaining DF shape: (464472, 19)


In [147]:
pairs_call = df_grouped.iloc[5]['action_object_pairs']
df_call, df_remaining = split_dataframes(df_remaining, pairs_call)
print(f'Intent DF shape: {df_call.shape}; Remaining DF shape: {df_remaining.shape}')

  0%|          | 0/464472 [00:00<?, ?it/s]

100%|██████████| 464472/464472 [00:12<00:00, 36678.14it/s]


Intent DF shape: (20327, 19); Remaining DF shape: (444145, 19)


In [148]:
pairs_mail = df_grouped.iloc[3]['action_object_pairs']
df_mail, df_remaining = split_dataframes(df_remaining, pairs_mail)
print(f'Intent DF shape: {df_mail.shape}; Remaining DF shape: {df_remaining.shape}')

100%|██████████| 444145/444145 [00:10<00:00, 42025.87it/s]


Intent DF shape: (23250, 19); Remaining DF shape: (420895, 19)


In [149]:
pairs_send = df_grouped.iloc[2]['action_object_pairs']
df_send, df_remaining = split_dataframes(df_remaining, pairs_send)
print(f'Intent DF shape: {df_send.shape}; Remaining DF shape: {df_remaining.shape}')

100%|██████████| 420895/420895 [00:12<00:00, 34315.86it/s]


Intent DF shape: (23710, 19); Remaining DF shape: (397185, 19)


In [150]:
pairs_this_that = df_grouped.iloc[0]['action_object_pairs']
df_this_that, df_remaining = split_dataframes(df_remaining, pairs_this_that)
print(f'Intent DF shape: {df_this_that.shape}; Remaining DF shape: {df_remaining.shape}')

100%|██████████| 397185/397185 [00:46<00:00, 8625.40it/s]


Intent DF shape: (112704, 19); Remaining DF shape: (284481, 19)


##### Save DF Subsets

In [151]:
df_meeting.to_parquet('../../data/processed/intents/meeting.parquet')
df_feedback.to_parquet('../../data/processed/intents/feedback.parquet')
df_problem.to_parquet('../../data/processed/intents/problem.parquet')
df_thank.to_parquet('../../data/processed/intents/thank.parquet')
df_contact.to_parquet('../../data/processed/intents/contact.parquet')
df_info.to_parquet('../../data/processed/intents/info.parquet')
df_change.to_parquet('../../data/processed/intents/change.parquet')
df_call.to_parquet('../../data/processed/intents/call.parquet')
df_mail.to_parquet('../../data/processed/intents/mail.parquet')
df_send.to_parquet('../../data/processed/intents/send.parquet')
df_this_that.to_parquet('../../data/processed/intents/this_that.parquet')
df_remaining.to_parquet('../../data/processed/intents/remaining.parquet')