## Filter Action-Object Pairs

In [2]:
import pandas as pd
from collections import Counter
import spacy
from spacy.matcher import Matcher
from tqdm import tqdm
from IPython.display import display, HTML
tqdm.pandas()


# Load the English model
nlp = spacy.load("en_core_web_lg")

#### Functions

In [None]:
# Function to replace PERSON and GPE entities
def replace_entities(text):
    doc = nlp(text)
    modified_text = []
    for token in doc:
        if (token.ent_type_ == "PERSON") or (token.ent_type_ == "GPE"):
            modified_text.append("[ENTITY]")
        else:
            modified_text.append(token.text)
    return " ".join(modified_text)

####

In [2]:
df = pd.read_parquet('../../data/processed/targets/avocado_train_targets_exploded.parquet', engine='fastparquet')
df = df.rename(columns={'targets': 'target'})

In [None]:
df.shape

In [4]:
# Remove empty entries
df_filtered = df[~df['action_object_pairs'].apply(lambda x: x is None or (isinstance(x, list) and len(x) == 0))]

In [None]:
df_filtered.shape

### Inspect Action-Object-Pairs results

#### Functions

In [6]:
def get_distinct_words_df(df):
    all_words = [word for sublist in df['action_object_pairs'] for word in sublist]

    # Use Counter to count occurrences of each word
    word_counts = Counter(all_words)

    # Convert to DataFrame (optional, if you want to keep it in tabular form)
    distinct_words_df = pd.DataFrame(word_counts.items(), columns=['action_object_pairs', 'count'])
    distinct_words_df = distinct_words_df.sort_values(by='count', ascending=False)
    #display(HTML(distinct_words_df.head(n).to_html()))
    return distinct_words_df

# Define a function to inspect entries for a given action-object pair
def inspect_action_object_pair(pair_to_inspect, entry, print_extracted_text=True, print_target=True):
    test = df_filtered[df_filtered['action_object_pairs'].apply(lambda x: pair_to_inspect in x)]
    if print_extracted_text:
        print(test.iloc[entry]['extracted_text'])
    if print_target:
        if print_extracted_text:
            print('----------------------')
    print(test.iloc[entry]['target'])
    del test

####

In [14]:
get_distinct_words_df(df_filtered).head(20)

Unnamed: 0,action_object_pairs,count
172,have_questions,11024
296,send_email,8182
60,send_it,6188
389,call_me,6177
61,send_message,4856
99,thank_you,4433
229,contact_me,3994
404,do_what,3567
668,send_mail,3490
148,do_it,2806


### Inspect Messages


| Action-Object-Pair    | Count |
|-------------|-------------|
 start_Server |	12831
send_it |	12052
have_questions |	11024
send_message |	10704
send_email |	8186
fail_Message |	8065
start_Failures |	6983
start_Occurrences |	6428
call_me	| 6177
thank_you | 4472
contact_me | 3995
post_message | 3790
miss_UNIVERSE | 3742
do_what | 3584
send_mail | 3490
start_occurrence | 3152
do_it | 2816
give_call | 2642
need_help | 2541
unsubscribe_mailto | 2468

In [16]:
inspect_action_object_pair('send_e', 1,print_extracted_text=False)

I might have sent the e - mail to the wrong person because you mentioned before that we should have a meeting with [PERSON]


## Entry Removal

After inspecting the most-frequently occurring action-object-pairs, the following ones will not be considered further:

| Action-Object-Pair    | Reason |
|-------------|-------------|
|fail_Message|belongs to error message and thus does not contain a humanly intent|
|start_Failures|belongs to error message and thus does not contain a humanly intent|
|start_Occurrences|belongs to error message and thus does not contain a humanly intent|
|post_message|belongs to an automated message|
|unsubscribe_mailto|belongs to an automated message|
|miss_UNIVERSE|belongs to an automated message|
|start_occurrence|belongs to an automated message|

also the extremeprogramming unsubscribe message is being filtered from the original dataframe, as well as java messages

In [39]:
entries_to_remove = [
    'fail_Message',
    'start_Failures',
    'start_Occurrences',
    'post_message',
    'unsubscribe_mailto',
    'miss_UNIVERSE',
    'start_occurrence',
    'set_sender'
]

# Remove these entries
df_filtered = df_filtered[~df_filtered['action_object_pairs'].apply(lambda x: any(item in entries_to_remove for item in x))]

In [None]:
df_filtered.shape

In [41]:
test_string = "Unexpected flow reached by the server"

df_filtered = df_filtered[~df_filtered['extracted_text'].apply(lambda x: test_string in str(x))]


In [None]:
df_filtered.shape

In [13]:
# Save the cleaned model

#df_filtered.to_parquet('../../data/processed/targets/avocado_train_targets_exploded_cleaned.parquet')

In [17]:
df_filtered = pd.read_parquet('../../data/processed/targets/avocado_train_targets_exploded_cleaned.parquet', engine='fastparquet')

Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x1048d1df0>>
Traceback (most recent call last):
  File "/opt/homebrew/Caskroom/miniforge/base/envs/ma_exp_intent/lib/python3.12/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(

KeyboardInterrupt: 


: 

: 

In [4]:
# Apply the function to the 'target' column
df_filtered['target'] = df_filtered['target'].progress_apply(replace_entities)

100%|██████████| 686570/686570 [1:25:37<00:00, 133.64it/s]  


In [7]:
df_filtered.to_parquet('../../data/processed/targets/avocado_train_targets_exploded_cleaned_entities.parquet')

## Group Action-Object-Pairs

#### Functions

In [3]:
def is_aggregated(entry):
    """
    Check if the entry is an aggregated row (i.e., a list of keywords).
    
    Parameters:
    - entry: The action-object pair to check.
    
    Returns:
    - True if the entry is aggregated, otherwise False.
    """
    return isinstance(entry, str) and ', ' in entry



def aggregate_by_keywords(df, keywords):
    """
    Aggregate action-object pairs based on a coherent list of keywords,
    excluding already aggregated entries from being re-aggregated.
    
    Parameters:
    - df: The DataFrame to aggregate.
    - keywords: A list of strings representing a coherent group of keywords.
    
    Returns:
    - A DataFrame with the newly aggregated action-object pairs.
    """
    # Create a single pattern to match any of the keywords
    pattern = '|'.join(keywords)
    
    # Filter entries that match the keywords but are not already aggregated
    matching_entries = df[
        df['action_object_pairs'].str.contains(pattern, case=False, na=False) &
        ~df['action_object_pairs'].apply(is_aggregated)
    ]
    
    # Create a new DataFrame for aggregated results
    aggregated_rows = []
    
    # Only create a new entry for the matched pairs if there are any
    if not matching_entries.empty:
        aggregated_row = {
            'action_object_pairs': ', '.join(matching_entries['action_object_pairs'].tolist()),
            'count': matching_entries['count'].sum()
        }
        aggregated_rows.append(aggregated_row)

        # Remove the matching entries from the original DataFrame
        df = df.drop(matching_entries.index)

    # Convert aggregated rows to DataFrame if any
    aggregated_df = pd.DataFrame(aggregated_rows)

    # Concatenate remaining entries with the aggregated DataFrame
    final_df = pd.concat([df, aggregated_df], ignore_index=True)

    return final_df.sort_values(by='count', ascending=False).reset_index(drop=True)

def remove_entries_containing(aggregated_df, original_df, substring):
    original_df = get_distinct_words_df(original_df)
    # Step 1: Identify the aggregated row index
    aggregated_row_index = aggregated_df[aggregated_df['action_object_pairs'].str.contains(',', case=False)].index
    if not aggregated_row_index.empty:
        aggregated_row_index = aggregated_row_index[0]
        
        # Step 2: Get the current list and split into individual entries
        current_list = aggregated_df.at[aggregated_row_index, 'action_object_pairs'].split(', ')
        
        # Step 3: Identify entries to remove
        entries_to_remove = [entry for entry in current_list if substring in entry]
        
        # Step 4: Create new rows for each removed entry using original counts
        new_rows = []
        for entry in entries_to_remove:
            # Get the count of the removed entry from the original DataFrame
            count_of_entry = original_df.loc[original_df['action_object_pairs'] == entry, 'count']
            if not count_of_entry.empty:
                new_rows.append({'action_object_pairs': entry, 'count': count_of_entry.values[0]})
            else:
                print(f"Warning: Could not find count for entry '{entry}'")

        # Step 5: Remove the entries from the current list
        updated_list = [entry for entry in current_list if entry not in entries_to_remove]

        # Step 6: Update the aggregated row
        if new_rows:
            aggregated_df.at[aggregated_row_index, 'action_object_pairs'] = ', '.join(updated_list)
            updated_count = aggregated_df.at[aggregated_row_index, 'count'] - sum(row['count'] for row in new_rows)
            aggregated_df.at[aggregated_row_index, 'count'] = updated_count

            # Step 7: Convert new_rows to DataFrame
            new_rows_df = pd.DataFrame(new_rows)

            # Step 8: Concatenate new rows for removed entries to the aggregated DataFrame
            aggregated_df = pd.concat([aggregated_df, new_rows_df], ignore_index=True)

            # Step 9: Sort the DataFrame by count in descending order
            aggregated_df = aggregated_df.sort_values(by='count', ascending=False).reset_index(drop=True)
    
    return aggregated_df


def aggregate_specific_indices(df, index_pairs):
    """
    Aggregate entries in the DataFrame based on specific index pairs.

    Parameters:
    - df: The DataFrame to aggregate.
    - index_pairs: A list of tuples, where each tuple contains indices of rows to aggregate.

    Returns:
    - A DataFrame with aggregated action-object pairs and their counts.
    """
    # Create a list to hold new aggregated rows
    aggregated_rows = []

    # Track which indices have been aggregated to avoid duplicates
    aggregated_indices = set()

    for indices in index_pairs:
        # Check if all indices are valid and not already aggregated
        if all(idx < len(df) for idx in indices) and not any(idx in aggregated_indices for idx in indices):
            # Select the rows to aggregate
            group = df.iloc[list(indices)]
            # Aggregate the action_object_pairs and the counts
            aggregated_row = {
                'action_object_pairs': ', '.join(group['action_object_pairs']),
                'count': group['count'].sum()
            }
            aggregated_rows.append(aggregated_row)
            # Mark these indices as aggregated
            aggregated_indices.update(indices)

    # Create a new DataFrame from aggregated rows
    aggregated_df = pd.DataFrame(aggregated_rows)

    # Add remaining unaggregated rows to the aggregated DataFrame
    unaggregated_rows = df[~df.index.isin(aggregated_indices)]
    aggregated_df = pd.concat([aggregated_df, unaggregated_rows], ignore_index=True)

    return aggregated_df.sort_values(by='count', ascending=False).reset_index(drop=True)

#### Implementation

In [4]:
# Read filtered DF
df_filtered = pd.read_parquet('../../data/processed/targets/avocado_train_targets_exploded_cleaned_entities.parquet', engine='fastparquet')

In [7]:
df_grouped = get_distinct_words_df(df_filtered)
df_grouped.head(20)

Unnamed: 0,action_object_pairs,count
172,have_questions,11024
296,send_email,8182
60,send_it,6188
389,call_me,6177
61,send_message,4856
99,thank_you,4433
229,contact_me,3994
404,do_what,3567
668,send_mail,3490
148,do_it,2806


In [33]:
df_grouped = aggregate_by_keywords(df_grouped, ['attend', ])


In [35]:
df_grouped = aggregate_specific_indices(df_grouped, [(5,8)])
df_grouped

Unnamed: 0,action_object_pairs,count
0,"send_email, send_it, send_message, send_mail, ...",68255
1,"call_me, give_call, call_you, have_call, set_c...",37338
2,"have_questions, need_help, have_question, need...",29876
3,"do_what, do_it, do_this, do_that, do_What, do_...",28753
4,"have_meeting, set_meeting, schedule_meeting, a...",20347
...,...,...
196876,host_presentations,1
196877,drop_thread,1
196878,assess_overhaul,1
196879,make_margin,1


In [199]:
df_grouped = remove_entries_containing(df_grouped, df_filtered, 'send_email')
df_grouped.head(10)

Unnamed: 0,action_object_pairs,count
0,"send_it, send_message, send_mail, send_e, send...",59541
1,have_questions,11024
2,"have_meeting, set_meeting, schedule_meeting, a...",10989
3,send_email,8182
4,call_me,6177
5,thank_you,4433
6,contact_me,3994
7,do_what,3567
8,do_it,2806
9,give_call,2642


#### Iteratively apply the grouping-function to derive at a taxonomy and retrieve their action-object-pairs

In [38]:
df_grouped.head(10)

Unnamed: 0,action_object_pairs,count
0,"send_email, send_it, send_message, send_mail, ...",68255
1,"call_me, give_call, call_you, have_call, set_c...",37338
2,"have_questions, need_help, have_question, need...",29876
3,"do_what, do_it, do_this, do_that, do_What, do_...",28753
4,"have_meeting, set_meeting, schedule_meeting, a...",20347
5,"provide_information, contain_information, need...",17776
6,"make_changes, change_address, make_change, tes...",14091
7,"thank_you, have_Thanks, thank_You, thank_every...",10457
8,tell_me,2027
9,notify_sender,1923


#### Final Taxonomy

| Action-Object-Pairs    | Overarching Intent |
|-----------------------|-------------|
|send_  | Send something or request somebody to send something|
| call_ | Contacting somebody or request somebody to contact them |
| have_questions, need_help | request help or provide help |
| do_ | unspecified doing of something |
| meeting | planning of meetings |
| information | requesting or providing information |
| change | making changes or request for changes |
| thank_you | thank you messages |
| ... | rest of the Dataframe |

### Create Dataframe Subsets for overarching intents

In [49]:
def split_dataframes(df, action_object_pairs):
    mask = df['action_object_pairs'].progress_apply(lambda pairs: any(pair in action_object_pairs for pair in pairs))

    # Extract the rows that match the filter criteria into a new DataFrame
    df_intent = df[mask].copy()

    # Remove the matching rows from the original DataFrame
    df_remaining = df[~mask].copy()
    
    return df_intent, df_remaining


In [42]:
df_start = df_filtered.copy()

##### Send DF

In [40]:
pairs_send = df_grouped.iloc[0]['action_object_pairs']
pairs_send



In [50]:
df_send, df_remaining = split_dataframes(df_start,pairs_send)
print(f'Intent DF shape: {df_send.shape}; Remaining DF shape: {df_remaining.shape}')

100%|██████████| 686570/686570 [00:23<00:00, 29597.29it/s]


Intent DF shape: (54316, 19); Remaining DF shape: (632254, 19)


##### Call DF

In [51]:
pairs_call = df_grouped.iloc[1]['action_object_pairs']
pairs_call

"call_me, give_call, call_you, have_call, set_call, call_him, call_it, call_us, make_calls, make_call, call_number, get_call, call_them, call_what, call_her, schedule_call, receive_call, call_Fazio, arrange_call, call_method, receive_calls, do_call, call_phone, call_toll, return_call, get_calls, handle_calls, expect_call, have_calls, call_that, call_office, place_call, take_call, call_cell, call_this, join_call, call_section, call_mail, follow_call, call_which, call_function, call_Center, call_reps, put_call, miss_call, attend_call, call_support, return_calls, call_methods, take_calls, answer_call, activate_call, call_What, call_release, setup_call, hold_call, call_line, call_Support, call_system, call_company, call_code, set_calls, call_Jose, schedule_calls, coordinate_call, initiate_call, call_Stewart, call_whatever, call_one, call_servlet, call_person, call_guy, call_people, place_calls, add_call, reschedule_call, confirm_call, call_Managers, call_u, call_Code, call_mobile, call_Pas

In [None]:
df_call, df_remaining = split_dataframes(df_remaining,pairs_call)
print(f'Intent DF shape: {df_call.shape}; Remaining DF shape: {df_remaining.shape}')

100%|██████████| 632254/632254 [00:19<00:00, 32695.74it/s]


Intent DF shape: (54316, 19); Remaining DF shape: (596807, 19)


##### Help DF

In [53]:
pairs_help = df_grouped.iloc[2]['action_object_pairs']
pairs_help

'have_questions, need_help, have_question, need_HELP, have_QUESTIONS, need_Help, have_Question, have_Questions, need_helps, have_questionnaire, need_helpone, have_problems, help_you, help_me, help_us, appreciate_help, get_help, help_them, help_yourself, help_AvocadoIT, offer_help, help_others, help_him, help_lot, use_help, help_those, help_executives, want_help, provide_help, help_companies, have_help, help_that, contact_Helpdesk, help_yourselves, help_customers, help_her, take_help, help_progress, check_Help, help_business, help_team, help_company, help_other, help_Schramm, give_help, help_Advertising, help_Desk, help_students, help_people, receive_help, visit_Help, help_services, help_deal, ask_help, help_self, like_help, help_QA, help_redevelopment, help_much, help_users, help_process, help_Accenture, request_help, help_guys, help_customer, help_IDC, help_Tel, help_it, find_help, require_help, help_Regards, help_organization, help_which, enlist_help, help_law, help_Jose, get_Help, h

In [None]:
df_help, df_remaining = split_dataframes(df_remaining,pairs_help)
print(f'Intent DF shape: {df_help.shape}; Remaining DF shape: {df_remaining.shape}')

100%|██████████| 596807/596807 [00:09<00:00, 61461.35it/s]


Intent DF shape: (54316, 19); Remaining DF shape: (572415, 19)


##### Do something DF

In [55]:
pairs_do = df_grouped.iloc[3]['action_object_pairs']
pairs_do



In [None]:
df_do, df_remaining = split_dataframes(df_remaining,pairs_do)
print(f'Intent DF shape: {df_do.shape}; Remaining DF shape: {df_remaining.shape}')

100%|██████████| 572415/572415 [00:11<00:00, 48762.41it/s]


Intent DF shape: (54316, 19); Remaining DF shape: (548564, 19)


##### Meeting DF

In [57]:
pairs_meeting = df_grouped.iloc[4]['action_object_pairs']
pairs_meeting

'have_meeting, set_meeting, schedule_meeting, attend_meeting, have_meetings, arrange_meeting, join_meeting, schedule_meetings, attend_meetings, have_appointment, set_meetings, hold_meeting, get_meeting, reschedule_meeting, call_meeting, follow_meeting, make_meeting, setup_meeting, set_appointment, confirm_meeting, move_meeting, do_meeting, attend_Meeting, coordinate_meeting, announce_appointment, cancel_meeting, start_meeting, setup_meetings, schedule_appointment, miss_meeting, hold_Meeting, regard_meeting, hold_meetings, need_meeting, join_MEETING, enjoy_meeting, get_appointment, leave_meeting, create_meeting, make_appointment, arrange_meetings, host_meeting, request_meeting, facilitate_meeting, plan_meeting, get_meetings, make_appointments, complete_meetings, disrupt_meeting, move_meetings, discuss_meeting, conduct_meetings, propose_meeting, organize_meeting, start_MEETING, postpone_meeting, coordinate_meetings, cancel_appointment, record_meetings, have_appointments, start_meetings, 

In [None]:
df_meeting, df_remaining = split_dataframes(df_remaining,pairs_meeting)
print(f'Intent DF shape: {df_meeting.shape}; Remaining DF shape: {df_remaining.shape}')

100%|██████████| 548564/548564 [00:11<00:00, 47522.00it/s]


Intent DF shape: (54316, 19); Remaining DF shape: (521583, 19)


##### Information DF

In [60]:
pairs_information = df_grouped.iloc[5]['action_object_pairs']
pairs_information

'provide_information, contain_information, need_information, have_information, get_information, receive_information, give_information, access_information, update_information, pass_information, include_information, verify_information, request_information, use_information, find_information, review_information, share_information, email_information, like_information, want_information, gather_information, add_information, require_information, deliver_information, collect_information, forward_information, obtain_information, capture_information, keep_information, submit_information, present_information, select_Information, take_information, enter_information, change_information, see_information, read_information, disclose_INFORMATION, note_information, support_information, communicate_information, know_information, publish_information, distribute_information, fill_information, put_information, store_information, retrieve_information, pull_information, manage_information, zap_information, ext

In [None]:
df_information, df_remaining = split_dataframes(df_remaining,pairs_information)
print(f'Intent DF shape: {df_information.shape}; Remaining DF shape: {df_remaining.shape}')

100%|██████████| 521583/521583 [00:04<00:00, 104392.32it/s]


Intent DF shape: (54316, 19); Remaining DF shape: (503645, 19)


##### Change DF

In [62]:
pairs_change = df_grouped.iloc[6]['action_object_pairs']
pairs_change



In [None]:
df_change, df_remaining = split_dataframes(df_remaining,pairs_change)
print(f'Intent DF shape: {df_change.shape}; Remaining DF shape: {df_remaining.shape}')

100%|██████████| 503645/503645 [00:11<00:00, 44552.05it/s]


Intent DF shape: (54316, 19); Remaining DF shape: (490806, 19)


##### Thank You DF

In [64]:
pairs_thanks = df_grouped.iloc[7]['action_object_pairs']
pairs_thanks

'thank_you, have_Thanks, thank_You, thank_everyone, give_Thanks, provide_Thanks, use_Thanks, know_Thanks, advise_Thanks, need_Thanks, thank_all, extend_thanks, thank_Jose, thank_YOU, tell_Thanks, help_Thanks, thank_Team, ask_Thanks, find_Thanks, thank_Manager, thank_each, do_Thanks, thank_Miyuki, thank_team, take_Thanks, thank_Kathy, thank_Trish, thank_Mohindra, make_Thanks, confirm_Thanks, get_Thanks, put_Thanks, want_Thanks, thank_jaime, check_Thanks, have_thanks, add_Thanks, contact_Thanks, include_Thanks, think_Thanks, thank_Regards, give_thanks, discuss_Thanks, update_Thanks, say_thanks, see_Thanks, copy_Thanks, thank_Betty, try_Thanks, change_Thanks, get_thanks, set_Thanks, like_Thanks, thank_Street, require_Thanks, say_Thanks, thank_Baxish, thank_success, thank_many, thank_Prakash, ask_thanks, test_Thanks, thank_Director, select_Thanks, attend_Thanks, show_Thanks, let_Thanks, create_Thanks, use_thanks, thank_them, email_Thanks, direct_Thanks, thank_letters, review_Thanks, thank_

In [65]:
df_thanks, df_remaining = split_dataframes(df_remaining,pairs_thanks)
print(f'Intent DF shape: {df_thanks.shape}; Remaining DF shape: {df_remaining.shape}')

100%|██████████| 490806/490806 [00:04<00:00, 110866.48it/s]


Intent DF shape: (7106, 19); Remaining DF shape: (483700, 19)


##### Save DF Subsets

In [66]:
df_send.to_parquet('../../data/processed/intents/send.parquet')
df_call.to_parquet('../../data/processed/intents/call.parquet')
df_help.to_parquet('../../data/processed/intents/help.parquet')
df_do.to_parquet('../../data/processed/intents/do.parquet')
df_meeting.to_parquet('../../data/processed/intents/meeting.parquet')
df_information.to_parquet('../../data/processed/intents/info.parquet')
df_change.to_parquet('../../data/processed/intents/change.parquet')
df_thanks.to_parquet('../../data/processed/intents/thanks.parquet')
df_remaining.to_parquet('../../data/processed/intents/remaining.parquet')