## Filter Action-Object Pairs

In [2]:
import pandas as pd
from collections import Counter
import spacy
from spacy.matcher import Matcher
from tqdm import tqdm
from IPython.display import display, HTML
tqdm.pandas()


# Load the English model
nlp = spacy.load("en_core_web_lg")

In [12]:
# Function to replace PERSON and GPE entities
def replace_entities(text):
    doc = nlp(text)
    modified_text = []
    for token in doc:
        if (token.ent_type_ == "PERSON") or (token.ent_type_ == "GPE"):
            modified_text.append("[ENTITY]")
        else:
            modified_text.append(token.text)
    return " ".join(modified_text)


In [2]:
df = pd.read_parquet('../../data/processed/targets/avocado_train_targets_exploded.parquet', engine='fastparquet')
df = df.rename(columns={'targets': 'target'})

In [None]:
df.shape

In [4]:
# Remove empty entries
df_filtered = df[~df['action_object_pairs'].apply(lambda x: x is None or (isinstance(x, list) and len(x) == 0))]

In [None]:
df_filtered.shape

### Inspect Action-Object-Pairs results

In [6]:
def get_distinct_words_df(df):
    all_words = [word for sublist in df['action_object_pairs'] for word in sublist]

    # Use Counter to count occurrences of each word
    word_counts = Counter(all_words)

    # Convert to DataFrame (optional, if you want to keep it in tabular form)
    distinct_words_df = pd.DataFrame(word_counts.items(), columns=['action_object_pairs', 'count'])
    distinct_words_df = distinct_words_df.sort_values(by='count', ascending=False)
    #display(HTML(distinct_words_df.head(n).to_html()))
    return distinct_words_df

In [14]:
get_distinct_words_df(df_filtered).head(20)

Unnamed: 0,action_object_pairs,count
172,have_questions,11024
296,send_email,8182
60,send_it,6188
389,call_me,6177
61,send_message,4856
99,thank_you,4433
229,contact_me,3994
404,do_what,3567
668,send_mail,3490
148,do_it,2806


### Inspect Messages


| Action-Object-Pair    | Count |
|-------------|-------------|
 start_Server |	12831
send_it |	12052
have_questions |	11024
send_message |	10704
send_email |	8186
fail_Message |	8065
start_Failures |	6983
start_Occurrences |	6428
call_me	| 6177
thank_you | 4472
contact_me | 3995
post_message | 3790
miss_UNIVERSE | 3742
do_what | 3584
send_mail | 3490
start_occurrence | 3152
do_it | 2816
give_call | 2642
need_help | 2541
unsubscribe_mailto | 2468

In [15]:
# Define a function to inspect entries for a given action-object pair
def inspect_action_object_pair(pair_to_inspect, entry, print_extracted_text=True, print_target=True):
    test = df_filtered[df_filtered['action_object_pairs'].apply(lambda x: pair_to_inspect in x)]
    if print_extracted_text:
        print(test.iloc[entry]['extracted_text'])
    if print_target:
        if print_extracted_text:
            print('----------------------')
    print(test.iloc[entry]['target'])
    del test

In [16]:
inspect_action_object_pair('send_e', 1,print_extracted_text=False)

I might have sent the e - mail to the wrong person because you mentioned before that we should have a meeting with [PERSON]


## Entry Removal

After inspecting the most-frequently occurring action-object-pairs, the following ones will not be considered further:

| Action-Object-Pair    | Reason |
|-------------|-------------|
|fail_Message|belongs to error message and thus does not contain a humanly intent|
|start_Failures|belongs to error message and thus does not contain a humanly intent|
|start_Occurrences|belongs to error message and thus does not contain a humanly intent|
|post_message|belongs to an automated message|
|unsubscribe_mailto|belongs to an automated message|
|miss_UNIVERSE|belongs to an automated message|
|start_occurrence|belongs to an automated message|

also the extremeprogramming unsubscribe message is being filtered from the original dataframe, as well as java messages

In [39]:
entries_to_remove = [
    'fail_Message',
    'start_Failures',
    'start_Occurrences',
    'post_message',
    'unsubscribe_mailto',
    'miss_UNIVERSE',
    'start_occurrence',
    'set_sender'
]

# Remove these entries
df_filtered = df_filtered[~df_filtered['action_object_pairs'].apply(lambda x: any(item in entries_to_remove for item in x))]

In [None]:
df_filtered.shape

In [41]:
test_string = "Unexpected flow reached by the server"

df_filtered = df_filtered[~df_filtered['extracted_text'].apply(lambda x: test_string in str(x))]


In [None]:
df_filtered.shape

In [13]:
# Save the cleaned model

#df_filtered.to_parquet('../../data/processed/targets/avocado_train_targets_exploded_cleaned.parquet')

In [17]:
df_filtered = pd.read_parquet('../../data/processed/targets/avocado_train_targets_exploded_cleaned.parquet', engine='fastparquet')

Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x1048d1df0>>
Traceback (most recent call last):
  File "/opt/homebrew/Caskroom/miniforge/base/envs/ma_exp_intent/lib/python3.12/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(

KeyboardInterrupt: 


: 

: 

In [4]:
# Apply the function to the 'target' column
df_filtered['target'] = df_filtered['target'].progress_apply(replace_entities)

100%|██████████| 686570/686570 [1:25:37<00:00, 133.64it/s]  


In [7]:
df_filtered.to_parquet('../../data/processed/targets/avocado_train_targets_exploded_cleaned_entities.parquet')

## Group Action-Object-Pairs

#### Functions

In [195]:



def is_aggregated(entry):
    """
    Check if the entry is an aggregated row (i.e., a list of keywords).
    
    Parameters:
    - entry: The action-object pair to check.
    
    Returns:
    - True if the entry is aggregated, otherwise False.
    """
    return isinstance(entry, str) and ', ' in entry



def aggregate_by_keywords(df, keywords):
    """
    Aggregate action-object pairs based on a coherent list of keywords,
    excluding already aggregated entries from being re-aggregated.
    
    Parameters:
    - df: The DataFrame to aggregate.
    - keywords: A list of strings representing a coherent group of keywords.
    
    Returns:
    - A DataFrame with the newly aggregated action-object pairs.
    """
    # Create a single pattern to match any of the keywords
    pattern = '|'.join(keywords)
    
    # Filter entries that match the keywords but are not already aggregated
    matching_entries = df[
        df['action_object_pairs'].str.contains(pattern, case=False, na=False) &
        ~df['action_object_pairs'].apply(is_aggregated)
    ]
    
    # Create a new DataFrame for aggregated results
    aggregated_rows = []
    
    # Only create a new entry for the matched pairs if there are any
    if not matching_entries.empty:
        aggregated_row = {
            'action_object_pairs': ', '.join(matching_entries['action_object_pairs'].tolist()),
            'count': matching_entries['count'].sum()
        }
        aggregated_rows.append(aggregated_row)

        # Remove the matching entries from the original DataFrame
        df = df.drop(matching_entries.index)

    # Convert aggregated rows to DataFrame if any
    aggregated_df = pd.DataFrame(aggregated_rows)

    # Concatenate remaining entries with the aggregated DataFrame
    final_df = pd.concat([df, aggregated_df], ignore_index=True)

    return final_df.sort_values(by='count', ascending=False).reset_index(drop=True)

def remove_entries_containing(aggregated_df, original_df, substring):
    original_df = get_distinct_words_df(original_df)
    # Step 1: Identify the aggregated row index
    aggregated_row_index = aggregated_df[aggregated_df['action_object_pairs'].str.contains(',', case=False)].index
    if not aggregated_row_index.empty:
        aggregated_row_index = aggregated_row_index[0]
        
        # Step 2: Get the current list and split into individual entries
        current_list = aggregated_df.at[aggregated_row_index, 'action_object_pairs'].split(', ')
        
        # Step 3: Identify entries to remove
        entries_to_remove = [entry for entry in current_list if substring in entry]
        
        # Step 4: Create new rows for each removed entry using original counts
        new_rows = []
        for entry in entries_to_remove:
            # Get the count of the removed entry from the original DataFrame
            count_of_entry = original_df.loc[original_df['action_object_pairs'] == entry, 'count']
            if not count_of_entry.empty:
                new_rows.append({'action_object_pairs': entry, 'count': count_of_entry.values[0]})
            else:
                print(f"Warning: Could not find count for entry '{entry}'")

        # Step 5: Remove the entries from the current list
        updated_list = [entry for entry in current_list if entry not in entries_to_remove]

        # Step 6: Update the aggregated row
        if new_rows:
            aggregated_df.at[aggregated_row_index, 'action_object_pairs'] = ', '.join(updated_list)
            updated_count = aggregated_df.at[aggregated_row_index, 'count'] - sum(row['count'] for row in new_rows)
            aggregated_df.at[aggregated_row_index, 'count'] = updated_count

            # Step 7: Convert new_rows to DataFrame
            new_rows_df = pd.DataFrame(new_rows)

            # Step 8: Concatenate new rows for removed entries to the aggregated DataFrame
            aggregated_df = pd.concat([aggregated_df, new_rows_df], ignore_index=True)

            # Step 9: Sort the DataFrame by count in descending order
            aggregated_df = aggregated_df.sort_values(by='count', ascending=False).reset_index(drop=True)
    
    return aggregated_df


def aggregate_specific_indices(df, index_pairs):
    """
    Aggregate entries in the DataFrame based on specific index pairs.

    Parameters:
    - df: The DataFrame to aggregate.
    - index_pairs: A list of tuples, where each tuple contains indices of rows to aggregate.

    Returns:
    - A DataFrame with aggregated action-object pairs and their counts.
    """
    # Create a list to hold new aggregated rows
    aggregated_rows = []

    # Track which indices have been aggregated to avoid duplicates
    aggregated_indices = set()

    for indices in index_pairs:
        # Check if all indices are valid and not already aggregated
        if all(idx < len(df) for idx in indices) and not any(idx in aggregated_indices for idx in indices):
            # Select the rows to aggregate
            group = df.iloc[list(indices)]
            # Aggregate the action_object_pairs and the counts
            aggregated_row = {
                'action_object_pairs': ', '.join(group['action_object_pairs']),
                'count': group['count'].sum()
            }
            aggregated_rows.append(aggregated_row)
            # Mark these indices as aggregated
            aggregated_indices.update(indices)

    # Create a new DataFrame from aggregated rows
    aggregated_df = pd.DataFrame(aggregated_rows)

    # Add remaining unaggregated rows to the aggregated DataFrame
    unaggregated_rows = df[~df.index.isin(aggregated_indices)]
    aggregated_df = pd.concat([aggregated_df, unaggregated_rows], ignore_index=True)

    return aggregated_df.sort_values(by='count', ascending=False).reset_index(drop=True)

#### Implementation

In [61]:
# Read filtered DF
df_filtered = pd.read_parquet('../../data/processed/targets/avocado_train_targets_exploded_cleaned_entities.parquet', engine='fastparquet')

In [190]:
df_grouped = get_distinct_words_df(df_filtered)
df_grouped.head(20)

Unnamed: 0,action_object_pairs,count
172,have_questions,11024
296,send_email,8182
60,send_it,6188
389,call_me,6177
61,send_message,4856
99,thank_you,4433
229,contact_me,3994
404,do_what,3567
668,send_mail,3490
148,do_it,2806


In [194]:
df_grouped = aggregate_by_keywords(df_grouped, ['send_', ])
df_grouped.head(20)

Unnamed: 0,action_object_pairs,count
0,"send_it, send_message, send_mail, send_e, send...",59541
1,have_questions,11024
2,"have_meeting, set_meeting, schedule_meeting, a...",10989
3,"send_email, send_emails, resend_email, resend_...",8714
4,call_me,6177
5,thank_you,4433
6,contact_me,3994
7,do_what,3567
8,do_it,2806
9,give_call,2642


In [198]:
df_grouped = aggregate_specific_indices(df_grouped, [(0,3)])
df_grouped

Unnamed: 0,action_object_pairs,count
0,"send_it, send_message, send_mail, send_e, send...",68255
1,have_questions,11024
2,"have_meeting, set_meeting, schedule_meeting, a...",10989
3,call_me,6177
4,thank_you,4433
...,...,...
207865,change_speaker,1
207866,investigate_framework,1
207867,forward_set,1
207868,enable_functionalities,1


In [175]:
df_grouped.tail()

Unnamed: 0,action_object_pairs,count
207864,touch_winter,1
207865,expose_infrastructure,1
207866,develop_alliances,1
207867,synche_times,1
207868,talk_AvocadoIT,1


In [199]:
df_grouped = remove_entries_containing(df_grouped, df_filtered, 'send_email')
df_grouped.head(10)

Unnamed: 0,action_object_pairs,count
0,"send_it, send_message, send_mail, send_e, send...",59541
1,have_questions,11024
2,"have_meeting, set_meeting, schedule_meeting, a...",10989
3,send_email,8182
4,call_me,6177
5,thank_you,4433
6,contact_me,3994
7,do_what,3567
8,do_it,2806
9,give_call,2642


#### Iteratively apply the grouping-function to derive at a taxonomy and retrieve their action-object-pairs

In [None]:
df_grouped = group_action_object_pairs(df_grouped, 'send_')

### Group Pairs

In [9]:
df_start = df_filtered.copy()

In [33]:
df_filtered.shape

(686570, 19)

In [31]:
# Define the function to split the DataFrame based on a list of target substrings
def split_dataframe(df, target_substrings):
    """
    Splits the DataFrame into two: one with rows where any target substring is present in 'action_object_pairs',
    and another with the remaining rows.

    Parameters:
    df (pd.DataFrame): The original DataFrame to filter.
    target_substrings (list): A list of substrings to search for in 'action_object_pairs'.

    Returns:
    tuple: Two DataFrames - (df_send, df_remaining)
    """
    
    # Filter rows where 'action_object_pairs' contains any of the target substrings
    df_send = df[df['action_object_pairs'].apply(
        lambda x: isinstance(x, list) and any(any(substring in item for substring in target_substrings) for item in x)
    )]
    
    # Filter rows where 'action_object_pairs' does NOT contain any of the target substrings
    df_remaining = df[~df['action_object_pairs'].apply(
        lambda x: isinstance(x, list) and any(any(substring in item for substring in target_substrings) for item in x)
    )]
    
    return df_send, df_remaining

In [24]:
df_send, df_filtered = split_dataframe(df_filtered, target_substrings=['send_'])

In [25]:
print(df_send.shape)

(54000, 19)


In [26]:
df_filtered.shape

(632570, 19)

In [30]:
get_distinct_words_df(df_filtered).head(20)

Unnamed: 0,action_object_pairs,count
157,have_questions,10484
367,call_me,6007
85,thank_you,4028
213,contact_me,3925
382,do_what,3445
134,do_it,2723
963,give_call,2500
807,need_help,2452
57,join_us,2437
494,call_you,2341


In [10]:
df_send.to_parquet('../../data/processed/intents/send.parquet')

In [None]:
print(df_send.iloc[10]['target'])

#### obsolete code

In [15]:
df_distinct_words = get_distinct_words_df(df_filtered)

In [None]:
df_distinct_words.head(60)

In [15]:
# Define function to aggregate rows for a given word. If an action-object-pair contains the given word, entries and their occurrence-count will be aggregated.

def aggregate_rows(df, word_to_group):
    # Step 1: Filter rows where 'action_object_pairs' contains the word to group
    filtered_df = df[df['action_object_pairs'].str.contains(word_to_group, case=False, na=False)]

    # Step 2: Aggregate the filtered entries
    new_entry = pd.DataFrame({
        'action_object_pairs': [filtered_df['action_object_pairs'].tolist()],  # Aggregate label
        'count': [filtered_df['count'].sum()]                   # Sum the counts
    })

    # Step 3: Remove the filtered entries from the original DataFrame
    df = df[~df['action_object_pairs'].str.contains(word_to_group, case=False, na=False)]

    # Step 4: Append the new entry to the original DataFrame
    df = pd.concat([df, new_entry], ignore_index=True)

    # Sort the DataFrame by 'count' column in descending order
    return df.sort_values(by='count', ascending=False).reset_index(drop=True)


In [93]:
# Group for words "send", "call", "meeting", "join", "thank", "do", "receive", "provide", "discuss", "changes", "need"
df_distinct_words = aggregate_rows(df_distinct_words,'see')

In [None]:
df_distinct_words.head(60)

In [None]:
df_distinct_words.shape