## Filter Action-Object Pairs

In [1]:
import pandas as pd
from collections import Counter
import spacy
from spacy.matcher import Matcher
from tqdm import tqdm
from IPython.display import display, HTML
tqdm.pandas()


# Load the English model
nlp = spacy.load("en_core_web_lg")

In [2]:
df = pd.read_parquet('../../data/processed/targets/avocado_train_targets_exploded.parquet', engine='fastparquet')
df = df.rename(columns={'targets': 'target'})

In [3]:
df.shape

(1205714, 19)

In [4]:
# Remove empty entries
df_filtered = df[~df['action_object_pairs'].apply(lambda x: x is None or (isinstance(x, list) and len(x) == 0))]

In [5]:
df_filtered.shape

(722472, 19)

### Inspect Action-Object-Pairs results

In [6]:
def get_distinct_words_df(df):
    all_words = [word for sublist in df['action_object_pairs'] for word in sublist]

    # Use Counter to count occurrences of each word
    word_counts = Counter(all_words)

    # Convert to DataFrame (optional, if you want to keep it in tabular form)
    distinct_words_df = pd.DataFrame(word_counts.items(), columns=['action_object_pairs', 'count'])
    distinct_words_df = distinct_words_df.sort_values(by='count', ascending=False)
    #display(HTML(distinct_words_df.head(n).to_html()))
    return distinct_words_df

In [7]:
get_distinct_words_df(df_filtered).head(20)

Unnamed: 0,action_object_pairs,count
20,start_Server,12831
63,send_it,12052
177,have_questions,11024
64,send_message,10704
301,send_email,8186
19,fail_Message,8065
18,start_Failures,6983
376,start_Occurrences,6428
395,call_me,6177
103,thank_you,4472


### Inspect Messages


| Action-Object-Pair    | Count |
|-------------|-------------|
 start_Server |	12831
send_it |	12052
have_questions |	11024
send_message |	10704
send_email |	8186
fail_Message |	8065
start_Failures |	6983
start_Occurrences |	6428
call_me	| 6177
thank_you | 4472
contact_me | 3995
post_message | 3790
miss_UNIVERSE | 3742
do_what | 3584
send_mail | 3490
start_occurrence | 3152
do_it | 2816
give_call | 2642
need_help | 2541
unsubscribe_mailto | 2468

In [8]:
# Define a function to inspect entries for a given action-object pair
def inspect_action_object_pair(pair_to_inspect, entry, print_extracted_text=True, print_target=True):
    test = df_filtered[df_filtered['action_object_pairs'].apply(lambda x: pair_to_inspect in x)]
    if print_extracted_text:
        print(test.iloc[entry]['extracted_text'])
    if print_target:
        if print_extracted_text:
            print('----------------------')
    print(test.iloc[entry]['target'])
    del test

In [9]:
inspect_action_object_pair('send_e', 1,print_extracted_text=False)

I might have sent the e-mail to the wrong person because you mentioned before that we should have a meeting with Mark


## Entry Removal

After inspecting the most-frequently occurring action-object-pairs, the following ones will not be considered further:

| Action-Object-Pair    | Reason |
|-------------|-------------|
|fail_Message|belongs to error message and thus does not contain a humanly intent|
|start_Failures|belongs to error message and thus does not contain a humanly intent|
|start_Occurrences|belongs to error message and thus does not contain a humanly intent|
|post_message|belongs to an automated message|
|unsubscribe_mailto|belongs to an automated message|
|miss_UNIVERSE|belongs to an automated message|
|start_occurrence|belongs to an automated message|

also the extremeprogramming unsubscribe message is being filtered from the original dataframe, as well as java messages

In [10]:
entries_to_remove = [
    'fail_Message',
    'start_Failures',
    'start_Occurrences',
    'post_message',
    'unsubscribe_mailto',
    'miss_UNIVERSE',
    'start_occurrence'
]

# Remove these entries
df_filtered = df_filtered[~df_filtered['action_object_pairs'].apply(lambda x: any(item in entries_to_remove for item in x))]

In [11]:
df_filtered.shape

(699454, 19)

In [21]:
test_string = "Unexpected flow reached by the server"

df_filtered = df_filtered[~df_filtered['extracted_text'].apply(lambda x: test_string in str(x))]


In [13]:
# Save the cleaned model

df_filtered.to_parquet('../../data/processed/targets/avocado_train_targets_exploded_cleaned.parquet')

In [22]:
get_distinct_words_df(df_filtered).head(20)

Unnamed: 0,action_object_pairs,count
172,have_questions,11024
296,send_email,8182
60,send_it,6188
389,call_me,6177
61,send_message,4856
99,thank_you,4433
229,contact_me,3994
404,do_what,3567
668,send_mail,3490
148,do_it,2806


In [31]:
# Filter to keep only unique message IDs
unique_df = df_filtered[~df_filtered['messageid'].duplicated(keep=False)]

entry = 
print(unique_df.iloc[entry]['extracted_text'])
print('-------------------------------------')
print(unique_df.iloc[entry]['target'])
print('-------------------------------------')
print(unique_df.iloc[entry]['action_object_pairs'])

Steve,
Please disregard Wendy's request until I get a chance to determine from our
database engineer, Shari Brown, if we require suplemental data beyond the
mass files that we already load.

I think that we have the right mix of data in house.  I have been swamped
with various undertakings (RIM, Omnisky,etc.), and as a result have not had
time to close the loop with Shari Brown to get the final form of the
standard reports summarized and regularized.

In the meantime, thanks for your prompt response.

Regards,
Joe

   


-------------------------------------
Steve,
Please disregard Wendy's request until I get a chance to determine from our
database engineer, Shari Brown, if we require suplemental data beyond the
mass files that we already load.
-------------------------------------
['disregard_request', 'get_chance', 'require_data', 'load_that']


### Group Pairs

In [15]:
df_distinct_words = get_distinct_words_df(df_filtered)

In [None]:
df_distinct_words.head(60)

In [15]:
# Define function to aggregate rows for a given word. If an action-object-pair contains the given word, entries and their occurrence-count will be aggregated.

def aggregate_rows(df, word_to_group):
    # Step 1: Filter rows where 'action_object_pairs' contains the word to group
    filtered_df = df[df['action_object_pairs'].str.contains(word_to_group, case=False, na=False)]

    # Step 2: Aggregate the filtered entries
    new_entry = pd.DataFrame({
        'action_object_pairs': [filtered_df['action_object_pairs'].tolist()],  # Aggregate label
        'count': [filtered_df['count'].sum()]                   # Sum the counts
    })

    # Step 3: Remove the filtered entries from the original DataFrame
    df = df[~df['action_object_pairs'].str.contains(word_to_group, case=False, na=False)]

    # Step 4: Append the new entry to the original DataFrame
    df = pd.concat([df, new_entry], ignore_index=True)

    # Sort the DataFrame by 'count' column in descending order
    return df.sort_values(by='count', ascending=False).reset_index(drop=True)


In [93]:
# Group for words "send", "call", "meeting", "join", "thank", "do", "receive", "provide", "discuss", "changes", "need"
df_distinct_words = aggregate_rows(df_distinct_words,'see')

In [None]:
df_distinct_words.head(60)

In [None]:
df_distinct_words.shape