In [1]:
import pandas as pd
from collections import Counter
import spacy
from spacy.matcher import Matcher
from tqdm import tqdm
tqdm.pandas()


# Load the English model
nlp = spacy.load("en_core_web_lg")

In [2]:
df = pd.read_parquet('../../data/processed/avocado_train_individual_sentences.parquet')

### Create list of words for filtering requests

In [3]:
intent_words = [
    'dinner', 'lunch', 'breakfast', 'meeting', 'appointment', 'reminder', 'review', 'send me', 
    'need', 'how', 'schedule', 'please', 'send', 'sent', 'join', 'make sure', 
    'discuss', 'email', 'attend', 'call', 'provide', 'help', 'are there', 'are you', 'available', 
    'can i', 'can you', 'can we', 'can he', 'can she', 'can they', 'could you', 'could we', 'could i', 
    'did you', 'did i', 'did we', 'did he', 'did she', 'did they', 'do you', 'do they', 'does he', 'does she', 'do we',
    'do not', "don't", 'want', 'does that', 'does this', 'give', 'go ahead',
    'have you', 'have there', 'mail', 'is it', 'possible', 
]

In [4]:
def filter_sentences(sentences, intent_words):
    if sentences is None:
        return []  # Return an empty list for None values
    return [sentence for sentence in sentences if any(keyword in sentence.lower() for keyword in intent_words)]

df['targets'] = df['sentences'].progress_apply(lambda sentences: filter_sentences(sentences, intent_words))


100%|██████████| 503917/503917 [00:38<00:00, 12937.24it/s]


In [5]:
df.head()

Unnamed: 0,messageid,sender_access,outlook_sender_name,outlook_recipient_name,sentto_address,cc_address,subject,processed_subject,sent_date,arrival_date,body_file_path,body_content,in_reply_to,Keywords,extracted_text,preceding_conversation,sentences,targets
281687,<-434009366.998315478988.JavaMail.sreddy@sreddy>,SMTP,MonitorEtradeFilter@avocadoit.com,DA,MonitorEtradeFilter@avocadoit.com,,App Failure: appid=testetrade88456 EPXML=main....,App Failure: appid=testetrade88456 EPXML=main....,2001-08-20T13:51:18Z,2001-08-20T13:46:45Z,text/062/062-035343-EM.txt,Received: from sreddy (10.7.10.74 [10.7.10.74]...,,,Script: session_4000007\r\nAppid: testetrade88...,,[Script: session_4000007\r\nAppid: testetrade8...,[]
453472,<01370F5B01AAD411BE8100D0B744338C24F5A0@HQEXCH>,EX,Jackie Valle,,Toshiya Otani,Jackie Valle,Tickets to Tokyo,Tickets to Tokyo,2001-02-22T19:38:41Z,2001-02-22T19:38:41Z,text/096/096-001089-EM.txt,"From: ""Jackie Valle"" <jackie.valle@avocadoit.c...",,,"Hi,\r\n\r\nToday I received your tickets to To...",,"[Hi,\r\n\r\nToday I received your tickets to T...",[Looks like Natasha from Atherton Travel sent ...
183547,<01370F5B01AAD411BE8100D0B744338C296A70@HQEXCH>,EX,Helen Spade,Amit Sethi,'jorg.g.heinemann@accenture.com',David Chan; Amit Sethi; Don Giesen; Kelsey Kerr,RE: FW: Sun info requested - VERY URGENT PLEAS...,FW: Sun info requested - VERY URGENT PLEASE READ,2001-02-12T22:36:06Z,2001-02-12T22:36:09Z,text/007/007-013632-EM.txt,"From: ""Helen Spade"" <hspade@avocadoit.com>\r\n...",<OFF3DEA0B6.F0E8B090-ON882569E7.006B687C@accen...,,"Hello Jorg,\r\n\r\nI just left you a voicemail...",,"[Hello Jorg,\r\n\r\nI just left you a voicemai...","[Hello Jorg,\r\n\r\nI just left you a voicemai..."
51933,<F57EA917353BD411B5BA00D0B708160C1F4808@COFFEE>,EX,Scott Weller,Marcia Kadanoff,Marcia Kadanoff,,RE: Vodafone-Airtouch prep mtg,Vodafone-Airtouch prep mtg,2000-06-27T01:10:18Z,2000-06-27T01:10:19Z,text/161/161-019215-EM.txt,"From: ""Scott Weller"" <sweller@avocadoit.com>\r...",<F57EA917353BD411B5BA00D0B708160C195A59@COFFEE>,,Thanks. Since Dan is out through this week (a...,,"[Thanks., Since Dan is out through this week (...",[Since Dan is out through this week (and you C...
665644,<FC38A9406AA4D411AB62009027DE9DA5037CDD3B@ntma...,EX,Prakash Iyer,Srikanth Raghavan,Steve Hirata; Rajeev Mohindra,Srikanth Raghavan; Dave Sulcer; Wilhan Martono...,RE: Is it okay to take the following EP Toront...,"Geoff, Kant, Scott and Rebecca",2002-05-15T23:20:15Z,2002-05-15T23:19:53Z,text/235/235-015706-EM.txt,"From: ""Prakash Iyer"" <piyer@avocadoit.com>\r\n...",<FC38A9406AA4D411AB62009027DE9DA502B3A283@ntma...,,"Steve,\r\nRichard I think wanted butters. let ...",,"[Steve,\r\nRichard I think wanted butters., le...","[Steve,\r\nRichard I think wanted butters., Ra..."


### EDA Action-Object Pairs

In [6]:
def filter_action_object(targets):
    pairs = []
    matcher = Matcher(nlp.vocab)
    pattern = [{"POS": "VERB"}]
    matcher.add("VERB", [pattern])

    for target in targets:
        doc = nlp(target)
        matches = matcher(doc)
        for match_id, start, end in matches:
            verb = doc[start]
            for child in verb.children:
                # Check for direct or prepositional object that is alphabetic
                if child.dep_ in ("dobj", "pobj") and child.is_alpha:
                    pairs.append(verb.lemma_ + "_" + child.text)
    return pairs

df['action_object_pairs'] = df['targets'].progress_apply(lambda targets: filter_action_object(targets))

100%|██████████| 503917/503917 [2:00:54<00:00, 69.46it/s]   


In [7]:
all_words = [word for sublist in df['action_object_pairs'] for word in sublist]

# Use Counter to count occurrences of each word
word_counts = Counter(all_words)

# Convert to DataFrame (optional, if you want to keep it in tabular form)
distinct_words_df = pd.DataFrame(word_counts.items(), columns=['Word', 'Count'])
distinct_words_df = distinct_words_df.sort_values(by='Count', ascending=False)

In [8]:
distinct_words_df.head(20)

Unnamed: 0,Word,Count
20,start_Server,12831
63,send_it,12052
177,have_questions,11024
64,send_message,10704
301,send_email,8186
19,fail_Message,8065
18,start_Failures,6983
376,start_Occurrences,6428
395,call_me,6177
103,thank_you,4472


### Inspects Messages

| Action-Object-Pair    | Count |
|-------------|-------------|
 start_Server |	12831
send_it |	12052
have_questions |	11024
send_message |	10704
send_email |	8186
fail_Message |	8065
start_Failures |	6983
start_Occurrences |	6428
call_me	| 6177
thank_you | 4472
contact_me | 3995
post_message | 3790
miss_UNIVERSE | 3742
do_what | 3584
send_mail | 3490
start_occurrence | 3152
do_it | 2816
give_call | 2642
need_help | 2541
unsubscribe_mailto | 2468

### Inspecting Messages

After inspecting the most-frequently occurring action-object-pairs, the following ones will not be considered further:

| Action-Object-Pair    | Reason |
|-------------|-------------|
|fail_Message|belongs to error message and thus does not contain a humanly intent|
|start_Failures|belongs to error message and thus does not contain a humanly intent|
|start_Occurrences|belongs to error message and thus does not contain a humanly intent|
|post_message|belongs to an automated message|
|unsubscribe_mailto|belongs to an automated message|
|miss_UNIVERSE|belongs to an automated message|
|start_occurrence|belongs to an automated message|

also the extremeprogramming unsubscribe message is being filtered from the original dataframe, as well as java messages

In [49]:
# Remove unsubscribe text
string_to_remove = "To Post a message, send it to:   extremeprogramming@eGroups.com\r\n\r\nTo Unsubscribe, send a blank message to: extremeprogramming-unsubscribe@eGroups.com\r\n\r\nad-free courtesy of objectmentor.com \r\n\r\nYour use of Yahoo! Groups is subject to http://docs.yahoo.com/info/terms/\r\n"

df['extracted_text'] = df['extracted_text'].str.replace(string_to_remove, "", regex=False)

In [58]:
# Remove java text
strings_to_remove = [
    'To unsubscribe: mailto listserv@java.sun.com with body: "signoff\r\nJSP-INTEREST"',
    'For digest: mailto listserv@java.sun.com with body: "set JSP-INTEREST DIGEST"',
    'Some relevant FAQs on JSP/Servlets can be found at:',
    'http://java.sun.com/products/jsp/faq.html',
    'http://www.esperanto.org.nz/jsp/jspfaq.html',
    'http://www.jguru.com/jguru/faq/faqpage.jsp?name=JSP',
    'http://www.jguru.com/jguru/faq/faqpage.jsp?name=Servlets'
]

for s in strings_to_remove:
    df['extracted_text'] = df['extracted_text'].str.replace(s, "", regex=False)

In [61]:
# Explorative Inspection

# Define the list of target pairs you want to filter for
target_pairs = ["post_message"]

# Function to filter rows based on the presence of any target pair
def contains_target_pairs(pairs):
    return any(pair in target_pairs for pair in pairs)

# Filter the DataFrame
filtered_df = df[df['action_object_pairs'].apply(contains_target_pairs)]

print(repr(filtered_df.iloc[1]['extracted_text']))

'I seem to have cut into a vital part of XP here,\r\ncommunication seems to be *the* central part of\r\nthis process. Sorry for disturbing the waters :-9\r\n\r\nImagine, someone offered you a highly paid job\r\nas XP-Coach on a very interesting project,\r\nonly in a location with a quite different\r\nlanguage than yours. \r\n\r\nSince I cannot disclose details, imagine you are\r\nin Clarke\'s short story "The nine billion\r\nnames of God", being sent to a monastery in Tibet. \r\nYou can talk in English to your partner, to\r\nthe lama (a manager) and to yourself.\r\n\r\nYou know how to program a computer, so does your\r\npartner, and let\'s assume the monks know this, too.\r\nBut they might be used to different programming\r\nlanguages with different paradigms than yours.\r\n\r\nWhat now?\r\n\r\nWhich of XP\'s components would be useable in\r\nthis context?\r\n\r\nAnd what is a moxie-detector?\r\n\r\ns.\r\n\r\nFrom:    Phlip <pplumlee@celterra.com>\r\nTo:      To extremeprogramming@yaho

In [62]:
df.shape

(503917, 19)

In [63]:
exclude_list = ['fail_Message', 'start_Failures', 'start_Occurrences']

df = df[~df['action_object_pairs'].apply(lambda x: any(item in exclude_list for item in x))]


In [64]:
df.shape

(489401, 19)

## Rerun Code after filtering

In [75]:
df['action_object_pairs'] = df['targets'].progress_apply(lambda targets: filter_action_object(targets))

100%|██████████| 489401/489401 [6:14:35<00:00, 21.78it/s]     


In [66]:
all_words = [word for sublist in df['action_object_pairs'] for word in sublist]

# Use Counter to count occurrences of each word
word_counts = Counter(all_words)

# Convert to DataFrame (optional, if you want to keep it in tabular form)
distinct_words_df = pd.DataFrame(word_counts.items(), columns=['Word', 'Count'])
distinct_words_df = distinct_words_df.sort_values(by='Count', ascending=False)

In [83]:
# Explorative Inspection

# Define the list of target pairs you want to filter for
target_pairs = ["post_message"]

# Function to filter rows based on the presence of any target pair
def contains_target_pairs(pairs):
    return any(pair in target_pairs for pair in pairs)

# Filter the DataFrame
filtered_df = df[df['action_object_pairs'].apply(contains_target_pairs)]

print(filtered_df.iloc[5]['extracted_text'])

Hi

Here is a good write up "XP VS RUP" by Robert Martin.
http://www.objectmentor.com/publications/RUPvsXP.pdf

Ashima


>From: king_nitz@yahoo.com
>Reply-To: extremeprogramming@yahoogroups.com
>To: extremeprogramming@yahoogroups.com
>Subject: [XP] (unknown)
>Date: Tue, 10 Jul 2001 05:31:34 -0000
>
>Hi All,
>
>Well I am very new to extreme programming. If I understand I think
>its a process like RUP and I believe its best used when the projects
>aren't too huge. I read somewhere that its best used in case the
>project doesn't involve more than 10 developers. Can anybody please
>put light on this as to when its best to use XP vis a vis RUP? Also
>is XP competing in any way with RUP?
>
>Awaiting your valued feedback...
>
>Regards,
>King Nitz
>
>
>To Post a message, send it to:   extremeprogramming@eGroups.com
>
>To Unsubscribe, send a blank message to: 
>extremeprogramming-unsubscribe@eGroups.com
>
>Don't miss XP UNIVERSE, the first US conference on XP and Agile Methods.  
>see www.xpuni