In [29]:
import pandas as pd
import spacy
import ast
nlp = spacy.load("en_core_web_lg")

In [77]:
df = pd.read_csv('../../data/labeled/send_new_refined.tsv', sep='\t')

In [71]:
df.head()

Unnamed: 0,text,label
0,I will be sending them our comments by 5 pm EST .,send_review
1,Please feel free to send me any corrections or...,send_review
2,keep sending us your feedbacks .,send_review
3,Please send all your feedbacks to [PERSON] .,send_review
4,Send your feedback to me no later than 2:00 PM...,send_review


In [19]:
df.shape

(1298, 4)

In [7]:
# Function to extract and format action-object pairs
def extract_action_object_pairs(text):
    doc = nlp(text)
    pairs = []
    for token in doc:
        # Check if the token is a verb (action) and has a direct object (dobj)
        if token.pos_ == "VERB":
            for child in token.children:
                if child.dep_ == "dobj":  # dobj = direct object
                    # Format as "action_object"
                    pairs.append(f"{token.text}_{child.text}")
    return pairs

# Apply the function to the DataFrame column
df['action_object_pairs'] = df['text'].apply(extract_action_object_pairs)

In [72]:
df.head()

Unnamed: 0,text,label
0,I will be sending them our comments by 5 pm EST .,send_review
1,Please feel free to send me any corrections or...,send_review
2,keep sending us your feedbacks .,send_review
3,Please send all your feedbacks to [PERSON] .,send_review
4,Send your feedback to me no later than 2:00 PM...,send_review


In [13]:
# Convert each entry in 'action_object_pairs' to a string if it’s not already
df['action_object_pairs'] = df['action_object_pairs'].apply(lambda x: str(x) if isinstance(x, list) else x)

# Group by 'label' and use set to get distinct values
distinct_pairs = df.groupby('label')['action_object_pairs'].apply(lambda x: list(set(x))).reset_index()


In [14]:
distinct_pairs

Unnamed: 0,label,action_object_pairs
0,send_fax,"[['sending_fax'], ['send_fax', 'write_number']..."
1,send_meeting_request,"[['sent_request'], ['send_info'], ['send_Take'..."
2,send_pictures,"[['send_info'], ['grab_Camera', 'scan_negative..."
3,send_presentation,"[['send_info'], ['includes_slides', 'sent_me',..."
4,send_resume,"[['networked_alumni', 'sent_resumes'], ['sendi..."
5,send_review,"[['send_suggestions'], ['send_this', ""send_'"",..."
6,send_update_changes,"[['sent_changes'], ['have_questions', 'send_ma..."


In [15]:
df_unique = df.drop_duplicates()

# Count occurrences of each action_object_pairs across all labels
pair_counts = df_unique.groupby('action_object_pairs')['label'].nunique().reset_index()
pair_counts = pair_counts.rename(columns={'label': 'label_count'})

# Filter for entries that appear in more than one label
multi_label_pairs = pair_counts[pair_counts['label_count'] > 1]

In [16]:
multi_label_pairs

Unnamed: 0,action_object_pairs,label_count
194,"['have_questions', 'send_mail']",2
327,"['review_it', 'sending_it']",2
362,"['send_[', 'send_PERSON']",2
385,['send_comments'],2
388,['send_copies'],2
396,['send_copy'],4
400,['send_details'],3
405,['send_document'],2
406,['send_draft'],2
417,['send_email'],5


In [24]:
df_exploded = df.explode('action_object_pairs')

In [25]:
df_exploded

Unnamed: 0.1,Unnamed: 0,text,label,action_object_pairs
0,0,"Please send your comments to me ASAP , but n...",send_review,['send_comments']
1,1,Please review and send feedback .,send_review,['send_feedback']
2,2,"If you would like ask a question , or if you j...",send_review,"['ask_question', 'have_comment', 'send_email']"
3,3,I am sending it to the customer to review an...,send_review,"['sending_it', 'finalize_it']"
4,4,Thanks for capturing all the feedback and send...,send_review,"['capturing_feedback', 'sending_it']"
...,...,...,...,...
1293,1293,Do you want me to send you my resume ?,send_resume,['send_resume']
1294,1294,I am not sure if you have already sent me your...,send_resume,['sent_resume']
1295,1295,"Per you request , I am sending you my resume .",send_resume,['sending_resume']
1296,1296,I will look for my resume and send it to you .,send_resume,['send_it']


In [22]:
# Step 4: Filter out rows that contain any of these multi-label pairs
df_filtered = df[~df['action_object_pairs'].apply(lambda pairs: any(pair in multi_label_pairs for pair in pairs))]


In [23]:
df_filtered

Unnamed: 0.1,Unnamed: 0,text,label,action_object_pairs
0,0,"Please send your comments to me ASAP , but n...",send_review,['send_comments']
1,1,Please review and send feedback .,send_review,['send_feedback']
2,2,"If you would like ask a question , or if you j...",send_review,"['ask_question', 'have_comment', 'send_email']"
3,3,I am sending it to the customer to review an...,send_review,"['sending_it', 'finalize_it']"
4,4,Thanks for capturing all the feedback and send...,send_review,"['capturing_feedback', 'sending_it']"
...,...,...,...,...
1293,1293,Do you want me to send you my resume ?,send_resume,['send_resume']
1294,1294,I am not sure if you have already sent me your...,send_resume,['sent_resume']
1295,1295,"Per you request , I am sending you my resume .",send_resume,['sending_resume']
1296,1296,I will look for my resume and send it to you .,send_resume,['send_it']


In [30]:
# Step 1: Convert string representations of lists to actual lists
if isinstance(df['action_object_pairs'].iloc[0], str):
    df['action_object_pairs'] = df['action_object_pairs'].apply(ast.literal_eval)

# Step 2: Explode the action_object_pairs to separate each pair into its own row
df_exploded = df.explode('action_object_pairs')

# Step 3: Count occurrences of each action_object_pair across unique labels
pair_counts = df_exploded.drop_duplicates().groupby('action_object_pairs')['label'].nunique().reset_index()
pair_counts = pair_counts.rename(columns={'label': 'label_count'})

# Step 4: Identify pairs that appear in more than one label
multi_label_pairs = pair_counts[pair_counts['label_count'] > 1]['action_object_pairs'].tolist()

# Step 5: Filter out rows that contain any of these multi-label pairs
df_filtered = df[~df['action_object_pairs'].apply(lambda pairs: any(pair in multi_label_pairs for pair in pairs))]


In [31]:
df_filtered

Unnamed: 0.1,Unnamed: 0,text,label,action_object_pairs
12,12,I will be sending them our comments by 5 pm EST .,send_review,[sending_comments]
15,15,Please feel free to send me any corrections or...,send_review,[send_corrections]
42,42,keep sending us your feedbacks .,send_review,[sending_feedbacks]
47,47,Please send all your feedbacks to [PERSON] .,send_review,[send_feedbacks]
54,54,Send your feedback to me no later than 2:00 PM...,send_review,[Send_feedback]
...,...,...,...,...
1292,1292,Can I send you my resume to forward ? Thanks .,send_resume,[send_resume]
1293,1293,Do you want me to send you my resume ?,send_resume,[send_resume]
1294,1294,I am not sure if you have already sent me your...,send_resume,[sent_resume]
1295,1295,"Per you request , I am sending you my resume .",send_resume,[sending_resume]


In [32]:
df_filtered = df_filtered[['text','label']]

In [35]:
df_filtered

Unnamed: 0,text,label
12,I will be sending them our comments by 5 pm EST .,send_review
15,Please feel free to send me any corrections or...,send_review
42,keep sending us your feedbacks .,send_review
47,Please send all your feedbacks to [PERSON] .,send_review
54,Send your feedback to me no later than 2:00 PM...,send_review
...,...,...
1292,Can I send you my resume to forward ? Thanks .,send_resume
1293,Do you want me to send you my resume ?,send_resume
1294,I am not sure if you have already sent me your...,send_resume
1295,"Per you request , I am sending you my resume .",send_resume


In [34]:
df_filtered.to_csv('send_new_refined.tsv',sep='\t', index=False)

In [36]:
df = df_filtered

## Distinguish between providing and requesting

In [37]:
labels = df['label'].unique()
labels

array(['send_review', 'send_presentation', 'send_fax',
       'send_update_changes', 'send_pictures', 'send_meeting_request',
       'send_resume'], dtype=object)

In [128]:
def filter_request(row):
    row_temp = row.copy()
    keywords_request = {' your ', ' please ', ' me ', ' us '}
    keywords_provide = {'i ',  ' i '}

    # Check for intersections with each set of keywords
    request_found = bool(any(keyword in row['text'].lower() for keyword in keywords_request))
    provide_found = bool(any(keyword in row['text'].lower() for keyword in keywords_provide))


    if (request_found and provide_found) or (not request_found and not provide_found):
        row_temp['label'] = row_temp['label']+"_ambiguous"
        return row_temp
    if request_found:
        row_temp['label'] =  row_temp['label']+"_request"
        return row_temp
    if provide_found:
        row_temp['label'] =  row_temp['label']+"_provide"
        return row_temp
    return row_temp

In [129]:
df_new = df.apply(filter_request, axis=1)

In [141]:
index = 10
print(df_new.iloc[index]['text'])
print(df_new.iloc[index]['label'])

So in the interest of time , I 'm sending a first draft of the release for your review in lieu of a phone interview .
send_review_ambiguous


In [101]:
df

Unnamed: 0,text,label
0,I will be sending them our comments by 5 pm EST .,send_review
1,Please feel free to send me any corrections or...,send_review
2,keep sending us your feedbacks .,send_review
3,Please send all your feedbacks to [PERSON] .,send_review
4,Send your feedback to me no later than 2:00 PM...,send_review
...,...,...
575,Can I send you my resume to forward ? Thanks .,send_resume
576,Do you want me to send you my resume ?,send_resume
577,I am not sure if you have already sent me your...,send_resume
578,"Per you request , I am sending you my resume .",send_resume


In [66]:
df_new[df_new['label']=='send_review_request'].iloc[0]['text']

IndexError: single positional indexer is out-of-bounds

In [42]:
df[df['label'] == 'send_review']

Unnamed: 0,text,label
12,I will be sending them our comments by 5 pm EST .,send_review
15,Please feel free to send me any corrections or...,send_review
42,keep sending us your feedbacks .,send_review
47,Please send all your feedbacks to [PERSON] .,send_review
54,Send your feedback to me no later than 2:00 PM...,send_review
56,I ca n't stress the importance of a review cyc...,send_review
65,I thought I had already sent you my feedback .,send_review
69,Are there any whitepapers or technical mater...,send_review
70,Send any questions or comments to [PERSON] and...,send_review
72,I have sent some information via overnight mai...,send_review


In [38]:
df[df['label'] == 'send_review']

Unnamed: 0,text,label
12,I will be sending them our comments by 5 pm EST .,send_review
15,Please feel free to send me any corrections or...,send_review
42,keep sending us your feedbacks .,send_review
47,Please send all your feedbacks to [PERSON] .,send_review
54,Send your feedback to me no later than 2:00 PM...,send_review
56,I ca n't stress the importance of a review cyc...,send_review
65,I thought I had already sent you my feedback .,send_review
69,Are there any whitepapers or technical mater...,send_review
70,Send any questions or comments to [PERSON] and...,send_review
72,I have sent some information via overnight mai...,send_review


In [163]:
df_request_provide = df_new[~df_new['label'].str.contains("ambiguous", case=False, na=False)]

In [164]:
df_request_provide

Unnamed: 0,text,label
0,I will be sending them our comments by 5 pm EST .,send_review_provide
1,Please feel free to send me any corrections or...,send_review_request
2,keep sending us your feedbacks .,send_review_request
3,Please send all your feedbacks to [PERSON] .,send_review_request
4,Send your feedback to me no later than 2:00 PM...,send_review_request
...,...,...
574,"If possible , i can have him send you his resu...",send_resume_provide
575,Can I send you my resume to forward ? Thanks .,send_resume_provide
576,Do you want me to send you my resume ?,send_resume_request
578,"Per you request , I am sending you my resume .",send_resume_provide


In [156]:
df_request = df_request[~df_request['label'].str.contains("provide", case=False, na=False)]
df_request = df_request[~df_request['label'].str.contains("ambiguous", case=False, na=False)]

In [157]:
df_request

Unnamed: 0,text,label
1,Please feel free to send me any corrections or...,send_review_request
2,keep sending us your feedbacks .,send_review_request
3,Please send all your feedbacks to [PERSON] .,send_review_request
4,Send your feedback to me no later than 2:00 PM...,send_review_request
13,Send me your questions & comments [PERSON] ...,send_review_request
...,...,...
561,Here are the areas in which we need people (...,send_resume_request
568,"Justin , No problem , Just send me your res...",send_resume_request
569,Can you please send this resume to your HR ? ...,send_resume_request
572,"Please send resumes , a brief 50 - 85 word sum...",send_resume_request


In [165]:
df_request_provide.to_csv('send_new_refined_request_provide.tsv', sep='\t', index=False)