# Split Dataset and create Target Sentence Column

In [33]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
import spacy
from tqdm import tqdm
tqdm.pandas()  # Integrate tqdm with pandas

In [34]:
df = pd.read_parquet('../avocado-1.0.2/avocado_extracted_text.parquet', engine='pyarrow')

In [35]:
# First, split the data into training+validation (80%) and test sets (20%)
train_val_df, df_test = train_test_split(df, test_size=0.2, random_state=42)

# Then, split the train+val set into training (75% of train+val, which is 60% of total) and validation (25% of train+val, which is 20% of total)
df_train, df_val = train_test_split(train_val_df, test_size=0.25, random_state=42)  # 0.25 * 0.8 = 0.2



## Split Dataset into 60:20:20 Train/Eval/Test

In [36]:
df_test.shape

(187592, 17)

In [37]:
df_val.shape

(187592, 17)

In [38]:
df_train.shape


(562774, 17)

In [39]:
#df_train.to_parquet('avocado_train.parquet', engine='pyarrow', compression='snappy')

In [40]:
df_val.shape[0] * 3

562776

## Intent Extraction with Heuristics

In [41]:
df['heuristics'] = ''

def filter_df(row):
    text = row['extracted_text'].lower()  # Convert to lower case for case-insensitive comparison
    if any(keyword in text for keyword in ['please', 'can you', '?']):
        return 'pos'
    return ''

# Apply the function to update the heuristics column
df['heuristics'] = df.apply(filter_df, axis=1)

In [42]:
# Filter the DataFrame based on the updated heuristics column
df_pos = df[df['heuristics'] == 'pos']

In [None]:
df_pos.to_parquet('avocado_train.parquet', engine='pyarrow', compression='snappy')

In [43]:
df_pos.head()

Unnamed: 0,messageid,sender_access,outlook_sender_name,outlook_recipient_name,sentto_address,cc_address,subject,processed_subject,sent_date,arrival_date,body_file_path,body_content,in_reply_to,Keywords,extracted_text,preceding_conversation,problematic,heuristics
0,<FC38A9406AA4D411AB62009027DE9DA5E6545D@HQEXCH01>,EX,Fortunata Hermoso,Meshele Ko,Meshele Ko,'ray_rahamin@hp.com'; Dan Baca; Elba Linscott,Fw: RE: AvocadoIT's CEO committed to winning B...,AvocadoIT's CEO committed to winning Banamex deal,2001-04-12T22:02:41Z,2001-04-12T22:02:41Z,text/162/162-000001-EM.txt,"From: ""Fortunata Hermoso"" <fortunata.hermoso@a...",,,Fyi on Venks commitment on Banamex. The next s...,,,pos
1,<19B3B310D020D311B57E00105A9A55241D5975@COFFEE>,EX,Paiman Komeilizadeh,Meshele Ko,All Employees,,Expense reimbursement request- Amendment,Expense reimbursement request- Amendment,2000-04-24T23:06:06Z,2000-04-24T23:06:15Z,text/162/162-000003-EM.txt,"From: ""Paiman Komeilizadeh"" <pkomeilizadeh@avo...",,,"Dear friends,\r\n\r\n\r\nTo expedite the expe...",,,pos
3,<9640A23C9075D411B5CB00D0B708160C23A93D@COFFEE>,EX,Meshele Ko,,'jbmcbryde@fedex.com',,Fedex,Fedex,2000-09-12T17:01:08Z,2000-09-12T17:01:08Z,text/162/162-000005-EM.txt,"From: ""Meshele Ko"" <mko@avocadoit.com>\r\nTo: ...",,,"Hello Jim,\r\n\r\nI am following up on a piece...",,,pos
6,<19B3B310D020D311B57E00105A9A55245BE600@COFFEE>,EX,Meshele Ko,,'eileenw@getsmart.com',,RE: GetSmart.com,GetSmart.com,2000-05-13T01:00:03Z,2000-05-13T01:00:03Z,text/162/162-000009-EM.txt,"From: ""Meshele Ko"" <mko@avocadoit.com>\r\nTo: ...",<35153F4F88CFD311B33300104B95BC493DD197@arabia...,,"Eileen,\r\n\r\nAvocadoIT is a vendor that take...",,,pos
10,<19B3B310D020D311B57E00105A9A5524557278@COFFEE>,EX,Debbie Margulies,Meshele Ko,All Employees,,Photo touchup,Photo touchup,2000-04-04T04:57:20Z,2000-04-04T04:57:30Z,text/162/162-000017-EM.txt,"From: ""Debbie Margulies"" <dmargulies@avocadoit...",,,does anyone know how to touch up a photo in ph...,,,pos


# SpaCy Stuff

In [44]:
nlp = spacy.load('en_core_web_lg')
nlp.max_length = 10000000  # Increase as needed

In [46]:
df_pos['target_sentence'] = ''
keywords = {"please", "kindly", "can you", "could you", "would you", "would it", "i need", "i want", "send", "provide", "do this", "let me know"}

def find_target_sentence(text):
    doc = nlp(text.lower())

    # Split into sentences
    sentences = list(doc.sents)
    for sent in sentences:
        words = set(str(sent).split())
        if not keywords.isdisjoint(words):
            return sent

# Apply the function to update the heuristics column
df_pos['target_sentence'] = df_pos['extracted_text'].progress_apply(find_target_sentence)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_pos['target_sentence'] = ''
100%|██████████| 448790/448790 [8:39:02<00:00, 14.41it/s]    
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_pos['target_sentence'] = df_pos['extracted_text'].progress_apply(find_target_sentence)


In [47]:
df_pos.head(10)

Unnamed: 0,messageid,sender_access,outlook_sender_name,outlook_recipient_name,sentto_address,cc_address,subject,processed_subject,sent_date,arrival_date,body_file_path,body_content,in_reply_to,Keywords,extracted_text,preceding_conversation,problematic,heuristics,target_sentence
0,<FC38A9406AA4D411AB62009027DE9DA5E6545D@HQEXCH01>,EX,Fortunata Hermoso,Meshele Ko,Meshele Ko,'ray_rahamin@hp.com'; Dan Baca; Elba Linscott,Fw: RE: AvocadoIT's CEO committed to winning B...,AvocadoIT's CEO committed to winning Banamex deal,2001-04-12T22:02:41Z,2001-04-12T22:02:41Z,text/162/162-000001-EM.txt,"From: ""Fortunata Hermoso"" <fortunata.hermoso@a...",,,Fyi on Venks commitment on Banamex. The next s...,,,pos,"(please, use, me, to, escalate, any, support, ..."
1,<19B3B310D020D311B57E00105A9A55241D5975@COFFEE>,EX,Paiman Komeilizadeh,Meshele Ko,All Employees,,Expense reimbursement request- Amendment,Expense reimbursement request- Amendment,2000-04-24T23:06:06Z,2000-04-24T23:06:15Z,text/162/162-000003-EM.txt,"From: ""Paiman Komeilizadeh"" <pkomeilizadeh@avo...",,,"Dear friends,\r\n\r\n\r\nTo expedite the expe...",,,pos,"(2-, \t, please, make, a, copy, of, your, expe..."
3,<9640A23C9075D411B5CB00D0B708160C23A93D@COFFEE>,EX,Meshele Ko,,'jbmcbryde@fedex.com',,Fedex,Fedex,2000-09-12T17:01:08Z,2000-09-12T17:01:08Z,text/162/162-000005-EM.txt,"From: ""Meshele Ko"" <mko@avocadoit.com>\r\nTo: ...",,,"Hello Jim,\r\n\r\nI am following up on a piece...",,,pos,"(please, contact, me, with, any, questions, or..."
6,<19B3B310D020D311B57E00105A9A55245BE600@COFFEE>,EX,Meshele Ko,,'eileenw@getsmart.com',,RE: GetSmart.com,GetSmart.com,2000-05-13T01:00:03Z,2000-05-13T01:00:03Z,text/162/162-000009-EM.txt,"From: ""Meshele Ko"" <mko@avocadoit.com>\r\nTo: ...",<35153F4F88CFD311B33300104B95BC493DD197@arabia...,,"Eileen,\r\n\r\nAvocadoIT is a vendor that take...",,,pos,"(can, you, please, direct, me, to, the, right,..."
10,<19B3B310D020D311B57E00105A9A5524557278@COFFEE>,EX,Debbie Margulies,Meshele Ko,All Employees,,Photo touchup,Photo touchup,2000-04-04T04:57:20Z,2000-04-04T04:57:30Z,text/162/162-000017-EM.txt,"From: ""Debbie Margulies"" <dmargulies@avocadoit...",,,does anyone know how to touch up a photo in ph...,,,pos,"(if, so, ,, please, see, me, tuesday, morning,..."
12,<9640A23C9075D411B5CB00D0B708160C23A862@COFFEE>,EX,Meshele Ko,,'mark.ebel@bestbuy.com',,hello,hello,2000-08-25T00:16:19Z,2000-08-25T00:16:19Z,text/162/162-000019-EM.txt,"From: ""Meshele Ko"" <mko@avocadoit.com>\r\nTo: ...",,,"Mark,\r\n\r\nI did get a message from Nina?. ...",,,pos,
18,<19B3B310D020D311B57E00105A9A55240B4670@COFFEE>,EX,Meshele Ko,,Best Buy (E-mail),,Non-PC access to Web application,Wireless connection to Web application,2000-01-03T22:07:50Z,2000-01-03T22:07:50Z,text/162/162-000029-EM.txt,"From: ""Meshele Ko"" <mko@avocadoit.com>\r\nTo: ...",,,"Best Buy Chief Technology Officer,\r\n\r\nAs y...",,,pos,
21,<19B3B310D020D311B57E00105A9A55245BE715@COFFEE>,EX,Meshele Ko,,'greg@messageblaster.com',,AvocadoIT Wireless,AvocadoIT Wireless,2000-06-03T01:10:13Z,2000-06-03T01:10:13Z,text/162/162-000034-EM.txt,"From: ""Meshele Ko"" <mko@avocadoit.com>\r\nTo: ...",,,"Greg,\r\n\r\nI wanted to follow-up with your c...",,,pos,"(please, let, me, know, how, best, to, reach, ..."
26,<19B3B310D020D311B57E00105A9A55240B452C@COFFEE>,EX,Meshele Ko,,Lisa Chui; Mike Scolari; John Schemena; Dan De...,Kelsey Kerr,United Healthcare Demo,Re[2]: Demo,1999-11-15T17:50:04Z,1999-11-15T17:50:04Z,text/162/162-000040-EM.txt,"From: ""Meshele Ko"" <mko@avocadoit.com>\r\nTo: ...",,,"Dan,\r\n\r\nThe conference call has been sched...",,,pos,"(would, it, be, possible, to, send, me, a, lis..."
27,<19B3B310D020D311B57E00105A9A55240B46C0@COFFEE>,EX,Meshele Ko,,'Flavia.Spasiano@diax.ch',,Non-PC access to Web application,Wireless connection to Web application,2000-01-17T17:48:31Z,2000-01-17T17:48:31Z,text/162/162-000043-EM.txt,"From: ""Meshele Ko"" <mko@avocadoit.com>\r\nTo: ...",,,"Flavia,\r\nI appreciate your interest. I've a...",,,pos,


In [52]:
def span_to_string(span):
    if isinstance(span, spacy.tokens.Span):
        return span.text
    return str(span)  # Ensure anything else is converted to string if necessary

df_pos['target_sentence'] = df_pos['target_sentence'].apply(span_to_string)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_pos['target_sentence'] = df_pos['target_sentence'].apply(span_to_string)


In [53]:
df_pos.to_parquet('avocado_target_sentences.parquet', engine='pyarrow', compression='snappy')

In [None]:
nr = 0
print(df_pos.iloc[nr]['target_sentence'])
print('----')
print(df_pos.iloc[nr]['extracted_text'])


----
Fyi on Venks commitment on Banamex. The next step is to set a conference call with Nicollo and Venk next week. EP is committed to do whatever it takes to close this accoiunt.

We have a lot of support. Let's keep the communcation going. Please use me to escalate any support needed to close this account.
--------------------------
Fortunata Hermoso
AvocadoIT, Inc.
Global Alliance Manager
Mobile Phone: 408.464.8998
email: fortunata.hermoso@avocadoit.com


In [None]:
print(df[df['heuristics'].str.len() > 0].iloc[1]['extracted_text'])

Dear  friends,


To expedite the expense report / reimbursement process and documentation, accounting department is amending the process.


The following steps are added to expense report process:

1-	All receipts must be taped flat on regular white sheet ( you may tape 2, 3 or more to one sheet -depending on length and size of your receipts), prior to filing your expense reimbursement request with accounts payable.

2-	Please make a copy of your expense report and its documentation for your record, prior to filing your expense reimbursement request with accounts payable.

3-  	Due to the volume of invoices and vouchers we are receiving on daily basis, any expense reports which have not been prepared correctly may be returned to requester for correction. This will delay the reimbursement.

4-	All expense reports should be filed on timely basis. No actual expenditures should be more than 15 days late.

5- 	On every month end, if you have not yet prepared your expense reimbursement reque

In [None]:
df.shape

(937958, 18)