# First experiments for intent extraction using heuristics

In [44]:
import pandas as pd
import os

In [35]:
df = pd.read_parquet('../../data/avocado_train.parquet')

In [36]:
df_sample = df.sample(n=1000,random_state=42)

# Manual Labeling

### Inspecting entries to subsequently add labels to them

In [88]:
csv_file_path = '../../data/manual_labels.csv'

def add_entry(message_id, label_finegrain, label_coarse, number_intents, target_sentences, auxiliary_information, requires_action):

    if os.path.exists(csv_file_path):
        # Read the existing CSV file into a DataFrame
        df = pd.read_csv(csv_file_path, dtype=object)
    else:
        # Create an empty DataFrame with the desired columns
        df = pd.DataFrame(columns=["message_id", "label_finegrain", "label_coarse", "number_intents", "target_sentences", "auxiliary_information", "requires_action"], dtype=object)

    if message_id in df['message_id'].values:
        df.loc[df['message_id'] == message_id, ['label_finegrain', 'label_coarse', 'number_intents', 'target_sentences', "auxiliary_information", 'requires_action']] = [label_finegrain, label_coarse, number_intents, target_sentences, auxiliary_information, requires_action]
    else:
        # Create a new entry as a DataFrame
        new_entry = pd.DataFrame([[message_id, label_finegrain, label_coarse, number_intents, target_sentences, auxiliary_information, requires_action]], columns=df.columns)

        # Append the new entry to the existing DataFrame
        df = pd.concat([df, new_entry], ignore_index=True)

    # Save the updated DataFrame back to the CSV file
    df.to_csv(csv_file_path, index=False)
    
    # Optional: Remove the temporary variable
    del df

    print("Data saved successfully.")

In [125]:
def check_whether_lengths_match(label_finegrain, label_coarse, target_sentences, auxiliary_information, requires_action):
    number_intents = len(target_sentences)
    if (len(requires_action) != number_intents):
        raise Exception("Lengths of required actions and number of intents not matching!")
    if (number_intents != len(label_coarse)):
        raise Exception("Lengths of coarsegrained intents and number of intents not matching!")
    if (len(label_finegrain) != number_intents):
        raise Exception("Lengths of finegrained intents and number of intents not matching!")
    if (len(auxiliary_information) != number_intents): 
        raise Exception("Lengths of aux. information and number of intents not matching!")

In [160]:
entry = 37
print(df_sample.iloc[entry]['extracted_text'])

Hi Dondi,
In was wondering if you'd be interested in working on the feature spec template again to get it into better shape?
I'm planning on bringing the issue up with Steve that we were in the process of developing the spec and never got a chance to complete it. Let me know if you still have an interest.

How's your day going?

Ruth


 

Ruth Gantly
Manager, Information Products
2211 North First Street
Suite 200
San Jose, CA 95131

email ruth.gantly@avocadoit.com
Phone: 408.562.8033
Fax: 408.562.8100


In [162]:
# Create Labeling for specific entry
label_finegrain = ['request_work', 'inform_clarification','request_reply',"ask_wellbeing"]
label_coarse = ['request','inform','request','request']
target_sentences = [
    "In was wondering if you'd be interested in working on the feature spec template again to get it into better shape?",
    "I'm planning on bringing the issue up with Steve that we were in the process of developing the spec and never got a chance to complete it.",
    "Let me know if you still have an interest.",
    "How's your day going?"
]
auxiliary_information = [
    'feature spec template',
    'Steve',
    "",
    ""
]

requires_action = [True,False,True,True]

check_whether_lengths_match(label_finegrain, label_coarse, target_sentences, auxiliary_information, requires_action)

number_intents = len(target_sentences)
# Add to DataFrame; Save labeling to csv
df_sample.at[df_sample.index[entry], 'label_finegrain'] = label_finegrain
df_sample.at[df_sample.index[entry], 'label_coarse'] = label_coarse
df_sample.at[df_sample.index[entry], 'number_intents'] = number_intents
df_sample.at[df_sample.index[entry], 'target_sentences'] = target_sentences
df_sample.at[df_sample.index[entry], 'auxiliary_information'] = auxiliary_information
df_sample.at[df_sample.index[entry], 'requires_action'] = requires_action
add_entry(message_id=df_sample.iloc[entry]['messageid'], label_finegrain=label_finegrain, label_coarse=label_coarse, number_intents=number_intents, target_sentences=target_sentences, auxiliary_information=auxiliary_information, requires_action=requires_action)

Data saved successfully.


In [41]:
df_sample.head()

Unnamed: 0,messageid,sender_access,outlook_sender_name,outlook_recipient_name,sentto_address,cc_address,subject,processed_subject,sent_date,arrival_date,...,body_content,in_reply_to,Keywords,extracted_text,preceding_conversation,label_finegrain,label_coarse,number_intents,target_sentences,requires_action
469373,<FC38A9406AA4D411AB62009027DE9DA5013DC38C@HQEX...,EX,Don Giesen,Dan Baca,Anthony Tarsia; Peter Smialek,Dan Baca,FW: Carey Limo and MobileSys,Carey Limo and MobileSys,2001-05-02T23:27:12Z,2001-05-02T23:27:01Z,...,"From: ""Don Giesen"" <don.giesen@avocadoit.com>\...",,,Guys:\r\n\r\nIf Carey is comfortable with our ...,,"[request_opinion, request_commitment]","[request, request]",2.0,"[How do they fell about us?, Can we diplomatic...",True
622663,<BD17BD4D5123D31186BF00105A9C9FBC14423E@donald>,EX,Marcia Kadanoff,,Marcos Sanchez,,Richard's schedule - FYI,Richard's schedule - FYI,1999-10-11T16:20:23Z,1999-10-11T16:20:23Z,...,"From: ""Marcia Kadanoff"" <marciak@avocadoit.com...",,,"My current schedule is this: I go ""official"", ...",,,,,,
275534,<-1535613024.999603664513.JavaMail.sreddy@sreddy>,SMTP,MonitorEtrade@avocadoit.com,DA,MonitorEtrade@avocadoit.com,,App Failure: appid=testetraderim EPXML=logoff....,App Failure: appid=testetraderim EPXML=logoff....,2001-09-04T11:41:04Z,2001-09-04T11:36:20Z,...,Received: from sreddy (10.7.10.74 [10.7.10.74]...,,,Script: session_3000020\r\nAppid: testetraderi...,,,,,,
892295,<FC38A9406AA4D411AB62009027DE9DA5CB2697@ntmach...,EX,Rajeev Mohindra,Om Sonie,Howard Mora; Ravi Pachipala; Nihar Mehta; Srik...,,Branching diagram for codeline.,code names for releases,2001-06-06T02:19:06Z,2001-06-06T02:18:51Z,...,"From: ""Rajeev Mohindra"" <rajeev@avocadoit.com>...",<FC38A9406AA4D411AB62009027DE9DA501162802@ntma...,,"Om,\r\n\r\nWe need to create a branch for 3.3....",,,,,,
231442,<FC38A9406AA4D411AB62009027DE9DA5B86803@ntmach...,EX,Dondi Gaskill,Dondi Gaskill,EMDS,,app tree minutes,app tree minutes,2001-08-17T21:15:12Z,2001-08-17T21:15:13Z,...,"From: ""Dondi Gaskill"" <dondi.gaskill@avocadoit...",,,The user may rename the root of a screenset or...,,,,,,


# Automatic Labeling