In [30]:
import pandas as pd
from collections import Counter
import spacy
from spacy.matcher import Matcher
from tqdm import tqdm
tqdm.pandas()


# Load the English model
nlp = spacy.load("en_core_web_lg")

In [9]:
df = pd.read_parquet('../../data/processed/avocado_train_individual_sentences.parquet')

### Create list of words for filtering requests

In [11]:
intent_words = [
    'dinner', 'lunch', 'breakfast', 'meeting', 'appointment', 'reminder', 'review', 'send me', 
    'need', 'how', 'schedule', 'please', 'send', 'sent', 'join', 'make sure', 
    'discuss', 'email', 'attend', 'call', 'provide', 'help', 'are there', 'are you', 'available', 
    'can i', 'can you', 'can we', 'can he', 'can she', 'can they', 'could you', 'could we', 'could i', 
    'did you', 'did i', 'did we', 'did he', 'did she', 'did they', 'do you', 'do they', 'does he', 'does she', 'do we',
    'do not', "don't", 'want', 'does that', 'does this', 'give', 'go ahead',
    'have you', 'have there', 'mail', 'is it', 'possible', 
]

In [None]:
def filter_sentences(sentences, intent_words):
    if sentences is None:
        return []  # Return an empty list for None values
    return [sentence for sentence in sentences if any(keyword in sentence.lower() for keyword in intent_words)]

df['targets'] = df['sentences'].progress_apply(lambda sentences: filter_sentences(sentences, intent_words))


In [None]:
df.head()

### EDA Action-Object Pairs

In [None]:
def filter_action_object(targets):
    pairs = []
    matcher = Matcher(nlp.vocab)
    pattern = [{"POS": "VERB"}]
    matcher.add("VERB", [pattern])

    for target in targets:
        doc = nlp(target)
        matches = matcher(doc)
        for match_id, start, end in matches:
            verb = doc[start]
            for child in verb.children:
                # Check for direct or prepositional object that is alphabetic
                if child.dep_ in ("dobj", "pobj") and child.is_alpha:
                    pairs.append(verb.lemma_ + "_" + child.text)
    return pairs

# Example usage
df['action_object_pairs'] = df['targets'].progress_apply(lambda targets: filter_action_object(targets))

In [39]:
all_words = [word for sublist in df['action_object_pairs'] for word in sublist]

# Use Counter to count occurrences of each word
word_counts = Counter(all_words)

# Convert to DataFrame (optional, if you want to keep it in tabular form)
distinct_words_df = pd.DataFrame(word_counts.items(), columns=['Word', 'Count'])
distinct_words_df = distinct_words_df.sort_values(by='Count', ascending=False)

In [None]:
distinct_words_df.head(20)