In [1]:
import spacy
import pandas as pd
from tqdm import tqdm

# Enable tqdm for pandas
tqdm.pandas()
from spacy.matcher import Matcher
nlp = spacy.load("en_core_web_sm")

In [2]:
df = pd.read_parquet('../../data/avocado_train.parquet',engine='pyarrow')

In [3]:
def extract_action_object_pairs(text):
    doc = nlp(text)
    action_object_pairs = []  # List to hold (action, object) pairs
    unconnected_dobjs = []    # List to hold unconnected dobj nouns
    unconnected_verbs = []    # List to hold unconnected verbs
    
    for token in doc:
        # Case 1: Check for action-object pairs
        if token.pos_ == "VERB":
            # Find its direct object (dobj) or object-related dependencies
            objects = [child for child in token.children if child.dep_ in ("dobj", "obj", "pobj")]
            for obj in objects:
                action_object_pairs.append((token.text, obj.text))
            
            # Case 2: If the verb has no object-related children, it's unconnected
            if not objects:
                unconnected_verbs.append(token.text)
        
        # Case 3: Collect unconnected dobj nouns
        if token.dep_ == "dobj" and token.pos_ == "NOUN":
            # Check if this noun is connected to a verb by looking at its head
            if token.head.pos_ != "VERB":
                # If the head is not a verb, it's an unconnected dobj noun
                unconnected_dobjs.append(token.text)
    
    return action_object_pairs, unconnected_dobjs, unconnected_verbs

In [4]:
df["action_object_pairs"] = df["text"].progress_apply(extract_action_object_pairs)

100%|██████████| 352123/352123 [1:46:41<00:00, 55.00it/s]  


In [6]:
df_backup = df

In [9]:
def normalize_action_object_pairs(value):
    if isinstance(value, list):
        # Flatten nested lists
        return [item for sublist in value for item in sublist] if any(isinstance(i, list) for i in value) else value
    elif pd.isna(value):
        return []  # Replace NaN or None with an empty list
    else:
        return [value]  # Convert single non-list values to lists

df['action_object_pairs'] = df['action_object_pairs'].apply(normalize_action_object_pairs)


In [11]:
df['action_object_pairs']

547880    [([('watch', 'demo'), ('used', 'events'), ('up...
507647    [([('adding', 'support'), ('using', 'crypto')]...
390433    [([('contacted', 'me'), ('setting', 'meeting')...
244808    [([('see', 'announcement'), ('editing', 'copy'...
532146    [([('sign', 'form'), ('have', 'questions')], [...
                                ...                        
110268    [([('has', 'list'), ('has', 'Darshan'), ('iden...
259178          [([('reach', 'me')], [], [Look, speaking])]
365838    [([('download', 'file'), ('provide', 'interfac...
131932    [([('do', 'what'), ('assist', 'you')], [], [le...
121958                                  [([], [], [Added])]
Name: action_object_pairs, Length: 352123, dtype: object

In [12]:
df.to_parquet('../../data/avocado_train_action_object_pairs.parquet',engine='fastparquet')