# Developing Heuristics
The approach tries to extract target-sentences that require some sort of action. Multi-Intents are not being considered.

In [1]:
import pandas as pd
import spacy
from tqdm import tqdm

# Load spaCy model
nlp = spacy.load("en_core_web_lg")

# Increase max_length to handle larger texts
nlp.max_length = 10_000_000  # Increase this to accommodate your text size


# Initialize the tqdm progress bar for pandas
tqdm.pandas()

In [2]:
df = pd.read_parquet('../../data/avocado_train.parquet')

In [6]:
# Function to split sentences using spaCy
def split_into_sentences(text):
    doc = nlp(text)  # Process the text using spaCy
    return [sentence.text.strip() for sentence in doc.sents]  # Return list of sentences

In [None]:
# Apply the function to each row using progress_apply
df['sentences'] = df['extracted_text'].progress_apply(split_into_sentences)

In [None]:
df_targets = df.explode('sentences').rename(columns={'sentences': 'sentence'}).reset_index(drop=True)

In [None]:
df_targets.to_parquet('../../data/processed/avocado_train_targets.parquet')