# Developing Heuristics
The approach tries to extract target-sentences that require some sort of action. Multi-Intents are not being considered.

In [1]:
import pandas as pd
import spacy
from tqdm import tqdm

# Load spaCy model
nlp = spacy.load("en_core_web_lg")

# Increase max_length to handle larger texts
nlp.max_length = 5_000_000  # Increase this to accommodate your text size


# Initialize the tqdm progress bar for pandas
tqdm.pandas()

In [2]:
df = pd.read_parquet('../../data/avocado_train.parquet')

In [3]:
def split_sentences(text, message_id):
    try:
        # Process the text using spaCy
        doc = nlp(text)
        return [sent.text.strip() for sent in doc.sents]
    except ValueError as e:
        # Catch the error and log the message ID
        if "exceeds maximum of" in str(e):
            print(f"Message ID {message_id}: Error processing text due to exceeding length.")
        else:
            print(f"Message ID {message_id}: Error - {str(e)}")
        return None  # Return None or some other marker to indicate failure

In [4]:
# Apply the chunked sentence splitting function with message ID logging
df['sentences'] = df.progress_apply(lambda row: split_sentences(row['extracted_text'], row['messageid']), axis=1)


 57%|█████▋    | 288642/503917 [2:43:14<1:47:59, 33.22it/s]  

Message ID <19B3B310D020D311B57E00105A9A55246E0C67@COFFEE>: Error processing text due to exceeding length.


 65%|██████▍   | 326311/503917 [3:03:23<1:54:48, 25.78it/s] 

Message ID <200012150128.eBF1SkD09210@mailer.avocadoit.com>: Error processing text due to exceeding length.


100%|██████████| 503917/503917 [4:44:54<00:00, 29.48it/s]   


In [5]:
#df_targets = df.explode('sentences').rename(columns={'sentences': 'sentence'}).reset_index(drop=True)

In [6]:
df.to_parquet('../../data/processed/avocado_train_individual_sentences.parquet')