In [26]:
import pandas as pd
import re
import contractions
import nltk
import spacy
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from joblib import Parallel, delayed


# Preprocess text

In [29]:

# nltk.download(['punkt', 'punk_tab' 'stopwords', 'wordnet'])
# nltk.download('punkt_tab')
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
# Preprocessing function (must be standalone and picklable)
def preprocess_text(text, nlp):
    if pd.isna(text):
        return ""
    
    # Normalization
    text = str(text).lower()
    text = contractions.fix(text)
    text = re.sub(r'@\w+|http\S+', '', text)
    
    # Cleaning (preserve punctuation)
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Tokenization and filtering
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words and len(token) > 1]
    
    # Lemmatization with POS tagging
    doc = nlp(" ".join(tokens))
    tokens = [token.lemma_ for token in doc if token.lemma_ != '-PRON-']
    
    return ' '.join(tokens)

# Read data

In [39]:
# Load your datasets
train_df = pd.read_csv('data/train.csv')  # Replace with your actual train file
test_df = pd.read_csv('data/test.csv')    # Replace with your actual test file


# Batch preprocessing

In [21]:
preprocess_text(train_df['content'][2])

'hello cas fair anxiety depression work lot come work become much hard last year pretty good job waitress could cope high expectation overthought everything big give feedback know well end leave shift time wrong suppose start 11 thought 1 . freak badly answer call never go back time smoke plant cope really help give coming year work drive really anyone help 27 driving lesson expensive right want give work ago feel concerned long even get hire anywhere especially licence vicious cycle need licence car get job need job get licence car haha especially age feel optimistic able get back work gap resume centrelink work though many bill want get bit ahead life know go process manufacturing course july get forklift licence soon many job available hope would something easy would find something mostly worried manage anxiety work manage overthinke managing tell thing wrong concerned make mistake point get fire anyone something similar manage ok anyone able completely transparent workplace go type

In [35]:
# Precompute stopwords as a global set
STOPWORDS = set(stopwords.words('english'))

# Load spaCy in a thread-safe way
class SpacyProcessor:
    def __init__(self):
        self.nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
        
    def process(self, text):
        return self.nlp(text)

# Initialize one instance per worker
spacy_processor = SpacyProcessor()

def preprocess_text(text):
    if pd.isna(text):
        return ""
    
    # Normalization
    text = str(text).lower()
    text = contractions.fix(text)
    text = re.sub(r'@\w+|http\S+', '', text)
    
    # Cleaning
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Tokenization and filtering
    tokens = word_tokenize(text)
    tokens = [t for t in tokens if t not in STOPWORDS and len(t) > 1]
    
    # Lemmatization
    doc = spacy_processor.process(" ".join(tokens))
    tokens = [token.lemma_ for token in doc if token.lemma_ != '-PRON-']
    
    return ' '.join(tokens)

# Load datasets
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

# Handle missing values
train_df['content'] = train_df['content'].fillna('[MISSING]')
test_df['content'] = test_df['content'].fillna('[MISSING]')

# Parallel processing with thread-based backend
def parallel_preprocess(df):
    return Parallel(n_jobs=-1, backend='threading')(
        delayed(preprocess_text)(text) for text in df['content']
    )

print("Preprocessing training data...")
train_df['cleaned_text'] = parallel_preprocess(train_df)

print("Preprocessing test data...")
test_df['cleaned_text'] = parallel_preprocess(test_df)


# Store preprocessed text for later reuse

In [34]:

# Save results
train_df[['id', 'cleaned_text', 'target']].to_csv('data/train_preprocessed.csv', index=False)
test_df[['id', 'cleaned_text']].to_csv('data/test_preprocessed.csv', index=False)

print("Preprocessing complete!")

Preprocessing complete!


# Fill in missing values

In [41]:
# Identify missing content rows
missing_train = train_df['content'].isna()
missing_test = test_df['content'].isna()
missing_train.sum(), missing_test.sum()

(242, 25)

In [42]:
new_train_df = pd.read_csv('data/train_preprocessed.csv')
new_test_df = pd.read_csv('data/test_preprocessed.csv')

In [43]:
new_train_df.shape, train_df.shape

((22151, 3), (22151, 4))

In [44]:
# Fill missing content with titles
new_train_df.loc[missing_train, 'cleaned_text'] = train_df.loc[missing_train, 'title']
new_test_df.loc[missing_test, 'cleaned_text'] = test_df.loc[missing_test, 'title']

In [45]:
# Save results
new_train_df.to_csv('data/train_preprocessed_fill_missing.csv', index=False)
new_test_df.to_csv('data/test_preprocessed_fill_missing.csv', index=False)

print("Missing imputation complete!")

Missing imputation complete!
