Setup

In [None]:
import pandas as pd
from textblob import TextBlob
from tqdm import tqdm
import os


load data

In [None]:
df = pd.read_csv('../data/raw/test.csv')
print(df.isna().mean().sort_values(ascending=False))
df = df.fillna('')  # Replace NaNs with empty strings

Combine 'body' and 'subject'


In [None]:
df['text'] = df.apply(lambda row: row['body'] if row['body'].strip() else row['Subject'], axis=1)
df = df[df['text'].str.strip() != '']  # Drop rows with no usable text

label sentiment

In [None]:
def classify_sentiment(text):
    polarity = TextBlob(text).sentiment.polarity
    if polarity > 0.1:
        return 'Positive'
    elif polarity < -0.1:
        return 'Negative'
    else:
        return 'Neutral'

tqdm.pandas()
df['Sentiment'] = df['text'].progress_apply(classify_sentiment)

### Sentiment Threshold Justification
This project uses TextBlob to assign sentiment labels based on polarity scores. A threshold of ±0.1 was used to label Neutral sentiment:
- Polarity > 0.1 → Positive  
- Polarity < –0.1 → Negative  
- Otherwise → Neutral

save labeled data

In [None]:
os.makedirs('../data/processed', exist_ok=True)
df.to_csv('../data/processed/labeled_messages.csv', index=False)

summary

In [None]:
print(df['Sentiment'].value_counts())