# Data Preprocessing - 3-Class Emotion Classification

Cleaning, preprocessing, and balancing text data for 3-class emotion classification.


## Import Libraries


In [1]:
import pandas as pd
import numpy as np
import re
import os
import warnings
import json
from collections import Counter

warnings.filterwarnings('ignore')
os.makedirs('../data/processed', exist_ok=True)

np.random.seed(42)


## Load Data


In [2]:
train_df = pd.read_csv('../data/raw/train_raw.csv')
val_df = pd.read_csv('../data/raw/val_raw.csv')
test_df = pd.read_csv('../data/raw/test_raw.csv')

print(f"Train: {train_df.shape} | Val: {val_df.shape} | Test: {test_df.shape}")


Train: (43410, 2) | Val: (5426, 2) | Test: (5427, 2)


## Sample Raw Data


In [3]:
train_df.head()


Unnamed: 0,text,emotion
0,My favourite food is anything I didn't have to...,neutral
1,"Now if he does off himself, everyone will thin...",neutral
2,WHY THE FUCK IS BAYLESS ISOING,anger
3,To make her feel threatened,fear
4,Dirty Southern Wankers,anger


## Preprocessing Function


In [4]:
def preprocess_text(text):
    if pd.isna(text):
        return ""
    
    text = str(text).lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'<.*?>', '', text)
    
    emoji_patterns = [
        re.compile(u'[\U0001F600-\U0001F64F]', flags=re.UNICODE),
        re.compile(u'[\U0001F300-\U0001F5FF]', flags=re.UNICODE),
        re.compile(u'[\U0001F680-\U0001F6FF]', flags=re.UNICODE),
        re.compile(u'[\U0001F1E0-\U0001F1FF]', flags=re.UNICODE),
        re.compile(u'[\U0001F900-\U0001F9FF]', flags=re.UNICODE),
        re.compile(u'[\U0001FA00-\U0001FA6F]', flags=re.UNICODE),
        re.compile(u'[\U0001FA70-\U0001FAFF]', flags=re.UNICODE),
        re.compile(u'[\U00002600-\U000026FF]', flags=re.UNICODE),
        re.compile(u'[\U00002700-\U000027BF]', flags=re.UNICODE),
        re.compile(u'[\U000024C2-\U0001F251]', flags=re.UNICODE),
        re.compile(u'[\U0001F018-\U0001F270]', flags=re.UNICODE),
        re.compile(u'[\U0001F700-\U0001F77F]', flags=re.UNICODE),
        re.compile(u'[\U0001F780-\U0001F7FF]', flags=re.UNICODE),
        re.compile(u'[\U0001F800-\U0001F8FF]', flags=re.UNICODE),
    ]
    
    for pattern in emoji_patterns:
        text = pattern.sub('', text)
    
    text = re.sub(u'[\U0001F3FB-\U0001F3FF]', '', text)
    text = re.sub(u'[\U0000200D]', '', text)
    text = re.sub(u'[\U0000FE00-\U0000FE0F]', '', text)
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#(\w+)', r'\1', text)
    text = re.sub(r'r/\w+', '', text)
    text = re.sub(r'[^\w\s!?.,:;\-\'\"()\[\]{}]', '', text)
    text = re.sub(r'([!?.]){2,}', r'\1', text)
    text = re.sub(r'-{2,}', '-', text)
    text = re.sub(r'\s+', ' ', text).strip()
    text = re.sub(r'^[^\w]+|[^\w]+$', '', text)
    
    return text


## Apply Preprocessing


In [5]:
train_df['text_cleaned'] = train_df['text'].apply(preprocess_text)
val_df['text_cleaned'] = val_df['text'].apply(preprocess_text)
test_df['text_cleaned'] = test_df['text'].apply(preprocess_text)

print("Preprocessing complete")


Preprocessing complete


## Before/After Comparison


In [6]:
for i in range(5):
    print(f"\nExample {i+1}:")
    print(f"  Before: {train_df.iloc[i]['text']}")
    print(f"  After:  {train_df.iloc[i]['text_cleaned']}")



Example 1:
  Before: My favourite food is anything I didn't have to cook myself.
  After:  my favourite food is anything i didn't have to cook myself

Example 2:
  Before: Now if he does off himself, everyone will think hes having a laugh screwing with people instead of actually dead
  After:  now if he does off himself, everyone will think hes having a laugh screwing with people instead of actually dead

Example 3:
  Before: WHY THE FUCK IS BAYLESS ISOING
  After:  why the fuck is bayless isoing

Example 4:
  Before: To make her feel threatened
  After:  to make her feel threatened

Example 5:
  Before: Dirty Southern Wankers
  After:  dirty southern wankers


## Remove Short Texts


In [7]:
train_empty = train_df[train_df['text_cleaned'].str.len() < 3].shape[0]
val_empty = val_df[val_df['text_cleaned'].str.len() < 3].shape[0]
test_empty = test_df[test_df['text_cleaned'].str.len() < 3].shape[0]

print(f"Short texts (< 3 chars): Train={train_empty}, Val={val_empty}, Test={test_empty}")

train_df = train_df[train_df['text_cleaned'].str.len() >= 3].reset_index(drop=True)
val_df = val_df[val_df['text_cleaned'].str.len() >= 3].reset_index(drop=True)
test_df = test_df[test_df['text_cleaned'].str.len() >= 3].reset_index(drop=True)

print(f"After removal: Train={len(train_df)}, Val={len(val_df)}, Test={len(test_df)}")


Short texts (< 3 chars): Train=48, Val=4, Test=4
After removal: Train=43362, Val=5422, Test=5423


## 3-Class Emotion Mapping


In [8]:
mapping_7_to_3 = {
    'anger': 'negative',
    'disgust': 'negative',
    'fear': 'negative',
    'sadness': 'negative',
    'joy': 'positive',
    'neutral': 'neutral',
    'surprise': 'neutral'
}

train_df['emotion'] = train_df['emotion'].map(mapping_7_to_3)
val_df['emotion'] = val_df['emotion'].map(mapping_7_to_3)
test_df['emotion'] = test_df['emotion'].map(mapping_7_to_3)

print("3-class mapping applied")
print(f"Classes: {sorted(train_df['emotion'].unique())}")


3-class mapping applied
Classes: ['negative', 'neutral', 'positive']


## Class Distribution Before Balancing


In [9]:
emotion_counts_before = train_df['emotion'].value_counts()
emotion_percentages_before = train_df['emotion'].value_counts(normalize=True) * 100

emotion_df_before = pd.DataFrame({
    'Count': emotion_counts_before,
    'Percentage': emotion_percentages_before.round(2)
})
print("Before balancing:")
print(emotion_df_before)

max_count = emotion_counts_before.max()
min_count = emotion_counts_before.min()
imbalance_ratio = max_count / min_count

print(f"\nImbalance ratio: {imbalance_ratio:.2f}:1")


Before balancing:
          Count  Percentage
emotion                    
neutral   25387       58.55
positive   9075       20.93
negative   8900       20.52

Imbalance ratio: 2.85:1


## Balance Classes


In [10]:
def balance_dataset(df, split_name):
    print(f"\nBalancing {split_name} dataset...")
    
    emotion_counts = Counter(df['emotion'])
    mean_samples = int(np.mean(list(emotion_counts.values())))
    
    print(f"Target per class: {mean_samples:,} samples")
    print(f"Original distribution:")
    for emotion, count in sorted(emotion_counts.items()):
        print(f"  {emotion:10s}: {count:6,} samples")
    
    balanced_rows = []
    for emotion in sorted(df['emotion'].unique()):
        class_data = df[df['emotion'] == emotion].copy()
        class_count = len(class_data)
        
        if class_count < mean_samples:
            needed = mean_samples - class_count
            oversample_indices = np.random.choice(class_data.index, size=needed, replace=True)
            oversampled = class_data.loc[oversample_indices]
            balanced_class = pd.concat([class_data, oversampled], ignore_index=True)
            print(f"  {emotion:10s}: oversampled {needed:,} samples")
        elif class_count > mean_samples:
            balanced_class = class_data.sample(n=mean_samples, random_state=42).reset_index(drop=True)
            removed = class_count - mean_samples
            print(f"  {emotion:10s}: undersampled {removed:,} samples")
        else:
            balanced_class = class_data.reset_index(drop=True)
            print(f"  {emotion:10s}: no change")
        
        balanced_rows.append(balanced_class)
    
    df_balanced = pd.concat(balanced_rows, ignore_index=True)
    df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)
    
    balanced_counts = Counter(df_balanced['emotion'])
    print(f"\nAfter balancing:")
    for emotion, count in sorted(balanced_counts.items()):
        print(f"  {emotion:10s}: {count:6,} samples")
    print(f"  Total: {len(df_balanced):,} samples")
    
    return df_balanced

train_balanced = balance_dataset(train_df, "TRAIN")
val_balanced = balance_dataset(val_df, "VALIDATION")
test_balanced = balance_dataset(test_df, "TEST")



Balancing TRAIN dataset...
Target per class: 14,454 samples
Original distribution:
  negative  :  8,900 samples
  neutral   : 25,387 samples
  positive  :  9,075 samples
  negative  : oversampled 5,554 samples
  neutral   : undersampled 10,933 samples
  positive  : oversampled 5,379 samples

After balancing:
  negative  : 14,454 samples
  neutral   : 14,454 samples
  positive  : 14,454 samples
  Total: 43,362 samples

Balancing VALIDATION dataset...
Target per class: 1,807 samples
Original distribution:
  negative  :  1,114 samples
  neutral   :  3,088 samples
  positive  :  1,220 samples
  negative  : oversampled 693 samples
  neutral   : undersampled 1,281 samples
  positive  : oversampled 587 samples

After balancing:
  negative  :  1,807 samples
  neutral   :  1,807 samples
  positive  :  1,807 samples
  Total: 5,421 samples

Balancing TEST dataset...
Target per class: 1,807 samples
Original distribution:
  negative  :  1,164 samples
  neutral   :  3,150 samples
  positive  :  1,1

## Create Processed Datasets


In [11]:
train_processed = train_balanced[['text_cleaned', 'emotion']].copy()
train_processed.columns = ['text', 'emotion']

val_processed = val_balanced[['text_cleaned', 'emotion']].copy()
val_processed.columns = ['text', 'emotion']

test_processed = test_balanced[['text_cleaned', 'emotion']].copy()
test_processed.columns = ['text', 'emotion']

print(f"Processed datasets: Train={train_processed.shape}, Val={val_processed.shape}, Test={test_processed.shape}")
print("\nSample:")
print(train_processed.head())


Processed datasets: Train=(43362, 2), Val=(5421, 2), Test=(5421, 2)

Sample:
                                                text   emotion
0                                   shes all natural   neutral
1  no, hes been with more women than i have been ...  negative
2  thanks for the quick reply i am very bored. wh...  positive
3                                    except is worse  negative
4                                          fake lous  negative


## Save Processed Data


In [12]:
train_processed.to_csv('../data/processed/train_processed_3class.csv', index=False)
val_processed.to_csv('../data/processed/val_processed_3class.csv', index=False)
test_processed.to_csv('../data/processed/test_processed_3class.csv', index=False)

print("Saved processed 3-class datasets")


Saved processed 3-class datasets


## Create Emotion Mapping


In [13]:
emotion_to_id = {
    'negative': 0,
    'neutral': 1,
    'positive': 2
}

id_to_emotion = {idx: emotion for emotion, idx in emotion_to_id.items()}

print("3-Class Emotion to ID mapping:")
for emotion, idx in emotion_to_id.items():
    print(f"  {emotion}: {idx}")

with open('../data/processed/emotion_mapping_3class.json', 'w') as f:
    json.dump({
        'emotion_to_id': emotion_to_id,
        'id_to_emotion': {str(k): v for k, v in id_to_emotion.items()}
    }, f, indent=2)

print("\nSaved to emotion_mapping_3class.json")


3-Class Emotion to ID mapping:
  negative: 0
  neutral: 1
  positive: 2

Saved to emotion_mapping_3class.json


## Summary


In [14]:
final_counts = train_processed['emotion'].value_counts()

print("3-Class Preprocessing Summary:")
print(f"Total samples: {len(train_processed) + len(val_processed) + len(test_processed):,}")
print(f"Train: {len(train_processed):,} | Val: {len(val_processed):,} | Test: {len(test_processed):,}")
print(f"\nAvg text length: {train_processed['text'].str.len().mean():.0f} chars, {train_processed['text'].str.split().str.len().mean():.1f} words")
print(f"\nBalanced emotion distribution:")
for emotion, count in sorted(final_counts.items()):
    percentage = (count / len(train_processed)) * 100
    print(f"  {emotion:10s}: {count:6,} ({percentage:5.2f}%)")
print(f"\nFiles saved:")
print(f"  - train_processed_3class.csv")
print(f"  - val_processed_3class.csv")
print(f"  - test_processed_3class.csv")
print(f"  - emotion_mapping_3class.json")


3-Class Preprocessing Summary:
Total samples: 54,204
Train: 43,362 | Val: 5,421 | Test: 5,421

Avg text length: 66 chars, 12.8 words

Balanced emotion distribution:
  negative  : 14,454 (33.33%)
  neutral   : 14,454 (33.33%)
  positive  : 14,454 (33.33%)

Files saved:
  - train_processed_3class.csv
  - val_processed_3class.csv
  - test_processed_3class.csv
  - emotion_mapping_3class.json
