# Data Preprocessing

Cleaning and preprocessing text data for model training.


## Import Libraries


In [53]:
import pandas as pd
import numpy as np
import re
import os
import warnings
import json
from sklearn.preprocessing import LabelEncoder

warnings.filterwarnings('ignore')
os.makedirs('../data/processed', exist_ok=True)


## Load Data


In [54]:
train_df = pd.read_csv('../data/raw/train_raw.csv')
val_df = pd.read_csv('../data/raw/val_raw.csv')
test_df = pd.read_csv('../data/raw/test_raw.csv')

print(f"Train: {train_df.shape} | Val: {val_df.shape} | Test: {test_df.shape}")


Train: (43410, 2) | Val: (5426, 2) | Test: (5427, 2)


## Sample Raw Data


In [55]:
train_df.head()


Unnamed: 0,text,emotion
0,My favourite food is anything I didn't have to...,neutral
1,"Now if he does off himself, everyone will thin...",neutral
2,WHY THE FUCK IS BAYLESS ISOING,anger
3,To make her feel threatened,fear
4,Dirty Southern Wankers,anger


## Preprocessing Function


In [56]:
def preprocess_text(text):
    if pd.isna(text):
        return ""
    
    text = str(text).lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'<.*?>', '', text)
    
    emoji_patterns = [
        re.compile(u'[\U0001F600-\U0001F64F]', flags=re.UNICODE),
        re.compile(u'[\U0001F300-\U0001F5FF]', flags=re.UNICODE),
        re.compile(u'[\U0001F680-\U0001F6FF]', flags=re.UNICODE),
        re.compile(u'[\U0001F1E0-\U0001F1FF]', flags=re.UNICODE),
        re.compile(u'[\U0001F900-\U0001F9FF]', flags=re.UNICODE),
        re.compile(u'[\U0001FA00-\U0001FA6F]', flags=re.UNICODE),
        re.compile(u'[\U0001FA70-\U0001FAFF]', flags=re.UNICODE),
        re.compile(u'[\U00002600-\U000026FF]', flags=re.UNICODE),
        re.compile(u'[\U00002700-\U000027BF]', flags=re.UNICODE),
        re.compile(u'[\U000024C2-\U0001F251]', flags=re.UNICODE),
        re.compile(u'[\U0001F018-\U0001F270]', flags=re.UNICODE),
        re.compile(u'[\U0001F700-\U0001F77F]', flags=re.UNICODE),
        re.compile(u'[\U0001F780-\U0001F7FF]', flags=re.UNICODE),
        re.compile(u'[\U0001F800-\U0001F8FF]', flags=re.UNICODE),
    ]
    
    for pattern in emoji_patterns:
        text = pattern.sub('', text)
    
    text = re.sub(u'[\U0001F3FB-\U0001F3FF]', '', text)
    text = re.sub(u'[\U0000200D]', '', text)
    text = re.sub(u'[\U0000FE00-\U0000FE0F]', '', text)
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#(\w+)', r'\1', text)
    text = re.sub(r'r/\w+', '', text)
    text = re.sub(r'[^\w\s!?.,:;\-\'\"()\[\]{}]', '', text)
    text = re.sub(r'([!?.]){2,}', r'\1', text)
    text = re.sub(r'-{2,}', '-', text)
    text = re.sub(r'\s+', ' ', text).strip()
    text = re.sub(r'^[^\w]+|[^\w]+$', '', text)
    
    return text


## Apply Preprocessing


In [57]:
train_df['text_cleaned'] = train_df['text'].apply(preprocess_text)
val_df['text_cleaned'] = val_df['text'].apply(preprocess_text)
test_df['text_cleaned'] = test_df['text'].apply(preprocess_text)

print("Preprocessing complete")


Preprocessing complete


## Before/After Comparison


In [58]:
for i in range(5):
    print(f"\nExample {i+1}:")
    print(f"  Before: {train_df.iloc[i]['text']}")
    print(f"  After:  {train_df.iloc[i]['text_cleaned']}")



Example 1:
  Before: My favourite food is anything I didn't have to cook myself.
  After:  my favourite food is anything i didn't have to cook myself

Example 2:
  Before: Now if he does off himself, everyone will think hes having a laugh screwing with people instead of actually dead
  After:  now if he does off himself, everyone will think hes having a laugh screwing with people instead of actually dead

Example 3:
  Before: WHY THE FUCK IS BAYLESS ISOING
  After:  why the fuck is bayless isoing

Example 4:
  Before: To make her feel threatened
  After:  to make her feel threatened

Example 5:
  Before: Dirty Southern Wankers
  After:  dirty southern wankers


## Remove Short Texts


In [59]:
train_empty = train_df[train_df['text_cleaned'].str.len() < 3].shape[0]
val_empty = val_df[val_df['text_cleaned'].str.len() < 3].shape[0]
test_empty = test_df[test_df['text_cleaned'].str.len() < 3].shape[0]

print(f"Short texts (< 3 chars): Train={train_empty}, Val={val_empty}, Test={test_empty}")

train_df = train_df[train_df['text_cleaned'].str.len() >= 3].reset_index(drop=True)
val_df = val_df[val_df['text_cleaned'].str.len() >= 3].reset_index(drop=True)
test_df = test_df[test_df['text_cleaned'].str.len() >= 3].reset_index(drop=True)

print(f"After removal: Train={len(train_df)}, Val={len(val_df)}, Test={len(test_df)}")


Short texts (< 3 chars): Train=48, Val=4, Test=4
After removal: Train=43362, Val=5422, Test=5423


## Text Statistics


In [60]:
train_df['cleaned_length'] = train_df['text_cleaned'].str.len()
train_df['cleaned_word_count'] = train_df['text_cleaned'].str.split().str.len()

print("Cleaned text stats:")
print(train_df[['cleaned_length', 'cleaned_word_count']].describe())


Cleaned text stats:
       cleaned_length  cleaned_word_count
count    43362.000000        43362.000000
mean        65.768830           12.635995
std         36.171532            6.669857
min          3.000000            1.000000
25%         36.000000            7.000000
50%         62.000000           12.000000
75%         93.000000           18.000000
max        703.000000           33.000000


## Emotion Distribution


In [61]:
emotion_counts = train_df['emotion'].value_counts()
emotion_percentages = train_df['emotion'].value_counts(normalize=True) * 100

emotion_df = pd.DataFrame({
    'Count': emotion_counts,
    'Percentage': emotion_percentages.round(2)
})
print(emotion_df)

max_count = emotion_counts.max()
min_count = emotion_counts.min()
imbalance_ratio = max_count / min_count

print(f"\nImbalance ratio: {imbalance_ratio:.2f}:1")
if imbalance_ratio > 3:
    print("Note: Severe class imbalance - consider handling during training")


          Count  Percentage
emotion                    
neutral   23238       53.59
joy        9075       20.93
anger      5335       12.30
sadness    2371        5.47
surprise   2149        4.96
fear        615        1.42
disgust     579        1.34

Imbalance ratio: 40.13:1
Note: Severe class imbalance - consider handling during training


## Create Processed Datasets


In [62]:
train_processed = train_df[['text_cleaned', 'emotion']].copy()
train_processed.columns = ['text', 'emotion']

val_processed = val_df[['text_cleaned', 'emotion']].copy()
val_processed.columns = ['text', 'emotion']

test_processed = test_df[['text_cleaned', 'emotion']].copy()
test_processed.columns = ['text', 'emotion']

print(f"Processed datasets: Train={train_processed.shape}, Val={val_processed.shape}, Test={test_processed.shape}")
print("\nSample:")
print(train_processed.head())


Processed datasets: Train=(43362, 2), Val=(5422, 2), Test=(5423, 2)

Sample:
                                                text  emotion
0  my favourite food is anything i didn't have to...  neutral
1  now if he does off himself, everyone will thin...  neutral
2                     why the fuck is bayless isoing    anger
3                        to make her feel threatened     fear
4                             dirty southern wankers    anger


## Save Processed Data


In [63]:
train_processed.to_csv('../data/processed/train_processed.csv', index=False)
val_processed.to_csv('../data/processed/val_processed.csv', index=False)
test_processed.to_csv('../data/processed/test_processed.csv', index=False)

print("Saved processed datasets")


Saved processed datasets


## Label Encoding


In [64]:
label_encoder = LabelEncoder()
label_encoder.fit(train_processed['emotion'])

emotion_to_id = {emotion: idx for idx, emotion in enumerate(label_encoder.classes_)}
id_to_emotion = {idx: emotion for emotion, idx in emotion_to_id.items()}

print("Emotion to ID mapping:")
for emotion, idx in emotion_to_id.items():
    print(f"  {emotion}: {idx}")

with open('../data/processed/emotion_mapping.json', 'w') as f:
    json.dump({
        'emotion_to_id': emotion_to_id,
        'id_to_emotion': id_to_emotion
    }, f, indent=2)

print("\nSaved to emotion_mapping.json")


Emotion to ID mapping:
  anger: 0
  disgust: 1
  fear: 2
  joy: 3
  neutral: 4
  sadness: 5
  surprise: 6

Saved to emotion_mapping.json


## Summary


In [65]:
print("Preprocessing Summary:")
print(f"Total samples: {len(train_processed) + len(val_processed) + len(test_processed):,}")
print(f"Train: {len(train_processed):,} | Val: {len(val_processed):,} | Test: {len(test_processed):,}")
print(f"\nAvg text length: {train_processed['text'].str.len().mean():.0f} chars, {train_processed['text'].str.split().str.len().mean():.1f} words")
print(f"\nEmotion distribution:")
for emotion, count in emotion_counts.items():
    percentage = (count / len(train_processed)) * 100
    print(f"  {emotion:10s}: {count:6,} ({percentage:5.2f}%)")
print(f"\nFiles saved: train_processed.csv, val_processed.csv, test_processed.csv, emotion_mapping.json")


Preprocessing Summary:
Total samples: 54,207
Train: 43,362 | Val: 5,422 | Test: 5,423

Avg text length: 66 chars, 12.6 words

Emotion distribution:
  neutral   : 23,238 (53.59%)
  joy       :  9,075 (20.93%)
  anger     :  5,335 (12.30%)
  sadness   :  2,371 ( 5.47%)
  surprise  :  2,149 ( 4.96%)
  fear      :    615 ( 1.42%)
  disgust   :    579 ( 1.34%)

Files saved: train_processed.csv, val_processed.csv, test_processed.csv, emotion_mapping.json
