# Building Multi-Task NLP model with LSTM : Detect Emotions, Hate Speech & Violence in Text

> Detect Emotion , violence , Hate Speech



## Dataset

(i) Emotion Data: https://www.kaggle.com/datasets/nelgiriyewithana/emotions

(ii) Violence Data: https://www.kaggle.com/datasets/gauravduttakiit/gender-based-violence-tweet-classification?select=Train.csv

(iii) Hate Speech Data: https://www.kaggle.com/datasets/mrmorj/hate-speech-and-offensive-language-dataset


**Emotions**

0 : Sadness

1 : Joy

2 : Love

3 : Anger

4 : Fear

5 : Surprise

**Violence**

0 : Harmful_Traditional_practice

1 : Physical_violence

2 : economic_violence

3 : emotional_violence

4 : sexual_violence

**Hate**

0 : Hate Speech

1 : Offensive Speech

2 : Neither


## 0.Setup


In [1]:
import pandas as pd
import numpy as np

# SK learn
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix

# NLP
import nltk
from nltk.corpus import stopwords

# DeepLearning
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow import keras


# Visualize
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

Unable to revert mtime: /Library/Fonts
Matplotlib is building the font cache; this may take a moment.


In [4]:
! mkdir ~/.kaggle
! cp ../kaggle-token.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle-token.json

mkdir: /Users/magesh/.kaggle: File exists


## Download Dataset

In [5]:
!kaggle datasets download -d nelgiriyewithana/emotions
!kaggle datasets download -d gauravduttakiit/gender-based-violence-tweet-classification
!kaggle datasets download -d mrmorj/hate-speech-and-offensive-language-dataset

Downloading emotions.zip to /Volumes/CodeHub/projects/ML/LSTM - P1/notebooks
100%|██████████████████████████████████████| 15.7M/15.7M [00:05<00:00, 3.28MB/s]
100%|██████████████████████████████████████| 15.7M/15.7M [00:05<00:00, 2.82MB/s]
Downloading gender-based-violence-tweet-classification.zip to /Volumes/CodeHub/projects/ML/LSTM - P1/notebooks
 97%|████████████████████████████████████▋ | 5.00M/5.17M [00:02<00:00, 2.46MB/s]
100%|██████████████████████████████████████| 5.17M/5.17M [00:02<00:00, 1.97MB/s]
Downloading hate-speech-and-offensive-language-dataset.zip to /Volumes/CodeHub/projects/ML/LSTM - P1/notebooks
 99%|██████████████████████████████████████▊| 1.00M/1.01M [00:01<00:00, 776kB/s]
100%|███████████████████████████████████████| 1.01M/1.01M [00:01<00:00, 780kB/s]


In [7]:
import zipfile

# Create the 'data' directory if it doesn't exist
!mkdir -p data

def unzip_files(zip_filepath, extract_dir):
    with zipfile.ZipFile(zip_filepath, 'r') as zip_ref:
        zip_ref.extractall(extract_dir)

# Unzip the files into the 'data' folder
unzip_files("../data/emotions.zip", "../data/")
unzip_files("../data/gender-based-violence-tweet-classification.zip", "../data/")
unzip_files("../data/hate-speech-and-offensive-language-dataset.zip", "../data/")


FileNotFoundError: [Errno 2] No such file or directory: './data/gender-based-violence-tweet-classification.zip'

## 1.Load Dataset

In [11]:
emotion_df=pd.read_csv("../data/text.csv")
violence_df=pd.read_csv("../data/Train.csv")
hate_df=pd.read_csv("../data/labeled_data.csv")

## 1. Data Cleaning (Common for All Datasets)


* Remove unwated columns

* Rename columns

* check for null values



In [12]:

def clean_dataset(emotion_df, violence_df, hate_df):
    """Cleans the datasets by removing unwanted columns.

    Args:
        emotion_df: Pandas DataFrame containing emotion data.
        violence_df: Pandas DataFrame containing violence data.
        hate_df: Pandas DataFrame containing hate speech data.

    Returns:
        A tuple containing the cleaned DataFrames: (emotion_df, violence_df, hate_df).
    """
    # remove unwanted columns
    emotion_df.drop(columns=["Unnamed: 0"], inplace=True)
    violence_df.drop(columns=["Tweet_ID"], inplace=True)
    new_hate_df = hate_df[["tweet", "class"]]  # pick only tweet and class

    # rename the column

    violence_df.rename(columns={"tweet":"text","type":"label"},inplace=True)
    new_hate_df.rename(columns={"tweet":"text","class":"label"},inplace=True)

    return emotion_df, violence_df, new_hate_df


# Clean the datasets
emotion_df, violence_df, hate_df = clean_dataset(emotion_df, violence_df, hate_df)

In [13]:
emotion_df.columns,violence_df.columns,hate_df.columns

(Index(['text', 'label'], dtype='object'),
 Index(['text', 'label'], dtype='object'),
 Index(['text', 'label'], dtype='object'))

### 2. Exploratory Data Analysis

* check for class distributions




In [14]:
emotion_df.isnull().sum(),violence_df.isnull().sum(),hate_df.isnull().sum()

(text     0
 label    0
 dtype: int64,
 text     0
 label    0
 dtype: int64,
 text     0
 label    0
 dtype: int64)

In [15]:
emotion_df['label'].value_counts()

label
1    141067
0    121187
3     57317
4     47712
2     34554
5     14972
Name: count, dtype: int64

In [16]:
violence_df['label'].value_counts()

label
sexual_violence                 32648
Physical_violence                5946
emotional_violence                651
economic_violence                 217
Harmful_Traditional_practice      188
Name: count, dtype: int64

In [17]:
hate_df['label'].value_counts()

label
1    19190
2     4163
0     1430
Name: count, dtype: int64

In [18]:
import pandas as pd
import numpy as np

def balance_datasets(emotion_df, violence_df, hate_df):
    # Balance Emotion Data
    e_df = pd.DataFrame()
    for i in range(6):
        subset = emotion_df[emotion_df['label'] == i].sample(n=2000, random_state=42)
        e_df = pd.concat([e_df, subset])
    emotion_df = e_df.copy()

    # Balance Violence Data
    sexual_violence = violence_df[violence_df['label'] == 'sexual_violence'].sample(n=4998, random_state=42)
    violence_df = violence_df[violence_df['label'] != 'sexual_violence']
    violence_df = pd.concat([sexual_violence, violence_df], axis=0)

    # Balance Hate Speech Data
    offensive_speech = hate_df[hate_df['label'] == 1].sample(n=6407, random_state=42)
    hate_df = hate_df[hate_df['label'] != 1]
    hate_df = pd.concat([offensive_speech, hate_df], axis=0)

    return emotion_df, violence_df, hate_df

emotion_df, violence_df, hate_df = balance_datasets(emotion_df, violence_df, hate_df)


In [19]:
emotion_df.shape, violence_df.shape, hate_df.shape

((12000, 2), (12000, 2), (12000, 2))

In [20]:
#resetting the indexes
emotion_df.reset_index(drop = True, inplace = True)
violence_df.reset_index(drop = True, inplace = True)
hate_df.reset_index(drop = True, inplace = True)

In [21]:
emotion_df.head()

Unnamed: 0,text,label
0,ive learned to surround myself with women who ...,0
1,i already feel crappy because of this and you ...,0
2,i feel like i have lost mourned and moved past...,0
3,i could write a whole lot more about why im fe...,0
4,i always seem to feel inadequate,0


In [22]:
violence_df.head()

Unnamed: 0,text,label
0,My cousin was raped by this guy Matthew. She w...,sexual_violence
1,HAHAHAHAHAHAHHA I DIDN’T SEE IT THE FIRST TIME...,sexual_violence
2,I remember how I almost got raped like it was ...,sexual_violence
3,He raped me 👈,sexual_violence
4,"A woman raped by A male: psychological horror,...",sexual_violence


In [23]:
hate_df.head()

Unnamed: 0,text,label
0,Why is it everytime I go to cracker barrel the...,1
1,"Run that nigga, you don't want that nigga, but...",1
2,I need a girl from Jamaica I can't fuck with t...,1
3,RT @ShadowBeatz_Inc: I know you have me blocke...,1
4,Put ya hands up if you a Grade A bitch,1


## 3. Text Preprocessing
* label encoding
* Clean text - remove URLs, special characters, etc.
* Tokenization
* Padding sequences

In [29]:
# for violence_df convert str to labels
label_encoder = LabelEncoder()
violence_df['label'] = label_encoder.fit_transform(violence_df['label'])

In [36]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/magesh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/magesh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/magesh/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/magesh/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [37]:
# Download required NLTK resources

from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import re
import string

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()
# Get English stopwords
stop_words = set(stopwords.words('english'))

def clean_text(text):
    """
    Clean and preprocess text data:
    - Convert to lowercase
    - Remove URLs, mentions, special characters
    - Remove punctuation
    - Remove stopwords
    - Lemmatize words
    
    Args:
        text: String containing text to be cleaned
        
    Returns:
        Cleaned text string
    """
    if isinstance(text, str):
        # Convert to lowercase
        text = text.lower()
        
        # Remove URLs
        text = re.sub(r'http\S+|www\S+', '', text)
        
        # Remove mentions (@username)
        text = re.sub(r'@\w+', '', text)
        
        # Remove hashtags
        text = re.sub(r'#\w+', '', text)
        
        # Remove numbers
        text = re.sub(r'\d+', '', text)
        
        # Remove punctuation
        text = text.translate(str.maketrans('', '', string.punctuation))
        
        # Tokenize text
        tokens = word_tokenize(text)
        
        # Remove stopwords and lemmatize
        tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
        
        # Join tokens back into string
        return ' '.join(tokens)
    return ''

In [38]:
emotion_df['text'] = emotion_df['text'].apply(clean_text)
violence_df['text'] = violence_df['text'].apply(clean_text)
hate_df['text'] = hate_df['text'].apply(clean_text)

## 4. Tokenization and Sequence Padding

* Create a tokenizer
* Convert text to sequences
* Pad sequences to uniform length

In [40]:
# Set parameters for tokenization and padding
MAX_NUM_WORDS = 10000  # Maximum number of words to keep in the vocabulary
MAX_SEQUENCE_LENGTH = 100  # Maximum length of all sequences
EMBEDDING_DIM = 100  # Dimensionality of word embeddings

In [41]:
def tokenize_and_pad(texts, max_words=MAX_NUM_WORDS, max_length=MAX_SEQUENCE_LENGTH):
    """
    Tokenize text data and pad sequences to a uniform length.
    
    Args:
        texts: List of text strings to tokenize and pad
        max_words: Maximum number of words to keep in the vocabulary
        max_length: Maximum length of all sequences
        
    Returns:
        Tuple containing (padded_sequences, tokenizer)
    """
    # Create a tokenizer
    tokenizer = Tokenizer(num_words=max_words)
    
    # Fit the tokenizer on the texts
    tokenizer.fit_on_texts(texts)
    
    # Convert texts to sequences
    sequences = tokenizer.texts_to_sequences(texts)
    
    # Pad sequences to a uniform length
    padded_sequences = pad_sequences(sequences, maxlen=max_length)
    
    # Get the word index
    word_index = tokenizer.word_index
    print(f"Found {len(word_index)} unique tokens.")
    
    return padded_sequences, tokenizer, word_index

In [42]:
# Process each dataset
emotion_sequences, emotion_tokenizer, emotion_word_index = tokenize_and_pad(emotion_df['text'])
violence_sequences, violence_tokenizer, violence_word_index = tokenize_and_pad(violence_df['text'])
hate_sequences, hate_tokenizer, hate_word_index = tokenize_and_pad(hate_df['text'])

Found 11776 unique tokens.
Found 19249 unique tokens.
Found 13228 unique tokens.


In [43]:
# Check the shape of the padded sequences
print(f"Emotion sequences shape: {emotion_sequences.shape}")
print(f"Violence sequences shape: {violence_sequences.shape}")
print(f"Hate sequences shape: {hate_sequences.shape}")

Emotion sequences shape: (12000, 100)
Violence sequences shape: (12000, 100)
Hate sequences shape: (12000, 100)


In [44]:
# Examine a sample sequence
print("Original text:")
print(emotion_df['text'][0])
print("\nTokenized sequence (first 10 tokens):")
print(emotion_sequences[0][:10])

# Display a few words from the vocabulary
word_dict = list(emotion_word_index.items())
print("\nSample words from vocabulary:")
for word, index in sorted(word_dict[:10], key=lambda x: x[1]):
    print(f"{word}: {index}")

Original text:
ive learned surround woman lift leave feeling nurtured rather drained

Tokenized sequence (first 10 tokens):
[0 0 0 0 0 0 0 0 0 0]

Sample words from vocabulary:
feel: 1
feeling: 2
like: 3
im: 4
really: 5
time: 6
know: 7
little: 8
get: 9
people: 10


### Save Tokenizers for Later Use

It's a good practice to save your tokenizers for later use in inference or deployment.

In [45]:
import pickle
import os

# Create a directory for saving tokenizers if it doesn't exist
os.makedirs('../models/tokenizers', exist_ok=True)

# Save the tokenizers
with open('../models/tokenizers/emotion_tokenizer.pickle', 'wb') as handle:
    pickle.dump(emotion_tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('../models/tokenizers/violence_tokenizer.pickle', 'wb') as handle:
    pickle.dump(violence_tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('../models/tokenizers/hate_tokenizer.pickle', 'wb') as handle:
    pickle.dump(hate_tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
print("Tokenizers saved successfully.")

Tokenizers saved successfully.


## 5. Data Preparation for Training

* Split data into training and validation sets
* Prepare labels (one-hot encoding if needed)

In [46]:
from sklearn.model_selection import train_test_split

# Function to split data into train and validation sets
def prepare_train_val_data(sequences, labels, test_size=0.2, random_state=42):
    """
    Split sequences and labels into training and validation sets.
    
    Args:
        sequences: Padded sequences
        labels: Corresponding labels
        test_size: Proportion of data to use for validation
        random_state: Random seed for reproducibility
        
    Returns:
        Tuple containing (X_train, X_val, y_train, y_val)
    """
    return train_test_split(sequences, labels, test_size=test_size, random_state=random_state)

# Split each dataset
X_emotion_train, X_emotion_val, y_emotion_train, y_emotion_val = prepare_train_val_data(
    emotion_sequences, emotion_df['label'].values
)

X_violence_train, X_violence_val, y_violence_train, y_violence_val = prepare_train_val_data(
    violence_sequences, violence_df['label'].values
)

X_hate_train, X_hate_val, y_hate_train, y_hate_val = prepare_train_val_data(
    hate_sequences, hate_df['label'].values
)

In [47]:
# One-hot encode labels for multi-class classification
from tensorflow.keras.utils import to_categorical

# Number of classes in each task
num_emotion_classes = len(np.unique(emotion_df['label']))
num_violence_classes = len(np.unique(violence_df['label']))
num_hate_classes = len(np.unique(hate_df['label']))

print(f"Number of emotion classes: {num_emotion_classes}")
print(f"Number of violence classes: {num_violence_classes}")
print(f"Number of hate classes: {num_hate_classes}")

# Convert labels to one-hot encoding
y_emotion_train_categorical = to_categorical(y_emotion_train, num_classes=num_emotion_classes)
y_emotion_val_categorical = to_categorical(y_emotion_val, num_classes=num_emotion_classes)

y_violence_train_categorical = to_categorical(y_violence_train, num_classes=num_violence_classes)
y_violence_val_categorical = to_categorical(y_violence_val, num_classes=num_violence_classes)

y_hate_train_categorical = to_categorical(y_hate_train, num_classes=num_hate_classes)
y_hate_val_categorical = to_categorical(y_hate_val, num_classes=num_hate_classes)

Number of emotion classes: 6
Number of violence classes: 5
Number of hate classes: 3
