In [1]:
import pandas as pd
import tensorflow as tf
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from skmultilearn.adapt import MLkNN
import sklearn.metrics as metrics
from sklearn.metrics import hamming_loss, accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import multilabel_confusion_matrix
import os

In [2]:
# Process label 1 dataset
df_l1 = pd.read_csv('train_en_l1.csv')
df_l1 = df_l1.rename(columns={'key': 'unique_id', 'sentence': 'text'})

# Convert annotator columns to numeric without replacing NaNs
df_l1[['en_a1', 'en_a2', 'en_a3', 'en_a4', 'en_a5', 'en_a6']] = df_l1[
    ['en_a1', 'en_a2', 'en_a3', 'en_a4', 'en_a5', 'en_a6']
].apply(pd.to_numeric, errors='coerce')

# Compute 'label_1' based on majority voting while ignoring NaNs
df_l1['label_1'] = (df_l1[['en_a1', 'en_a2', 'en_a3', 'en_a4', 'en_a5', 'en_a6']].mean(axis=1, skipna=True) >= 0.5).astype(int)

# Create proper binary label for task 1 (gendered abuse)
df_l1['binary_label_1'] = df_l1['label_1'].apply(lambda x: 'hate' if x == 1 else 'not_hate')

# Process label 3 dataset
df_l3 = pd.read_csv('train_en_l3.csv')
df_l3 = df_l3.rename(columns={'key': 'unique_id', 'sentence': 'text'})

# Convert annotator columns to numeric without replacing NaNs
df_l3[['en_a1', 'en_a2', 'en_a3', 'en_a4', 'en_a5', 'en_a6']] = df_l3[
    ['en_a1', 'en_a2', 'en_a3', 'en_a4', 'en_a5', 'en_a6']
].apply(pd.to_numeric, errors='coerce')

# Compute 'label_3' based on majority voting while ignoring NaNs
df_l3['label_3'] = (df_l3[['en_a1', 'en_a2', 'en_a3', 'en_a4', 'en_a5', 'en_a6']].mean(axis=1, skipna=True) >= 0.5).astype(int)

# Create proper binary label for task 3 (explicit/aggressive)
df_l3['binary_label_3'] = df_l3['label_3'].apply(lambda x: 'explicit' if x == 1 else 'not_explicit')

# Select columns for merging
df_l1_slim = df_l1[['text', 'label_1', 'binary_label_1']]
df_l3_slim = df_l3[['text', 'label_3', 'binary_label_3']]

# Merge the datasets based on text field
merged_df = pd.merge(df_l1_slim, df_l3_slim, on='text', how='inner')

# Add a unique_id column to the merged dataset
merged_df['unique_id'] = [f'id_{i}' for i in range(len(merged_df))]

# Reorder columns
merged_df = merged_df[['unique_id', 'text', 'binary_label_1', 'label_1', 'binary_label_3', 'label_3']]

# Save the merged dataset
merged_df.to_csv('train_en_task3.csv', index=False)

# Display information
print(f"Final merged dataset shape: {merged_df.shape}")
print(merged_df.head())

# Check label distribution
print("\nLabel distribution:")
print(f"Label 1 (gendered abuse): {merged_df['binary_label_1'].value_counts().to_dict()}")
print(f"Label 3 (explicit/aggressive): {merged_df['binary_label_3'].value_counts().to_dict()}")

Final merged dataset shape: (6531, 6)
  unique_id                                               text binary_label_1  \
0      id_0      This is our history . For Shia and Sunni ,...       not_hate   
1      id_1     If this is "grilling" then wht wud u call t...           hate   
2      id_2    look at <handle replaced>madam. Locals attac...       not_hate   
3      id_3    our beloved yogi ji has given money home and...           hate   
4      id_4  "I gonna kill that bitch"  Bitvj imma kill you...           hate   

   label_1 binary_label_3  label_3  
0        0   not_explicit        0  
1        1       explicit        1  
2        0   not_explicit        0  
3        1       explicit        1  
4        1       explicit        1  

Label distribution:
Label 1 (gendered abuse): {'not_hate': 5159, 'hate': 1372}
Label 3 (explicit/aggressive): {'not_explicit': 4159, 'explicit': 2372}


In [3]:
import pandas as pd
import nltk
import string
import re

# Download necessary NLTK resources
nltk.download('stopwords')

# Set up tokenizer and stopwords
wpt = nltk.WordPunctTokenizer()
stop_words_init = nltk.corpus.stopwords.words('english')
stop_words = [i for i in stop_words_init if i not in ('not', 'and', 'for')]
print(stop_words)

# Function to normalize text for pre-processing
def normalize_text(text):
    text = str(text).lower()  # Ensure text is a string
    text = re.sub(r'\[.*?\]', ' ', text)
    text = re.sub(r'https?://\S+|www\.\S+', ' ', text)
    text = re.sub(r'<.*?>+', ' ', text)
    text = re.sub(r'[%s]' % re.escape(string.punctuation), ' ', text)
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r'\w*\d\w*', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Load the multi-task dataset
merged_df = pd.read_csv('train_en_task3.csv')

# Apply text normalization
merged_df['processed_text'] = merged_df['text'].apply(lambda x: normalize_text(x))

# Further processing to remove '...'
merged_df['processed_text'] = merged_df['processed_text'].str.replace('...', '')

# Display samples of processed text
print("\nOriginal vs Processed Text Samples:")
for i in range(3):
    print(f"Original: {merged_df['text'].iloc[i]}")
    print(f"Processed: {merged_df['processed_text'].iloc[i]}")
    print("-" * 50)

# Keep all columns but add processed text
merged_df_final = merged_df[['unique_id', 'text', 'processed_text', 'binary_label_1', 'label_1', 'binary_label_3', 'label_3']]


print(f"\nProcessed dataset shape: {merged_df_final.shape}")
print(merged_df_final.head())

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\krmri\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she', "she'd", "she'll",

In [4]:
# Extract features (processed text)
X = list(merged_df_final['processed_text'])

# Extract labels for both tasks
y_task1 = merged_df_final['label_1'].values
y_task3 = merged_df_final['label_3'].values

In [5]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.utils import pad_sequences
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import (
    LSTM, Activation, Dropout, Dense, Flatten,
    Bidirectional, GRU, concatenate, SpatialDropout1D,
    GlobalMaxPooling1D, GlobalAveragePooling1D, Conv1D,
    Embedding, Input, Concatenate
)
from tensorflow.keras.optimizers import Adam, SGD, RMSprop
from tensorflow.keras.losses import MeanSquaredError

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

######## Textual Features for Embedding ###################
max_len = 100
max_features = 4479

# Tokenization
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(X)
X = tokenizer.texts_to_sequences(X)

# Padding
X = pad_sequences(X, padding='post', maxlen=max_len)

print(X)  # Check the processed sequences

[[  12    6   68 ...    0    0    0]
 [  34   12    6 ...    0    0    0]
 [ 158   64  239 ...    0    0    0]
 ...
 [  10   41  387 ...    0    0    0]
 [1590 1170   69 ...    0    0    0]
 [ 461  294   27 ...    0    0    0]]


In [6]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_task1 = label_encoder.fit_transform(y_task1)
y_task3 = label_encoder.fit_transform(y_task3)

In [7]:
from keras.utils import to_categorical
y_task1 = to_categorical(y_task1, num_classes=2)
y_task3 = to_categorical(y_task3, num_classes=2)

In [8]:
import numpy as np
import json

# Load GloVe embeddings from JSON
with open('glove_embeddings.json', encoding="utf8") as f:
    embeddings_list = json.load(f)

# Convert the list of vectors to a dictionary with word indices as keys
embeddings_dictionary = {str(i): vector for i, vector in enumerate(embeddings_list)}

# Define tokenizer 
vocab_size = len(tokenizer.word_index) + 1  # Vocabulary size
word_index = tokenizer.word_index
num_words = min(max_features, vocab_size)  # Limit vocab to max_features

# Get embedding dimension (from first vector in list)
embed_size = len(embeddings_list[0]) if embeddings_list else 0

# Initialize embedding matrix
embedding_matrix = np.zeros((num_words, embed_size))

# Fill embedding matrix with corresponding word vectors
for word, index in word_index.items():
    if index >= max_features:
        continue
    embedding_vector = embeddings_dictionary.get(word) or embeddings_dictionary.get(str(index))
    if embedding_vector is not None:
        embedding_matrix[index] = np.asarray(embedding_vector, dtype=np.float32)

print("Embedding matrix shape:", embedding_matrix.shape)

Embedding matrix shape: (4479, 50)


In [9]:
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import (
    Input, Embedding, SpatialDropout1D, Conv1D,
    Bidirectional, LSTM, GRU, Dense, Dropout,
    GlobalAveragePooling1D, GlobalMaxPooling1D
)
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Configure GPU for optimal performance
def configure_gpu():
    gpus = tf.config.list_physical_devices('GPU')
    if gpus:
        try:
            # Enable memory growth for each GPU
            for gpu in gpus:
                tf.config.experimental.set_memory_growth(gpu, True)
            logical_gpus = tf.config.list_logical_devices('GPU')
            print(f"{len(gpus)} Physical GPUs, {len(logical_gpus)} Logical GPUs")
            # Use mixed precision for better performance
            policy = tf.keras.mixed_precision.Policy('mixed_float16')
            tf.keras.mixed_precision.set_global_policy(policy)
            print('Mixed precision enabled')
        except RuntimeError as e:
            print(e)

configure_gpu()

# Multi-Task Model Definition with GRU and Attention
def create_multi_task_gru_attention_model(max_len, max_features, embedding_matrix, embed_size=300):
    """
    Creates a multi-task GRU model with hierarchical attention mechanism for joint prediction of
    gendered abuse (task1) and explicit language (task3)
    """
    # Input layer
    input_layer = Input(shape=(max_len,))
    
    # Embedding layer with pretrained weights
    embedding_layer = Embedding(
        input_dim=max_features,
        output_dim=embed_size,
        weights=[embedding_matrix],
        input_length=max_len,
        trainable=True  # Make embeddings trainable for fine-tuning
    )(input_layer)
    
    # Spatial Dropout with higher rate
    spatial_dropout = SpatialDropout1D(0.3)(embedding_layer)
    
    # Multiple GRU layers with different window sizes
    gru_layer1 = Bidirectional(
        GRU(
            units=128,
            return_sequences=True,
            dropout=0.2,
            recurrent_dropout=0.2,
            kernel_regularizer=tf.keras.regularizers.l2(1e-5)
        )
    )(spatial_dropout)
    
    gru_layer2 = Bidirectional(
        GRU(
            units=64,
            return_sequences=True,
            dropout=0.2,
            recurrent_dropout=0.2
        )
    )(gru_layer1)
    
    # Multi-head self-attention (simplified version)
    attention_layer = tf.keras.layers.MultiHeadAttention(
        num_heads=8,
        key_dim=16
    )(gru_layer2, gru_layer2)
    
    # Skip connection
    concat_layer = tf.keras.layers.Concatenate()([gru_layer2, attention_layer])
    
    # Feature extraction with pooling operations
    avg_pool = GlobalAveragePooling1D()(concat_layer)
    max_pool = GlobalMaxPooling1D()(concat_layer)
    
    # Combine pooled features
    shared_features = tf.keras.layers.Concatenate()([avg_pool, max_pool])
    
    # Task-specific layers for Task 1 (Gendered Abuse)
    task1_dense1 = Dense(256, activation='relu', name='task1_dense1')(shared_features)
    task1_bn1 = tf.keras.layers.BatchNormalization(name='task1_bn1')(task1_dense1)
    task1_dropout1 = Dropout(0.3)(task1_bn1)
    
    task1_dense2 = Dense(128, activation='relu', name='task1_dense2')(task1_dropout1)
    task1_bn2 = tf.keras.layers.BatchNormalization(name='task1_bn2')(task1_dense2)
    task1_dropout2 = Dropout(0.2)(task1_bn2)
    
    task1_output = Dense(2, activation='softmax', name='task1_output', dtype='float32')(task1_dropout2)
    
    # Task-specific layers for Task 3 (Explicit Language)
    task3_dense1 = Dense(256, activation='relu', name='task3_dense1')(shared_features)
    task3_bn1 = tf.keras.layers.BatchNormalization(name='task3_bn1')(task3_dense1)
    task3_dropout1 = Dropout(0.3)(task3_bn1)
    
    task3_dense2 = Dense(128, activation='relu', name='task3_dense2')(task3_dropout1)
    task3_bn2 = tf.keras.layers.BatchNormalization(name='task3_bn2')(task3_dense2)
    task3_dropout2 = Dropout(0.2)(task3_bn2)
    
    task3_output = Dense(2, activation='softmax', name='task3_output', dtype='float32')(task3_dropout2)
    
    # Create model with multiple outputs
    model = Model(inputs=input_layer, outputs=[task1_output, task3_output])
    
    return model

# Custom MacroF1Score Metric for multi-task learning
class MacroF1Score(tf.keras.metrics.Metric):
    def __init__(self, num_classes=2, name='macro_f1_score', **kwargs):
        super(MacroF1Score, self).__init__(name=name, **kwargs)
        self.num_classes = num_classes
        self.tp = self.add_weight(name='tp', initializer='zeros')
        self.fp = self.add_weight(name='fp', initializer='zeros')
        self.fn = self.add_weight(name='fn', initializer='zeros')
        self.count = self.add_weight(name='count', initializer='zeros')

    def update_state(self, y_true, y_pred, sample_weight=None):
        # Convert probabilities to predicted class indices
        y_pred = tf.argmax(y_pred, axis=-1)
        
        # Convert one-hot encoded y_true to class indices if needed
        if len(y_true.shape) > 1 and y_true.shape[-1] > 1:
            y_true = tf.argmax(y_true, axis=-1)
        
        # Initialize confusion matrix
        conf_matrix = tf.math.confusion_matrix(
            y_true,
            y_pred,
            num_classes=self.num_classes,
            dtype=tf.float32
        )
        
        # Calculate TP, FP, FN for each class
        diag = tf.linalg.diag_part(conf_matrix)
        row_sum = tf.reduce_sum(conf_matrix, axis=1)
        col_sum = tf.reduce_sum(conf_matrix, axis=0)
        
        tp = diag
        fp = col_sum - diag
        fn = row_sum - diag
        
        # Update the state variables
        self.tp.assign_add(tf.reduce_sum(tp))
        self.fp.assign_add(tf.reduce_sum(fp))
        self.fn.assign_add(tf.reduce_sum(fn))
        self.count.assign_add(tf.cast(tf.shape(y_true)[0], tf.float32))

    def result(self):
        # Calculate precision and recall
        precision = self.tp / (self.tp + self.fp + tf.keras.backend.epsilon())
        recall = self.tp / (self.tp + self.fn + tf.keras.backend.epsilon())
        
        # Calculate F1 score
        f1 = 2 * (precision * recall) / (precision + recall + tf.keras.backend.epsilon())
        
        # Return macro F1 (average of per-class F1 scores)
        return f1

    def reset_states(self):
        self.tp.assign(0.)
        self.fp.assign(0.)
        self.fn.assign(0.)
        self.count.assign(0.)

# Model Training for multi-task learning
def train_and_validate_multi_task_model(model, X_train, y_train_task1, y_train_task3, 
                                         X_val, y_val_task1, y_val_task3, 
                                         batch_size=32, epochs=15, model_dir='models_en_multi_task_gru'):
    """
    Trains the multi-task GRU-Attention model with early stopping and model checkpointing
    Returns the best model and training history
    """
    # Create directory for saving models if it doesn't exist
    os.makedirs(model_dir, exist_ok=True)
    
    # Callbacks
    early_stopping = EarlyStopping(
        monitor='val_loss',
        patience=3,
        restore_best_weights=True,
        mode='min',
        verbose=1
    )
    
    model_checkpoint = ModelCheckpoint(
        os.path.join(model_dir, 'best_model_en_multi_task_gru.h5'),
        monitor='val_loss',
        mode='min',
        save_best_only=True,
        verbose=1
    )
    
    # Compile model with Adam optimizer
    model.compile(
        optimizer=Adam(learning_rate=0.001),
        loss={
            'task1_output': 'categorical_crossentropy',
            'task3_output': 'categorical_crossentropy'
        },
        loss_weights={
            'task1_output': 1.0,  # Weight for gendered abuse task
            'task3_output': 1.0   # Weight for explicit language task
        },
        metrics={
            'task1_output': ['accuracy', MacroF1Score(num_classes=2)],
            'task3_output': ['accuracy', MacroF1Score(num_classes=2)]
        }
    )
    
    # Train the model
    history = model.fit(
        X_train, 
        {'task1_output': y_train_task1, 'task3_output': y_train_task3},
        validation_data=(
            X_val, 
            {'task1_output': y_val_task1, 'task3_output': y_val_task3}
        ),
        batch_size=batch_size,
        epochs=epochs,
        callbacks=[early_stopping, model_checkpoint],
        verbose=1
    )
    
    # Load the best model found during training
    best_model = load_model(
        os.path.join(model_dir, 'best_model_en_multi_task_gru.h5'), 
        custom_objects={'MacroF1Score': MacroF1Score}
    )
    
    return history, best_model

# Plot Training History for multi-task model
def plot_multi_task_training_history(history, plot_dir='plots_en_multi_task_gru'):
    """
    Plots training history for both tasks (accuracy and loss curves)
    Saves plots to specified directory
    """
    os.makedirs(plot_dir, exist_ok=True)
    
    # Task 1 (Gendered Abuse) plots
    plt.figure(figsize=(12, 10))
    
    # Plot task1 accuracy
    plt.subplot(2, 2, 1)
    plt.plot(history.history['task1_output_accuracy'], label='Train Accuracy')
    plt.plot(history.history['val_task1_output_accuracy'], label='Validation Accuracy')
    plt.title('Task 1 (Gendered Abuse) Accuracy')
    plt.ylabel('Accuracy')
    plt.xlabel('Epoch')
    plt.legend()
    
    # Plot task1 loss
    plt.subplot(2, 2, 2)
    plt.plot(history.history['task1_output_loss'], label='Train Loss')
    plt.plot(history.history['val_task1_output_loss'], label='Validation Loss')
    plt.title('Task 1 (Gendered Abuse) Loss')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend()
    
    # Task 3 (Explicit Language) plots
    # Plot task3 accuracy
    plt.subplot(2, 2, 3)
    plt.plot(history.history['task3_output_accuracy'], label='Train Accuracy')
    plt.plot(history.history['val_task3_output_accuracy'], label='Validation Accuracy')
    plt.title('Task 3 (Explicit Language) Accuracy')
    plt.ylabel('Accuracy')
    plt.xlabel('Epoch')
    plt.legend()
    
    # Plot task3 loss
    plt.subplot(2, 2, 4)
    plt.plot(history.history['task3_output_loss'], label='Train Loss')
    plt.plot(history.history['val_task3_output_loss'], label='Validation Loss')
    plt.title('Task 3 (Explicit Language) Loss')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend()
    
    plt.tight_layout()
    plt.savefig(os.path.join(plot_dir, 'training_history_en_multi_task_gru.png'))
    plt.close()
    
    # Plot combined loss
    plt.figure(figsize=(10, 6))
    plt.plot(history.history['loss'], label='Total Train Loss')
    plt.plot(history.history['val_loss'], label='Total Validation Loss')
    plt.title('Multi-Task Model Combined Loss')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend()
    plt.tight_layout()
    plt.savefig(os.path.join(plot_dir, 'combined_loss_en_multi_task_gru.png'))
    plt.close()
    
    # Plot macro F1 scores
    plt.figure(figsize=(10, 6))
    plt.plot(history.history['task1_output_macro_f1_score'], label='Task1 Train F1')
    plt.plot(history.history['val_task1_output_macro_f1_score'], label='Task1 Val F1')
    plt.plot(history.history['task3_output_macro_f1_score'], label='Task3 Train F1')
    plt.plot(history.history['val_task3_output_macro_f1_score'], label='Task3 Val F1')
    plt.title('Multi-Task Model Macro F1 Scores')
    plt.ylabel('F1 Score')
    plt.xlabel('Epoch')
    plt.legend()
    plt.tight_layout()
    plt.savefig(os.path.join(plot_dir, 'f1_scores_en_multi_task_gru.png'))
    plt.close()

# Evaluation function for multi-task model
def evaluate_multi_task_validation(model, X_val, y_val_task1, y_val_task3, plot_dir='plots_en_multi_task_gru'):
    """
    Evaluates the multi-task model on validation data and saves metrics and plots
    """
    os.makedirs(plot_dir, exist_ok=True)
    
    # Predict probabilities
    y_pred_task1, y_pred_task3 = model.predict(X_val, batch_size=32)
    
    # Convert to class labels
    y_pred_task1_labels = np.argmax(y_pred_task1, axis=1)
    y_true_task1_labels = np.argmax(y_val_task1, axis=1)
    
    y_pred_task3_labels = np.argmax(y_pred_task3, axis=1)
    y_true_task3_labels = np.argmax(y_val_task3, axis=1)
    
    # Task 1 (Gendered Abuse) metrics
    task1_precision = precision_score(y_true_task1_labels, y_pred_task1_labels, average='weighted')
    task1_recall = recall_score(y_true_task1_labels, y_pred_task1_labels, average='weighted')
    task1_weighted_f1 = f1_score(y_true_task1_labels, y_pred_task1_labels, average='weighted')
    task1_macro_f1 = f1_score(y_true_task1_labels, y_pred_task1_labels, average='macro')
    
    # Task 3 (Explicit Language) metrics
    task3_precision = precision_score(y_true_task3_labels, y_pred_task3_labels, average='weighted')
    task3_recall = recall_score(y_true_task3_labels, y_pred_task3_labels, average='weighted')
    task3_weighted_f1 = f1_score(y_true_task3_labels, y_pred_task3_labels, average='weighted')
    task3_macro_f1 = f1_score(y_true_task3_labels, y_pred_task3_labels, average='macro')
    
    # Classification reports
    task1_report = classification_report(y_true_task1_labels, y_pred_task1_labels, 
                                        target_names=['not_hate', 'hate'])
    
    task3_report = classification_report(y_true_task3_labels, y_pred_task3_labels, 
                                        target_names=['not_explicit', 'explicit'])
    
    # Confusion matrices
    task1_conf_matrix = confusion_matrix(y_true_task1_labels, y_pred_task1_labels)
    task3_conf_matrix = confusion_matrix(y_true_task3_labels, y_pred_task3_labels)
    
    # Plot confusion matrices
    # Task 1 confusion matrix
    plt.figure(figsize=(8, 6))
    sns.heatmap(task1_conf_matrix, annot=True, fmt='d', cmap='Blues',
                xticklabels=['Not Hate', 'Hate'],
                yticklabels=['Not Hate', 'Hate'])
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Task 1 (Gendered Abuse) Confusion Matrix')
    plt.savefig(os.path.join(plot_dir, 'task1_confusion_matrix_en_multi_task_gru.png'))
    plt.close()
    
    # Task 3 confusion matrix
    plt.figure(figsize=(8, 6))
    sns.heatmap(task3_conf_matrix, annot=True, fmt='d', cmap='Blues',
                xticklabels=['Not Explicit', 'Explicit'],
                yticklabels=['Not Explicit', 'Explicit'])
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Task 3 (Explicit Language) Confusion Matrix')
    plt.savefig(os.path.join(plot_dir, 'task3_confusion_matrix_en_multi_task_gru.png'))
    plt.close()
    
    return {
        'task1': {
            'precision': task1_precision,
            'recall': task1_recall,
            'f1_score_weighted': task1_weighted_f1,
            'f1_score_macro': task1_macro_f1,
            'classification_report': task1_report,
            'confusion_matrix': task1_conf_matrix
        },
        'task3': {
            'precision': task3_precision,
            'recall': task3_recall,
            'f1_score_weighted': task3_weighted_f1,
            'f1_score_macro': task3_macro_f1,
            'classification_report': task3_report,
            'confusion_matrix': task3_conf_matrix
        }
    }

# Main Execution for Training and Validation
if __name__ == "__main__":
    # Split into train (80%) and validation (20%)
    X_train, X_val, y_train_task1, y_val_task1, y_train_task3, y_val_task3 = train_test_split(
        X, y_task1, y_task3, test_size=0.2, random_state=42, stratify=y_task1
    )
    
    print(f"Training samples: {len(X_train)}")
    print(f"Validation samples: {len(X_val)}")
    
    # Create multi-task GRU-Attention model
    embed_size = embedding_matrix.shape[1]
    model = create_multi_task_gru_attention_model(max_len, max_features, embedding_matrix, embed_size)
    
    # Print model summary
    model.summary()
    
    # Train multi-task model
    history, trained_model = train_and_validate_multi_task_model(
        model, X_train, y_train_task1, y_train_task3, 
        X_val, y_val_task1, y_val_task3,
        batch_size=32,
        epochs=15  
    )
    
    # Plot training history
    plot_multi_task_training_history(history)
    
    # Evaluate on validation set
    val_results = evaluate_multi_task_validation(
        trained_model, X_val, y_val_task1, y_val_task3
    )
    
    # Print Task 1 (Gendered Abuse) results
    print("\nTask 1 (Gendered Abuse) Validation Results:")
    print(f"Precision: {val_results['task1']['precision']:.4f}")
    print(f"Recall: {val_results['task1']['recall']:.4f}")
    print(f"weighted F1 Score: {val_results['task1']['f1_score_weighted']:.4f}")
    print(f"macro F1 Score: {val_results['task1']['f1_score_macro']:.4f}")
    print("\nClassification Report:")
    print(val_results['task1']['classification_report'])
    
    # Print Task 3 (Explicit Language) results
    print("\nTask 3 (Explicit Language) Validation Results:")
    print(f"Precision: {val_results['task3']['precision']:.4f}")
    print(f"Recall: {val_results['task3']['recall']:.4f}")
    print(f"weighted F1 Score: {val_results['task3']['f1_score_weighted']:.4f}")
    print(f"macro F1 Score: {val_results['task3']['f1_score_macro']:.4f}")
    print("\nClassification Report:")
    print(val_results['task3']['classification_report'])

Training samples: 5224
Validation samples: 1307




Epoch 1/15
[1m164/164[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 190ms/step - loss: 1.7473 - task1_output_accuracy: 0.6014 - task1_output_loss: 0.8008 - task1_output_macro_f1_score: 0.6014 - task3_output_accuracy: 0.5357 - task3_output_loss: 0.9447 - task3_output_macro_f1_score: 0.5357 
Epoch 1: val_loss improved from inf to 1.16966, saving model to models_en_multi_task_gru\best_model_en_multi_task_gru.h5




[1m164/164[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 220ms/step - loss: 1.7461 - task1_output_accuracy: 0.6018 - task1_output_loss: 0.8002 - task1_output_macro_f1_score: 0.6018 - task3_output_accuracy: 0.5358 - task3_output_loss: 0.9441 - task3_output_macro_f1_score: 0.5358 - val_loss: 1.1697 - val_task1_output_accuracy: 0.7896 - val_task1_output_loss: 0.5133 - val_task1_output_macro_f1_score: 0.7896 - val_task3_output_accuracy: 0.6366 - val_task3_output_loss: 0.6560 - val_task3_output_macro_f1_score: 0.6366
Epoch 2/15
[1m164/164[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 180ms/step - loss: 1.3507 - task1_output_accuracy: 0.7557 - task1_output_loss: 0.5873 - task1_output_macro_f1_score: 0.7557 - task3_output_accuracy: 0.5816 - task3_output_loss: 0.7616 - task3_output_macro_f1_score: 0.5816 
Epoch 2: val_loss improved from 1.16966 to 1.16340, saving model to models_en_multi_task_gru\best_model_en_multi_task_gru.h5




[1m164/164[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 191ms/step - loss: 1.3505 - task1_output_accuracy: 0.7557 - task1_output_loss: 0.5872 - task1_output_macro_f1_score: 0.7557 - task3_output_accuracy: 0.5816 - task3_output_loss: 0.7615 - task3_output_macro_f1_score: 0.5816 - val_loss: 1.1634 - val_task1_output_accuracy: 0.7896 - val_task1_output_loss: 0.5109 - val_task1_output_macro_f1_score: 0.7896 - val_task3_output_accuracy: 0.6366 - val_task3_output_loss: 0.6524 - val_task3_output_macro_f1_score: 0.6366
Epoch 3/15
[1m164/164[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 180ms/step - loss: 1.2589 - task1_output_accuracy: 0.7720 - task1_output_loss: 0.5587 - task1_output_macro_f1_score: 0.7720 - task3_output_accuracy: 0.5993 - task3_output_loss: 0.6984 - task3_output_macro_f1_score: 0.5993 
Epoch 3: val_loss did not improve from 1.16340
[1m164/164[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 191ms/step - loss: 1.2587 - task1_output_accuracy: 0.7720 



[1m164/164[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 197ms/step - loss: 1.0774 - task1_output_accuracy: 0.8023 - task1_output_loss: 0.4622 - task1_output_macro_f1_score: 0.8023 - task3_output_accuracy: 0.6695 - task3_output_loss: 0.6134 - task3_output_macro_f1_score: 0.6695 - val_loss: 1.1625 - val_task1_output_accuracy: 0.7682 - val_task1_output_loss: 0.5270 - val_task1_output_macro_f1_score: 0.7682 - val_task3_output_accuracy: 0.6442 - val_task3_output_loss: 0.6329 - val_task3_output_macro_f1_score: 0.6442
Epoch 6/15
[1m164/164[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 191ms/step - loss: 1.0018 - task1_output_accuracy: 0.8144 - task1_output_loss: 0.4214 - task1_output_macro_f1_score: 0.8144 - task3_output_accuracy: 0.6917 - task3_output_loss: 0.5787 - task3_output_macro_f1_score: 0.6917 
Epoch 6: val_loss improved from 1.16251 to 1.07816, saving model to models_en_multi_task_gru\best_model_en_multi_task_gru.h5




[1m164/164[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 204ms/step - loss: 1.0018 - task1_output_accuracy: 0.8144 - task1_output_loss: 0.4213 - task1_output_macro_f1_score: 0.8144 - task3_output_accuracy: 0.6917 - task3_output_loss: 0.5787 - task3_output_macro_f1_score: 0.6917 - val_loss: 1.0782 - val_task1_output_accuracy: 0.8041 - val_task1_output_loss: 0.4495 - val_task1_output_macro_f1_score: 0.8041 - val_task3_output_accuracy: 0.6626 - val_task3_output_loss: 0.6265 - val_task3_output_macro_f1_score: 0.6626
Epoch 7/15
[1m164/164[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 192ms/step - loss: 0.9443 - task1_output_accuracy: 0.8178 - task1_output_loss: 0.3992 - task1_output_macro_f1_score: 0.8178 - task3_output_accuracy: 0.7208 - task3_output_loss: 0.5432 - task3_output_macro_f1_score: 0.7208 
Epoch 7: val_loss did not improve from 1.07816
[1m164/164[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 203ms/step - loss: 0.9442 - task1_output_accuracy: 0.8179 



[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 71ms/step

Task 1 (Gendered Abuse) Validation Results:
Precision: 0.7810
Recall: 0.8041
weighted F1 Score: 0.7855
macro F1 Score: 0.6527

Classification Report:
              precision    recall  f1-score   support

    not_hate       0.84      0.93      0.88      1032
        hate       0.56      0.34      0.42       275

    accuracy                           0.80      1307
   macro avg       0.70      0.63      0.65      1307
weighted avg       0.78      0.80      0.79      1307


Task 3 (Explicit Language) Validation Results:
Precision: 0.6586
Recall: 0.6626
weighted F1 Score: 0.6603
macro F1 Score: 0.6296

Classification Report:
              precision    recall  f1-score   support

not_explicit       0.73      0.75      0.74       835
    explicit       0.53      0.50      0.52       472

    accuracy                           0.66      1307
   macro avg       0.63      0.63      0.63      1307
weighted avg       0.6

In [10]:
# Process label 1 dataset
df_l1 = pd.read_csv('test_en_l1.csv')
df_l1 = df_l1.rename(columns={'key': 'unique_id', 'sentence': 'text'})

# Convert annotator columns to numeric without replacing NaNs
df_l1[['en_a1', 'en_a2', 'en_a3', 'en_a4', 'en_a5', 'en_a6']] = df_l1[
    ['en_a1', 'en_a2', 'en_a3', 'en_a4', 'en_a5', 'en_a6']
].apply(pd.to_numeric, errors='coerce')

# Compute 'label_1' based on majority voting while ignoring NaNs
df_l1['label_1'] = (df_l1[['en_a1', 'en_a2', 'en_a3', 'en_a4', 'en_a5', 'en_a6']].mean(axis=1, skipna=True) >= 0.5).astype(int)

# Create proper binary label for task 1 (gendered abuse)
df_l1['binary_label_1'] = df_l1['label_1'].apply(lambda x: 'hate' if x == 1 else 'not_hate')

# Process label 3 dataset
df_l3 = pd.read_csv('test_en_l3.csv')
df_l3 = df_l3.rename(columns={'key': 'unique_id', 'sentence': 'text'})

# Convert annotator columns to numeric without replacing NaNs
df_l3[['en_a1', 'en_a2', 'en_a3', 'en_a4', 'en_a5', 'en_a6']] = df_l3[
    ['en_a1', 'en_a2', 'en_a3', 'en_a4', 'en_a5', 'en_a6']
].apply(pd.to_numeric, errors='coerce')

# Compute 'label_3' based on majority voting while ignoring NaNs
df_l3['label_3'] = (df_l3[['en_a1', 'en_a2', 'en_a3', 'en_a4', 'en_a5', 'en_a6']].mean(axis=1, skipna=True) >= 0.5).astype(int)

# Create proper binary label for task 3 (explicit/aggressive)
df_l3['binary_label_3'] = df_l3['label_3'].apply(lambda x: 'explicit' if x == 1 else 'not_explicit')

# Select columns for merging
df_l1_slim = df_l1[['text', 'label_1', 'binary_label_1']]
df_l3_slim = df_l3[['text', 'label_3', 'binary_label_3']]

# Merge the datasets based on text field
merged_df = pd.merge(df_l1_slim, df_l3_slim, on='text', how='inner')

# Add a unique_id column to the merged dataset
merged_df['unique_id'] = [f'id_{i}' for i in range(len(merged_df))]

# Reorder columns
merged_df = merged_df[['unique_id', 'text', 'binary_label_1', 'label_1', 'binary_label_3', 'label_3']]

# Save the merged dataset
merged_df.to_csv('test_en_task3.csv', index=False)

# Display information
print(f"Final merged dataset shape: {merged_df.shape}")
print(merged_df.head())

# Check label distribution
print("\nLabel distribution:")
print(f"Label 1 (gendered abuse): {merged_df['binary_label_1'].value_counts().to_dict()}")
print(f"Label 3 (explicit/aggressive): {merged_df['binary_label_3'].value_counts().to_dict()}")

Final merged dataset shape: (1107, 6)
  unique_id                                               text binary_label_1  \
0      id_0  "who's this tweet about" bitch it's about you ...       not_hate   
1      id_1  #AlkaLamba should’ve used her slippers to slap...       not_hate   
2      id_2  #CoronaVillains  #COVIDIDIOT   #COVID2019 is a...           hate   
3      id_3  (Acts 29:19994) LIKE the female "governors" of...           hate   
4      id_4  ***Let us celebrate “secular” as a beautiful w...       not_hate   

   label_1 binary_label_3  label_3  
0        0   not_explicit        0  
1        0   not_explicit        0  
2        1       explicit        1  
3        1       explicit        1  
4        0   not_explicit        0  

Label distribution:
Label 1 (gendered abuse): {'not_hate': 877, 'hate': 230}
Label 3 (explicit/aggressive): {'not_explicit': 699, 'explicit': 408}


In [11]:
merged_df = pd.read_csv('test_en_task3.csv')

# Apply text normalization
merged_df['processed_text'] = merged_df['text'].apply(lambda x: normalize_text(x))

# Further processing to remove '...'
merged_df['processed_text'] = merged_df['processed_text'].str.replace('...', '')

# Display samples of processed text
print("\nOriginal vs Processed Text Samples:")
for i in range(3):
    print(f"Original: {merged_df['text'].iloc[i]}")
    print(f"Processed: {merged_df['processed_text'].iloc[i]}")
    print("-" * 50)

# Keep all columns but add processed text
merged_df_final = merged_df[['unique_id', 'text', 'processed_text', 'binary_label_1', 'label_1', 'binary_label_3', 'label_3']]


print(f"\nProcessed dataset shape: {merged_df_final.shape}")
print(merged_df_final.head())

# Extract features (processed text)
X = list(merged_df_final['processed_text'])

# Extract labels for both tasks
y_task1 = merged_df_final['label_1'].values
y_task3 = merged_df_final['label_3'].values
tokenizer.fit_on_texts(X)
X = tokenizer.texts_to_sequences(X)

# Padding
X = pad_sequences(X, padding='post', maxlen=max_len)

y_task1 = label_encoder.fit_transform(y_task1)
y_task3 = label_encoder.fit_transform(y_task3)

y_task1 = to_categorical(y_task1, num_classes=2)
y_task3 = to_categorical(y_task3, num_classes=2)


Original vs Processed Text Samples:
Original: "who's this tweet about" bitch it's about you fucking kiss me already
Processed: who s this tweet about bitch it s about you fucking kiss me already
--------------------------------------------------
Original: #AlkaLamba should’ve used her slippers to slap that misogynistic idiot.
Processed: alkalamba should’ve used her slippers to slap that misogynistic idiot
--------------------------------------------------
Original: #CoronaVillains  #COVIDIDIOT   #COVID2019 is a #PandemicCovid19   Don't spread your #FakeNewsAlert promoting #FakeCases  #womenaretrash
Processed: coronavillains covididiot is a don t spread your fakenewsalert promoting fakecases womenaretrash
--------------------------------------------------

Processed dataset shape: (1107, 7)
  unique_id                                               text  \
0      id_0  "who's this tweet about" bitch it's about you ...   
1      id_1  #AlkaLamba should’ve used her slippers to slap...   


In [12]:
# Evaluate on validation set
test_results = evaluate_multi_task_validation(
    trained_model, X, y_task1, y_task3
)

# Print Task 1 (Gendered Abuse) results
print("\nTask 1 (Gendered Abuse) Test Results:")
print(f"Precision: {test_results['task1']['precision']:.4f}")
print(f"Recall: {test_results['task1']['recall']:.4f}")
print(f"weighted F1 Score: {test_results['task1']['f1_score_weighted']:.4f}")
print(f"macro F1 Score: {test_results['task1']['f1_score_macro']:.4f}")
print("\nClassification Report:")
print(test_results['task1']['classification_report'])

# Print Task 3 (Explicit Language) results
print("\nTask 3 (Explicit Language) Test Results:")
print(f"Precision: {test_results['task3']['precision']:.4f}")
print(f"Recall: {test_results['task3']['recall']:.4f}")
print(f"weighted F1 Score: {test_results['task3']['f1_score_weighted']:.4f}")
print(f"macro F1 Score: {test_results['task3']['f1_score_macro']:.4f}")
print("\nClassification Report:")
print(test_results['task3']['classification_report'])

[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 39ms/step

Task 1 (Gendered Abuse) Test Results:
Precision: 0.7018
Recall: 0.7498
weighted F1 Score: 0.7190
macro F1 Score: 0.5332

Classification Report:
              precision    recall  f1-score   support

    not_hate       0.80      0.90      0.85       877
        hate       0.31      0.17      0.22       230

    accuracy                           0.75      1107
   macro avg       0.56      0.53      0.53      1107
weighted avg       0.70      0.75      0.72      1107


Task 3 (Explicit Language) Test Results:
Precision: 0.5594
Recall: 0.5682
weighted F1 Score: 0.5631
macro F1 Score: 0.5258

Classification Report:
              precision    recall  f1-score   support

not_explicit       0.65      0.69      0.67       699
    explicit       0.40      0.37      0.38       408

    accuracy                           0.57      1107
   macro avg       0.53      0.53      0.53      1107
weighted avg       0.56      0.57 