In [1]:
import pandas as pd
import tensorflow as tf
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from skmultilearn.adapt import MLkNN
import sklearn.metrics as metrics
from sklearn.metrics import hamming_loss, accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import multilabel_confusion_matrix
import os

In [2]:
# Process label 1 dataset
df_l1 = pd.read_csv('train_ta_l1.csv')
df_l1 = df_l1.rename(columns={'key': 'unique_id', 'sentence': 'text'})

# Convert annotator columns to numeric without replacing NaNs
df_l1[['ta_a1', 'ta_a2', 'ta_a3', 'ta_a4', 'ta_a5', 'ta_a6']] = df_l1[
    ['ta_a1', 'ta_a2', 'ta_a3', 'ta_a4', 'ta_a5', 'ta_a6']
].apply(pd.to_numeric, errors='coerce')

# Compute 'label_1' based on majority voting while ignoring NaNs
df_l1['label_1'] = (df_l1[['ta_a1', 'ta_a2', 'ta_a3', 'ta_a4', 'ta_a5', 'ta_a6']].mean(axis=1, skipna=True) >= 0.5).astype(int)

# Create proper binary label for task 1 (gendered abuse)
df_l1['binary_label_1'] = df_l1['label_1'].apply(lambda x: 'hate' if x == 1 else 'not_hate')

# Process label 3 dataset
df_l3 = pd.read_csv('train_ta_l3.csv')
df_l3 = df_l3.rename(columns={'key': 'unique_id', 'sentence': 'text'})

# Convert annotator columns to numeric without replacing NaNs
df_l3[['ta_a1', 'ta_a2', 'ta_a3', 'ta_a4', 'ta_a5', 'ta_a6']] = df_l3[
    ['ta_a1', 'ta_a2', 'ta_a3', 'ta_a4', 'ta_a5', 'ta_a6']
].apply(pd.to_numeric, errors='coerce')

# Compute 'label_3' based on majority voting while ignoring NaNs
df_l3['label_3'] = (df_l3[['ta_a1', 'ta_a2', 'ta_a3', 'ta_a4', 'ta_a5', 'ta_a6']].mean(axis=1, skipna=True) >= 0.5).astype(int)

# Create proper binary label for task 3 (explicit/aggressive)
df_l3['binary_label_3'] = df_l3['label_3'].apply(lambda x: 'explicit' if x == 1 else 'not_explicit')

# Select columns for merging
df_l1_slim = df_l1[['text', 'label_1', 'binary_label_1']]
df_l3_slim = df_l3[['text', 'label_3', 'binary_label_3']]

# Merge the datasets based on text field
merged_df = pd.merge(df_l1_slim, df_l3_slim, on='text', how='inner')

# Add a unique_id column to the merged dataset
merged_df['unique_id'] = [f'id_{i}' for i in range(len(merged_df))]

# Reorder columns
merged_df = merged_df[['unique_id', 'text', 'binary_label_1', 'label_1', 'binary_label_3', 'label_3']]

# Save the merged dataset
merged_df.to_csv('train_ta_task3.csv', index=False)

# Display information
print(f"Final merged dataset shape: {merged_df.shape}")
print(merged_df.head())

# Check label distribution
print("\nLabel distribution:")
print(f"Label 1 (gendered abuse): {merged_df['binary_label_1'].value_counts().to_dict()}")
print(f"Label 3 (explicit/aggressive): {merged_df['binary_label_3'].value_counts().to_dict()}")

Final merged dataset shape: (6779, 6)
  unique_id                                               text binary_label_1  \
0      id_0     *1. முரசொலி அலுவலகம் அமைந்துள்ள இடம் பஞ்சமி...       not_hate   
1      id_1     சோத்துக்கு பிச்சை எடுக்கிற கடங்கார நாய்களுக...       not_hate   
2      id_2           தத்தபுத்த தத்தபுத்த ன்னு எதாவது புரியுதா       not_hate   
3      id_3      பச்சை மொளகா காரம் vicky அம்மா புண்டை நாறும் 😆           hate   
4      id_4    என்ன உடம்பு டா சாமி- சும்மா வளுவளுனு.. முலை ...           hate   

   label_1 binary_label_3  label_3  
0        0   not_explicit        0  
1        0       explicit        1  
2        0   not_explicit        0  
3        1       explicit        1  
4        1       explicit        1  

Label distribution:
Label 1 (gendered abuse): {'not_hate': 3822, 'hate': 2957}
Label 3 (explicit/aggressive): {'explicit': 4400, 'not_explicit': 2379}


In [3]:
import nltk
import string
import re

def normalize_text(text):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"
                               u"\U0001F300-\U0001F5FF"
                               u"\U0001F680-\U0001F6FF"
                               u"\U0001F700-\U0001F77F"
                               u"\U0001F780-\U0001F7FF"
                               u"\U0001F800-\U0001F8FF"
                               u"\U0001F900-\U0001F9FF"
                               u"\U0001FA00-\U0001FA6F"
                               u"\U0001FA70-\U0001FAFF"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    text = text.lower()
    text = re.sub(r'\[.*?\]', ' ', text)
    text = re.sub(r'https?://\S+|www\.\S+', ' ', text)
    text = re.sub(r'<.*?>+', ' ', text)
    text = re.sub(r'[%s]' % re.escape(string.punctuation), ' ', text)
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r'\w*\d\w*', ' ', text)
    text = re.sub(r'<handle replaced>', '', text)
    text = emoji_pattern.sub(r'', text)
    return text

# Load the multi-task dataset
merged_df = pd.read_csv('train_ta_task3.csv')

# Apply text normalization
merged_df['processed_text'] = merged_df['text'].apply(lambda x: normalize_text(x))

# Further processing to remove '...'
merged_df['processed_text'] = merged_df['processed_text'].str.replace('...', '')

# Display samples of processed text
print("\nOriginal vs Processed Text Samples:")
for i in range(3):
    print(f"Original: {merged_df['text'].iloc[i]}")
    print(f"Processed: {merged_df['processed_text'].iloc[i]}")
    print("-" * 50)

# Keep all columns but add processed text
merged_df_final = merged_df[['unique_id', 'text', 'processed_text', 'binary_label_1', 'label_1', 'binary_label_3', 'label_3']]


print(f"\nProcessed dataset shape: {merged_df_final.shape}")
print(merged_df_final.head())


Original vs Processed Text Samples:
Original:    *1. முரசொலி அலுவலகம் அமைந்துள்ள இடம் பஞ்சமி நிலம் இல்லை என்பதை நிரூபிக்க 1985-ஆம் ஆண்டு வாங்கப்பட்ட பட்டாவை ஆதாரமாகக் காட்டியிருக்கிறார் மு.க.ஸ்டாலின்.  இதற்கு காட்ட வேண்டிய...  
Processed:        முரசொலி அலுவலகம் அமைந்துள்ள இடம் பஞ்சமி நிலம் இல்லை என்பதை நிரூபிக்க   ஆம் ஆண்டு வாங்கப்பட்ட பட்டாவை ஆதாரமாகக் காட்டியிருக்கிறார் மு க ஸ்டாலின்   இதற்கு காட்ட வேண்டிய     
--------------------------------------------------
Original:    சோத்துக்கு பிச்சை எடுக்கிற கடங்கார நாய்களுக்கு பேச்சு புண்டையை  பாரு..   போய் சீனா காரனுக்கு சூத்து குடு போ
Processed:    சோத்துக்கு பிச்சை எடுக்கிற கடங்கார நாய்களுக்கு பேச்சு புண்டையை  பாரு     போய் சீனா காரனுக்கு சூத்து குடு போ
--------------------------------------------------
Original:    தத்தபுத்த தத்தபுத்த ன்னு எதாவது புரியுதா
Processed:    தத்தபுத்த தத்தபுத்த ன்னு எதாவது புரியுதா
--------------------------------------------------

Processed dataset shape: (6779, 7)
  unique_id                             

In [4]:
# Extract features (processed text)
X = list(merged_df_final['processed_text'])

# Extract labels for both tasks
y_task1 = merged_df_final['label_1'].values
y_task3 = merged_df_final['label_3'].values

In [5]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.utils import pad_sequences
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import (
    LSTM, Activation, Dropout, Dense, Flatten,
    Bidirectional, GRU, concatenate, SpatialDropout1D,
    GlobalMaxPooling1D, GlobalAveragePooling1D, Conv1D,
    Embedding, Input, Concatenate
)
from tensorflow.keras.optimizers import Adam, SGD, RMSprop
from tensorflow.keras.losses import MeanSquaredError

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

######## Textual Features for Embedding ###################
max_len = 100
max_features = 4479

# Tokenization
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(X)
X = tokenizer.texts_to_sequences(X)

# Padding
X = pad_sequences(X, padding='post', maxlen=max_len)

print(X)  # Check the processed sequences

[[1557 2466  444 ...    0    0    0]
 [1396  323 2468 ...    0    0    0]
 [ 135  520 2470 ...    0    0    0]
 ...
 [3879 2142 1100 ...    0    0    0]
 [ 406  430  602 ...    0    0    0]
 [   4  850   13 ...    0    0    0]]


In [6]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_task1 = label_encoder.fit_transform(y_task1)
y_task3 = label_encoder.fit_transform(y_task3)

In [7]:
from keras.utils import to_categorical
y_task1 = to_categorical(y_task1, num_classes=2)
y_task3 = to_categorical(y_task3, num_classes=2)

In [8]:
import numpy as np
import json

# Load GloVe embeddings from JSON
with open('glove_embeddings.json', encoding="utf8") as f:
    embeddings_list = json.load(f)

# Convert the list of vectors to a dictionary with word indices as keys
embeddings_dictionary = {str(i): vector for i, vector in enumerate(embeddings_list)}

# Define tokenizer 
vocab_size = len(tokenizer.word_index) + 1  # Vocabulary size
word_index = tokenizer.word_index
num_words = min(max_features, vocab_size)  # Limit vocab to max_features

# Get embedding dimension (from first vector in list)
embed_size = len(embeddings_list[0]) if embeddings_list else 0

# Initialize embedding matrix
embedding_matrix = np.zeros((num_words, embed_size))

# Fill embedding matrix with corresponding word vectors
for word, index in word_index.items():
    if index >= max_features:
        continue
    embedding_vector = embeddings_dictionary.get(word) or embeddings_dictionary.get(str(index))
    if embedding_vector is not None:
        embedding_matrix[index] = np.asarray(embedding_vector, dtype=np.float32)

print("Embedding matrix shape:", embedding_matrix.shape)

Embedding matrix shape: (4479, 50)


In [9]:
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import (
    Input, Embedding, SpatialDropout1D, Conv1D,
    Bidirectional, LSTM, Dense, Dropout,
    GlobalAveragePooling1D
)
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Configure GPU for optimal performance
def configure_gpu():
    gpus = tf.config.list_physical_devices('GPU')
    if gpus:
        try:
            # Enable memory growth for each GPU
            for gpu in gpus:
                tf.config.experimental.set_memory_growth(gpu, True)
            logical_gpus = tf.config.list_logical_devices('GPU')
            print(f"{len(gpus)} Physical GPUs, {len(logical_gpus)} Logical GPUs")
            # Use mixed precision for better performance
            policy = tf.keras.mixed_precision.Policy('mixed_float16')
            tf.keras.mixed_precision.set_global_policy(policy)
            print('Mixed precision enabled')
        except RuntimeError as e:
            print(e)

configure_gpu()

# Multi-Task Model Definition
def create_multi_task_cnn_bilstm_model(max_len, max_features, embedding_matrix, embed_size=300):
    """
    Creates a multi-task CNN-BiLSTM model architecture for joint prediction of
    gendered abuse (task1) and explicit language (task3)
    """
    # Input layer
    input_layer = Input(shape=(max_len,))
    
    # Embedding layer with pretrained weights (GloVe/FastText)
    embedding_layer = Embedding(
        input_dim=max_features,
        output_dim=embed_size,
        weights=[embedding_matrix],
        input_length=max_len,
        trainable=False  # As per paper, embeddings are non-trainable
    )(input_layer)
    
    # Spatial Dropout to prevent overfitting
    spatial_dropout = SpatialDropout1D(0.2)(embedding_layer)
    
    # CNN Layer - shared representation
    conv_layer = Conv1D(
        filters=64,
        kernel_size=2,
        activation='relu',
        padding='same'
    )(spatial_dropout)
    
    # Bidirectional LSTM Layer - shared representation
    bilstm_layer = Bidirectional(
        LSTM(
            units=128,
            return_sequences=True,
            dropout=0.1,
            recurrent_dropout=0.1
        )
    )(conv_layer)
    
    # Global Average Pooling - shared representation
    gap_layer = GlobalAveragePooling1D()(bilstm_layer)
    
    # Task-specific layers for Task 1 (Gendered Abuse)
    task1_dense = Dense(128, activation='relu', name='task1_dense')(gap_layer)
    task1_dropout = Dropout(0.1)(task1_dense)
    task1_output = Dense(2, activation='softmax', name='task1_output', dtype='float32')(task1_dropout)
    
    # Task-specific layers for Task 3 (Explicit Language)
    task3_dense = Dense(128, activation='relu', name='task3_dense')(gap_layer)
    task3_dropout = Dropout(0.1)(task3_dense)
    task3_output = Dense(2, activation='softmax', name='task3_output', dtype='float32')(task3_dropout)
    
    # Create model with multiple outputs
    model = Model(inputs=input_layer, outputs=[task1_output, task3_output])
    
    return model

# Custom MacroF1Score Metric for multi-task learning
class MacroF1Score(tf.keras.metrics.Metric):
    def __init__(self, num_classes=2, name='macro_f1_score', **kwargs):
        super(MacroF1Score, self).__init__(name=name, **kwargs)
        self.num_classes = num_classes
        self.tp = self.add_weight(name='tp', initializer='zeros')
        self.fp = self.add_weight(name='fp', initializer='zeros')
        self.fn = self.add_weight(name='fn', initializer='zeros')
        self.count = self.add_weight(name='count', initializer='zeros')

    def update_state(self, y_true, y_pred, sample_weight=None):
        # Convert probabilities to predicted class indices
        y_pred = tf.argmax(y_pred, axis=-1)
        
        # Convert one-hot encoded y_true to class indices if needed
        if len(y_true.shape) > 1 and y_true.shape[-1] > 1:
            y_true = tf.argmax(y_true, axis=-1)
        
        # Initialize confusion matrix
        conf_matrix = tf.math.confusion_matrix(
            y_true,
            y_pred,
            num_classes=self.num_classes,
            dtype=tf.float32
        )
        
        # Calculate TP, FP, FN for each class
        diag = tf.linalg.diag_part(conf_matrix)
        row_sum = tf.reduce_sum(conf_matrix, axis=1)
        col_sum = tf.reduce_sum(conf_matrix, axis=0)
        
        tp = diag
        fp = col_sum - diag
        fn = row_sum - diag
        
        # Update the state variables
        self.tp.assign_add(tf.reduce_sum(tp))
        self.fp.assign_add(tf.reduce_sum(fp))
        self.fn.assign_add(tf.reduce_sum(fn))
        self.count.assign_add(tf.cast(tf.shape(y_true)[0], tf.float32))

    def result(self):
        # Calculate precision and recall
        precision = self.tp / (self.tp + self.fp + tf.keras.backend.epsilon())
        recall = self.tp / (self.tp + self.fn + tf.keras.backend.epsilon())
        
        # Calculate F1 score
        f1 = 2 * (precision * recall) / (precision + recall + tf.keras.backend.epsilon())
        
        # Return macro F1 (average of per-class F1 scores)
        return f1

    def reset_states(self):
        self.tp.assign(0.)
        self.fp.assign(0.)
        self.fn.assign(0.)
        self.count.assign(0.)

# Model Training for multi-task learning
def train_and_validate_multi_task_model(model, X_train, y_train_task1, y_train_task3, 
                                         X_val, y_val_task1, y_val_task3, 
                                         batch_size=32, epochs=15, model_dir='models_ta_task3_m1'):
    """
    Trains the multi-task CNN-BiLSTM model with early stopping and model checkpointing
    Returns the best model and training history
    """
    # Create directory for saving models if it doesn't exist
    os.makedirs(model_dir, exist_ok=True)
    
    # Callbacks
    early_stopping = EarlyStopping(
        monitor='val_loss',
        patience=3,
        restore_best_weights=True,
        mode='min',
        verbose=1
    )
    
    model_checkpoint = ModelCheckpoint(
        os.path.join(model_dir, 'best_model_ta_task3_m1.h5'),
        monitor='val_loss',
        mode='min',
        save_best_only=True,
        verbose=1
    )
    
    # Compile model with Adam optimizer
    model.compile(
        optimizer=Adam(learning_rate=0.001),
        loss={
            'task1_output': 'categorical_crossentropy',
            'task3_output': 'categorical_crossentropy'
        },
        loss_weights={
            'task1_output': 1.0,  # Weight for gendered abuse task
            'task3_output': 1.0   # Weight for explicit language task
        },
        metrics={
            'task1_output': ['accuracy', MacroF1Score(num_classes=2)],
            'task3_output': ['accuracy', MacroF1Score(num_classes=2)]
        }
    )
    
    # Train the model
    history = model.fit(
        X_train, 
        {'task1_output': y_train_task1, 'task3_output': y_train_task3},
        validation_data=(
            X_val, 
            {'task1_output': y_val_task1, 'task3_output': y_val_task3}
        ),
        batch_size=batch_size,
        epochs=epochs,
        callbacks=[early_stopping, model_checkpoint],
        verbose=1
    )
    
    # Load the best model found during training
    best_model = load_model(
        os.path.join(model_dir, 'best_model_ta_task3_m1.h5'), 
        custom_objects={'MacroF1Score': MacroF1Score}
    )
    
    return history, best_model

# Plot Training History for multi-task model
def plot_multi_task_training_history(history, plot_dir='plots_ta_task3_m1'):
    """
    Plots training history for both tasks (accuracy and loss curves)
    Saves plots to specified directory
    """
    os.makedirs(plot_dir, exist_ok=True)
    
    # Task 1 (Gendered Abuse) plots
    plt.figure(figsize=(12, 10))
    
    # Plot task1 accuracy
    plt.subplot(2, 2, 1)
    plt.plot(history.history['task1_output_accuracy'], label='Train Accuracy')
    plt.plot(history.history['val_task1_output_accuracy'], label='Validation Accuracy')
    plt.title('Task 1 (Gendered Abuse) Accuracy')
    plt.ylabel('Accuracy')
    plt.xlabel('Epoch')
    plt.legend()
    
    # Plot task1 loss
    plt.subplot(2, 2, 2)
    plt.plot(history.history['task1_output_loss'], label='Train Loss')
    plt.plot(history.history['val_task1_output_loss'], label='Validation Loss')
    plt.title('Task 1 (Gendered Abuse) Loss')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend()
    
    # Task 3 (Explicit Language) plots
    # Plot task3 accuracy
    plt.subplot(2, 2, 3)
    plt.plot(history.history['task3_output_accuracy'], label='Train Accuracy')
    plt.plot(history.history['val_task3_output_accuracy'], label='Validation Accuracy')
    plt.title('Task 3 (Explicit Language) Accuracy')
    plt.ylabel('Accuracy')
    plt.xlabel('Epoch')
    plt.legend()
    
    # Plot task3 loss
    plt.subplot(2, 2, 4)
    plt.plot(history.history['task3_output_loss'], label='Train Loss')
    plt.plot(history.history['val_task3_output_loss'], label='Validation Loss')
    plt.title('Task 3 (Explicit Language) Loss')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend()
    
    plt.tight_layout()
    plt.savefig(os.path.join(plot_dir, 'training_history_ta_task3_m1.png'))
    plt.close()
    
    # Plot combined loss
    plt.figure(figsize=(10, 6))
    plt.plot(history.history['loss'], label='Total Train Loss')
    plt.plot(history.history['val_loss'], label='Total Validation Loss')
    plt.title('Multi-Task Model Combined Loss')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend()
    plt.tight_layout()
    plt.savefig(os.path.join(plot_dir, 'combined_loss_ta_task3_m1.png'))
    plt.close()

# Evaluation function for multi-task model
def evaluate_multi_task_validation(model, X_val, y_val_task1, y_val_task3, plot_dir='plots_ta_task3_m1'):
    """
    Evaluates the multi-task model on validation data and saves metrics and plots
    """
    os.makedirs(plot_dir, exist_ok=True)
    
    # Predict probabilities
    y_pred_task1, y_pred_task3 = model.predict(X_val, batch_size=32)
    
    # Convert to class labels
    y_pred_task1_labels = np.argmax(y_pred_task1, axis=1)
    y_true_task1_labels = np.argmax(y_val_task1, axis=1)
    
    y_pred_task3_labels = np.argmax(y_pred_task3, axis=1)
    y_true_task3_labels = np.argmax(y_val_task3, axis=1)
    
    # Task 1 (Gendered Abuse) metrics
    task1_precision = precision_score(y_true_task1_labels, y_pred_task1_labels, average='weighted')
    task1_recall = recall_score(y_true_task1_labels, y_pred_task1_labels, average='weighted')
    task1_weighted_f1 = f1_score(y_true_task1_labels, y_pred_task1_labels, average='weighted')
    task1_macro_f1 = f1_score(y_true_task1_labels, y_pred_task1_labels, average='macro')
    
    # Task 3 (Explicit Language) metrics
    task3_precision = precision_score(y_true_task3_labels, y_pred_task3_labels, average='weighted')
    task3_recall = recall_score(y_true_task3_labels, y_pred_task3_labels, average='weighted')
    task3_weighted_f1 = f1_score(y_true_task3_labels, y_pred_task3_labels, average='weighted')
    task3_macro_f1 = f1_score(y_true_task3_labels, y_pred_task3_labels, average='macro')
    
    # Classification reports
    task1_report = classification_report(y_true_task1_labels, y_pred_task1_labels, 
                                        target_names=['not_hate', 'hate'])
    
    task3_report = classification_report(y_true_task3_labels, y_pred_task3_labels, 
                                        target_names=['not_explicit', 'explicit'])
    
    # Confusion matrices
    task1_conf_matrix = confusion_matrix(y_true_task1_labels, y_pred_task1_labels)
    task3_conf_matrix = confusion_matrix(y_true_task3_labels, y_pred_task3_labels)
    
    # Plot confusion matrices
    # Task 1 confusion matrix
    plt.figure(figsize=(8, 6))
    sns.heatmap(task1_conf_matrix, annot=True, fmt='d', cmap='Blues',
                xticklabels=['Not Hate', 'Hate'],
                yticklabels=['Not Hate', 'Hate'])
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Task 1 (Gendered Abuse) Confusion Matrix')
    plt.savefig(os.path.join(plot_dir, 'task1_confusion_matrix_ta_task3_m1.png'))
    plt.close()
    
    # Task 3 confusion matrix
    plt.figure(figsize=(8, 6))
    sns.heatmap(task3_conf_matrix, annot=True, fmt='d', cmap='Blues',
                xticklabels=['Not Explicit', 'Explicit'],
                yticklabels=['Not Explicit', 'Explicit'])
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Task 3 (Explicit Language) Confusion Matrix')
    plt.savefig(os.path.join(plot_dir, 'task3_confusion_matrix_ta_task3_m1.png'))
    plt.close()
    
    return {
        'task1': {
            'precision': task1_precision,
            'recall': task1_recall,
            'f1_score_weighted': task1_weighted_f1,
            'f1_score_macro': task1_macro_f1,
            'classification_report': task1_report,
            'confusion_matrix': task1_conf_matrix
        },
        'task3': {
            'precision': task3_precision,
            'recall': task3_recall,
            'f1_score_weighted': task3_weighted_f1,
            'f1_score_macro': task3_macro_f1,
            'classification_report': task3_report,
            'confusion_matrix': task3_conf_matrix
        }
    }

# Main Execution for Training and Validation
if __name__ == "__main__":
    # Split into train (80%) and validation (20%)
    X_train, X_val, y_train_task1, y_val_task1, y_train_task3, y_val_task3 = train_test_split(
        X, y_task1, y_task3, test_size=0.2, random_state=42, stratify=y_task1
    )
    
    print(f"Training samples: {len(X_train)}")
    print(f"Validation samples: {len(X_val)}")
    
    # Create multi-task model
    embed_size = embedding_matrix.shape[1]
    model = create_multi_task_cnn_bilstm_model(max_len, max_features, embedding_matrix, embed_size)
    
    # Print model summary
    model.summary()
    
    # Train multi-task model
    history, trained_model = train_and_validate_multi_task_model(
        model, X_train, y_train_task1, y_train_task3, 
        X_val, y_val_task1, y_val_task3,
        batch_size=32,
        epochs=15  
    )
    
    # Plot training history
    plot_multi_task_training_history(history)
    
    # Evaluate on validation set
    val_results = evaluate_multi_task_validation(
        trained_model, X_val, y_val_task1, y_val_task3
    )
    
    # Print Task 1 (Gendered Abuse) results
    print("\nTask 1 (Gendered Abuse) Validation Results:")
    print(f"Precision: {val_results['task1']['precision']:.4f}")
    print(f"Recall: {val_results['task1']['recall']:.4f}")
    print(f"weighted F1 Score: {val_results['task1']['f1_score_weighted']:.4f}")
    print(f"macro F1 Score: {val_results['task1']['f1_score_macro']:.4f}")
    print("\nClassification Report:")
    print(val_results['task1']['classification_report'])
    
    # Print Task 3 (Explicit Language) results
    print("\nTask 3 (Explicit Language) Validation Results:")
    print(f"Precision: {val_results['task3']['precision']:.4f}")
    print(f"Recall: {val_results['task3']['recall']:.4f}")
    print(f"weighted F1 Score: {val_results['task3']['f1_score_weighted']:.4f}")
    print(f"macro F1 Score: {val_results['task3']['f1_score_macro']:.4f}")
    print("\nClassification Report:")
    print(val_results['task3']['classification_report'])

Training samples: 5423
Validation samples: 1356




Epoch 1/15
[1m170/170[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 93ms/step - loss: 1.3422 - task1_output_accuracy: 0.5496 - task1_output_loss: 0.6877 - task1_output_macro_f1_score: 0.5496 - task3_output_accuracy: 0.6516 - task3_output_loss: 0.6545 - task3_output_macro_f1_score: 0.6516  
Epoch 1: val_loss improved from inf to 1.32825, saving model to models_ta_task3_m1\best_model_ta_task3_m1.h5




[1m170/170[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 124ms/step - loss: 1.3421 - task1_output_accuracy: 0.5497 - task1_output_loss: 0.6877 - task1_output_macro_f1_score: 0.5497 - task3_output_accuracy: 0.6516 - task3_output_loss: 0.6544 - task3_output_macro_f1_score: 0.6516 - val_loss: 1.3283 - val_task1_output_accuracy: 0.5642 - val_task1_output_loss: 0.6834 - val_task1_output_macro_f1_score: 0.5642 - val_task3_output_accuracy: 0.6527 - val_task3_output_loss: 0.6436 - val_task3_output_macro_f1_score: 0.6527
Epoch 2/15
[1m170/170[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 98ms/step - loss: 1.3267 - task1_output_accuracy: 0.5641 - task1_output_loss: 0.6819 - task1_output_macro_f1_score: 0.5641 - task3_output_accuracy: 0.6468 - task3_output_loss: 0.6449 - task3_output_macro_f1_score: 0.6468  
Epoch 2: val_loss improved from 1.32825 to 1.30570, saving model to models_ta_task3_m1\best_model_ta_task3_m1.h5




[1m170/170[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 103ms/step - loss: 1.3267 - task1_output_accuracy: 0.5641 - task1_output_loss: 0.6819 - task1_output_macro_f1_score: 0.5641 - task3_output_accuracy: 0.6468 - task3_output_loss: 0.6449 - task3_output_macro_f1_score: 0.6468 - val_loss: 1.3057 - val_task1_output_accuracy: 0.5737 - val_task1_output_loss: 0.6743 - val_task1_output_macro_f1_score: 0.5737 - val_task3_output_accuracy: 0.6593 - val_task3_output_loss: 0.6308 - val_task3_output_macro_f1_score: 0.6593
Epoch 3/15
[1m170/170[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 99ms/step - loss: 1.3037 - task1_output_accuracy: 0.5842 - task1_output_loss: 0.6731 - task1_output_macro_f1_score: 0.5842 - task3_output_accuracy: 0.6536 - task3_output_loss: 0.6306 - task3_output_macro_f1_score: 0.6536  
Epoch 3: val_loss improved from 1.30570 to 1.28204, saving model to models_ta_task3_m1\best_model_ta_task3_m1.h5




[1m170/170[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 106ms/step - loss: 1.3037 - task1_output_accuracy: 0.5842 - task1_output_loss: 0.6731 - task1_output_macro_f1_score: 0.5842 - task3_output_accuracy: 0.6536 - task3_output_loss: 0.6306 - task3_output_macro_f1_score: 0.6536 - val_loss: 1.2820 - val_task1_output_accuracy: 0.5988 - val_task1_output_loss: 0.6639 - val_task1_output_macro_f1_score: 0.5988 - val_task3_output_accuracy: 0.6652 - val_task3_output_loss: 0.6181 - val_task3_output_macro_f1_score: 0.6652
Epoch 4/15
[1m170/170[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 120ms/step - loss: 1.2904 - task1_output_accuracy: 0.5832 - task1_output_loss: 0.6705 - task1_output_macro_f1_score: 0.5832 - task3_output_accuracy: 0.6703 - task3_output_loss: 0.6199 - task3_output_macro_f1_score: 0.6703 
Epoch 4: val_loss did not improve from 1.28204
[1m170/170[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 128ms/step - loss: 1.2904 - task1_output_accuracy: 0.5832 



[1m170/170[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 123ms/step - loss: 1.2641 - task1_output_accuracy: 0.5891 - task1_output_loss: 0.6546 - task1_output_macro_f1_score: 0.5891 - task3_output_accuracy: 0.6792 - task3_output_loss: 0.6095 - task3_output_macro_f1_score: 0.6792 - val_loss: 1.2761 - val_task1_output_accuracy: 0.5914 - val_task1_output_loss: 0.6654 - val_task1_output_macro_f1_score: 0.5914 - val_task3_output_accuracy: 0.6681 - val_task3_output_loss: 0.6107 - val_task3_output_macro_f1_score: 0.6681
Epoch 7/15
[1m170/170[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 116ms/step - loss: 1.2484 - task1_output_accuracy: 0.5941 - task1_output_loss: 0.6490 - task1_output_macro_f1_score: 0.5941 - task3_output_accuracy: 0.6890 - task3_output_loss: 0.5995 - task3_output_macro_f1_score: 0.6890 
Epoch 7: val_loss improved from 1.27609 to 1.27368, saving model to models_ta_task3_m1\best_model_ta_task3_m1.h5




[1m170/170[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 123ms/step - loss: 1.2485 - task1_output_accuracy: 0.5941 - task1_output_loss: 0.6490 - task1_output_macro_f1_score: 0.5941 - task3_output_accuracy: 0.6890 - task3_output_loss: 0.5995 - task3_output_macro_f1_score: 0.6890 - val_loss: 1.2737 - val_task1_output_accuracy: 0.6069 - val_task1_output_loss: 0.6632 - val_task1_output_macro_f1_score: 0.6069 - val_task3_output_accuracy: 0.6637 - val_task3_output_loss: 0.6101 - val_task3_output_macro_f1_score: 0.6637
Epoch 8/15
[1m170/170[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 102ms/step - loss: 1.2549 - task1_output_accuracy: 0.5905 - task1_output_loss: 0.6530 - task1_output_macro_f1_score: 0.5905 - task3_output_accuracy: 0.6877 - task3_output_loss: 0.6018 - task3_output_macro_f1_score: 0.6877 
Epoch 8: val_loss did not improve from 1.27368
[1m170/170[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 109ms/step - loss: 1.2548 - task1_output_accuracy: 0.5905 



[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 40ms/step  

Task 1 (Gendered Abuse) Validation Results:
Precision: 0.6188
Recall: 0.6069
weighted F1 Score: 0.5509
macro F1 Score: 0.5258

Classification Report:
              precision    recall  f1-score   support

    not_hate       0.60      0.90      0.72       765
        hate       0.64      0.22      0.33       591

    accuracy                           0.61      1356
   macro avg       0.62      0.56      0.53      1356
weighted avg       0.62      0.61      0.55      1356


Task 3 (Explicit Language) Validation Results:
Precision: 0.6376
Recall: 0.6637
weighted F1 Score: 0.6067
macro F1 Score: 0.5337

Classification Report:
              precision    recall  f1-score   support

not_explicit       0.56      0.19      0.29       477
    explicit       0.68      0.92      0.78       879

    accuracy                           0.66      1356
   macro avg       0.62      0.56      0.53      1356
weighted avg       0

In [10]:
# Process label 1 dataset
df_l1 = pd.read_csv('test_ta_l1.csv')
df_l1 = df_l1.rename(columns={'key': 'unique_id', 'sentence': 'text'})

# Convert annotator columns to numeric without replacing NaNs
df_l1[['ta_a1', 'ta_a2', 'ta_a3', 'ta_a4', 'ta_a5', 'ta_a6']] = df_l1[
    ['ta_a1', 'ta_a2', 'ta_a3', 'ta_a4', 'ta_a5', 'ta_a6']
].apply(pd.to_numeric, errors='coerce')

# Compute 'label_1' based on majority voting while ignoring NaNs
df_l1['label_1'] = (df_l1[['ta_a1', 'ta_a2', 'ta_a3', 'ta_a4', 'ta_a5', 'ta_a6']].mean(axis=1, skipna=True) >= 0.5).astype(int)

# Create proper binary label for task 1 (gendered abuse)
df_l1['binary_label_1'] = df_l1['label_1'].apply(lambda x: 'hate' if x == 1 else 'not_hate')

# Process label 3 dataset
df_l3 = pd.read_csv('test_ta_l3.csv')
df_l3 = df_l3.rename(columns={'key': 'unique_id', 'sentence': 'text'})

# Convert annotator columns to numeric without replacing NaNs
df_l3[['ta_a1', 'ta_a2', 'ta_a3', 'ta_a4', 'ta_a5', 'ta_a6']] = df_l3[
    ['ta_a1', 'ta_a2', 'ta_a3', 'ta_a4', 'ta_a5', 'ta_a6']
].apply(pd.to_numeric, errors='coerce')

# Compute 'label_3' based on majority voting while ignoring NaNs
df_l3['label_3'] = (df_l3[['ta_a1', 'ta_a2', 'ta_a3', 'ta_a4', 'ta_a5', 'ta_a6']].mean(axis=1, skipna=True) >= 0.5).astype(int)

# Create proper binary label for task 3 (explicit/aggressive)
df_l3['binary_label_3'] = df_l3['label_3'].apply(lambda x: 'explicit' if x == 1 else 'not_explicit')

# Select columns for merging
df_l1_slim = df_l1[['text', 'label_1', 'binary_label_1']]
df_l3_slim = df_l3[['text', 'label_3', 'binary_label_3']]

# Merge the datasets based on text field
merged_df = pd.merge(df_l1_slim, df_l3_slim, on='text', how='inner')

# Add a unique_id column to the merged dataset
merged_df['unique_id'] = [f'id_{i}' for i in range(len(merged_df))]

# Reorder columns
merged_df = merged_df[['unique_id', 'text', 'binary_label_1', 'label_1', 'binary_label_3', 'label_3']]

# Save the merged dataset
merged_df.to_csv('test_ta_task3.csv', index=False)

# Display information
print(f"Final merged dataset shape: {merged_df.shape}")
print(merged_df.head())

# Check label distribution
print("\nLabel distribution:")
print(f"Label 1 (gendered abuse): {merged_df['binary_label_1'].value_counts().to_dict()}")
print(f"Label 3 (explicit/aggressive): {merged_df['binary_label_3'].value_counts().to_dict()}")

Final merged dataset shape: (1135, 6)
  unique_id                                               text binary_label_1  \
0      id_0     வைரமுத்து ஒரு காம மிருகம் என்பது சினிமா துற...       not_hate   
1      id_1  #4YrsOfValiantVIVEGAM  #Valimai #AjithKumar   ...       not_hate   
2      id_2  #AmbedkarBlueShirtRally  இந்த போராட்டத்துக்கு ...       not_hate   
3      id_3  #BREAKING | திருச்சி மாவட்டம்  மணப்பாறையை அடுத...       not_hate   
4      id_4  #Bachelor 😤😤😤😤😤படமாடா இது கோத்தா <handle repla...       not_hate   

   label_1 binary_label_3  label_3  
0        0   not_explicit        0  
1        0       explicit        1  
2        0       explicit        1  
3        0   not_explicit        0  
4        0       explicit        1  

Label distribution:
Label 1 (gendered abuse): {'not_hate': 596, 'hate': 539}
Label 3 (explicit/aggressive): {'explicit': 765, 'not_explicit': 370}


In [11]:
# Apply text normalization
merged_df['processed_text'] = merged_df['text'].apply(lambda x: normalize_text(x))

# Further processing to remove '...'
merged_df['processed_text'] = merged_df['processed_text'].str.replace('...', '')

# Display samples of processed text
print("\nOriginal vs Processed Text Samples:")
for i in range(3):
    print(f"Original: {merged_df['text'].iloc[i]}")
    print(f"Processed: {merged_df['processed_text'].iloc[i]}")
    print("-" * 50)

# Keep all columns but add processed text
merged_df_final = merged_df[['unique_id', 'text', 'processed_text', 'binary_label_1', 'label_1', 'binary_label_3', 'label_3']]


print(f"\nProcessed dataset shape: {merged_df_final.shape}")
print(merged_df_final.head())

# Extract features (processed text)
X = list(merged_df_final['processed_text'])

# Extract labels for both tasks
y_task1 = merged_df_final['label_1'].values
y_task3 = merged_df_final['label_3'].values
tokenizer.fit_on_texts(X)
X = tokenizer.texts_to_sequences(X)

# Padding
X = pad_sequences(X, padding='post', maxlen=max_len)

y_task1 = label_encoder.fit_transform(y_task1)
y_task3 = label_encoder.fit_transform(y_task3)

y_task1 = to_categorical(y_task1, num_classes=2)
y_task3 = to_categorical(y_task3, num_classes=2)


Original vs Processed Text Samples:
Original:    வைரமுத்து ஒரு காம மிருகம் என்பது சினிமா துறையில் இருக்கும் அனைவருக்கும் தெரியும்.  சமுதாயத்துக்கு எத்தனை பேரவலம் !  #SexualPredatorVairamuthu #பொம்பள_பொருக்கி_வைரமுத்து #SexualPredatorVairamuthu #பொம்பள_பொருக்கி_வைரமுத்து #SexualPredatorVairamuthu
Processed:    வைரமுத்து ஒரு காம மிருகம் என்பது சினிமா துறையில் இருக்கும் அனைவருக்கும் தெரியும்   சமுதாயத்துக்கு எத்தனை பேரவலம்     sexualpredatorvairamuthu  பொம்பள பொருக்கி வைரமுத்து  sexualpredatorvairamuthu  பொம்பள பொருக்கி வைரமுத்து  sexualpredatorvairamuthu
--------------------------------------------------
Original: #4YrsOfValiantVIVEGAM  #Valimai #AjithKumar   ஏண்டா குள்ள புண்டைகலா,நீங்கலாம் அஜீத்,விஜய் பேன்ஸ்க்கு போட்டியா,சூத்து அடிச்சி விட்ருவோம்,போயி தனுஸ்,சிவா பேன்ஸ் கூட சண்ட போடுங்கடா <handle replaced>💦எச்ச பேன்ஸ்
Processed:      valimai  ajithkumar   ஏண்டா குள்ள புண்டைகலா நீங்கலாம் அஜீத் விஜய் பேன்ஸ்க்கு போட்டியா சூத்து அடிச்சி விட்ருவோம் போயி தனுஸ் சிவா பேன்ஸ் கூட சண்ட போடுங்கடா  

In [12]:
# Evaluate on validation set
test_results = evaluate_multi_task_validation(
    trained_model, X, y_task1, y_task3
)

# Print Task 1 (Gendered Abuse) results
print("\nTask 1 (Gendered Abuse) Test Results:")
print(f"Precision: {test_results['task1']['precision']:.4f}")
print(f"Recall: {test_results['task1']['recall']:.4f}")
print(f"weighted F1 Score: {test_results['task1']['f1_score_weighted']:.4f}")
print(f"macro F1 Score: {test_results['task1']['f1_score_macro']:.4f}")
print("\nClassification Report:")
print(test_results['task1']['classification_report'])

# Print Task 3 (Explicit Language) results
print("\nTask 3 (Explicit Language) Test Results:")
print(f"Precision: {test_results['task3']['precision']:.4f}")
print(f"Recall: {test_results['task3']['recall']:.4f}")
print(f"weighted F1 Score: {test_results['task3']['f1_score_weighted']:.4f}")
print(f"macro F1 Score: {test_results['task3']['f1_score_macro']:.4f}")
print("\nClassification Report:")
print(test_results['task3']['classification_report'])

[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 24ms/step

Task 1 (Gendered Abuse) Test Results:
Precision: 0.5311
Recall: 0.5330
weighted F1 Score: 0.4632
macro F1 Score: 0.4526

Classification Report:
              precision    recall  f1-score   support

    not_hate       0.53      0.87      0.66       596
        hate       0.53      0.16      0.24       539

    accuracy                           0.53      1135
   macro avg       0.53      0.52      0.45      1135
weighted avg       0.53      0.53      0.46      1135


Task 3 (Explicit Language) Test Results:
Precision: 0.5782
Recall: 0.6396
weighted F1 Score: 0.5849
macro F1 Score: 0.4878

Classification Report:
              precision    recall  f1-score   support

not_explicit       0.37      0.15      0.21       370
    explicit       0.68      0.88      0.77       765

    accuracy                           0.64      1135
   macro avg       0.52      0.51      0.49      1135
weighted avg       0.58      0.64 