In [2]:
import numpy as np
import pandas as pd
from sklearn.utils.class_weight import compute_class_weight
import re
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
from gensim.models import FastText
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tqdm.auto import tqdm
from torch.nn.utils.rnn import pad_sequence

In [3]:
df = pd.read_csv('final_dataset.csv')

df.shape

(67378, 2)

In [4]:
df['hate'].value_counts()

hate
0    36583
1    30795
Name: count, dtype: int64

Start

In [5]:
from sklearn.model_selection import train_test_split

def split_data(dataset, stratify_column, train_ratio=0.7, val_ratio=0.2, test_ratio=0.1, random_state=42):
    """
    Splits the dataset into training, validation, and test sets.

    Parameters:
    - dataset: pd.DataFrame, the input dataset.
    - stratify_column: str, the column used for stratified splitting.
    - train_ratio: float, the proportion of the dataset for training (default 0.7).
    - val_ratio: float, the proportion of the dataset for validation (default 0.2).
    - test_ratio: float, the proportion of the dataset for testing (default 0.1).
    - random_state: int, random seed for reproducibility.

    Returns:
    - train_data, val_data, test_data: pd.DataFrames, the split datasets.
    """
    # Ensure the split ratios sum to 1
    assert abs(train_ratio + val_ratio + test_ratio - 1.0) < 1e-5, "Ratios must sum to 1."

    # Initial train + validation split
    train_data, temp_data = train_test_split(
        dataset,
        test_size=(val_ratio + test_ratio),
        random_state=random_state,
        stratify=dataset[stratify_column] if stratify_column else None
    )

    # Split the remaining data into validation and test sets
    val_data, test_data = train_test_split(
        temp_data,
        test_size=test_ratio / (val_ratio + test_ratio),
        random_state=random_state,
        stratify=temp_data[stratify_column] if stratify_column else None
    )

    return train_data, val_data, test_data

In [6]:
nltk.download('stopwords')
bengali_stopwords = set(stopwords.words('bengali'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Shazzad\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
def clean_text(text):
    """Cleans text by removing unnecessary characters and symbols."""
    text = re.sub(r'[^\u0980-\u09FF\s]', '', text)  # Keep Bengali characters and whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def tokenize_text(text, tokenizer):
    """Tokenizes text using Bangla BERT tokenizer."""
    tokens = tokenizer.tokenize(text)
    return tokens

def remove_stopwords(tokens):
    """Removes Bengali stopwords from tokens."""
    return [token for token in tokens if token not in bengali_stopwords]

def preprocess_text(text, tokenizer):
    """Combines cleaning, tokenization, and stopword removal."""
    text = clean_text(text)
    tokens = tokenize_text(text, tokenizer)
    tokens = remove_stopwords(tokens)
    return tokens

In [8]:
bert_tokenizer = AutoTokenizer.from_pretrained("sagorsarker/bangla-bert-base")

In [9]:
train_data, val_data, test_data = split_data(df, stratify_column='hate')

train_data['processed_sentence'] = train_data['sentence'].apply(lambda x: preprocess_text(x, bert_tokenizer))
val_data['processed_sentence'] = val_data['sentence'].apply(lambda x: preprocess_text(x, bert_tokenizer))
test_data['processed_sentence'] = test_data['sentence'].apply(lambda x: preprocess_text(x, bert_tokenizer))

In [10]:
df['processed_sentence'] = df['sentence'].apply(lambda x: preprocess_text(x, bert_tokenizer))

In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

device

device(type='cuda')

In [12]:
def train_fasttext(corpus, embedding_dim=100, window=5, min_count=1, sg=1):
    """
    Trains a FastText embedding model using gensim.

    Parameters:
    - corpus: list of lists, tokenized sentences.
    - embedding_dim: int, size of word vectors.
    - window: int, context window size.
    - min_count: int, minimum word frequency for inclusion.
    - sg: int, skip-gram (1) or CBOW (0).

    Returns:
    - model: Trained FastText model.
    """
    model = FastText(
        sentences=corpus, 
        vector_size=embedding_dim, 
        window=window, 
        min_count=min_count, 
        sg=sg
    )
    return model

In [13]:
corpus = train_data['processed_sentence'].tolist()
fasttext_model = train_fasttext(corpus)

fasttext_model.save("fasttext_embeddings.model")

In [14]:
def sentence_embedding(sentence_tokens, model):
    """
    Generates sentence embeddings by averaging word embeddings.

    Parameters:
    - sentence_tokens: list, tokens in a sentence.
    - model: FastText model.

    Returns:
    - np.array: Sentence embedding vector.
    """
    embeddings = [model.wv[word] for word in sentence_tokens if word in model.wv]
    if len(embeddings) == 0:  # Handle case with no valid tokens
        return np.zeros(model.vector_size)
    return sum(embeddings) / len(embeddings)

In [15]:
df['fasttext_embedding'] = df['processed_sentence'].apply(lambda tokens: sentence_embedding(tokens, fasttext_model))
train_data['fasttext_embedding'] = train_data['processed_sentence'].apply(lambda x: sentence_embedding(x, fasttext_model))
val_data['fasttext_embedding'] = val_data['processed_sentence'].apply(lambda x: sentence_embedding(x, fasttext_model))
test_data['fasttext_embedding'] = test_data['processed_sentence'].apply(lambda x: sentence_embedding(x, fasttext_model))

In [16]:
df['fasttext_embedding'].head()

0    [-0.2294447, -0.0554792, 0.43374178, 0.0877842...
1    [-0.09326634, -0.058718935, 0.25895825, 0.0064...
2    [-0.15248016, -0.1341862, 0.38996747, 0.208053...
3    [-0.24937002, -0.071896076, 0.29962087, 0.0268...
4    [-0.1607902, 0.07257928, 0.41274455, 0.0196789...
Name: fasttext_embedding, dtype: object

In [17]:
bert_model = AutoModel.from_pretrained("sagorsarker/bangla-bert-base").to(device)

In [18]:
df['processed_sentence'].head()

0            [##ত, ##সব, পাপন, শালা, ##র, ফাজলাম, ##ী]
1      [পাপন, শালা, রে, রিমান, ##ডে, নেও, ##যা, দরকার]
2    [জিল, ##ল, ##র, রহমান, স, ##যার, ##ের, ছেলে, #...
3             [শালা, ল, ##চ, ##চা, পাঠ, ##ার, মত, ##য]
4    [ত, ##ই, তে, ##া, শালা, গাজা, খাই, ##ছ, ##চতর,...
Name: processed_sentence, dtype: object

In [19]:
def bert_sentence_embedding(text):
    """
    Args:
        text (str): Input text
    Returns:
        torch.Tensor of shape [bert_hidden_size]
    """
    inputs = bert_tokenizer(
        text, 
        return_tensors="pt", 
        padding=True, 
        truncation=True, 
        max_length=512
    ).to(device)

    # Inference with no gradient to save memory and compute
    with torch.no_grad():
        outputs = bert_model(**inputs)

    # outputs.last_hidden_state has shape [batch_size, seq_len, hidden_dim]
    # If you're only handling a single text, you can squeeze the batch dimension
    return outputs.last_hidden_state.squeeze(0)

In [21]:
def combine_embeddings(tokens, fasttext_model, bert_embeddings, embedding_dim=100):
    """
    Args:
        tokens (list): List of tokens
        fasttext_model: Gensim's FastText model
        bert_embeddings (torch.Tensor): BERT embeddings on GPU
        embedding_dim (int): Dimension of the FastText embeddings
    Returns:
        torch.Tensor of shape [len(tokens), embedding_dim + bert_hidden_size]
    """
    combined_embeddings = []

    for idx, token in enumerate(tokens):
        # 1. Get FastText vector (on CPU), then move to GPU
        if token in fasttext_model.wv:
            fasttext_vec = fasttext_model.wv[token]
        else:
            fasttext_vec = np.zeros(embedding_dim)
        fasttext_vec = torch.tensor(fasttext_vec, dtype=torch.float32, device=device)

        # 2. Get the corresponding BERT embedding (already on GPU)
        if idx < bert_embeddings.size(0):
            bert_vec = bert_embeddings[idx]
        else:
            bert_vec = torch.zeros_like(bert_embeddings[0], device=device)

        # 3. Concatenate FastText and BERT embedding along the last dimension
        combined_vec = torch.cat((fasttext_vec, bert_vec), dim=0)
        combined_embeddings.append(combined_vec)

    # Convert list of tensors to a single tensor of shape [num_tokens, total_dim]
    combined_embeddings = torch.stack(combined_embeddings)
    return combined_embeddings

In [22]:
def get_embeddings_safe(text, fasttext_model):
    """
    Returns a zero embedding if the text is empty
    """
    text = text.strip()
    if not text:
        # For example, return a zero tensor with shape [1, your_dimension]
        # or skip. This is up to your design.
        return torch.zeros((1, 868), device=device)  # If your BERT hidden dim is 768
    tokens = text.split()
    bert_emb = bert_sentence_embedding(" ".join(tokens))  # get_bert_embeddings also on GPU
    return combine_embeddings(tokens, fasttext_model, bert_emb)

In [23]:
X = df['processed_sentence'].apply(lambda tokens: ' '.join(tokens))
y = df['hate']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [24]:
train_embeddings = [
    get_embeddings_safe(tokens, fasttext_model) 
    for tokens in tqdm(X_train, desc="Generating train embeddings")
]
test_embeddings = [
    get_embeddings_safe(tokens, fasttext_model) 
    for tokens in tqdm(X_test, desc="Generating test embeddings")
]

Generating train embeddings: 100%|██████████| 53902/53902 [08:40<00:00, 103.65it/s]
Generating test embeddings: 100%|██████████| 13476/13476 [02:12<00:00, 101.85it/s]


In [57]:
class HateSpeechDataset(Dataset):
    def __init__(self, embeddings, labels):
        self.embeddings = embeddings
        self.labels = labels

    def __len__(self):
        return len(self.embeddings)

    def __getitem__(self, idx):
        return self.embeddings[idx].float(), torch.tensor(self.labels[idx], dtype=torch.float)

In [50]:
import torch
import torch.nn as nn

class HAN(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, num_classes, dropout=0.3):
        super(HAN, self).__init__()

        self.word_lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True, batch_first=True)
        self.word_attention = nn.Linear(2 * hidden_dim, 1)
        self.sentence_lstm = nn.LSTM(2 * hidden_dim, hidden_dim, bidirectional=True, batch_first=True)
        self.sentence_attention = nn.Linear(2 * hidden_dim, 1)
        self.fc = nn.Linear(2 * hidden_dim, 1)

    def attention(self, lstm_output, attention_layer):
        """
        Computes attention scores and applies attention mechanism.

        Parameters:
        - lstm_output: Tensor [batch_size, seq_len, hidden_dim]
        - attention_layer: nn.Linear, attention layer
        
        Returns:
        - weighted_output: Tensor [batch_size, hidden_dim]
        """
        attention_weights = torch.softmax(attention_layer(lstm_output), dim=1)
        weighted_output = torch.sum(attention_weights * lstm_output, dim=1)
        return weighted_output

    def forward(self, x):
        """
        Forward pass of the HAN model.

        Parameters:
        - x: Tensor [batch_size, num_sentences, num_words, embedding_dim] or [batch_size, num_words, embedding_dim]
        
        Returns:
        - logits: Tensor [batch_size, num_classes]
        """
        # Preprocess input to ensure it has four dimensions
        word_output, _ = self.word_lstm(x)
        sentence_input = self.attention(word_output, self.word_attention)
        sentence_output, _ = self.sentence_lstm(sentence_input.unsqueeze(1))
        document_representation = self.attention(sentence_output, self.sentence_attention)
        logits = self.fc(document_representation)
        return logits


In [51]:
def collate_fn(batch):
    """
    Each item in 'batch' is a tuple: (embeddings, label)
    - embeddings: shape [seq_len, embedding_dim]
    - label: scalar
    We'll pad embeddings so they all match the longest seq_len in this batch.
    """
    embeddings_list = [item[0] for item in batch]  # list of [seq_len, embed_dim] tensors
    labels_list = [item[1] for item in batch]      # list of label tensors

    # Pad embeddings to [batch_size, max_seq_len_in_batch, embed_dim]
    padded_embeddings = pad_sequence(embeddings_list, batch_first=True)

    # Stack labels, shape [batch_size]
    labels_tensor = torch.stack(labels_list)

    return padded_embeddings, labels_tensor

In [52]:
batch_size = 32

train_dataset_han = HateSpeechDataset(train_embeddings, y_train.tolist())
test_dataset_han = HateSpeechDataset(test_embeddings, y_test.tolist())
train_loader_han = DataLoader(train_dataset_han, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
test_loader_han = DataLoader(test_dataset_han, batch_size=batch_size, collate_fn=collate_fn)

In [53]:
len(train_dataset_han.embeddings)

53902

In [54]:
import math
math.ceil(len(train_loader_han.dataset) / batch_size)

1685

In [55]:
embedding_dim = 868  # 100 (FastText) + 768 (Bangla BERT)
hidden_dim = 256
num_classes = len(set(y_train))

In [56]:
model = HAN(embedding_dim, hidden_dim, num_classes).to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

TypeError: Linear.__init__() missing 1 required positional argument: 'out_features'

In [47]:
def train_han(model, 
             train_loader, 
             test_loader, 
             criterion, 
             optimizer, 
             epochs, 
             device, 
             save_path=None,
             early_stopping_patience=None):
    """
    Trains the HAN model and evaluates on the test set after each epoch.

    Parameters:
    - model (nn.Module): The HAN model to train.
    - train_loader (DataLoader): DataLoader for the training data.
    - test_loader (DataLoader): DataLoader for the testing/validation data.
    - criterion (nn.Module): Loss function.
    - optimizer (torch.optim.Optimizer): Optimizer.
    - epochs (int): Number of training epochs.
    - device (torch.device): Device to run the model on.
    - save_path (str, optional): Path to save the best model. Defaults to None.
    - early_stopping_patience (int, optional): Number of epochs with no improvement after which training will be stopped. Defaults to None.

    Returns:
    - None
    """
    model.to(device)
    best_f1 = 0.0
    epochs_no_improve = 0

    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)

    for epoch in range(1, epochs + 1):
        current_lr = optimizer.param_groups[0]['lr']
        print(f"Current Learning Rate: {current_lr}")

        model.train()
        total_loss = 0.0
        train_bar = tqdm(train_loader, desc=f"Epoch {epoch}/{epochs} - Training", leave=False)

        for batch_idx, (embeddings, labels) in enumerate(train_bar):
            embeddings, labels = embeddings.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(embeddings)
            loss = criterion(outputs, labels)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5.0)
            optimizer.step()

            total_loss += loss.item()
            train_bar.set_postfix({'Loss': loss.item()})

        avg_loss = total_loss / len(train_loader)
        print(f"Epoch [{epoch}/{epochs}] - Training Loss: {avg_loss:.4f}")

        # Evaluation
        model.eval()
        all_preds, all_labels = [], []

        with torch.no_grad():
            eval_bar = tqdm(test_loader, desc=f"Epoch {epoch}/{epochs} - Evaluating", leave=False)
            for embeddings, labels in eval_bar:
                embeddings, labels = embeddings.to(device), labels.to(device)
                outputs = model(embeddings)
                preds = torch.argmax(outputs, dim=1)

                all_preds.extend(preds.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())

        # Calculate metrics
        accuracy = accuracy_score(all_labels, all_preds)
        precision = precision_score(all_labels, all_preds, average='macro', zero_division=0)
        recall = recall_score(all_labels, all_preds, average='macro', zero_division=0)
        f1 = f1_score(all_labels, all_preds, average='macro', zero_division=0)

        print(f"Epoch [{epoch}/{epochs}] - "
              f"Accuracy: {accuracy:.4f} | "
              f"Precision: {precision:.4f} | "
              f"Recall: {recall:.4f} | "
              f"F1 Score: {f1:.4f}")

        # Check for improvement
        if f1 > best_f1:
            best_f1 = f1
            epochs_no_improve = 0
            if save_path:
                torch.save(model.state_dict(), save_path)
                print(f"Best model saved to {save_path}")
        else:
            epochs_no_improve += 1
            if early_stopping_patience and epochs_no_improve >= early_stopping_patience:
                print("Early stopping triggered.")
                break

        # if save_path:
        #     checkpoint_path = f"epoch_{epoch}_model.pth"
        #     torch.save(model.state_dict(), checkpoint_path)
        #     print(f"Model checkpoint saved to {checkpoint_path}")


        scheduler.step()

        print("-" * 50)

    print(f"Training completed. Best F1 Score: {best_f1:.4f}")

In [48]:
save_path = "best_model.pth"
early_stopping_patience = 5
learning_rate = 0.001
epochs = 20

In [49]:
train_han(model=model, 
         train_loader=train_loader_han, 
         test_loader=test_loader_han, 
         criterion=criterion, 
         optimizer=optimizer, 
         epochs=epochs, 
         device=device, 
         save_path=save_path,
         early_stopping_patience=20)

Current Learning Rate: 0.001


                                                               

ValueError: Target size (torch.Size([32])) must be the same as input size (torch.Size([32, 1]))

In [58]:
class ConvLSTM(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, num_filters, filter_sizes, output_dim, dropout):
        """
        Conv-LSTM model for hate speech detection.

        Parameters:
        - embedding_dim: Dimensionality of input embeddings (FastText + BERT combined).
        - hidden_dim: Hidden size of the LSTM layer.
        - num_filters: Number of filters for convolutional layers.
        - filter_sizes: List of filter sizes (e.g., [2, 3, 4]).
        - output_dim: Number of output classes (e.g., 1 for binary classification).
        - dropout: Dropout rate.
        """
        super(ConvLSTM, self).__init__()

        # Convolutional Layers
        self.convs = nn.ModuleList([
            nn.Conv2d(in_channels=1, 
                      out_channels=num_filters, 
                      kernel_size=(fs, embedding_dim)) 
            for fs in filter_sizes
        ])
        
        self.convs = nn.ModuleList([
            nn.Conv2d(in_channels=1, out_channels=num_filters, kernel_size=(fs, embedding_dim))
            for fs in filter_sizes
        ])
        self.lstm = nn.LSTM(num_filters * len(filter_sizes), hidden_dim, bidirectional=True, batch_first=True)
        self.attention = nn.Linear(hidden_dim * 2, 1)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        """
        Forward pass of the Conv-LSTM model.

        Parameters:
        - x: Tensor [batch_size, seq_len, embedding_dim]

        Returns:
        - logits: Tensor [batch_size, output_dim]
        """
        # Add a channel dimension for Conv2D
        x = x.transpose(1, 2)
        
        # Apply convolution + ReLU + MaxPooling
        conv_outs = [torch.relu(conv(x)) for conv in self.convs]  # [batch_size, num_filters, seq_len - kernel_size + 1]
        pooled_outs = [torch.max(conv_out, dim=2)[0] for conv_out in conv_outs]  # [batch_size, num_filters]
        concat_pooled = torch.cat(pooled_outs, dim=1)  # [batch_size, num_filters * len(filter_sizes)]
        
        # LSTM
        lstm_out, _ = self.lstm(concat_pooled.unsqueeze(1))  # [batch_size, seq_len=1, hidden_dim*2]
        lstm_out = lstm_out.squeeze(1)  # [batch_size, hidden_dim*2]
        
        logits = self.fc(lstm_out)  # [batch_size, output_dim]
        return logits

In [54]:
num_filters = 100
filter_sizes = [2, 3, 4, 5]
output_dim = 1

In [55]:
clmodel = ConvLSTM(embedding_dim=embedding_dim, hidden_dim=hidden_dim, num_filters=num_filters, filter_sizes=filter_sizes, output_dim=output_dim, dropout=0.2)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.BCEWithLogitsLoss()

In [56]:
def train_conv_lstm(model, 
                    train_loader, 
                    test_loader, 
                    criterion, 
                    optimizer,  
                    device='cuda', 
                    epochs=10, 
                    save_path=None, 
                    early_stopping_patience=None):
    """
    Trains the Conv-LSTM model and evaluates it on the test set after each epoch.

    Parameters:
    - model (nn.Module): The Conv-LSTM model to train.
    - train_loader (DataLoader): DataLoader for the training data.
    - test_loader (DataLoader): DataLoader for the testing/validation data.
    - criterion (nn.Module): Loss function (e.g., BCEWithLogitsLoss or CrossEntropyLoss).
    - optimizer (torch.optim.Optimizer): Optimizer (e.g., Adam).
    - scheduler (torch.optim.lr_scheduler, optional): Learning rate scheduler. Defaults to None.
    - device (str): Device to run the model on ('cuda' or 'cpu'). Defaults to 'cuda'.
    - epochs (int): Number of training epochs. Defaults to 10.
    - save_path (str, optional): Path to save the best model. Defaults to None.
    - early_stopping_patience (int, optional): Early stopping patience. Defaults to None.

    Returns:
    - None
    """
    model.to(device)
    best_f1 = 0.0
    epochs_no_improve = 0

    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)

    for epoch in range(1, epochs + 1):
        print(f"\nEpoch {epoch}/{epochs}")
        print("=" * 50)

        # Training Phase
        model.train()
        total_loss = 0.0
        train_bar = tqdm(train_loader, desc="Training", leave=False)

        for embeddings, labels in train_bar:
            embeddings, labels = embeddings.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(embeddings)  # Forward pass
            loss = criterion(outputs.squeeze(1), labels.float())  # Adjust for binary classification
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5.0)  # Gradient clipping
            optimizer.step()  # Optimizer step

            total_loss += loss.item()
            train_bar.set_postfix({'Loss': loss.item()})

        avg_train_loss = total_loss / len(train_loader)
        print(f"Training Loss: {avg_train_loss:.4f}")

        # Validation Phase
        model.eval()
        all_preds, all_labels = [], []
        val_loss = 0.0

        with torch.no_grad():
            eval_bar = tqdm(test_loader, desc="Evaluating", leave=False)
            for embeddings, labels in eval_bar:
                embeddings, labels = embeddings.to(device), labels.to(device)

                outputs = model(embeddings)
                loss = criterion(outputs.squeeze(1), labels.float())  # Validation loss
                val_loss += loss.item()

                preds = torch.sigmoid(outputs.squeeze(1)) >= 0.5  # Binary prediction threshold
                all_preds.extend(preds.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())

        avg_val_loss = val_loss / len(test_loader)

        # Compute metrics
        accuracy = accuracy_score(all_labels, all_preds)
        precision = precision_score(all_labels, all_preds, zero_division=0)
        recall = recall_score(all_labels, all_preds, zero_division=0)
        f1 = f1_score(all_labels, all_preds, zero_division=0)

        print(f"Validation Loss: {avg_val_loss:.4f}")
        print(f"Accuracy: {accuracy:.4f} | Precision: {precision:.4f} | Recall: {recall:.4f} | F1 Score: {f1:.4f}")

        # Save the best model
        if f1 > best_f1:
            best_f1 = f1
            epochs_no_improve = 0
            if save_path:
                torch.save(model.state_dict(), save_path)
                print(f"Best model saved to {save_path}")
        else:
            epochs_no_improve += 1
            if early_stopping_patience and epochs_no_improve >= early_stopping_patience:
                print("Early stopping triggered.")
                break

        
        scheduler.step()

        print("-" * 50)

    print(f"Training completed. Best F1 Score: {best_f1:.4f}")


In [59]:
train_conv_lstm(
    model=clmodel,
    train_loader=train_loader,
    test_loader=test_loader,
    criterion=criterion,
    optimizer=optimizer,
    device='cuda' if torch.cuda.is_available() else 'cpu',
    epochs=20,
    save_path="best_conv_lstm_model.pth",
    early_stopping_patience=5
)


Epoch 1/20


                                                                         

Training Loss: 0.7031


                                                              

Validation Loss: 0.7028
Accuracy: 0.4072 | Precision: 0.6250 | Recall: 0.0034 | F1 Score: 0.0068
Best model saved to best_conv_lstm_model.pth
--------------------------------------------------

Epoch 2/20


                                                                         

Training Loss: 0.7030


                                                              

Validation Loss: 0.7028
Accuracy: 0.4072 | Precision: 0.6250 | Recall: 0.0034 | F1 Score: 0.0068
--------------------------------------------------

Epoch 3/20


                                                                         

Training Loss: 0.7031


                                                              

Validation Loss: 0.7028
Accuracy: 0.4072 | Precision: 0.6250 | Recall: 0.0034 | F1 Score: 0.0068
--------------------------------------------------

Epoch 4/20


                                                                         

Training Loss: 0.7031


                                                              

Validation Loss: 0.7028
Accuracy: 0.4072 | Precision: 0.6250 | Recall: 0.0034 | F1 Score: 0.0068
--------------------------------------------------

Epoch 5/20


                                                                         

Training Loss: 0.7029


                                                              

Validation Loss: 0.7028
Accuracy: 0.4072 | Precision: 0.6250 | Recall: 0.0034 | F1 Score: 0.0068
--------------------------------------------------

Epoch 6/20


                                                                         

Training Loss: 0.7031


                                                              

Validation Loss: 0.7028
Accuracy: 0.4072 | Precision: 0.6250 | Recall: 0.0034 | F1 Score: 0.0068
Early stopping triggered.
Training completed. Best F1 Score: 0.0068




In [66]:
class HateSpeechDatasetBERT(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        """
        Custom Dataset for Bangla-BERT Fine-Tuning.
        Parameters:
        - texts: List of sentences.
        - labels: List of binary labels (0 or 1).
        - tokenizer: Pre-trained Bangla-BERT tokenizer.
        - max_length: Maximum sequence length for padding/truncation.
        """
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': torch.tensor(label, dtype=torch.long)
        }


In [77]:
# Load and split your dataset
from sklearn.model_selection import train_test_split

texts = df['sentence'].tolist()
labels = df['hate'].tolist()

train_texts, test_texts, train_labels, test_labels = train_test_split(
    texts, labels, test_size=0.2, random_state=42
)

# Create datasets and dataloaders
train_dataset_bert = HateSpeechDatasetBERT(train_texts, train_labels, bert_tokenizer)
test_dataset_bert = HateSpeechDatasetBERT(test_texts, test_labels, bert_tokenizer)

train_loader_bert = DataLoader(train_dataset_bert, batch_size=16, shuffle=True)
test_loader_bert = DataLoader(test_dataset_bert, batch_size=16, shuffle=False)