In [3]:
%pip install pandas numpy nltk transformers fasttext-wheel scikit-learn torch shap matplotlib seaborn





In [4]:
# Step 1: Import Libraries
import pandas as pd
import re
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
from transformers import AutoTokenizer


In [5]:
# Download the Bengali stopwords
nltk.download('stopwords')

# Load Bengali stopwords
bengali_stopwords = set(stopwords.words('bengali'))

# Initialize tokenizer for later use
tokenizer = AutoTokenizer.from_pretrained("sagorsarker/bangla-bert-base")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Shazzad\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:

# Step 2: Load the Dataset
# Replace 'dataset.csv' with the path to your dataset file
file_path = 'bangla_hate_speech.csv'  # Update this with your actual dataset path
df = pd.read_csv(file_path)

# Display the first few rows to verify data loading
print("Data Preview:")
print(df.head())

Data Preview:
                                            sentence  hate category
0                     যত্তসব পাপন শালার ফাজলামী!!!!!     1   sports
1                  পাপন শালা রে রিমান্ডে নেওয়া দরকার     1   sports
2  জিল্লুর রহমান স্যারের ছেলে এতো বড় জারজ হবে এটা...     1   sports
3                শালা লুচ্চা দেখতে পাঠার মত দেখা যায়     1   sports
4   তুই তো শালা গাজা খাইছচ।তুর মার হেডায় খেলবে সাকিব     1   sports


In [7]:


# Step 3: Text Cleaning
def clean_text(text):
    """
    Cleans text by removing unnecessary characters and symbols.
    """
    # Remove special characters, numbers, and symbols, keeping only Bengali letters
    text = re.sub(r'[^\u0980-\u09FF\s]', '', text)  # Keep Bengali characters and whitespace
    # Remove multiple spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Apply cleaning to the 'sentence' column
df['cleaned_sentence'] = df['sentence'].apply(clean_text)

print("After Cleaning:")
print(df[['sentence', 'cleaned_sentence']].head())

After Cleaning:
                                            sentence  \
0                     যত্তসব পাপন শালার ফাজলামী!!!!!   
1                  পাপন শালা রে রিমান্ডে নেওয়া দরকার   
2  জিল্লুর রহমান স্যারের ছেলে এতো বড় জারজ হবে এটা...   
3                শালা লুচ্চা দেখতে পাঠার মত দেখা যায়   
4   তুই তো শালা গাজা খাইছচ।তুর মার হেডায় খেলবে সাকিব   

                                    cleaned_sentence  
0                          যত্তসব পাপন শালার ফাজলামী  
1                  পাপন শালা রে রিমান্ডে নেওয়া দরকার  
2  জিল্লুর রহমান স্যারের ছেলে এতো বড় জারজ হবে এটা...  
3                শালা লুচ্চা দেখতে পাঠার মত দেখা যায়  
4    তুই তো শালা গাজা খাইছচতুর মার হেডায় খেলবে সাকিব  


In [8]:


# Step 4: Tokenization with Bangla BERT
def tokenize_text(text):
    """
    Tokenizes text using Bangla BERT tokenizer.
    """
    # Tokenize the text into subwords for better context understanding
    tokens = tokenizer.tokenize(text)
    # Convert tokens to a single string for training input
    return tokens

# Tokenize cleaned sentences
df['tokens'] = df['cleaned_sentence'].apply(tokenize_text)

print("After Tokenization:")
print(df[['cleaned_sentence', 'tokens']].head())

After Tokenization:
                                    cleaned_sentence  \
0                          যত্তসব পাপন শালার ফাজলামী   
1                  পাপন শালা রে রিমান্ডে নেওয়া দরকার   
2  জিল্লুর রহমান স্যারের ছেলে এতো বড় জারজ হবে এটা...   
3                শালা লুচ্চা দেখতে পাঠার মত দেখা যায়   
4    তুই তো শালা গাজা খাইছচতুর মার হেডায় খেলবে সাকিব   

                                              tokens  
0      [যত, ##ত, ##সব, পাপন, শালা, ##র, ফাজলাম, ##ী]  
1    [পাপন, শালা, রে, রিমান, ##ডে, নেও, ##যা, দরকার]  
2  [জিল, ##ল, ##র, রহমান, স, ##যার, ##ের, ছেলে, এ...  
3  [শালা, ল, ##চ, ##চা, দেখতে, পাঠ, ##ার, মত, দেখ...  
4  [ত, ##ই, তে, ##া, শালা, গাজা, খাই, ##ছ, ##চতর,...  


In [9]:


# Step 5: Stopword Removal
def remove_stopwords(tokens):
    """
    Removes Bengali stopwords from the token list.
    """
    return [token for token in tokens if token not in bengali_stopwords]

# Apply stopword removal on tokens
df['filtered_tokens'] = df['tokens'].apply(remove_stopwords)

print("After Stopword Removal:")
print(df[['tokens', 'filtered_tokens']].head())

After Stopword Removal:
                                              tokens  \
0      [যত, ##ত, ##সব, পাপন, শালা, ##র, ফাজলাম, ##ী]   
1    [পাপন, শালা, রে, রিমান, ##ডে, নেও, ##যা, দরকার]   
2  [জিল, ##ল, ##র, রহমান, স, ##যার, ##ের, ছেলে, এ...   
3  [শালা, ল, ##চ, ##চা, দেখতে, পাঠ, ##ার, মত, দেখ...   
4  [ত, ##ই, তে, ##া, শালা, গাজা, খাই, ##ছ, ##চতর,...   

                                     filtered_tokens  
0          [##ত, ##সব, পাপন, শালা, ##র, ফাজলাম, ##ী]  
1    [পাপন, শালা, রে, রিমান, ##ডে, নেও, ##যা, দরকার]  
2  [জিল, ##ল, ##র, রহমান, স, ##যার, ##ের, ছেলে, #...  
3           [শালা, ল, ##চ, ##চা, পাঠ, ##ার, মত, ##য]  
4  [ত, ##ই, তে, ##া, শালা, গাজা, খাই, ##ছ, ##চতর,...  


In [10]:
# Step 6: Prepare Data for Modeling
# Map the sentences and their respective labels
X = df['filtered_tokens'].apply(lambda tokens: ' '.join(tokens))  # Join tokens for input
y = df['hate']  # Assuming the 'hate' column contains the labels

# Split the data into training and testing sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Display dataset statistics
print("Training Data Size:", len(X_train))
print("Testing Data Size:", len(X_test))


Training Data Size: 24000
Testing Data Size: 6000


In [11]:

# Step 7: Save Preprocessed Data
# Save the train and test splits for later use
X_train.to_csv('X_train.csv', index=False)
y_train.to_csv('y_train.csv', index=False)
X_test.to_csv('X_test.csv', index=False)
y_test.to_csv('y_test.csv', index=False)

print("Preprocessing complete. Data saved to disk.")


Preprocessing complete. Data saved to disk.


In [12]:
%pip install gensim

Note: you may need to restart the kernel to use updated packages.


In [13]:
# Import necessary libraries
from gensim.models import FastText
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np

In [14]:


# Step 1: FastText Embeddings
def train_fasttext(corpus, embedding_dim=100, window_size=5, min_count=1):
    """
    Train a FastText model on the dataset.
    :param corpus: List of tokenized sentences.
    :param embedding_dim: Dimension of the embeddings.
    :param window_size: Context window size.
    :param min_count: Minimum word count threshold.
    :return: Trained FastText model.
    """
    model = FastText(sentences=corpus, vector_size=embedding_dim, window=window_size, min_count=min_count, sg=1)
    return model

# Convert the filtered tokens to a list of tokenized sentences
corpus = df['filtered_tokens'].tolist()

# Train the FastText model
fasttext_model = train_fasttext(corpus)
print("FastText training complete!")

# Example: Retrieve FastText vector for a word
word = "বাংলাদেশ"
if word in fasttext_model.wv:
    print(f"FastText vector for '{word}': {fasttext_model.wv[word]}")


FastText training complete!
FastText vector for 'বাংলাদেশ': [-0.3095373  -0.31599662  0.588646    0.269986   -0.19336222 -0.3560959
  0.16978024  0.5795253  -0.40246212 -0.42315122  0.25908586  0.35319906
  0.08453769 -0.5304741  -0.58906287 -0.10794803  0.06659401 -0.7531366
  0.13634633 -0.01830969 -0.44294015  0.49936166 -0.21476266 -0.01546686
  0.4388483  -0.37863943 -0.17825086  0.29273772  0.26894897 -0.0129158
 -0.18571742  0.47350574 -0.20383324  0.38840663  0.07102347 -0.19870542
 -0.00942242 -0.3821738  -0.09808816 -0.18958327  0.5214082   0.25697157
 -0.30335066 -0.33748052  0.27202594 -0.2883737  -0.15581124  0.45094603
 -0.21410267 -0.2759681   0.6139857  -0.19837175  0.5427403  -0.11001705
 -0.20757355  0.0792667   0.18060471 -0.24376367 -0.48467174  0.32077911
  0.2312523  -0.00254642  0.03805562 -0.18511496  0.07985787 -0.53999215
 -0.03044677 -0.30813614 -0.11011249  0.08694138 -0.2902234   0.24777886
  0.5570032   0.10141779  0.49019662 -0.1341573  -0.25346276  0.086

In [18]:


# Check if GPU is available
device = torch.device("cuda")

# Load Bangla BERT model and tokenizer
bangla_bert_model = AutoModel.from_pretrained("sagorsarker/bangla-bert-base").to(device)
bangla_bert_tokenizer = AutoTokenizer.from_pretrained("sagorsarker/bangla-bert-base")

def get_bert_embeddings(text):
    """
    Extract BERT embeddings for the input text.
    :param text: Input sentence.
    :return: Token-level embeddings.
    """
    # Tokenize input and move tensors to the same device as the model
    inputs = bangla_bert_tokenizer(text, return_tensors="pt", 
                                   padding=True, truncation=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = bangla_bert_model(**inputs)
        
    # Get the last hidden state (token embeddings) and move back to CPU if needed
    token_embeddings = outputs.last_hidden_state.squeeze(0).cpu()
    return token_embeddings

# Example: Get Bangla BERT embeddings for a sentence
sentence = "বাংলাদেশ একটি সুন্দর দেশ।"
bert_embeddings = get_bert_embeddings(sentence)
print(f"BERT Embeddings shape: {bert_embeddings.shape}")


BERT Embeddings shape: torch.Size([8, 768])


In [19]:


# Step 3: Combine FastText and BERT Embeddings
def combine_embeddings(tokens, fasttext_model, bert_embeddings, embedding_dim=100):
    """
    Combine FastText and BERT embeddings for each token.
    :param tokens: List of tokens for the sentence.
    :param fasttext_model: Trained FastText model.
    :param bert_embeddings: BERT embeddings for the tokens.
    :param embedding_dim: Dimension of FastText embeddings.
    :return: Combined embeddings for each token.
    """
    combined_embeddings = []
    for idx, token in enumerate(tokens):
        # Get FastText embedding (zeros if not in vocab)
        fasttext_vec = fasttext_model.wv[token] if token in fasttext_model.wv else np.zeros(embedding_dim)
        
        # Get BERT embedding for the token
        bert_vec = bert_embeddings[idx].numpy() if idx < len(bert_embeddings) else np.zeros_like(bert_embeddings[0].numpy())
        
        # Concatenate FastText and BERT embeddings
        combined_vec = np.concatenate((fasttext_vec, bert_vec))
        combined_embeddings.append(combined_vec)
    
    return np.array(combined_embeddings)

# Example: Combine embeddings for a sentence
tokens = df['filtered_tokens'][0]  # Use the first sentence in the dataset
bert_embs = get_bert_embeddings(" ".join(tokens))
combined_embs = combine_embeddings(tokens, fasttext_model, bert_embs)
print(f"Combined Embeddings shape: {combined_embs.shape}")


Combined Embeddings shape: (7, 868)


In [20]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np

In [21]:
# Step 1: Define the Custom Dataset
class HateSpeechDataset(Dataset):
    """
    Custom Dataset for loading hate speech data with embeddings.
    """
    def __init__(self, texts, labels, embeddings):
        self.texts = texts
        self.labels = labels
        self.embeddings = embeddings

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text_embedding = self.embeddings[idx]
        label = self.labels[idx]
        return torch.tensor(text_embedding, dtype=torch.float32), torch.tensor(label, dtype=torch.long)


In [22]:

# Step 2: Define the HAN Model
class HAN(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, num_classes):
        super(HAN, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.num_classes = num_classes

        # Word-level BiLSTM
        self.word_lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True, batch_first=True)
        self.word_attention = nn.Linear(2 * hidden_dim, 1)  # Attention layer for words

        # Sentence-level BiLSTM
        self.sentence_lstm = nn.LSTM(2 * hidden_dim, hidden_dim, bidirectional=True, batch_first=True)
        self.sentence_attention = nn.Linear(2 * hidden_dim, 1)  # Attention layer for sentences

        # Fully connected output layer
        self.fc = nn.Linear(2 * hidden_dim, num_classes)

    def attention(self, lstm_output, attention_layer):
        """
        Apply attention mechanism.
        """
        attention_weights = torch.softmax(attention_layer(lstm_output), dim=1)
        weighted_output = torch.sum(attention_weights * lstm_output, dim=1)
        return weighted_output

    def forward(self, x):
        # Word-level BiLSTM and attention
        word_output, _ = self.word_lstm(x)
        sentence_input = self.attention(word_output, self.word_attention)

        # Sentence-level BiLSTM and attention
        sentence_output, _ = self.sentence_lstm(sentence_input.unsqueeze(0))
        document_representation = self.attention(sentence_output, self.sentence_attention)

        # Classification layer
        logits = self.fc(document_representation)
        return logits

In [23]:


# Step 3: Training and Evaluation Functions
def train_model(model, train_loader, criterion, optimizer, device):
    model.train()
    total_loss = 0
    for embeddings, labels in train_loader:
        embeddings, labels = embeddings.to(device), labels.to(device)
        
        # Forward pass
        outputs = model(embeddings)
        loss = criterion(outputs, labels)
        total_loss += loss.item()

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    return total_loss / len(train_loader)

def evaluate_model(model, val_loader, device):
    model.eval()
    all_labels = []
    all_preds = []
    with torch.no_grad():
        for embeddings, labels in val_loader:
            embeddings, labels = embeddings.to(device), labels.to(device)
            outputs = model(embeddings)
            preds = torch.argmax(outputs, dim=1)
            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(preds.cpu().numpy())
    
    accuracy = accuracy_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds, average='weighted')
    recall = recall_score(all_labels, all_preds, average='weighted')
    f1 = f1_score(all_labels, all_preds, average='weighted')
    return accuracy, precision, recall, f1


In [None]:
from torch.utils.data import Dataset

# Convert labels to numpy arrays for easier manipulation
y_train = np.array(y_train)
y_test = np.array(y_test)

# Step 1: Combine Embeddings and Labels for Training Dataset
train_dataset = [
    (torch.tensor(embedding, dtype=torch.float32), torch.tensor(label, dtype=torch.long))
    for embedding, label in zip(combined_embs, y_train)
]

# Step 2: Combine Embeddings and Labels for Testing Dataset
test_dataset = [
    (torch.tensor(embedding, dtype=torch.float32), torch.tensor(label, dtype=torch.long))
    for embedding, label in zip(test_embeddings, y_test)
]

# Step 3: Create PyTorch DataLoaders
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32)

# Print statistics to verify
print(f"Number of training samples: {len(train_loader.dataset)}")
print(f"Number of testing samples: {len(test_loader.dataset)}")

In [25]:

# Step 4: Preparing Data and Training the Model
# Assuming you have combined embeddings (X_train, X_test) and labels (y_train, y_test)
train_dataset = HateSpeechDataset(X_train, y_train, combined_train_embeddings)
test_dataset = HateSpeechDataset(X_test, y_test, combined_test_embeddings)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

# Model Parameters
embedding_dim = 868  # Example: 100 from FastText + 768 from Bangla BERT
hidden_dim = 128
num_classes = len(set(y_train))  # Number of unique labels
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Initialize model, loss function, and optimizer
model = HAN(embedding_dim, hidden_dim, num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training Loop
num_epochs = 10
for epoch in range(num_epochs):
    train_loss = train_model(model, train_loader, criterion, optimizer, device)
    accuracy, precision, recall, f1 = evaluate_model(model, test_loader, device)
    print(f"Epoch {epoch+1}/{num_epochs}")
    print(f"Train Loss: {train_loss:.4f}")
    print(f"Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}")


NameError: name 'combined_train_embeddings' is not defined

In [26]:
# Import necessary libraries
import pandas as pd
import re
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
from transformers import AutoTokenizer, AutoModel
from gensim.models import FastText
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np

# Download stopwords
nltk.download('stopwords')
bengali_stopwords = set(stopwords.words('bengali'))

# Step 1: Data Preprocessing
def clean_text(text):
    """Cleans text by removing unnecessary characters and symbols."""
    text = re.sub(r'[^\u0980-\u09FF\s]', '', text)  # Keep Bengali characters and whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def tokenize_text(text, tokenizer):
    """Tokenizes text using Bangla BERT tokenizer."""
    tokens = tokenizer.tokenize(text)
    return tokens

def remove_stopwords(tokens):
    """Removes Bengali stopwords from tokens."""
    return [token for token in tokens if token not in bengali_stopwords]

# Load dataset
file_path = "bangla_hate_speech.csv"  # Replace with actual file path
df = pd.read_csv(file_path)
df['cleaned_sentence'] = df['sentence'].apply(clean_text)

# Load tokenizer
bert_tokenizer = AutoTokenizer.from_pretrained("sagorsarker/bangla-bert-base")

df['tokens'] = df['cleaned_sentence'].apply(lambda x: tokenize_text(x, bert_tokenizer))
df['filtered_tokens'] = df['tokens'].apply(remove_stopwords)

# Split data
X = df['filtered_tokens'].apply(lambda tokens: ' '.join(tokens))
y = df['hate']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Shazzad\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [30]:
# Ensure PyTorch is set up to use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load Bangla BERT model onto GPU
bert_model = AutoModel.from_pretrained("sagorsarker/bangla-bert-base").to(device)


Using device: cuda


In [31]:

# Step 2: Hybrid Embeddings Generation
# Train FastText
def train_fasttext(corpus, embedding_dim=100):
    model = FastText(sentences=corpus, vector_size=embedding_dim, window=5, min_count=1, sg=1)
    return model

# Prepare corpus for FastText
corpus = df['filtered_tokens'].tolist()
fasttext_model = train_fasttext(corpus)


In [40]:


# def get_bert_embeddings(text):
#     inputs = bert_tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
#     with torch.no_grad():
#         outputs = bert_model(**inputs)
#     return outputs.last_hidden_state.squeeze(0)

# def combine_embeddings(tokens, fasttext_model, bert_embeddings, embedding_dim=100):
#     combined_embeddings = []
#     for idx, token in enumerate(tokens):
#         fasttext_vec = fasttext_model.wv[token] if token in fasttext_model.wv else np.zeros(embedding_dim)
#         bert_vec = bert_embeddings[idx].numpy() if idx < len(bert_embeddings) else np.zeros_like(bert_embeddings[0].numpy())
#         combined_vec = np.concatenate((fasttext_vec, bert_vec))
#         combined_embeddings.append(combined_vec)
#     return np.array(combined_embeddings)


def get_bert_embeddings(text):
    # Tokenize and move tensors to GPU
    inputs = bert_tokenizer(
        text, 
        return_tensors="pt", 
        padding=True, 
        truncation=True, 
        max_length=512
    ).to(device)

    # Inference with no gradient to save memory and compute
    with torch.no_grad():
        outputs = bert_model(**inputs)

    # outputs.last_hidden_state has shape [batch_size, seq_len, hidden_dim]
    # If you're only handling a single text, you can squeeze the batch dimension
    return outputs.last_hidden_state.squeeze(0)

def combine_embeddings(tokens, fasttext_model, bert_embeddings, embedding_dim=100):
    """
    Args:
        tokens (list): List of tokens
        fasttext_model: Gensim's FastText model
        bert_embeddings (torch.Tensor): BERT embeddings on GPU
        embedding_dim (int): Dimension of the FastText embeddings
    Returns:
        torch.Tensor of shape [len(tokens), embedding_dim + bert_hidden_size]
    """
    combined_embeddings = []

    for idx, token in enumerate(tokens):
        # 1. Get FastText vector (on CPU), then move to GPU
        if token in fasttext_model.wv:
            fasttext_vec = fasttext_model.wv[token]
        else:
            fasttext_vec = np.zeros(embedding_dim)
        fasttext_vec = torch.tensor(fasttext_vec, dtype=torch.float32, device=device)

        # 2. Get the corresponding BERT embedding (already on GPU)
        if idx < bert_embeddings.size(0):
            bert_vec = bert_embeddings[idx]
        else:
            bert_vec = torch.zeros_like(bert_embeddings[0], device=device)

        # 3. Concatenate FastText and BERT embedding along the last dimension
        combined_vec = torch.cat((fasttext_vec, bert_vec), dim=0)
        combined_embeddings.append(combined_vec)

    # Convert list of tensors to a single tensor of shape [num_tokens, total_dim]
    combined_embeddings = torch.stack(combined_embeddings)
    return combined_embeddings

# Usage example:
# tokens = ["hello", "world", "this", "is", "gpu"]
# text = "hello world this is gpu"
# bert_emb = get_bert_embeddings(text)
# combined = combine_embeddings(tokens, fasttext_model, bert_emb)



In [56]:
def get_embeddings_safe(text, fasttext_model):
    """
    Returns a zero embedding if the text is empty
    """
    text = text.strip()
    if not text:
        # For example, return a zero tensor with shape [1, your_dimension]
        # or skip. This is up to your design.
        return torch.zeros((1, 868), device=device)  # If your BERT hidden dim is 768
    tokens = text.split()
    bert_emb = get_bert_embeddings(" ".join(tokens))  # get_bert_embeddings also on GPU
    return combine_embeddings(tokens, fasttext_model, bert_emb)


In [57]:
from tqdm.auto import tqdm
train_embeddings = [
    get_embeddings_safe(tokens, fasttext_model) 
    for tokens in tqdm(X_train, desc="Generating train embeddings")
]
test_embeddings = [
    get_embeddings_safe(tokens, fasttext_model) 
    for tokens in tqdm(X_test, desc="Generating test embeddings")
]

Generating train embeddings: 100%|██████████| 24000/24000 [07:22<00:00, 54.25it/s] 
Generating test embeddings: 100%|██████████| 6000/6000 [01:54<00:00, 52.35it/s]


In [58]:
from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    """
    Each item in 'batch' is a tuple: (embeddings, label)
    - embeddings: shape [seq_len, embedding_dim]
    - label: scalar
    We'll pad embeddings so they all match the longest seq_len in this batch.
    """
    embeddings_list = [item[0] for item in batch]  # list of [seq_len, embed_dim] tensors
    labels_list = [item[1] for item in batch]      # list of label tensors

    # Pad embeddings to [batch_size, max_seq_len_in_batch, embed_dim]
    padded_embeddings = pad_sequence(embeddings_list, batch_first=True)

    # Stack labels, shape [batch_size]
    labels_tensor = torch.stack(labels_list)

    return padded_embeddings, labels_tensor


In [59]:
for i, emb in enumerate(train_embeddings):
    if emb.shape[1] == 768:
        print(i, emb.shape)


In [65]:

# Step 3: Hierarchical Attention Network (HAN)
class HateSpeechDataset(Dataset):
    def __init__(self, embeddings, labels):
        self.embeddings = embeddings
        self.labels = labels

    def __len__(self):
        return len(self.embeddings)

    def __getitem__(self, idx):
        return self.embeddings[idx].float(), torch.tensor(self.labels[idx], dtype=torch.long)


        

class HAN(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, num_classes):
        super(HAN, self).__init__()
        self.word_lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True, batch_first=True)
        self.word_attention = nn.Linear(2 * hidden_dim, 1)
        self.sentence_lstm = nn.LSTM(2 * hidden_dim, hidden_dim, bidirectional=True, batch_first=True)
        self.sentence_attention = nn.Linear(2 * hidden_dim, 1)
        self.fc = nn.Linear(2 * hidden_dim, num_classes)

    def attention(self, lstm_output, attention_layer):
        attention_weights = torch.softmax(attention_layer(lstm_output), dim=1)
        weighted_output = torch.sum(attention_weights * lstm_output, dim=1)
        return weighted_output

    def forward(self, x):
        word_output, _ = self.word_lstm(x)
        sentence_input = self.attention(word_output, self.word_attention)
        sentence_output, _ = self.sentence_lstm(sentence_input.unsqueeze(1))
        document_representation = self.attention(sentence_output, self.sentence_attention)
        logits = self.fc(document_representation)
        return logits

train_dataset = HateSpeechDataset(train_embeddings, y_train.tolist())
test_dataset = HateSpeechDataset(test_embeddings, y_test.tolist())
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=32, collate_fn=collate_fn)

embedding_dim = 868  # 100 (FastText) + 768 (Bangla BERT)
hidden_dim = 128
num_classes = len(set(y_train))
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = HAN(embedding_dim, hidden_dim, num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training Loop
for epoch in range(10):
    model.train()
    for embeddings, labels in train_loader:
        embeddings, labels = embeddings.to(device), labels.to(device)
        outputs = model(embeddings)
        loss = criterion(outputs, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for embeddings, labels in test_loader:
            embeddings, labels = embeddings.to(device), labels.to(device)
            outputs = model(embeddings)
            preds = torch.argmax(outputs, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(all_labels, all_preds)
    print(f"Epoch {epoch + 1}: Accuracy = {accuracy:.4f}")


Epoch 1: Accuracy = 0.8293
Epoch 2: Accuracy = 0.8355
Epoch 3: Accuracy = 0.8642
Epoch 4: Accuracy = 0.8655
Epoch 5: Accuracy = 0.8683
Epoch 6: Accuracy = 0.8688
Epoch 7: Accuracy = 0.8613
Epoch 8: Accuracy = 0.8678
Epoch 9: Accuracy = 0.8662
Epoch 10: Accuracy = 0.8717


In [66]:
def predict_text(text, model, fasttext_model, bert_tokenizer, bert_model, device):
    """
    Predict the label for a single piece of text using the trained HAN model.
    """
    model.eval()

    # 1) Preprocess text (tokenize, embed)
    tokens = text.split()
    # Embed using your known pipeline:
    # e.g. get_bert_embeddings, then combine_embeddings
    with torch.no_grad():
        bert_emb = get_bert_embeddings(" ".join(tokens))  # the function you used before
        sample_emb = combine_embeddings(tokens, fasttext_model, bert_emb)  # shape [seq_len, 868]
    
    # 2) Model expects shape [batch_size, seq_len, embedding_dim]
    sample_emb = sample_emb.unsqueeze(0).to(device)  # shape [1, seq_len, 868]
    
    # 3) Forward pass
    with torch.no_grad():
        outputs = model(sample_emb)
        # outputs: [1, num_classes]

    # 4) Predicted class
    predicted_class = torch.argmax(outputs, dim=1).item()
    return predicted_class


In [70]:
# Example usage (Bangla text)
example_text = "পিলখানা হত্যাকান্ড বাংলাদেশের প্রতিরক্ষা ব্যবস্থা ধ্বংসের জন্য ভারতের প্রত্যক্ষ সহযোগিতায় এই হত্যাকা- ঘটানো হয়েছিল"
pred_label = predict_text(
    text=example_text, 
    model=model, 
    fasttext_model=fasttext_model,
    bert_tokenizer=bert_tokenizer,
    bert_model=bert_model,
    device=device
)

print(f"Predicted label for '{example_text}': {pred_label}")


Predicted label for 'পিলখানা হত্যাকান্ড বাংলাদেশের প্রতিরক্ষা ব্যবস্থা ধ্বংসের জন্য ভারতের প্রত্যক্ষ সহযোগিতায় এই হত্যাকা- ঘটানো হয়েছিল': 1
