In [5]:
%pip install pandas numpy nltk transformers fasttext-wheel scikit-learn torch shap matplotlib seaborn


Collecting nltk
  Using cached nltk-3.9.1-py3-none-any.whl (1.5 MB)
Collecting transformers
  Using cached transformers-4.47.1-py3-none-any.whl (10.1 MB)
Collecting torch
  Using cached torch-2.5.1-cp310-cp310-win_amd64.whl (203.1 MB)
Collecting shap
  Using cached shap-0.46.0-cp310-cp310-win_amd64.whl (456 kB)
Collecting seaborn
  Using cached seaborn-0.13.2-py3-none-any.whl (294 kB)
Collecting tokenizers<0.22,>=0.21
  Using cached tokenizers-0.21.0-cp39-abi3-win_amd64.whl (2.4 MB)
Collecting filelock
  Using cached filelock-3.16.1-py3-none-any.whl (16 kB)
Collecting safetensors>=0.4.1
  Using cached safetensors-0.4.5-cp310-none-win_amd64.whl (285 kB)
Collecting huggingface-hub<1.0,>=0.24.0
  Using cached huggingface_hub-0.27.0-py3-none-any.whl (450 kB)
Collecting fsspec
  Using cached fsspec-2024.12.0-py3-none-any.whl (183 kB)
Collecting sympy==1.13.1
  Using cached sympy-1.13.1-py3-none-any.whl (6.2 MB)
Collecting jinja2
  Using cached jinja2-3.1.5-py3-none-any.whl (134 kB)
Collecti


[notice] A new release of pip available: 22.2.1 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [6]:
# Step 1: Import Libraries
import pandas as pd
import re
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
from transformers import AutoTokenizer


  from .autonotebook import tqdm as notebook_tqdm


In [7]:
# Download the Bengali stopwords
nltk.download('stopwords')

# Load Bengali stopwords
bengali_stopwords = set(stopwords.words('bengali'))

# Initialize tokenizer for later use
tokenizer = AutoTokenizer.from_pretrained("sagorsarker/bangla-bert-base")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [8]:

# Step 2: Load the Dataset
# Replace 'dataset.csv' with the path to your dataset file
file_path = 'bangla_hate_speech.csv'  # Update this with your actual dataset path
df = pd.read_csv(file_path)

# Display the first few rows to verify data loading
print("Data Preview:")
print(df.head())

Data Preview:
                                            sentence  hate category
0                     যত্তসব পাপন শালার ফাজলামী!!!!!     1   sports
1                  পাপন শালা রে রিমান্ডে নেওয়া দরকার     1   sports
2  জিল্লুর রহমান স্যারের ছেলে এতো বড় জারজ হবে এটা...     1   sports
3                শালা লুচ্চা দেখতে পাঠার মত দেখা যায়     1   sports
4   তুই তো শালা গাজা খাইছচ।তুর মার হেডায় খেলবে সাকিব     1   sports


In [14]:


# Step 3: Text Cleaning
def clean_text(text):
    """
    Cleans text by removing unnecessary characters and symbols.
    """
    # Remove special characters, numbers, and symbols, keeping only Bengali letters
    text = re.sub(r'[^\u0980-\u09FF\s]', '', text)  # Keep Bengali characters and whitespace
    # Remove multiple spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Apply cleaning to the 'sentence' column
df['cleaned_sentence'] = df['sentence'].apply(clean_text)

print("After Cleaning:")
print(df[['sentence', 'cleaned_sentence']].head())

After Cleaning:
                                            sentence  \
0                     যত্তসব পাপন শালার ফাজলামী!!!!!   
1                  পাপন শালা রে রিমান্ডে নেওয়া দরকার   
2  জিল্লুর রহমান স্যারের ছেলে এতো বড় জারজ হবে এটা...   
3                শালা লুচ্চা দেখতে পাঠার মত দেখা যায়   
4   তুই তো শালা গাজা খাইছচ।তুর মার হেডায় খেলবে সাকিব   

                                    cleaned_sentence  
0                          যত্তসব পাপন শালার ফাজলামী  
1                  পাপন শালা রে রিমান্ডে নেওয়া দরকার  
2  জিল্লুর রহমান স্যারের ছেলে এতো বড় জারজ হবে এটা...  
3                শালা লুচ্চা দেখতে পাঠার মত দেখা যায়  
4    তুই তো শালা গাজা খাইছচতুর মার হেডায় খেলবে সাকিব  


In [15]:


# Step 4: Tokenization with Bangla BERT
def tokenize_text(text):
    """
    Tokenizes text using Bangla BERT tokenizer.
    """
    # Tokenize the text into subwords for better context understanding
    tokens = tokenizer.tokenize(text)
    # Convert tokens to a single string for training input
    return tokens

# Tokenize cleaned sentences
df['tokens'] = df['cleaned_sentence'].apply(tokenize_text)

print("After Tokenization:")
print(df[['cleaned_sentence', 'tokens']].head())

After Tokenization:
                                    cleaned_sentence  \
0                          যত্তসব পাপন শালার ফাজলামী   
1                  পাপন শালা রে রিমান্ডে নেওয়া দরকার   
2  জিল্লুর রহমান স্যারের ছেলে এতো বড় জারজ হবে এটা...   
3                শালা লুচ্চা দেখতে পাঠার মত দেখা যায়   
4    তুই তো শালা গাজা খাইছচতুর মার হেডায় খেলবে সাকিব   

                                              tokens  
0      [যত, ##ত, ##সব, পাপন, শালা, ##র, ফাজলাম, ##ী]  
1    [পাপন, শালা, রে, রিমান, ##ডে, নেও, ##যা, দরকার]  
2  [জিল, ##ল, ##র, রহমান, স, ##যার, ##ের, ছেলে, এ...  
3  [শালা, ল, ##চ, ##চা, দেখতে, পাঠ, ##ার, মত, দেখ...  
4  [ত, ##ই, তে, ##া, শালা, গাজা, খাই, ##ছ, ##চতর,...  


In [16]:


# Step 5: Stopword Removal
def remove_stopwords(tokens):
    """
    Removes Bengali stopwords from the token list.
    """
    return [token for token in tokens if token not in bengali_stopwords]

# Apply stopword removal on tokens
df['filtered_tokens'] = df['tokens'].apply(remove_stopwords)

print("After Stopword Removal:")
print(df[['tokens', 'filtered_tokens']].head())

After Stopword Removal:
                                              tokens  \
0      [যত, ##ত, ##সব, পাপন, শালা, ##র, ফাজলাম, ##ী]   
1    [পাপন, শালা, রে, রিমান, ##ডে, নেও, ##যা, দরকার]   
2  [জিল, ##ল, ##র, রহমান, স, ##যার, ##ের, ছেলে, এ...   
3  [শালা, ল, ##চ, ##চা, দেখতে, পাঠ, ##ার, মত, দেখ...   
4  [ত, ##ই, তে, ##া, শালা, গাজা, খাই, ##ছ, ##চতর,...   

                                     filtered_tokens  
0          [##ত, ##সব, পাপন, শালা, ##র, ফাজলাম, ##ী]  
1    [পাপন, শালা, রে, রিমান, ##ডে, নেও, ##যা, দরকার]  
2  [জিল, ##ল, ##র, রহমান, স, ##যার, ##ের, ছেলে, #...  
3           [শালা, ল, ##চ, ##চা, পাঠ, ##ার, মত, ##য]  
4  [ত, ##ই, তে, ##া, শালা, গাজা, খাই, ##ছ, ##চতর,...  


In [17]:
# Step 6: Prepare Data for Modeling
# Map the sentences and their respective labels
X = df['filtered_tokens'].apply(lambda tokens: ' '.join(tokens))  # Join tokens for input
y = df['hate']  # Assuming the 'hate' column contains the labels

# Split the data into training and testing sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Display dataset statistics
print("Training Data Size:", len(X_train))
print("Testing Data Size:", len(X_test))


Training Data Size: 24000
Testing Data Size: 6000


In [18]:

# Step 7: Save Preprocessed Data
# Save the train and test splits for later use
X_train.to_csv('X_train.csv', index=False)
y_train.to_csv('y_train.csv', index=False)
X_test.to_csv('X_test.csv', index=False)
y_test.to_csv('y_test.csv', index=False)

print("Preprocessing complete. Data saved to disk.")


Preprocessing complete. Data saved to disk.


In [22]:
%pip install gensim




[notice] A new release of pip available: 22.2.1 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip





In [23]:
# Import necessary libraries
from gensim.models import FastText
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np

In [24]:


# Step 1: FastText Embeddings
def train_fasttext(corpus, embedding_dim=100, window_size=5, min_count=1):
    """
    Train a FastText model on the dataset.
    :param corpus: List of tokenized sentences.
    :param embedding_dim: Dimension of the embeddings.
    :param window_size: Context window size.
    :param min_count: Minimum word count threshold.
    :return: Trained FastText model.
    """
    model = FastText(sentences=corpus, vector_size=embedding_dim, window=window_size, min_count=min_count, sg=1)
    return model

# Convert the filtered tokens to a list of tokenized sentences
corpus = df['filtered_tokens'].tolist()

# Train the FastText model
fasttext_model = train_fasttext(corpus)
print("FastText training complete!")

# Example: Retrieve FastText vector for a word
word = "বাংলাদেশ"
if word in fasttext_model.wv:
    print(f"FastText vector for '{word}': {fasttext_model.wv[word]}")


FastText training complete!
FastText vector for 'বাংলাদেশ': [-2.93207377e-01 -9.67192203e-02  2.26068422e-01  8.45103711e-02
 -1.01818047e-01 -2.56076187e-01  5.44458777e-02  6.24997854e-01
 -3.01259190e-01 -2.64037192e-01 -9.29759890e-02  3.88862878e-01
  5.01257218e-02 -1.72663674e-01 -2.25367174e-01  1.17577925e-04
  2.78682798e-01 -7.68309176e-01  2.65721846e-02 -5.96940070e-02
 -3.49264205e-01  4.37141538e-01 -1.29414722e-01  1.65042341e-01
  4.27348584e-01 -4.40854579e-01 -1.81770399e-01  1.98105887e-01
  2.59890705e-01  4.10499386e-02 -2.85143435e-01  4.05519307e-01
 -3.30519706e-01  3.43494058e-01  1.33104116e-01 -4.05534923e-01
  2.35861167e-01 -2.89712064e-02 -2.29251921e-01 -3.33831102e-01
  6.43422067e-01  2.79015750e-01 -2.07290664e-01 -2.65391529e-01
  2.53665864e-01 -1.76009789e-01 -2.61550754e-01  5.06602108e-01
  1.09816402e-01 -4.24761653e-01  3.24022740e-01 -6.66299909e-02
  5.67849159e-01 -2.38398537e-01 -3.40567410e-01  1.73078343e-01
  2.49055699e-01 -4.71430600e-

In [25]:

# Step 2: Bangla BERT Embeddings
# Load Bangla BERT model and tokenizer
bangla_bert_model = AutoModel.from_pretrained("sagorsarker/bangla-bert-base")
bangla_bert_tokenizer = AutoTokenizer.from_pretrained("sagorsarker/bangla-bert-base")

def get_bert_embeddings(text):
    """
    Extract BERT embeddings for the input text.
    :param text: Input sentence.
    :return: Token-level embeddings.
    """
    inputs = bangla_bert_tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = bangla_bert_model(**inputs)
    # Get the last hidden state (token embeddings)
    return outputs.last_hidden_state.squeeze(0)

# Example: Get Bangla BERT embeddings for a sentence
sentence = "বাংলাদেশ একটি সুন্দর দেশ।"
bert_embeddings = get_bert_embeddings(sentence)
print(f"BERT Embeddings shape: {bert_embeddings.shape}")

BERT Embeddings shape: torch.Size([8, 768])


In [26]:


# Step 3: Combine FastText and BERT Embeddings
def combine_embeddings(tokens, fasttext_model, bert_embeddings, embedding_dim=100):
    """
    Combine FastText and BERT embeddings for each token.
    :param tokens: List of tokens for the sentence.
    :param fasttext_model: Trained FastText model.
    :param bert_embeddings: BERT embeddings for the tokens.
    :param embedding_dim: Dimension of FastText embeddings.
    :return: Combined embeddings for each token.
    """
    combined_embeddings = []
    for idx, token in enumerate(tokens):
        # Get FastText embedding (zeros if not in vocab)
        fasttext_vec = fasttext_model.wv[token] if token in fasttext_model.wv else np.zeros(embedding_dim)
        
        # Get BERT embedding for the token
        bert_vec = bert_embeddings[idx].numpy() if idx < len(bert_embeddings) else np.zeros_like(bert_embeddings[0].numpy())
        
        # Concatenate FastText and BERT embeddings
        combined_vec = np.concatenate((fasttext_vec, bert_vec))
        combined_embeddings.append(combined_vec)
    
    return np.array(combined_embeddings)

# Example: Combine embeddings for a sentence
tokens = df['filtered_tokens'][0]  # Use the first sentence in the dataset
bert_embs = get_bert_embeddings(" ".join(tokens))
combined_embs = combine_embeddings(tokens, fasttext_model, bert_embs)
print(f"Combined Embeddings shape: {combined_embs.shape}")


Combined Embeddings shape: (7, 868)


In [27]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np

In [28]:
# Step 1: Define the Custom Dataset
class HateSpeechDataset(Dataset):
    """
    Custom Dataset for loading hate speech data with embeddings.
    """
    def __init__(self, texts, labels, embeddings):
        self.texts = texts
        self.labels = labels
        self.embeddings = embeddings

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text_embedding = self.embeddings[idx]
        label = self.labels[idx]
        return torch.tensor(text_embedding, dtype=torch.float32), torch.tensor(label, dtype=torch.long)


In [29]:

# Step 2: Define the HAN Model
class HAN(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, num_classes):
        super(HAN, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.num_classes = num_classes

        # Word-level BiLSTM
        self.word_lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True, batch_first=True)
        self.word_attention = nn.Linear(2 * hidden_dim, 1)  # Attention layer for words

        # Sentence-level BiLSTM
        self.sentence_lstm = nn.LSTM(2 * hidden_dim, hidden_dim, bidirectional=True, batch_first=True)
        self.sentence_attention = nn.Linear(2 * hidden_dim, 1)  # Attention layer for sentences

        # Fully connected output layer
        self.fc = nn.Linear(2 * hidden_dim, num_classes)

    def attention(self, lstm_output, attention_layer):
        """
        Apply attention mechanism.
        """
        attention_weights = torch.softmax(attention_layer(lstm_output), dim=1)
        weighted_output = torch.sum(attention_weights * lstm_output, dim=1)
        return weighted_output

    def forward(self, x):
        # Word-level BiLSTM and attention
        word_output, _ = self.word_lstm(x)
        sentence_input = self.attention(word_output, self.word_attention)

        # Sentence-level BiLSTM and attention
        sentence_output, _ = self.sentence_lstm(sentence_input.unsqueeze(0))
        document_representation = self.attention(sentence_output, self.sentence_attention)

        # Classification layer
        logits = self.fc(document_representation)
        return logits

In [30]:


# Step 3: Training and Evaluation Functions
def train_model(model, train_loader, criterion, optimizer, device):
    model.train()
    total_loss = 0
    for embeddings, labels in train_loader:
        embeddings, labels = embeddings.to(device), labels.to(device)
        
        # Forward pass
        outputs = model(embeddings)
        loss = criterion(outputs, labels)
        total_loss += loss.item()

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    return total_loss / len(train_loader)

def evaluate_model(model, val_loader, device):
    model.eval()
    all_labels = []
    all_preds = []
    with torch.no_grad():
        for embeddings, labels in val_loader:
            embeddings, labels = embeddings.to(device), labels.to(device)
            outputs = model(embeddings)
            preds = torch.argmax(outputs, dim=1)
            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(preds.cpu().numpy())
    
    accuracy = accuracy_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds, average='weighted')
    recall = recall_score(all_labels, all_preds, average='weighted')
    f1 = f1_score(all_labels, all_preds, average='weighted')
    return accuracy, precision, recall, f1


In [31]:

# Step 4: Preparing Data and Training the Model
# Assuming you have combined embeddings (X_train, X_test) and labels (y_train, y_test)
train_dataset = HateSpeechDataset(X_train, y_train, combined_train_embeddings)
test_dataset = HateSpeechDataset(X_test, y_test, combined_test_embeddings)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

# Model Parameters
embedding_dim = 868  # Example: 100 from FastText + 768 from Bangla BERT
hidden_dim = 128
num_classes = len(set(y_train))  # Number of unique labels
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Initialize model, loss function, and optimizer
model = HAN(embedding_dim, hidden_dim, num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training Loop
num_epochs = 10
for epoch in range(num_epochs):
    train_loss = train_model(model, train_loader, criterion, optimizer, device)
    accuracy, precision, recall, f1 = evaluate_model(model, test_loader, device)
    print(f"Epoch {epoch+1}/{num_epochs}")
    print(f"Train Loss: {train_loss:.4f}")
    print(f"Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}")


NameError: name 'combined_train_embeddings' is not defined

In [32]:
# Import necessary libraries
import pandas as pd
import re
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
from transformers import AutoTokenizer, AutoModel
from gensim.models import FastText
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np

# Download stopwords
nltk.download('stopwords')
bengali_stopwords = set(stopwords.words('bengali'))

# Step 1: Data Preprocessing
def clean_text(text):
    """Cleans text by removing unnecessary characters and symbols."""
    text = re.sub(r'[^\u0980-\u09FF\s]', '', text)  # Keep Bengali characters and whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def tokenize_text(text, tokenizer):
    """Tokenizes text using Bangla BERT tokenizer."""
    tokens = tokenizer.tokenize(text)
    return tokens

def remove_stopwords(tokens):
    """Removes Bengali stopwords from tokens."""
    return [token for token in tokens if token not in bengali_stopwords]

# Load dataset
file_path = "bangla_hate_speech.csv"  # Replace with actual file path
df = pd.read_csv(file_path)
df['cleaned_sentence'] = df['sentence'].apply(clean_text)

# Load tokenizer
bert_tokenizer = AutoTokenizer.from_pretrained("sagorsarker/bangla-bert-base")

df['tokens'] = df['cleaned_sentence'].apply(lambda x: tokenize_text(x, bert_tokenizer))
df['filtered_tokens'] = df['tokens'].apply(remove_stopwords)

# Split data
X = df['filtered_tokens'].apply(lambda tokens: ' '.join(tokens))
y = df['hate']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [34]:

# Step 2: Hybrid Embeddings Generation
# Train FastText
def train_fasttext(corpus, embedding_dim=100):
    model = FastText(sentences=corpus, vector_size=embedding_dim, window=5, min_count=1, sg=1)
    return model

# Prepare corpus for FastText
corpus = df['filtered_tokens'].tolist()
fasttext_model = train_fasttext(corpus)

# Load Bangla BERT
bert_model = AutoModel.from_pretrained("sagorsarker/bangla-bert-base")

In [35]:


def get_bert_embeddings(text):
    inputs = bert_tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = bert_model(**inputs)
    return outputs.last_hidden_state.squeeze(0)

def combine_embeddings(tokens, fasttext_model, bert_embeddings, embedding_dim=100):
    combined_embeddings = []
    for idx, token in enumerate(tokens):
        fasttext_vec = fasttext_model.wv[token] if token in fasttext_model.wv else np.zeros(embedding_dim)
        bert_vec = bert_embeddings[idx].numpy() if idx < len(bert_embeddings) else np.zeros_like(bert_embeddings[0].numpy())
        combined_vec = np.concatenate((fasttext_vec, bert_vec))
        combined_embeddings.append(combined_vec)
    return np.array(combined_embeddings)


In [37]:
from tqdm import tqdm

def precompute_fasttext_embeddings(fasttext_model, unique_tokens, embedding_dim=100):
    """
    Precompute FastText embeddings for all unique tokens.
    :param fasttext_model: Trained FastText model.
    :param unique_tokens: Set of unique tokens from the dataset.
    :param embedding_dim: Dimension of FastText embeddings.
    :return: Dictionary of token to embedding.
    """
    token_embeddings = {}
    for token in unique_tokens:
        if token in fasttext_model.wv:
            token_embeddings[token] = fasttext_model.wv[token]
        else:
            token_embeddings[token] = np.zeros(embedding_dim)  # OOV tokens
    return token_embeddings

# Step 1: Precompute FastText Embeddings
unique_tokens = set(token for tokens in df['filtered_tokens'] for token in tokens)
fasttext_embeddings = precompute_fasttext_embeddings(fasttext_model, unique_tokens)

def batch_bert_embeddings(sentences, bert_tokenizer, bert_model, batch_size=16):
    """
    Generate BERT embeddings in batches for efficiency.
    :param sentences: List of sentences.
    :param bert_tokenizer: Bangla BERT tokenizer.
    :param bert_model: Bangla BERT model.
    :param batch_size: Number of sentences per batch.
    :return: List of BERT embeddings for each sentence.
    """
    all_embeddings = []
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    bert_model = bert_model.to(device)
    bert_model.eval()
    
    for i in tqdm(range(0, len(sentences), batch_size)):
        batch = sentences[i:i+batch_size]
        inputs = bert_tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=512)
        inputs = {key: value.to(device) for key, value in inputs.items()}
        with torch.no_grad():
            outputs = bert_model(**inputs)
        batch_embeddings = outputs.last_hidden_state.cpu().numpy()
        all_embeddings.extend(batch_embeddings)
    
    return all_embeddings

# Step 2: Generate BERT Embeddings in Batches
train_sentences = [" ".join(tokens) for tokens in X_train]
test_sentences = [" ".join(tokens) for tokens in X_test]

train_bert_embeddings = batch_bert_embeddings(train_sentences, bert_tokenizer, bert_model)
test_bert_embeddings = batch_bert_embeddings(test_sentences, bert_tokenizer, bert_model)

def combine_embeddings_optimized(tokens, fasttext_embeddings, bert_embedding):
    """
    Combine FastText and BERT embeddings for each token.
    :param tokens: List of tokens for the sentence.
    :param fasttext_embeddings: Precomputed FastText embeddings.
    :param bert_embedding: BERT embedding for the sentence.
    :return: Combined embeddings for the sentence.
    """
    combined_embeddings = []
    for idx, token in enumerate(tokens):
        fasttext_vec = fasttext_embeddings.get(token, np.zeros(len(next(iter(fasttext_embeddings.values())))))
        bert_vec = bert_embedding[idx] if idx < len(bert_embedding) else np.zeros(bert_embedding.shape[1])
        combined_vec = np.concatenate((fasttext_vec, bert_vec))
        combined_embeddings.append(combined_vec)
    return np.array(combined_embeddings)

# Step 3: Combine FastText and BERT Embeddings
train_embeddings = [
    combine_embeddings_optimized(tokens.split(), fasttext_embeddings, bert_emb)
    for tokens, bert_emb in zip(X_train, train_bert_embeddings)
]

test_embeddings = [
    combine_embeddings_optimized(tokens.split(), fasttext_embeddings, bert_emb)
    for tokens, bert_emb in zip(X_test, test_bert_embeddings)
]


 58%|█████▊    | 867/1500 [51:09<37:20,  3.54s/it]  


KeyboardInterrupt: 

In [None]:

# Generate embeddings for the dataset
train_embeddings = [combine_embeddings(tokens.split(), fasttext_model, get_bert_embeddings(" ".join(tokens.split()))) for tokens in X_train]
test_embeddings = [combine_embeddings(tokens.split(), fasttext_model, get_bert_embeddings(" ".join(tokens.split()))) for tokens in X_test]


In [None]:

# Step 3: Hierarchical Attention Network (HAN)
class HateSpeechDataset(Dataset):
    def __init__(self, embeddings, labels):
        self.embeddings = embeddings
        self.labels = labels

    def __len__(self):
        return len(self.embeddings)

    def __getitem__(self, idx):
        return torch.tensor(self.embeddings[idx], dtype=torch.float32), torch.tensor(self.labels[idx], dtype=torch.long)

class HAN(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, num_classes):
        super(HAN, self).__init__()
        self.word_lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True, batch_first=True)
        self.word_attention = nn.Linear(2 * hidden_dim, 1)
        self.sentence_lstm = nn.LSTM(2 * hidden_dim, hidden_dim, bidirectional=True, batch_first=True)
        self.sentence_attention = nn.Linear(2 * hidden_dim, 1)
        self.fc = nn.Linear(2 * hidden_dim, num_classes)

    def attention(self, lstm_output, attention_layer):
        attention_weights = torch.softmax(attention_layer(lstm_output), dim=1)
        weighted_output = torch.sum(attention_weights * lstm_output, dim=1)
        return weighted_output

    def forward(self, x):
        word_output, _ = self.word_lstm(x)
        sentence_input = self.attention(word_output, self.word_attention)
        sentence_output, _ = self.sentence_lstm(sentence_input.unsqueeze(0))
        document_representation = self.attention(sentence_output, self.sentence_attention)
        logits = self.fc(document_representation)
        return logits

train_dataset = HateSpeechDataset(train_embeddings, y_train.tolist())
test_dataset = HateSpeechDataset(test_embeddings, y_test.tolist())
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

embedding_dim = 868  # 100 (FastText) + 768 (Bangla BERT)
hidden_dim = 128
num_classes = len(set(y_train))
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = HAN(embedding_dim, hidden_dim, num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training Loop
for epoch in range(10):
    model.train()
    for embeddings, labels in train_loader:
        embeddings, labels = embeddings.to(device), labels.to(device)
        outputs = model(embeddings)
        loss = criterion(outputs, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for embeddings, labels in test_loader:
            embeddings, labels = embeddings.to(device), labels.to(device)
            outputs = model(embeddings)
            preds = torch.argmax(outputs, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(all_labels, all_preds)
    print(f"Epoch {epoch + 1}: Accuracy = {accuracy:.4f}")
