In [5]:
%pip install pandas numpy nltk transformers fasttext-wheel scikit-learn torch shap matplotlib seaborn


Collecting nltk
  Using cached nltk-3.9.1-py3-none-any.whl (1.5 MB)
Collecting transformers
  Using cached transformers-4.47.1-py3-none-any.whl (10.1 MB)
Collecting torch
  Using cached torch-2.5.1-cp310-cp310-win_amd64.whl (203.1 MB)
Collecting shap
  Using cached shap-0.46.0-cp310-cp310-win_amd64.whl (456 kB)
Collecting seaborn
  Using cached seaborn-0.13.2-py3-none-any.whl (294 kB)
Collecting tokenizers<0.22,>=0.21
  Using cached tokenizers-0.21.0-cp39-abi3-win_amd64.whl (2.4 MB)
Collecting filelock
  Using cached filelock-3.16.1-py3-none-any.whl (16 kB)
Collecting safetensors>=0.4.1
  Using cached safetensors-0.4.5-cp310-none-win_amd64.whl (285 kB)
Collecting huggingface-hub<1.0,>=0.24.0
  Using cached huggingface_hub-0.27.0-py3-none-any.whl (450 kB)
Collecting fsspec
  Using cached fsspec-2024.12.0-py3-none-any.whl (183 kB)
Collecting sympy==1.13.1
  Using cached sympy-1.13.1-py3-none-any.whl (6.2 MB)
Collecting jinja2
  Using cached jinja2-3.1.5-py3-none-any.whl (134 kB)
Collecti


[notice] A new release of pip available: 22.2.1 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [6]:
# Step 1: Import Libraries
import pandas as pd
import re
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
from transformers import AutoTokenizer


  from .autonotebook import tqdm as notebook_tqdm


In [7]:
# Download the Bengali stopwords
nltk.download('stopwords')

# Load Bengali stopwords
bengali_stopwords = set(stopwords.words('bengali'))

# Initialize tokenizer for later use
tokenizer = AutoTokenizer.from_pretrained("sagorsarker/bangla-bert-base")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [8]:

# Step 2: Load the Dataset
# Replace 'dataset.csv' with the path to your dataset file
file_path = 'bangla_hate_speech.csv'  # Update this with your actual dataset path
df = pd.read_csv(file_path)

# Display the first few rows to verify data loading
print("Data Preview:")
print(df.head())

Data Preview:
                                            sentence  hate category
0                     যত্তসব পাপন শালার ফাজলামী!!!!!     1   sports
1                  পাপন শালা রে রিমান্ডে নেওয়া দরকার     1   sports
2  জিল্লুর রহমান স্যারের ছেলে এতো বড় জারজ হবে এটা...     1   sports
3                শালা লুচ্চা দেখতে পাঠার মত দেখা যায়     1   sports
4   তুই তো শালা গাজা খাইছচ।তুর মার হেডায় খেলবে সাকিব     1   sports


In [14]:


# Step 3: Text Cleaning
def clean_text(text):
    """
    Cleans text by removing unnecessary characters and symbols.
    """
    # Remove special characters, numbers, and symbols, keeping only Bengali letters
    text = re.sub(r'[^\u0980-\u09FF\s]', '', text)  # Keep Bengali characters and whitespace
    # Remove multiple spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Apply cleaning to the 'sentence' column
df['cleaned_sentence'] = df['sentence'].apply(clean_text)

print("After Cleaning:")
print(df[['sentence', 'cleaned_sentence']].head())

After Cleaning:
                                            sentence  \
0                     যত্তসব পাপন শালার ফাজলামী!!!!!   
1                  পাপন শালা রে রিমান্ডে নেওয়া দরকার   
2  জিল্লুর রহমান স্যারের ছেলে এতো বড় জারজ হবে এটা...   
3                শালা লুচ্চা দেখতে পাঠার মত দেখা যায়   
4   তুই তো শালা গাজা খাইছচ।তুর মার হেডায় খেলবে সাকিব   

                                    cleaned_sentence  
0                          যত্তসব পাপন শালার ফাজলামী  
1                  পাপন শালা রে রিমান্ডে নেওয়া দরকার  
2  জিল্লুর রহমান স্যারের ছেলে এতো বড় জারজ হবে এটা...  
3                শালা লুচ্চা দেখতে পাঠার মত দেখা যায়  
4    তুই তো শালা গাজা খাইছচতুর মার হেডায় খেলবে সাকিব  


In [15]:


# Step 4: Tokenization with Bangla BERT
def tokenize_text(text):
    """
    Tokenizes text using Bangla BERT tokenizer.
    """
    # Tokenize the text into subwords for better context understanding
    tokens = tokenizer.tokenize(text)
    # Convert tokens to a single string for training input
    return tokens

# Tokenize cleaned sentences
df['tokens'] = df['cleaned_sentence'].apply(tokenize_text)

print("After Tokenization:")
print(df[['cleaned_sentence', 'tokens']].head())

After Tokenization:
                                    cleaned_sentence  \
0                          যত্তসব পাপন শালার ফাজলামী   
1                  পাপন শালা রে রিমান্ডে নেওয়া দরকার   
2  জিল্লুর রহমান স্যারের ছেলে এতো বড় জারজ হবে এটা...   
3                শালা লুচ্চা দেখতে পাঠার মত দেখা যায়   
4    তুই তো শালা গাজা খাইছচতুর মার হেডায় খেলবে সাকিব   

                                              tokens  
0      [যত, ##ত, ##সব, পাপন, শালা, ##র, ফাজলাম, ##ী]  
1    [পাপন, শালা, রে, রিমান, ##ডে, নেও, ##যা, দরকার]  
2  [জিল, ##ল, ##র, রহমান, স, ##যার, ##ের, ছেলে, এ...  
3  [শালা, ল, ##চ, ##চা, দেখতে, পাঠ, ##ার, মত, দেখ...  
4  [ত, ##ই, তে, ##া, শালা, গাজা, খাই, ##ছ, ##চতর,...  


In [16]:


# Step 5: Stopword Removal
def remove_stopwords(tokens):
    """
    Removes Bengali stopwords from the token list.
    """
    return [token for token in tokens if token not in bengali_stopwords]

# Apply stopword removal on tokens
df['filtered_tokens'] = df['tokens'].apply(remove_stopwords)

print("After Stopword Removal:")
print(df[['tokens', 'filtered_tokens']].head())

After Stopword Removal:
                                              tokens  \
0      [যত, ##ত, ##সব, পাপন, শালা, ##র, ফাজলাম, ##ী]   
1    [পাপন, শালা, রে, রিমান, ##ডে, নেও, ##যা, দরকার]   
2  [জিল, ##ল, ##র, রহমান, স, ##যার, ##ের, ছেলে, এ...   
3  [শালা, ল, ##চ, ##চা, দেখতে, পাঠ, ##ার, মত, দেখ...   
4  [ত, ##ই, তে, ##া, শালা, গাজা, খাই, ##ছ, ##চতর,...   

                                     filtered_tokens  
0          [##ত, ##সব, পাপন, শালা, ##র, ফাজলাম, ##ী]  
1    [পাপন, শালা, রে, রিমান, ##ডে, নেও, ##যা, দরকার]  
2  [জিল, ##ল, ##র, রহমান, স, ##যার, ##ের, ছেলে, #...  
3           [শালা, ল, ##চ, ##চা, পাঠ, ##ার, মত, ##য]  
4  [ত, ##ই, তে, ##া, শালা, গাজা, খাই, ##ছ, ##চতর,...  


In [17]:
# Step 6: Prepare Data for Modeling
# Map the sentences and their respective labels
X = df['filtered_tokens'].apply(lambda tokens: ' '.join(tokens))  # Join tokens for input
y = df['hate']  # Assuming the 'hate' column contains the labels

# Split the data into training and testing sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Display dataset statistics
print("Training Data Size:", len(X_train))
print("Testing Data Size:", len(X_test))


Training Data Size: 24000
Testing Data Size: 6000


In [18]:

# Step 7: Save Preprocessed Data
# Save the train and test splits for later use
X_train.to_csv('X_train.csv', index=False)
y_train.to_csv('y_train.csv', index=False)
X_test.to_csv('X_test.csv', index=False)
y_test.to_csv('y_test.csv', index=False)

print("Preprocessing complete. Data saved to disk.")


Preprocessing complete. Data saved to disk.


In [20]:
%pip install gensim

Collecting gensim
  Downloading gensim-4.3.3-cp310-cp310-win_amd64.whl (24.0 MB)
     ---------------------------------------- 24.0/24.0 MB 1.3 MB/s eta 0:00:00
Collecting smart-open>=1.8.1
  Downloading smart_open-7.1.0-py3-none-any.whl (61 kB)
     ---------------------------------------- 61.7/61.7 kB 3.2 MB/s eta 0:00:00
Collecting scipy<1.14.0,>=1.7.0
  Downloading scipy-1.13.1-cp310-cp310-win_amd64.whl (46.2 MB)
     -------------------------------------- 46.2/46.2 MB 889.3 kB/s eta 0:00:00
Installing collected packages: smart-open, scipy, gensim
  Attempting uninstall: scipy
    Found existing installation: scipy 1.14.1
    Uninstalling scipy-1.14.1:
      Successfully uninstalled scipy-1.14.1
Note: you may need to restart the kernel to use updated packages.


ERROR: Could not install packages due to an OSError: [WinError 5] Access is denied: 'C:\\Users\\ASUS\\AppData\\Local\\Programs\\Python\\Python310\\Lib\\site-packages\\~cipy.libs\\libscipy_openblas-5b1ec8b915dfb81d11cebc0788069d2d.dll'
Consider using the `--user` option or check the permissions.


[notice] A new release of pip available: 22.2.1 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [21]:
# Import necessary libraries
from gensim.models import FastText
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np

ModuleNotFoundError: No module named 'gensim'

In [None]:


# Step 1: FastText Embeddings
def train_fasttext(corpus, embedding_dim=100, window_size=5, min_count=1):
    """
    Train a FastText model on the dataset.
    :param corpus: List of tokenized sentences.
    :param embedding_dim: Dimension of the embeddings.
    :param window_size: Context window size.
    :param min_count: Minimum word count threshold.
    :return: Trained FastText model.
    """
    model = FastText(sentences=corpus, vector_size=embedding_dim, window=window_size, min_count=min_count, sg=1)
    return model

# Convert the filtered tokens to a list of tokenized sentences
corpus = df['filtered_tokens'].tolist()

# Train the FastText model
fasttext_model = train_fasttext(corpus)
print("FastText training complete!")

# Example: Retrieve FastText vector for a word
word = "বাংলাদেশ"
if word in fasttext_model.wv:
    print(f"FastText vector for '{word}': {fasttext_model.wv[word]}")


In [None]:

# Step 2: Bangla BERT Embeddings
# Load Bangla BERT model and tokenizer
bangla_bert_model = AutoModel.from_pretrained("sagorsarker/bangla-bert-base")
bangla_bert_tokenizer = AutoTokenizer.from_pretrained("sagorsarker/bangla-bert-base")

def get_bert_embeddings(text):
    """
    Extract BERT embeddings for the input text.
    :param text: Input sentence.
    :return: Token-level embeddings.
    """
    inputs = bangla_bert_tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = bangla_bert_model(**inputs)
    # Get the last hidden state (token embeddings)
    return outputs.last_hidden_state.squeeze(0)

# Example: Get Bangla BERT embeddings for a sentence
sentence = "বাংলাদেশ একটি সুন্দর দেশ।"
bert_embeddings = get_bert_embeddings(sentence)
print(f"BERT Embeddings shape: {bert_embeddings.shape}")

In [None]:


# Step 3: Combine FastText and BERT Embeddings
def combine_embeddings(tokens, fasttext_model, bert_embeddings, embedding_dim=100):
    """
    Combine FastText and BERT embeddings for each token.
    :param tokens: List of tokens for the sentence.
    :param fasttext_model: Trained FastText model.
    :param bert_embeddings: BERT embeddings for the tokens.
    :param embedding_dim: Dimension of FastText embeddings.
    :return: Combined embeddings for each token.
    """
    combined_embeddings = []
    for idx, token in enumerate(tokens):
        # Get FastText embedding (zeros if not in vocab)
        fasttext_vec = fasttext_model.wv[token] if token in fasttext_model.wv else np.zeros(embedding_dim)
        
        # Get BERT embedding for the token
        bert_vec = bert_embeddings[idx].numpy() if idx < len(bert_embeddings) else np.zeros_like(bert_embeddings[0].numpy())
        
        # Concatenate FastText and BERT embeddings
        combined_vec = np.concatenate((fasttext_vec, bert_vec))
        combined_embeddings.append(combined_vec)
    
    return np.array(combined_embeddings)

# Example: Combine embeddings for a sentence
tokens = df['filtered_tokens'][0]  # Use the first sentence in the dataset
bert_embs = get_bert_embeddings(" ".join(tokens))
combined_embs = combine_embeddings(tokens, fasttext_model, bert_embs)
print(f"Combined Embeddings shape: {combined_embs.shape}")
