In [9]:
from google.colab import files
uploaded = files.upload()

Saving BanglaBERT_ONNX.onnx to BanglaBERT_ONNX.onnx


In [4]:
!pip install onnxruntime

Collecting onnxruntime
  Downloading onnxruntime-1.22.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.6 kB)
Collecting coloredlogs (from onnxruntime)
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl.metadata (12 kB)
Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime)
  Downloading humanfriendly-10.0-py2.py3-none-any.whl.metadata (9.2 kB)
Downloading onnxruntime-1.22.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (16.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.5/16.5 MB[0m [31m24.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading coloredlogs-15.0.1-py2.py3-none-any.whl (46 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.0/46.0 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading humanfriendly-10.0-py2.py3-none-any.whl (86 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.8/86.8 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected pack

# Step 1: Importing Libraries for Model and Tokenizer

In [5]:
# Install if not already installed
# !pip install transformers onnxruntime

import re
import unicodedata
import numpy as np
import onnxruntime as ort
from transformers import AutoTokenizer

# Step 2: Defining Text Preprocessing Function

In [6]:
def preprocess_bangla_text(text):
    if not isinstance(text, str):
        return ""

    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    emoji_pattern = re.compile("["u"\U0001F600-\U0001F64F"
                               u"\U0001F300-\U0001F5FF"
                               u"\U0001F680-\U0001F6FF"
                               u"\U0001F700-\U0001F77F"
                               u"\U0001F780-\U0001F7FF"
                               u"\U0001F800-\U0001F8FF"
                               u"\U0001F900-\U0001F9FF"
                               u"\U0001FA00-\U0001FA6F"
                               u"\U0001FA70-\U0001FAFF"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    text = re.sub(r'[^\w\s\u0980-\u09FF]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    text = unicodedata.normalize('NFC', text)
    return text

# Step 3: Loading Tokenizer and ONNX Model

In [3]:
# Load tokenizer
TOKENIZER_NAME = 'sagorsarker/bangla-bert-base'
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)

# Load ONNX model
MODEL_PATH = 'BanglaBERT_ONNX.onnx'
session = ort.InferenceSession(MODEL_PATH)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


# Step 4: Defining Label Map

In [4]:
label_map = {
    0: 'not bully',
    1: 'religious',
    2: 'sexual',
    3: 'troll',
    4: 'threat'
}

# Step 5: Defining Prediction Function

In [10]:
def predict_bangla_sentiment(text):
    if not text.strip():
        return {"error": "Empty input"}

    processed = preprocess_bangla_text(text)
    enc = tokenizer(processed, truncation=True, padding='max_length', max_length=128, return_tensors='np')
    input_ids = enc['input_ids']
    attention_mask = enc['attention_mask']

    ort_inputs = {
        session.get_inputs()[0].name: input_ids.astype(np.int64),
        session.get_inputs()[1].name: attention_mask.astype(np.int64)
    }
    ort_outs = session.run(None, ort_inputs)
    logits = ort_outs[0]
    pred_id = int(np.argmax(logits, axis=1)[0])
    label = label_map.get(pred_id, "unknown")

    return {"label": label, "label_id": pred_id}

# Step 6: Testing Prediction Function

In [11]:
test_text = "এই পণ্যের মান খুবই খারাপ।"
result = predict_bangla_sentiment(test_text)
print(result)

{'label': 'not bully', 'label_id': 0}


In [9]:
!pip install lime

Collecting lime
  Downloading lime-0.2.0.1.tar.gz (275 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/275.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m266.2/275.7 kB[0m [31m8.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m275.7/275.7 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: lime
  Building wheel for lime (setup.py) ... [?25l[?25hdone
  Created wheel for lime: filename=lime-0.2.0.1-py3-none-any.whl size=283834 sha256=2819832b8e3e7bc245deffa1ce7031c2f2bbf8f57da5fb4ea72d004fd6a5824f
  Stored in directory: /root/.cache/pip/wheels/85/fa/a3/9c2d44c9f3cd77cf4e533b58900b2bf4487f2a17e8ec212a3d
Successfully built lime
Installing collected packages: lime
Successfully installed lime-0.2.0.1


# Step 7: Defining Softmax and LIME Prediction Function

In [10]:
from lime.lime_text import LimeTextExplainer

def softmax(x):
    e_x = np.exp(x - np.max(x, axis=1, keepdims=True))
    return e_x / e_x.sum(axis=1, keepdims=True)

# Step 8: Defining LIME Prediction Wrapper

In [6]:
def lime_predict(texts):
    processed = [preprocess_bangla_text(t) for t in texts]

    enc = tokenizer(
        processed,
        truncation=True,
        padding='max_length',
        max_length=64,  # Reduce memory
        return_tensors='np'
    )

    input_ids = enc['input_ids'].astype(np.int64)
    attention_mask = enc['attention_mask'].astype(np.int64)

    ort_inputs = {
        session.get_inputs()[0].name: input_ids,
        session.get_inputs()[1].name: attention_mask
    }

    logits = session.run(None, ort_inputs)[0]
    return softmax(logits)

# Step 9: Defining LIME Explanation Function

In [7]:
def explain_with_lime(text, top_k=3, num_samples=300):
    explainer = LimeTextExplainer(
        class_names=list(label_map.values()),
        split_expression=r'\s+'   # 👈 Tokenize by whitespace
    )

    # Generate explanation
    explanation = explainer.explain_instance(
        text,
        lime_predict,
        num_features=top_k,
        num_samples=num_samples
    )

    # Get prediction
    pred_probs = lime_predict([text])[0]
    pred_label = label_map[int(np.argmax(pred_probs))]

    # Get word-level importance
    top_words = explanation.as_list()
    total = sum(abs(score) for _, score in top_words) or 1e-6
    token_contributions = [
        f"{word} ({int(100 * abs(score) / total)}%)"
        for word, score in top_words if word.strip()
    ]

    # Display
    print("📌 Input Text:", text)
    print("🔮 Predicted Sentiment:", pred_label)
    print("🧠 Explanation (Word-Level Importance):")
    for e in token_contributions:
        print("  •", e)


# Step 10: Explaining Text Examples

In [8]:
text = "আপনি অভিনয় করেন তাই ফ্যান বাচাতে এগুলা বলছেন এখন মুখ দিয়া সত্যিটা বলে ফেসে গেছেন তাই এই কবিতার লাইনটা কারো কাছ থেকে ভাড়া করে আনছেন আপনি একজন নাস্তিক এটাই চরম সত্য unfollowsafakabir"
explain_with_lime(text)

📌 Input Text: আপনি অভিনয় করেন তাই ফ্যান বাচাতে এগুলা বলছেন এখন মুখ দিয়া সত্যিটা বলে ফেসে গেছেন তাই এই কবিতার লাইনটা কারো কাছ থেকে ভাড়া করে আনছেন আপনি একজন নাস্তিক এটাই চরম সত্য unfollowsafakabir
🔮 Predicted Sentiment: religious
🧠 Explanation (Word-Level Importance):
  • নাস্তিক (99%)
  • আপনি (0%)
  • আনছেন (0%)


In [12]:
text = "ওই হালার পুত এখন কি মদ খাওয়ার সময়"
explain_with_lime(text)

📌 Input Text: ওই হালার পুত এখন কি মদ খাওয়ার সময়
🔮 Predicted Sentiment: troll
🧠 Explanation (Word-Level Importance):
  • হালার (35%)
  • খাওয়ার (34%)
  • পুত (29%)
