In [None]:
!pip install onnxruntime

In [1]:
import re
import torch
import unicodedata
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_fscore_support
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, EvalPrediction

In [2]:
# Install if not already installed
# !pip install transformers onnxruntime

import re
import unicodedata
import numpy as np
import onnxruntime as ort
from transformers import AutoTokenizer


In [3]:
def preprocess_bangla_text(text):
    if not isinstance(text, str):
        return ""
    
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    emoji_pattern = re.compile("["u"\U0001F600-\U0001F64F"
                               u"\U0001F300-\U0001F5FF"
                               u"\U0001F680-\U0001F6FF"
                               u"\U0001F700-\U0001F77F"
                               u"\U0001F780-\U0001F7FF"
                               u"\U0001F800-\U0001F8FF"
                               u"\U0001F900-\U0001F9FF"
                               u"\U0001FA00-\U0001FA6F"
                               u"\U0001FA70-\U0001FAFF"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    text = re.sub(r'[^\w\s\u0980-\u09FF]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    text = unicodedata.normalize('NFC', text)
    return text


In [4]:
# Load tokenizer
TOKENIZER_NAME = 'sagorsarker/bangla-bert-base'
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)

# Load ONNX model
MODEL_PATH = 'BanglaBERT_ONNX.onnx'
session = ort.InferenceSession(MODEL_PATH)


In [5]:
label_map = {
    0: 'not bully',
    1: 'religious',
    2: 'sexual',
    3: 'troll',
    4: 'threat'
}

In [7]:
def predict_bangla_sentiment(text):
    if not text.strip():
        return {"error": "Empty input"}

    processed = preprocess_bangla_text(text)
    enc = tokenizer(processed, truncation=True, padding='max_length', max_length=128, return_tensors='np')
    input_ids = enc['input_ids']
    attention_mask = enc['attention_mask']

    ort_inputs = {
        session.get_inputs()[0].name: input_ids.astype(np.int64),
        session.get_inputs()[1].name: attention_mask.astype(np.int64)
    }
    ort_outs = session.run(None, ort_inputs)
    logits = ort_outs[0]
    pred_id = int(np.argmax(logits, axis=1)[0])
    label = label_map.get(pred_id, "unknown")

    return {"label": label, "label_id": pred_id}


In [8]:
test_text = "এই পণ্যের মান খুবই খারাপ।"
result = predict_bangla_sentiment(test_text)
print(result)

{'label': 'not bully', 'label_id': 0}


In [12]:
label_map = {'not bully': 0, 'religious': 1, 'sexual': 2, 'troll': 3, 'threat': 4}
id2label = {v: k for k, v in label_map.items()}

In [13]:
def predict_bangla_text(text):
    # Tokenize input text
    inputs = tokenizer(
        text,
        return_tensors='pt',
        truncation=True,
        padding=True,
        max_length=128
    )

    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predicted_class_id = torch.argmax(logits, dim=1).item()

    return id2label[predicted_class_id]


In [14]:
text = "ওই হালার পুত এখন কি মদ খাওয়ার সময়"
prediction = predict_bangla_text(text)
print("Predicted label:", prediction)


NameError: name 'model' is not defined