In [1]:
import re
import time
import pandas as pd
import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [2]:
# Define Bangla, Hindi, and English character sets
bangla_chars = 'অআইঈউঊএঐওঔকখগঘঙচছজঝঞটঠডঢণতথদধনপফবভমযরলশষসহড়ঢ়য়ৠৡ০১২৩৪৫৬৭৮৯'
hindi_chars = 'अआइईउऊऋएऐओऔकखगघङचछजझञटठडढणतथदधनपफबभमयरलशषसह०१२३४५६७८९'
english_chars = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'

In [3]:
# Function to detect the dominant language in a text
def detect_dominant_language(text):
    bangla_count = 0
    hindi_count = 0
    english_count = 0

    # Split text into words
    words = re.findall(r'\b\w+\b', text)

    # Count the words for each language
    for word in words:
        if any(char in bangla_chars for char in word):
            bangla_count += 1
        elif any(char in hindi_chars for char in word):
            hindi_count += 1
        elif any(char in english_chars for char in word):
            english_count += 1

    # Determine the dominant language
    counts = {'Bangla': bangla_count, 'Hindi': hindi_count, 'English': english_count}
    dominant_language = max(counts, key=counts.get)

    return dominant_language

In [4]:
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [5]:
# Load Dataset
data = pd.read_csv("/kaggle/input/100k-bangla-english-hindi-sentiment/option 4.csv")

In [6]:
# Handle missing values
data = data.dropna().reset_index(drop=True)

In [7]:
# Add `text_length` and `language` columns
data["text_length"] = data["text"].apply(len)
data["language"] = data["text"].apply(detect_dominant_language)

In [8]:
data

Unnamed: 0,label,text,text_length,language
0,Negative,DO NOT let the blu ray symbol on the box fool ...,388,Hindi
1,Neutral,আমার কোন ভাগ্য ছিল না trying to recover my फ़ा...,126,Hindi
2,Negative,यह बात वाकई में उलझ सकती है your computer. Che...,169,Bangla
3,Negative,मैं टर्बो लोड नहीं कर सकता Tax 22015 on Window...,159,Hindi
4,Neutral,"It's okay, but not as good as the মাইক্রোসফ্ট ...",253,Hindi
...,...,...,...,...
59989,Neutral,it is extremely cumbersome to navigate. while ...,397,Hindi
59990,Positive,some say it's hard to load. you need आपके पास ...,218,Hindi
59991,Negative,मैंने पिछले 3 वर्षों से टर्बोटैक्स का उपयोग कि...,1918,Hindi
59992,Negative,यदि आप कर करने के लिए इस सॉफ़्टवेयर का उपयोग क...,771,Hindi


In [9]:
# Group by label and language, and calculate average text length
avg_text_length = data.groupby(['label', 'language'])['text_length'].mean()

# Print the result
print(avg_text_length)

label     language
Negative  Bangla      555.763115
          English     589.500000
          Hindi       557.917066
Neutral   Bangla      563.324702
          English     612.340470
          Hindi       555.606427
Positive  Bangla      352.785996
          English     429.512168
          Hindi       366.801426
Name: text_length, dtype: float64


In [10]:
# Encode `label` and `language`
label_map = {"Negative": 0, "Neutral": 1, "Positive": 2}
data["label"] = data["label"].map(label_map)

language_map = {"Bangla": 0, "Hindi": 1, "English": 2}
data["language_encoded"] = data["language"].map(language_map)

In [11]:
character_limit = 512

In [12]:
# Filter long texts and reset index
data = data[data["text_length"] <= character_limit].reset_index(drop=True)

In [13]:
# Split dataset into train, validation, and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    data["text"], data["label"], test_size=0.1, random_state=42
)
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_texts, train_labels, test_size=0.1, random_state=42
)

In [14]:
# Corresponding lengths and languages
train_texts_length = data.loc[train_texts.index, "text_length"]
val_texts_length = data.loc[val_texts.index, "text_length"]
test_texts_length = data.loc[test_texts.index, "text_length"]

train_texts_language = data.loc[train_texts.index, "language_encoded"]
val_texts_language = data.loc[val_texts.index, "language_encoded"]
test_texts_language = data.loc[test_texts.index, "language_encoded"]

In [15]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]



In [16]:
# Tokenize data
def tokenize_data_with_features(texts, lengths, languages, labels, max_length=character_limit):
    encodings = tokenizer(list(texts), truncation=True, padding=True, max_length=max_length)
    encodings["text_length"] = torch.tensor(lengths.values, dtype=torch.float32)
    encodings["language"] = torch.tensor(languages.values, dtype=torch.long)
    return encodings, torch.tensor(labels.values)

train_encodings, train_labels = tokenize_data_with_features(train_texts, train_texts_length, train_texts_language, train_labels)
val_encodings, val_labels = tokenize_data_with_features(val_texts, val_texts_length, val_texts_language, val_labels)
test_encodings, test_labels = tokenize_data_with_features(test_texts, test_texts_length, test_texts_language, test_labels)

In [17]:
# Custom Dataset class
class SentimentDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = self.labels[idx]
        return item

In [18]:
train_dataset = SentimentDataset(train_encodings, train_labels)
val_dataset = SentimentDataset(val_encodings, val_labels)
test_dataset = SentimentDataset(test_encodings, test_labels)

In [19]:
# Define the model
class SentimentClassifier(nn.Module):
    def __init__(self, model_name, num_labels, extra_features=2):
        super(SentimentClassifier, self).__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        self.classifier = nn.Linear(self.bert.config.hidden_size + extra_features, num_labels)

    def forward(self, input_ids, attention_mask, text_length, language):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]  # CLS token
        combined_features = torch.cat([cls_output, text_length.unsqueeze(1), language.unsqueeze(1)], dim=1)
        logits = self.classifier(combined_features)
        return logits

In [20]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SentimentClassifier("xlm-roberta-base", num_labels=3).to(device)

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

In [21]:
# Training setup
batch_size = 16
learning_rate = 1e-5

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

optimizer = AdamW(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

In [22]:
# Training loop
batch_see = 100

def train_model(model, train_loader, val_loader, epochs):
    for epoch in range(epochs):
        epoch_start_time = time.time()
        model.train()
        train_loss = 0
        correct, total = 0, 0
        
        for batch_idx, batch in enumerate(train_loader):
            
            if (batch_idx + 1) % batch_see == 1:
                batch_start_time = time.time()

            
            optimizer.zero_grad()
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            text_length = batch["text_length"].to(device)
            language = batch["language"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids, attention_mask, text_length, language)
            loss = criterion(outputs, labels)
            train_loss += loss.item()

            loss.backward()
            optimizer.step()

            preds = torch.argmax(outputs, dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

            if (batch_idx + 1) % batch_see == 0:  # Display progress every 10 batches
                batch_time = time.time() - batch_start_time
                batch_time_str = time.strftime('%H:%M:%S', time.gmtime(batch_time))
                print(f"Epoch [{epoch + 1}/{epochs}], Step [{batch_idx + 1}/{len(train_loader)}], Loss: {loss.item():.4f}, Accuracy: {correct / total:.4f}, time = {batch_time_str}")

            
        val_loss = 0
        val_correct, val_total = 0, 0
        model.eval()
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch["input_ids"].to(device)
                attention_mask = batch["attention_mask"].to(device)
                text_length = batch["text_length"].to(device)
                language = batch["language"].to(device)
                labels = batch["labels"].to(device)

                outputs = model(input_ids, attention_mask, text_length, language)
                loss = criterion(outputs, labels)
                val_loss += loss.item()

                preds = torch.argmax(outputs, dim=1)
                val_correct += (preds == labels).sum().item()
                val_total += labels.size(0)

        epoch_time = time.time() - epoch_start_time
        epoch_time_str = time.strftime('%H:%M:%S', time.gmtime(epoch_time))
        print(f"Epoch {epoch + 1}/{epochs}, Train Loss: {train_loss / len(train_loader):.4f}, "
              f"Train Accuracy: {correct / total:.4f}, Val Loss: {val_loss / len(val_loader):.4f}, "
              f"Val Accuracy: {val_correct / val_total:.4f}, Time Spent = {epoch_time_str}")

In [23]:
train_model(model, train_loader, val_loader, epochs=15)

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch [1/15], Step [100/2142], Loss: 1.1770, Accuracy: 0.3850, time = 00:00:35
Epoch [1/15], Step [200/2142], Loss: 1.1545, Accuracy: 0.3972, time = 00:00:34
Epoch [1/15], Step [300/2142], Loss: 1.0107, Accuracy: 0.4000, time = 00:00:34
Epoch [1/15], Step [400/2142], Loss: 1.2808, Accuracy: 0.4044, time = 00:00:34
Epoch [1/15], Step [500/2142], Loss: 0.9840, Accuracy: 0.4265, time = 00:00:35
Epoch [1/15], Step [600/2142], Loss: 0.5377, Accuracy: 0.4519, time = 00:00:35
Epoch [1/15], Step [700/2142], Loss: 1.1638, Accuracy: 0.4811, time = 00:00:35
Epoch [1/15], Step [800/2142], Loss: 0.6925, Accuracy: 0.5033, time = 00:00:35
Epoch [1/15], Step [900/2142], Loss: 0.5908, Accuracy: 0.5217, time = 00:00:35
Epoch [1/15], Step [1000/2142], Loss: 0.7714, Accuracy: 0.5391, time = 00:00:35
Epoch [1/15], Step [1100/2142], Loss: 0.8334, Accuracy: 0.5524, time = 00:00:35
Epoch [1/15], Step [1200/2142], Loss: 0.5653, Accuracy: 0.5645, time = 00:00:35
Epoch [1/15], Step [1300/2142], Loss: 0.5360, Acc

In [24]:
# Evaluation
def evaluate_model(model, test_loader):
    model.eval()
    predictions, true_labels = [], []
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            text_length = batch["text_length"].to(device)
            language = batch["language"].to(device)

            outputs = model(input_ids, attention_mask, text_length, language)
            preds = torch.argmax(outputs, dim=1).cpu().numpy()
            labels = batch["labels"].cpu().numpy()

            predictions.extend(preds)
            true_labels.extend(labels)

    print(classification_report(true_labels, predictions, target_names=label_map.keys()))

In [25]:
evaluate_model(model, test_loader)

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


              precision    recall  f1-score   support

    Negative       0.77      0.75      0.76      1271
     Neutral       0.63      0.64      0.63      1291
    Positive       0.84      0.84      0.84      1669

    accuracy                           0.75      4231
   macro avg       0.75      0.74      0.75      4231
weighted avg       0.75      0.75      0.75      4231



In [26]:
# Predict custom input
def predict_sentiment(text, model, tokenizer):
    text_length = len(text)
    language = detect_dominant_language(text)
    language_encoded = language_map[language]

    model.eval()
    with torch.no_grad():
        encoding = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=character_limit).to(device)
        text_length_tensor = torch.tensor([text_length], dtype=torch.float32).to(device)
        language_tensor = torch.tensor([language_encoded], dtype=torch.long).to(device)

        output = model(encoding["input_ids"], encoding["attention_mask"], text_length_tensor, language_tensor)
        prediction = torch.argmax(output, dim=1).item()
        return list(label_map.keys())[prediction]


In [27]:
user_input = "আপনি আমাকে ভোটকা বলতে পারেন না"
sentiment = predict_sentiment(user_input, model, tokenizer)
print(f"Predicted Sentiment: {sentiment}")

Predicted Sentiment: Negative


In [28]:
user_input = "জয় বাংলা"
sentiment = predict_sentiment(user_input, model, tokenizer)
print(f"Predicted Sentiment: {sentiment}")

Predicted Sentiment: Positive


Fun fact is if we train our model using option 3 dataset, it would predict জয় বাংলা as negative! because it has updated data for training!!

for easy checking, below examples are generated from chatgpt

In [29]:
sentences = [
    # Positive Sentences
    {"text": "আমি আজ খুব খুশি।", "language": "Bangla", "sentiment": "Positive"},  # I'm very happy today.
    {"text": "आज का दिन बहुत अच्छा है।", "language": "Hindi", "sentiment": "Positive"},  # Today is a very good day.
    {"text": "I am so excited about the results.", "language": "English", "sentiment": "Positive"},
    {"text": "তোমার কাজ সত্যিই অসাধারণ!", "language": "Bangla", "sentiment": "Positive"},  # Your work is truly amazing!
    {"text": "यह मेरे जीवन का सबसे अच्छा समय है।", "language": "Hindi", "sentiment": "Positive"},  # This is the best time of my life.
    {"text": "The weather is absolutely delightful.", "language": "English", "sentiment": "Positive"},
    {"text": "তোমাকে পেয়ে আমি সত্যিই ভাগ্যবান।", "language": "Bangla", "sentiment": "Positive"},  # I'm truly lucky to have you.
    {"text": "आपकी मदद से मेरा काम आसान हो गया।", "language": "Hindi", "sentiment": "Positive"},  # Your help made my work easier.
    {"text": "I really appreciate everything you've done for me.", "language": "English", "sentiment": "Positive"},
    {"text": "আমার জীবনে আজ একটা নতুন অধ্যায় শুরু হলো।", "language": "Bangla", "sentiment": "Positive"},  # A new chapter started in my life today.
    {"text": "यह सफलता मेरे कठिन परिश्रम का परिणाम है।", "language": "Hindi", "sentiment": "Positive"},  # This success is the result of my hard work.
    {"text": "I feel incredibly grateful for this opportunity.", "language": "English", "sentiment": "Positive"},

    # Neutral Sentences
    {"text": "আমি দুপুরে ভাত খেয়েছি।", "language": "Bangla", "sentiment": "Neutral"},  # I had rice for lunch.
    {"text": "यह एक सामान्य दिन है।", "language": "Hindi", "sentiment": "Neutral"},  # It's a normal day.
    {"text": "The report was submitted on time.", "language": "English", "sentiment": "Neutral"},
    {"text": "তোমার চিঠি আজ সকালে পেলাম।", "language": "Bangla", "sentiment": "Neutral"},  # I received your letter this morning.
    {"text": "मुझे अभी कुछ कहना नहीं है।", "language": "Hindi", "sentiment": "Neutral"},  # I don't have anything to say right now.
    {"text": "The book is on the table.", "language": "English", "sentiment": "Neutral"},
    {"text": "আজকের আবহাওয়া ভালো।", "language": "Bangla", "sentiment": "Neutral"},  # Today's weather is fine.
    {"text": "मैंने सुबह चाय पी।", "language": "Hindi", "sentiment": "Neutral"},  # I had tea in the morning.
    {"text": "The meeting will start at 3 PM.", "language": "English", "sentiment": "Neutral"},
    {"text": "আমি আজ বই পড়ছি।", "language": "Bangla", "sentiment": "Neutral"},  # I am reading a book today.
    {"text": "मुझे नए कार्यक्रम के बारे में बताया गया।", "language": "Hindi", "sentiment": "Neutral"},  # I was informed about the new program.
    {"text": "There are no updates about the project yet.", "language": "English", "sentiment": "Neutral"},

    # Strongly Negative Sentences
    {"text": "আমি এত হতাশ যে আমি আর সহ্য করতে পারছি না।", "language": "Bangla", "sentiment": "Negative"},  # I am so frustrated that I can't take it anymore.
    {"text": "यह सबसे बेकार अनुभव था।", "language": "Hindi", "sentiment": "Negative"},  # This was the worst experience ever.
    {"text": "I hate how everything is falling apart in my life.", "language": "English", "sentiment": "Negative"},
    {"text": "তোমার আচরণ আমাকে চরম অপমানিত করেছে।", "language": "Bangla", "sentiment": "Negative"},  # Your behavior has deeply insulted me.
    {"text": "यह एक बड़ी गलती थी, और अब मुझे पछतावा हो रहा है।", "language": "Hindi", "sentiment": "Negative"},  # It was a big mistake, and I regret it now.
    {"text": "This is the most disappointing service I have ever received.", "language": "English", "sentiment": "Negative"},
    {"text": "এই পরিস্থিতি সম্পূর্ণ অগ্রহণযোগ্য।", "language": "Bangla", "sentiment": "Negative"},  # This situation is completely unacceptable.
    {"text": "आपकी लापरवाही ने सब बर्बाद कर दिया।", "language": "Hindi", "sentiment": "Negative"},  # Your carelessness has ruined everything.
    {"text": "I feel like giving up because nothing is going right.", "language": "English", "sentiment": "Negative"},
    {"text": "আমি নিজেকে সম্পূর্ণ ব্যর্থ মনে করছি।", "language": "Bangla", "sentiment": "Negative"},  # I feel like a complete failure.
    {"text": "मुझे अपने जीवन में कुछ भी अच्छा नहीं दिख रहा।", "language": "Hindi", "sentiment": "Negative"},  # I can't see anything good in my life.
    {"text": "Why does everything I do turn into a disaster?", "language": "English", "sentiment": "Negative"},
]

# Add more strongly negative examples
for i in range(15):
    sentences.append(
        {"text": f"এটা আমার জীবনের সবচেয়ে খারাপ দিন {i+1}।", "language": "Bangla", "sentiment": "Negative"}  # This is the worst day of my life {i+1}.
    )
    sentences.append(
        {"text": f"यह मेरी सबसे बड़ी असफलता है {i+1}।", "language": "Hindi", "sentiment": "Negative"}  # This is my biggest failure {i+1}.
    )
    sentences.append(
        {"text": f"This is unacceptable and I feel cheated {i+1}.", "language": "English", "sentiment": "Negative"}
    )


In [30]:
results = []

# Predict sentiments for the dataset
for sentence in sentences:
    predicted_sentiment = predict_sentiment(sentence["text"], model, tokenizer)
    results.append({
        "text": sentence["text"],
        "language": sentence["language"],
        "true_sentiment": sentence["sentiment"],
        "predicted_sentiment": predicted_sentiment
    })

# Display results
for res in results:
    print(f"Text: {res['text']}")
    print(f"Language: {res['language']}")
    print(f"True Sentiment: {res['true_sentiment']}")
    print(f"Predicted Sentiment: {res['predicted_sentiment']}")
    print("-" * 50)


Text: আমি আজ খুব খুশি।
Language: Bangla
True Sentiment: Positive
Predicted Sentiment: Positive
--------------------------------------------------
Text: आज का दिन बहुत अच्छा है।
Language: Hindi
True Sentiment: Positive
Predicted Sentiment: Positive
--------------------------------------------------
Text: I am so excited about the results.
Language: English
True Sentiment: Positive
Predicted Sentiment: Positive
--------------------------------------------------
Text: তোমার কাজ সত্যিই অসাধারণ!
Language: Bangla
True Sentiment: Positive
Predicted Sentiment: Positive
--------------------------------------------------
Text: यह मेरे जीवन का सबसे अच्छा समय है।
Language: Hindi
True Sentiment: Positive
Predicted Sentiment: Positive
--------------------------------------------------
Text: The weather is absolutely delightful.
Language: English
True Sentiment: Positive
Predicted Sentiment: Positive
--------------------------------------------------
Text: তোমাকে পেয়ে আমি সত্যিই ভাগ্যবান।
Language: B