In [None]:
!python --version


In [None]:
!python -c "import torch; print(torch.cuda.is_available())"


In [None]:
!pip install transformers datasets torch scikit-learn pandas openpyxl


In [26]:
import os
import pandas as pd
import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification, 
    Trainer, 
    TrainingArguments,
    EarlyStoppingCallback
)
import numpy as np
import datasets.formatting.formatting
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import StratifiedShuffleSplit
import torch.nn as nn
from torch.nn import functional as F

# -----------------------------------------------------
# ✅ Monkey patch for NumPy 2.0 error in datasets
def fixed_arrow_array_to_numpy(self, pa_array):
    array = pa_array.to_pandas().values
    return np.array(array, copy=True)

datasets.formatting.formatting.NumpyArrowExtractor._arrow_array_to_numpy = fixed_arrow_array_to_numpy
# -----------------------------------------------------

# ✅ Disable wandb
os.environ["WANDB_DISABLED"] = "true"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# ✅ Load dataframe with better preprocessing
df = pd.read_excel(r"BanglaBlendCleanedDataWithEnglishTranslation.xlsx")
df = df.dropna()



Using device: cuda


In [27]:
df

Unnamed: 0,Sentence,Sentence(English Translation),Labels(English),Labels(Bangla),Labels
0,সেখানকার জানালা দিয়ে সমুদ্র দেখা যাইতেছিল,The sea could be seen from the window there,Saint,Sadhu(সাধু ),0
1,আমি কিছু দেখিতে পারিতেছি না,I can't see anything,Saint,Sadhu(সাধু ),0
2,সকলেরই অনাবৃত দেহ সকলের সেই অনাবৃত বক্ষে আরশির...,Everyone's naked body is burning once in the m...,Saint,Sadhu(সাধু ),0
3,মেয়েটি সেদিন ভিক্ষুককে সাহায্য করিয়াছিল,The girl had helped the beggar that day,Saint,Sadhu(সাধু ),0
4,তুমি প্রশংসা কর না কর বৃদ্ধ বসিয়া তোমায় পুরা...,You don't praise the old man and he will tell ...,Saint,Sadhu(সাধু ),0
...,...,...,...,...,...
7345,রন্ধনপ্রণালী দোপেয়াজা জনপ্রিয়,The cuisine Dopeyazza is popular,Common,Cholito(চলিত),1
7346,শেষে রহিম করিমকে বিপদে ফেলল,"At the end, Rahim put Karim in danger",Common,Cholito(চলিত),1
7347,হাফিজ কে তারাই বিপদে ফেলল,They put Hafiz in danger,Common,Cholito(চলিত),1
7348,সালে আলেকজান্ডার কানিংহাম প্রথম হড়প্পা সিলমোহ...,Alexander Cunningham published the first Harap...,Common,Cholito(চলিত),1


In [28]:

# ✅ Keep only needed columns and rename
df = df[["Sentence", "Labels"]].copy()

In [30]:
df = df.rename(columns={"Sentence": "text", "Labels": "label"})
df["label"] = df["label"].astype(int)

# ✅ Basic text preprocessing
def preprocess_text(text):
    """Basic text preprocessing"""
    if pd.isna(text):
        return ""
    text = str(text).strip()
    # Remove extra whitespace
    text = ' '.join(text.split())
    return text

df["text"] = df["text"].apply(preprocess_text)

# ✅ Remove empty texts and duplicates
df = df[df["text"].str.len() > 0]
df = df.drop_duplicates(subset=["text"])

print("Columns after clean & rename:", df.columns)
print("Data shape:", df.shape)
print("Label distribution:")
print(df["label"].value_counts())
print(df.head())

# ✅ Load tokenizer and model with built-in regularization
model_name = "sagorsarker/bangla-bert-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Use standard model with built-in dropout configuration
model = AutoModelForSequenceClassification.from_pretrained(
    model_name, 
    num_labels=2,
    hidden_dropout_prob=0.3,  # Increase dropout for regularization
    attention_probs_dropout_prob=0.3,  # Attention dropout
    classifier_dropout=0.3  # Classifier dropout
).to(device)

# ✅ Advanced tokenization with dynamic padding
def tokenize(batch):
    return tokenizer(
        batch["text"], 
        padding=False,  # Dynamic padding in data collator
        truncation=True, 
        max_length=256,  # Increased max length for better context
        return_tensors=None
    )

# ✅ Create dataset with stratified split
dataset = Dataset.from_pandas(df)
dataset = dataset.map(tokenize, batched=True, remove_columns=["text"])

# ✅ Stratified train-validation-test split
def create_stratified_splits(dataset, train_size=0.7, val_size=0.15, test_size=0.15, random_state=42):
    labels = [item['label'] for item in dataset]
    
    # First split: train and temp (val + test)
    sss1 = StratifiedShuffleSplit(n_splits=1, test_size=(val_size + test_size), random_state=random_state)
    train_idx, temp_idx = next(sss1.split(range(len(labels)), labels))
    
    # Second split: val and test from temp
    temp_labels = [labels[i] for i in temp_idx]
    val_ratio = val_size / (val_size + test_size)
    sss2 = StratifiedShuffleSplit(n_splits=1, test_size=(1 - val_ratio), random_state=random_state)
    val_idx_temp, test_idx_temp = next(sss2.split(range(len(temp_idx)), temp_labels))
    
    val_idx = [temp_idx[i] for i in val_idx_temp]
    test_idx = [temp_idx[i] for i in test_idx_temp]
    
    return {
        'train': dataset.select(train_idx),
        'validation': dataset.select(val_idx),
        'test': dataset.select(test_idx)
    }

dataset_splits = create_stratified_splits(dataset)

# ✅ Set torch format
for split in dataset_splits:
    dataset_splits[split].set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

print(f"Train size: {len(dataset_splits['train'])}")
print(f"Validation size: {len(dataset_splits['validation'])}")
print(f"Test size: {len(dataset_splits['test'])}")

# ✅ Data collator for dynamic padding
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# ✅ Enhanced metrics function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average="weighted")
    precision = precision_score(labels, preds, average="weighted")
    recall = recall_score(labels, preds, average="weighted")
    
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }



# ✅ Enhanced training arguments with regularization
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=25,  # Reduced epochs to prevent overfitting
    per_device_train_batch_size=32,  # Increased batch size
    per_device_eval_batch_size=32,
    gradient_accumulation_steps=2,  # Effective batch size = 32
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=50,
    
    # ✅ Learning rate scheduling
    learning_rate=3e-5,  # Slightly higher learning rate
    lr_scheduler_type="cosine",
    warmup_steps=500,
    
    # ✅ Regularization techniques
    weight_decay=0.01,  # L2 regularization
    adam_epsilon=1e-6,
    max_grad_norm=1.0,  # Gradient clipping
    label_smoothing_factor=0.1,  # Built-in label smoothing
    
    # ✅ Early stopping and model selection
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    
    # ✅ Evaluation and saving
    eval_steps=100,
    save_steps=200,
    save_total_limit=3,
    
    # ✅ Other optimizations
    dataloader_num_workers=2,
    remove_unused_columns=False,
    report_to=[]
)

# ✅ Standard trainer with built-in label smoothing
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_splits["train"],
    eval_dataset=dataset_splits["validation"],
    compute_metrics=compute_metrics,
    data_collator=data_collator
    # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

# ✅ Train with mixed precision for efficiency
print("🚀 Starting training...")
trainer.train()

# ✅ Evaluate on validation set
print("\n📊 Validation Results:")
val_metrics = trainer.evaluate()
for key, value in val_metrics.items():
    print(f"{key}: {value:.4f}")

# ✅ Evaluate on test set
print("\n📊 Test Results:")
test_metrics = trainer.evaluate(dataset_splits["test"])
for key, value in test_metrics.items():
    print(f"test_{key}: {value:.4f}")

# ✅ Calculate overfitting metrics
train_metrics = trainer.evaluate(dataset_splits["train"])
overfitting_accuracy = train_metrics['eval_accuracy'] - test_metrics['eval_accuracy']
overfitting_f1 = train_metrics['eval_f1'] - test_metrics['eval_f1']

print(f"\n🎯 Overfitting Analysis:")
print(f"Training Accuracy: {train_metrics['eval_accuracy']:.4f}")
print(f"Test Accuracy: {test_metrics['eval_accuracy']:.4f}")
print(f"Accuracy Gap (overfitting): {overfitting_accuracy:.4f}")
print(f"F1 Gap (overfitting): {overfitting_f1:.4f}")

# ✅ Save the best model
model.save_pretrained("./final_bangla_bert_model")
tokenizer.save_pretrained("./final_bangla_bert_model")
print("\n✅ Best model saved at ./final_bangla_bert_model")

# ✅ Optional: Model inference example
def predict_text(text, model, tokenizer, device):
    """Function to predict on new text"""
    model.eval()
    inputs = tokenizer(
        text, 
        return_tensors="pt", 
        padding=True, 
        truncation=True, 
        max_length=256
    ).to(device)
    
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
        predicted_class = torch.argmax(predictions, dim=-1)
    
    return predicted_class.item(), predictions.cpu().numpy()

# ✅ Example usage
sample_text = "এটি একটি পরীক্ষা বাক্য।"
try:
    pred_class, pred_probs = predict_text(sample_text, model, tokenizer, device)
    print(f"\nSample prediction:")
    print(f"Text: {sample_text}")
    print(f"Predicted class: {pred_class}")
    print(f"Probabilities: {pred_probs}")
except Exception as e:
    print(f"Prediction error: {e}")

Columns after clean & rename: Index(['text', 'label'], dtype='object')
Data shape: (7066, 2)
Label distribution:
label
1    3608
0    3458
Name: count, dtype: int64
                                                text  label
0          সেখানকার জানালা দিয়ে সমুদ্র দেখা যাইতেছিল      0
1                        আমি কিছু দেখিতে পারিতেছি না      0
2  সকলেরই অনাবৃত দেহ সকলের সেই অনাবৃত বক্ষে আরশির...      0
3            মেয়েটি সেদিন ভিক্ষুককে সাহায্য করিয়াছিল      0
4  তুমি প্রশংসা কর না কর বৃদ্ধ বসিয়া তোমায় পুরা...      0


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at sagorsarker/bangla-bert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/7066 [00:00<?, ? examples/s]

Train size: 4946
Validation size: 1060
Test size: 1060
🚀 Starting training...


  0%|          | 0/1925 [00:00<?, ?it/s]

{'loss': 0.7189, 'grad_norm': 2.8138017654418945, 'learning_rate': 3e-06, 'epoch': 0.65}


  0%|          | 0/34 [00:00<?, ?it/s]

{'eval_loss': 0.4733998477458954, 'eval_accuracy': 0.8509433962264151, 'eval_f1': 0.8508982454257666, 'eval_precision': 0.8510047605821217, 'eval_recall': 0.8509433962264151, 'eval_runtime': 8.1247, 'eval_samples_per_second': 130.467, 'eval_steps_per_second': 4.185, 'epoch': 0.99}
{'loss': 0.535, 'grad_norm': 3.4255077838897705, 'learning_rate': 6e-06, 'epoch': 1.29}
{'loss': 0.4081, 'grad_norm': 7.16697359085083, 'learning_rate': 9e-06, 'epoch': 1.94}


  0%|          | 0/34 [00:00<?, ?it/s]

{'eval_loss': 0.3474562168121338, 'eval_accuracy': 0.9122641509433962, 'eval_f1': 0.9122578219850621, 'eval_precision': 0.9122661172275058, 'eval_recall': 0.9122641509433962, 'eval_runtime': 7.8355, 'eval_samples_per_second': 135.282, 'eval_steps_per_second': 4.339, 'epoch': 2.0}
{'loss': 0.3602, 'grad_norm': 4.683022975921631, 'learning_rate': 1.2e-05, 'epoch': 2.58}


  0%|          | 0/34 [00:00<?, ?it/s]

{'eval_loss': 0.32172346115112305, 'eval_accuracy': 0.9245283018867925, 'eval_f1': 0.9244075971760487, 'eval_precision': 0.9259827399112566, 'eval_recall': 0.9245283018867925, 'eval_runtime': 7.9948, 'eval_samples_per_second': 132.587, 'eval_steps_per_second': 4.253, 'epoch': 2.99}
{'loss': 0.3418, 'grad_norm': 4.497650623321533, 'learning_rate': 1.5e-05, 'epoch': 3.23}
{'loss': 0.3177, 'grad_norm': 2.5612897872924805, 'learning_rate': 1.8e-05, 'epoch': 3.87}


  0%|          | 0/34 [00:00<?, ?it/s]

{'eval_loss': 0.31336912512779236, 'eval_accuracy': 0.939622641509434, 'eval_f1': 0.9395994009089657, 'eval_precision': 0.9398174565119123, 'eval_recall': 0.939622641509434, 'eval_runtime': 7.9076, 'eval_samples_per_second': 134.048, 'eval_steps_per_second': 4.3, 'epoch': 4.0}
{'loss': 0.3062, 'grad_norm': 5.7067484855651855, 'learning_rate': 2.1e-05, 'epoch': 4.52}


  0%|          | 0/34 [00:00<?, ?it/s]

{'eval_loss': 0.31750261783599854, 'eval_accuracy': 0.9367924528301886, 'eval_f1': 0.9367315219341047, 'eval_precision': 0.9375562605517523, 'eval_recall': 0.9367924528301886, 'eval_runtime': 7.904, 'eval_samples_per_second': 134.109, 'eval_steps_per_second': 4.302, 'epoch': 4.99}
{'loss': 0.2874, 'grad_norm': 6.30911111831665, 'learning_rate': 2.4e-05, 'epoch': 5.16}
{'loss': 0.2858, 'grad_norm': 4.2460222244262695, 'learning_rate': 2.7000000000000002e-05, 'epoch': 5.81}


  0%|          | 0/34 [00:00<?, ?it/s]

{'eval_loss': 0.31065577268600464, 'eval_accuracy': 0.940566037735849, 'eval_f1': 0.9405011013563359, 'eval_precision': 0.9414873079524106, 'eval_recall': 0.940566037735849, 'eval_runtime': 7.9772, 'eval_samples_per_second': 132.879, 'eval_steps_per_second': 4.262, 'epoch': 6.0}
{'loss': 0.2757, 'grad_norm': 6.265702724456787, 'learning_rate': 3e-05, 'epoch': 6.45}


  0%|          | 0/34 [00:00<?, ?it/s]

{'eval_loss': 0.3196853995323181, 'eval_accuracy': 0.9367924528301886, 'eval_f1': 0.9367937472929474, 'eval_precision': 0.9367967098544635, 'eval_recall': 0.9367924528301886, 'eval_runtime': 7.8186, 'eval_samples_per_second': 135.574, 'eval_steps_per_second': 4.349, 'epoch': 6.99}
{'loss': 0.2595, 'grad_norm': 5.290413856506348, 'learning_rate': 2.9908960159769243e-05, 'epoch': 7.1}
{'loss': 0.2471, 'grad_norm': 11.927033424377441, 'learning_rate': 2.9636945739411533e-05, 'epoch': 7.74}


  0%|          | 0/34 [00:00<?, ?it/s]

{'eval_loss': 0.31967398524284363, 'eval_accuracy': 0.9433962264150944, 'eval_f1': 0.9433056978820366, 'eval_precision': 0.944945895500684, 'eval_recall': 0.9433962264150944, 'eval_runtime': 8.3189, 'eval_samples_per_second': 127.42, 'eval_steps_per_second': 4.087, 'epoch': 8.0}
{'loss': 0.2522, 'grad_norm': 2.297903060913086, 'learning_rate': 2.9187258625509518e-05, 'epoch': 8.39}


  0%|          | 0/34 [00:00<?, ?it/s]

{'eval_loss': 0.3327191174030304, 'eval_accuracy': 0.9349056603773584, 'eval_f1': 0.9349042120328063, 'eval_precision': 0.9359434664608428, 'eval_recall': 0.9349056603773584, 'eval_runtime': 7.7617, 'eval_samples_per_second': 136.569, 'eval_steps_per_second': 4.381, 'epoch': 8.99}
{'loss': 0.2625, 'grad_norm': 2.0402607917785645, 'learning_rate': 2.8565357410463664e-05, 'epoch': 9.03}
{'loss': 0.2366, 'grad_norm': 13.379804611206055, 'learning_rate': 2.7778791132574908e-05, 'epoch': 9.68}


  0%|          | 0/34 [00:00<?, ?it/s]

{'eval_loss': 0.3165685832500458, 'eval_accuracy': 0.9433962264150944, 'eval_f1': 0.9433744383521554, 'eval_precision': 0.9435949183330448, 'eval_recall': 0.9433962264150944, 'eval_runtime': 7.959, 'eval_samples_per_second': 133.183, 'eval_steps_per_second': 4.272, 'epoch': 10.0}
{'loss': 0.243, 'grad_norm': 10.441143989562988, 'learning_rate': 2.6837107640945904e-05, 'epoch': 10.32}
{'loss': 0.2278, 'grad_norm': 5.561850547790527, 'learning_rate': 2.575173769752677e-05, 'epoch': 10.97}


  0%|          | 0/34 [00:00<?, ?it/s]

{'eval_loss': 0.3327750861644745, 'eval_accuracy': 0.9415094339622642, 'eval_f1': 0.9415000562995878, 'eval_precision': 0.9415510733714365, 'eval_recall': 0.9415094339622642, 'eval_runtime': 7.7384, 'eval_samples_per_second': 136.978, 'eval_steps_per_second': 4.394, 'epoch': 10.99}
{'loss': 0.2212, 'grad_norm': 4.853788375854492, 'learning_rate': 2.4535856223149525e-05, 'epoch': 11.61}


  0%|          | 0/34 [00:00<?, ?it/s]

{'eval_loss': 0.3402288854122162, 'eval_accuracy': 0.940566037735849, 'eval_f1': 0.9404356115103488, 'eval_precision': 0.9428739362516778, 'eval_recall': 0.940566037735849, 'eval_runtime': 7.6409, 'eval_samples_per_second': 138.728, 'eval_steps_per_second': 4.45, 'epoch': 12.0}
{'loss': 0.2293, 'grad_norm': 0.765144407749176, 'learning_rate': 2.3204222371836406e-05, 'epoch': 12.26}
{'loss': 0.224, 'grad_norm': 0.20011042058467865, 'learning_rate': 2.177300037466334e-05, 'epoch': 12.9}


  0%|          | 0/34 [00:00<?, ?it/s]

{'eval_loss': 0.3445494472980499, 'eval_accuracy': 0.9386792452830188, 'eval_f1': 0.9386673923153476, 'eval_precision': 0.9387375057730849, 'eval_recall': 0.9386792452830188, 'eval_runtime': 7.5247, 'eval_samples_per_second': 140.869, 'eval_steps_per_second': 4.518, 'epoch': 12.99}
{'loss': 0.2183, 'grad_norm': 0.73643958568573, 'learning_rate': 2.025956332789132e-05, 'epoch': 13.55}


  0%|          | 0/34 [00:00<?, ?it/s]

{'eval_loss': 0.3305486738681793, 'eval_accuracy': 0.9443396226415094, 'eval_f1': 0.9443455191251242, 'eval_precision': 0.9444337859284077, 'eval_recall': 0.9443396226415094, 'eval_runtime': 7.5164, 'eval_samples_per_second': 141.026, 'eval_steps_per_second': 4.523, 'epoch': 14.0}
{'loss': 0.2193, 'grad_norm': 0.3022664189338684, 'learning_rate': 1.8682282307111988e-05, 'epoch': 14.19}
{'loss': 0.2135, 'grad_norm': 0.21690213680267334, 'learning_rate': 1.7060303367276123e-05, 'epoch': 14.84}


  0%|          | 0/34 [00:00<?, ?it/s]

{'eval_loss': 0.3211290240287781, 'eval_accuracy': 0.9415094339622642, 'eval_f1': 0.9415094339622642, 'eval_precision': 0.9424752360002655, 'eval_recall': 0.9415094339622642, 'eval_runtime': 7.6176, 'eval_samples_per_second': 139.151, 'eval_steps_per_second': 4.463, 'epoch': 14.99}
{'loss': 0.22, 'grad_norm': 1.0165290832519531, 'learning_rate': 1.5413315135522434e-05, 'epoch': 15.48}


  0%|          | 0/34 [00:00<?, ?it/s]

{'eval_loss': 0.32899004220962524, 'eval_accuracy': 0.9415094339622642, 'eval_f1': 0.9415000562995878, 'eval_precision': 0.9415510733714365, 'eval_recall': 0.9415094339622642, 'eval_runtime': 7.5378, 'eval_samples_per_second': 140.625, 'eval_steps_per_second': 4.511, 'epoch': 16.0}
{'loss': 0.2097, 'grad_norm': 2.428420305252075, 'learning_rate': 1.3761309817915017e-05, 'epoch': 16.13}
{'loss': 0.2082, 'grad_norm': 0.06486884504556656, 'learning_rate': 1.2124340521143929e-05, 'epoch': 16.77}


  0%|          | 0/34 [00:00<?, ?it/s]

{'eval_loss': 0.3280755877494812, 'eval_accuracy': 0.9471698113207547, 'eval_f1': 0.9471283634664617, 'eval_precision': 0.9477636052839573, 'eval_recall': 0.9471698113207547, 'eval_runtime': 7.6149, 'eval_samples_per_second': 139.201, 'eval_steps_per_second': 4.465, 'epoch': 16.99}
{'loss': 0.2107, 'grad_norm': 0.12779253721237183, 'learning_rate': 1.0522277834974586e-05, 'epoch': 17.42}


  0%|          | 0/34 [00:00<?, ?it/s]

{'eval_loss': 0.3247220516204834, 'eval_accuracy': 0.9490566037735849, 'eval_f1': 0.9490449862889477, 'eval_precision': 0.9491416271482864, 'eval_recall': 0.9490566037735849, 'eval_runtime': 7.6191, 'eval_samples_per_second': 139.123, 'eval_steps_per_second': 4.462, 'epoch': 18.0}
{'loss': 0.209, 'grad_norm': 0.24474157392978668, 'learning_rate': 8.974568630205462e-06, 'epoch': 18.06}
{'loss': 0.2058, 'grad_norm': 0.6760526895523071, 'learning_rate': 7.500000000000004e-06, 'epoch': 18.71}


  0%|          | 0/34 [00:00<?, ?it/s]

{'eval_loss': 0.33308282494544983, 'eval_accuracy': 0.9452830188679245, 'eval_f1': 0.9452705408288697, 'eval_precision': 0.9453658897521272, 'eval_recall': 0.9452830188679245, 'eval_runtime': 7.7781, 'eval_samples_per_second': 136.28, 'eval_steps_per_second': 4.371, 'epoch': 18.99}
{'loss': 0.2053, 'grad_norm': 0.13216090202331543, 'learning_rate': 6.116471210025302e-06, 'epoch': 19.35}
{'loss': 0.2049, 'grad_norm': 0.05667274072766304, 'learning_rate': 4.840776425613887e-06, 'epoch': 20.0}


  0%|          | 0/34 [00:00<?, ?it/s]

{'eval_loss': 0.33405038714408875, 'eval_accuracy': 0.9471698113207547, 'eval_f1': 0.947153808505335, 'eval_precision': 0.947306669982495, 'eval_recall': 0.9471698113207547, 'eval_runtime': 7.5536, 'eval_samples_per_second': 140.33, 'eval_steps_per_second': 4.501, 'epoch': 20.0}
{'loss': 0.2043, 'grad_norm': 0.8620634078979492, 'learning_rate': 3.688400853346558e-06, 'epoch': 20.65}


  0%|          | 0/34 [00:00<?, ?it/s]

{'eval_loss': 0.3362462520599365, 'eval_accuracy': 0.9433962264150944, 'eval_f1': 0.9433639390576126, 'eval_precision': 0.9437626514298524, 'eval_recall': 0.9433962264150944, 'eval_runtime': 7.714, 'eval_samples_per_second': 137.413, 'eval_steps_per_second': 4.408, 'epoch': 20.99}
{'loss': 0.205, 'grad_norm': 0.0326048843562603, 'learning_rate': 2.673332771621324e-06, 'epoch': 21.29}
{'loss': 0.2047, 'grad_norm': 0.06307166814804077, 'learning_rate': 1.8078937319026655e-06, 'epoch': 21.94}


  0%|          | 0/34 [00:00<?, ?it/s]

{'eval_loss': 0.3365718126296997, 'eval_accuracy': 0.9443396226415094, 'eval_f1': 0.9443288637939308, 'eval_precision': 0.9444005666450573, 'eval_recall': 0.9443396226415094, 'eval_runtime': 7.4782, 'eval_samples_per_second': 141.745, 'eval_steps_per_second': 4.547, 'epoch': 22.0}
{'loss': 0.205, 'grad_norm': 4.0086565017700195, 'learning_rate': 1.1025889917779735e-06, 'epoch': 22.58}


  0%|          | 0/34 [00:00<?, ?it/s]

{'eval_loss': 0.33672428131103516, 'eval_accuracy': 0.9415094339622642, 'eval_f1': 0.941496095368792, 'eval_precision': 0.941590152355968, 'eval_recall': 0.9415094339622642, 'eval_runtime': 7.6185, 'eval_samples_per_second': 139.135, 'eval_steps_per_second': 4.463, 'epoch': 22.99}
{'loss': 0.2043, 'grad_norm': 0.0563100203871727, 'learning_rate': 5.659799953612438e-07, 'epoch': 23.23}
{'loss': 0.2047, 'grad_norm': 0.06765048950910568, 'learning_rate': 2.0458044895916516e-07, 'epoch': 23.87}


  0%|          | 0/34 [00:00<?, ?it/s]

{'eval_loss': 0.3357042670249939, 'eval_accuracy': 0.9443396226415094, 'eval_f1': 0.9443157660188449, 'eval_precision': 0.9445765698446439, 'eval_recall': 0.9443396226415094, 'eval_runtime': 7.4954, 'eval_samples_per_second': 141.419, 'eval_steps_per_second': 4.536, 'epoch': 24.0}
{'loss': 0.2031, 'grad_norm': 1.752881646156311, 'learning_rate': 2.2777253500257388e-08, 'epoch': 24.52}


  0%|          | 0/34 [00:00<?, ?it/s]

{'eval_loss': 0.33562329411506653, 'eval_accuracy': 0.9452830188679245, 'eval_f1': 0.9452570782376002, 'eval_precision': 0.9455616638764459, 'eval_recall': 0.9452830188679245, 'eval_runtime': 7.8514, 'eval_samples_per_second': 135.008, 'eval_steps_per_second': 4.33, 'epoch': 24.84}
{'train_runtime': 582.2508, 'train_samples_per_second': 212.366, 'train_steps_per_second': 3.306, 'train_loss': 0.2622764894559786, 'epoch': 24.84}

📊 Validation Results:


  0%|          | 0/34 [00:00<?, ?it/s]

eval_loss: 0.3247
eval_accuracy: 0.9491
eval_f1: 0.9490
eval_precision: 0.9491
eval_recall: 0.9491
eval_runtime: 7.4967
eval_samples_per_second: 141.3960
eval_steps_per_second: 4.5350
epoch: 24.8387

📊 Test Results:


  0%|          | 0/34 [00:00<?, ?it/s]

test_eval_loss: 0.3210
test_eval_accuracy: 0.9509
test_eval_f1: 0.9509
test_eval_precision: 0.9511
test_eval_recall: 0.9509
test_eval_runtime: 7.6044
test_eval_samples_per_second: 139.3940
test_eval_steps_per_second: 4.4710
test_epoch: 24.8387


  0%|          | 0/155 [00:00<?, ?it/s]


🎯 Overfitting Analysis:
Training Accuracy: 0.9992
Test Accuracy: 0.9509
Accuracy Gap (overfitting): 0.0482
F1 Gap (overfitting): 0.0483

✅ Best model saved at ./final_bangla_bert_model

Sample prediction:
Text: এটি একটি পরীক্ষা বাক্য।
Predicted class: 1
Probabilities: [[0.04099333 0.95900667]]


In [31]:
from sklearn.metrics import classification_report

# ✅ Evaluate on validation set
print("\n📊 Validation Results:")
val_preds_output = trainer.predict(dataset_splits["validation"])
val_labels = val_preds_output.label_ids
val_preds = val_preds_output.predictions.argmax(-1)

val_metrics = trainer.evaluate()
for key, value in val_metrics.items():
    print(f"{key}: {value:.4f}")

# ✅ Classification report on validation
print("\n🔎 Classification Report (Validation):")
print(classification_report(val_labels, val_preds, digits=4))

# ✅ Evaluate on test set
print("\n📊 Test Results:")
test_preds_output = trainer.predict(dataset_splits["test"])
test_labels = test_preds_output.label_ids
test_preds = test_preds_output.predictions.argmax(-1)

test_metrics = trainer.evaluate(dataset_splits["test"])
for key, value in test_metrics.items():
    print(f"test_{key}: {value:.4f}")

# ✅ Classification report on test
print("\n🔎 Classification Report (Test):")
print(classification_report(test_labels, test_preds, digits=4))



📊 Validation Results:


  0%|          | 0/34 [00:00<?, ?it/s]

  0%|          | 0/34 [00:00<?, ?it/s]

eval_loss: 0.3247
eval_accuracy: 0.9491
eval_f1: 0.9490
eval_precision: 0.9491
eval_recall: 0.9491
eval_runtime: 7.3961
eval_samples_per_second: 143.3190
eval_steps_per_second: 4.5970
epoch: 24.8387

🔎 Classification Report (Validation):
              precision    recall  f1-score   support

           0     0.9549    0.9402    0.9475       518
           1     0.9436    0.9576    0.9505       542

    accuracy                         0.9491      1060
   macro avg     0.9493    0.9489    0.9490      1060
weighted avg     0.9491    0.9491    0.9490      1060


📊 Test Results:


  0%|          | 0/34 [00:00<?, ?it/s]

  0%|          | 0/34 [00:00<?, ?it/s]

test_eval_loss: 0.3210
test_eval_accuracy: 0.9509
test_eval_f1: 0.9509
test_eval_precision: 0.9511
test_eval_recall: 0.9509
test_eval_runtime: 7.5675
test_eval_samples_per_second: 140.0720
test_eval_steps_per_second: 4.4930
test_epoch: 24.8387

🔎 Classification Report (Test):
              precision    recall  f1-score   support

           0     0.9587    0.9403    0.9494       519
           1     0.9437    0.9612    0.9524       541

    accuracy                         0.9509      1060
   macro avg     0.9512    0.9507    0.9509      1060
weighted avg     0.9511    0.9509    0.9509      1060



In [None]:
df.shape

In [None]:

import matplotlib.pyplot as plt
import torch
from sklearn.metrics import *
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Get predictions from validation or test set
predictions = trainer.predict(dataset_splits["test"])
y_true = predictions.label_ids
y_pred = predictions.predictions.argmax(-1)

# Plot confusion matrix
cm = confusion_matrix(y_true, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap="Blues", values_format="d")
plt.title("Confusion Matrix (Test Set)")
plt.show()


In [None]:
import pandas as pd

# Load logs from Trainer
log_history = trainer.state.log_history
log_df = pd.DataFrame(log_history)

# Filter only relevant columns
train_logs = log_df[log_df["loss"].notna()]
eval_logs = log_df[log_df["eval_loss"].notna()]

# Plot Loss
plt.plot(train_logs["step"], train_logs["loss"], label="Train Loss")
# plt.plot(eval_logs["step"], eval_logs["eval_loss"], label="Eval Loss")
plt.xlabel("Step")
plt.ylabel("Loss")
plt.legend()
plt.title("Training Loss")
plt.grid(True)
plt.show()

# Plot Accuracy
if "eval_accuracy" in eval_logs.columns:
    plt.plot(eval_logs["step"], eval_logs["eval_accuracy"], label="Eval Accuracy")
    plt.xlabel("Step")
    plt.ylabel("Accuracy")
    plt.title("Validation Accuracy")
    plt.grid(True)
    plt.legend()
    plt.show()


In [None]:
import pandas as pd

# Load logs from Trainer
log_history = trainer.state.log_history
log_df = pd.DataFrame(log_history)

# Filter only relevant columns
train_logs = log_df[log_df["loss"].notna()]
eval_logs = log_df[log_df["eval_loss"].notna()]

# Plot Loss
# plt.plot(train_logs["step"], train_logs["loss"], label="Train Loss")
plt.plot(eval_logs["step"], eval_logs["eval_loss"], label="Eval Loss")
plt.xlabel("Step")
plt.ylabel("Loss")
plt.legend()
plt.title("Validation Loss")
plt.grid(True)
plt.show()

# Plot Accuracy
if "eval_accuracy" in eval_logs.columns:
    plt.plot(eval_logs["step"], eval_logs["eval_accuracy"], label="Eval Accuracy")
    plt.xlabel("Step")
    plt.ylabel("Accuracy")
    plt.title("Validation Accuracy")
    plt.grid(True)
    plt.legend()
    plt.show()


In [None]:
from sklearn.metrics import roc_auc_score, roc_curve

# Get softmax probabilities
probs = predictions.predictions
probs_softmax = torch.nn.functional.softmax(torch.tensor(probs), dim=1).numpy()

# ROC Curve (for binary classification only)
fpr, tpr, thresholds = roc_curve(y_true, probs_softmax[:, 1])
auc_score = roc_auc_score(y_true, probs_softmax[:, 1])

plt.plot(fpr, tpr, label=f"AUC = {auc_score:.4f}")
plt.plot([0, 1], [0, 1], linestyle="--", color="gray")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("AUC-ROC Curve")
plt.legend()
plt.grid(True)
plt.show()


In [None]:
from sklearn.metrics import precision_recall_curve, average_precision_score

precision, recall, _ = precision_recall_curve(y_true, probs_softmax[:, 1])
avg_precision = average_precision_score(y_true, probs_softmax[:, 1])

plt.plot(recall, precision, label=f"AP = {avg_precision:.4f}")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve")
plt.legend()
plt.grid(True)
plt.show()


In [None]:
import os
import pandas as pd
import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification, 
    Trainer, 
    TrainingArguments,
    EarlyStoppingCallback
)
import numpy as np
import datasets.formatting.formatting
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
from sklearn.model_selection import StratifiedShuffleSplit
import torch.nn as nn
from torch.nn import functional as F
import matplotlib.pyplot as plt
import seaborn as sns

# -----------------------------------------------------
# ✅ Monkey patch for NumPy 2.0 error in datasets
def fixed_arrow_array_to_numpy(self, pa_array):
    array = pa_array.to_pandas().values
    return np.array(array, copy=True)

datasets.formatting.formatting.NumpyArrowExtractor._arrow_array_to_numpy = fixed_arrow_array_to_numpy
# -----------------------------------------------------

# ✅ Disable wandb
os.environ["WANDB_DISABLED"] = "true"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# ✅ Load dataframe with better preprocessing
df = pd.read_excel(r"BanglaBlendCleanedDataWithEnglishTranslation.xlsx")
df = df.dropna()


# ✅ Keep only needed columns and rename
df = df[["Sentence", "Labels"]].copy()
df = df.rename(columns={"Sentence": "text", "Labels": "label"})
df["label"] = df["label"].astype(int)

# ✅ Basic text preprocessing
def preprocess_text(text):
    """Basic text preprocessing"""
    if pd.isna(text):
        return ""
    text = str(text).strip()
    # Remove extra whitespace
    text = ' '.join(text.split())
    return text

df["text"] = df["text"].apply(preprocess_text)

# ✅ Remove empty texts and duplicates
df = df[df["text"].str.len() > 0]
df = df.drop_duplicates(subset=["text"])

print("Columns after clean & rename:", df.columns)
print("Data shape:", df.shape)
print("Label distribution:")
print(df["label"].value_counts())
print(df.head())

# ✅ Load tokenizer and model with built-in regularization
model_name = "sagorsarker/bangla-bert-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Use standard model with built-in dropout configuration
model = AutoModelForSequenceClassification.from_pretrained(
    model_name, 
    num_labels=2,
    hidden_dropout_prob=0.5,  # Increase dropout for regularization
    attention_probs_dropout_prob=0.3,  # Attention dropout
    classifier_dropout=0.1  # Classifier dropout
).to(device)

# ✅ Advanced tokenization with dynamic padding
def tokenize(batch):
    return tokenizer(
        batch["text"], 
        padding=False,  # Dynamic padding in data collator
        truncation=True, 
        max_length=256,  # Increased max length for better context
        return_tensors=None
    )

# ✅ Create dataset with stratified split
dataset = Dataset.from_pandas(df)
dataset = dataset.map(tokenize, batched=True, remove_columns=["text"])

# ✅ Stratified train-validation-test split
def create_stratified_splits(dataset, train_size=0.7, val_size=0.15, test_size=0.15, random_state=42):
    labels = [item['label'] for item in dataset]
    
    # First split: train and temp (val + test)
    sss1 = StratifiedShuffleSplit(n_splits=1, test_size=(val_size + test_size), random_state=random_state)
    train_idx, temp_idx = next(sss1.split(range(len(labels)), labels))
    
    # Second split: val and test from temp
    temp_labels = [labels[i] for i in temp_idx]
    val_ratio = val_size / (val_size + test_size)
    sss2 = StratifiedShuffleSplit(n_splits=1, test_size=(1 - val_ratio), random_state=random_state)
    val_idx_temp, test_idx_temp = next(sss2.split(range(len(temp_idx)), temp_labels))
    
    val_idx = [temp_idx[i] for i in val_idx_temp]
    test_idx = [temp_idx[i] for i in test_idx_temp]
    
    return {
        'train': dataset.select(train_idx),
        'validation': dataset.select(val_idx),
        'test': dataset.select(test_idx)
    }

dataset_splits = create_stratified_splits(dataset)

# ✅ Set torch format
for split in dataset_splits:
    dataset_splits[split].set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

print(f"Train size: {len(dataset_splits['train'])}")
print(f"Validation size: {len(dataset_splits['validation'])}")
print(f"Test size: {len(dataset_splits['test'])}")

# ✅ Data collator for dynamic padding
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# ✅ Enhanced metrics function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average="weighted")
    precision = precision_score(labels, preds, average="weighted")
    recall = recall_score(labels, preds, average="weighted")
    
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }



# ✅ Enhanced training arguments with regularization
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=20,  # Reduced epochs to prevent overfitting
    per_device_train_batch_size=16,  # Increased batch size
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=2,  # Effective batch size = 32
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=50,
    
    # ✅ Learning rate scheduling
    learning_rate=3e-5,  # Slightly higher learning rate
    lr_scheduler_type="cosine",
    warmup_steps=500,
    
    # ✅ Regularization techniques
    weight_decay=0.1,  # L2 regularization
    adam_epsilon=1e-6,
    max_grad_norm=1.0,  # Gradient clipping
    label_smoothing_factor=0.1,  # Built-in label smoothing
    
    # ✅ Early stopping and model selection
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    
    # ✅ Evaluation and saving
    eval_steps=100,
    save_steps=200,
    save_total_limit=3,
    
    # ✅ Other optimizations
    dataloader_num_workers=2,
    remove_unused_columns=False,
    report_to=[]
)

# ✅ Custom Trainer to track training history
class CustomTrainer(Trainer):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.train_losses = []
        self.val_losses = []
        self.val_accuracies = []
        self.epochs = []
    
    def log(self, logs):
        super().log(logs)
        # Track training loss
        if "train_loss" in logs:
            self.train_losses.append(logs["train_loss"])
        
        # Track validation metrics
        if "eval_loss" in logs:
            self.val_losses.append(logs["eval_loss"])
            self.val_accuracies.append(logs.get("eval_accuracy", 0))
            self.epochs.append(len(self.val_losses))

# ✅ Standard trainer with built-in label smoothing
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset_splits["train"],
    eval_dataset=dataset_splits["validation"],
    compute_metrics=compute_metrics,
    data_collator=data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

# ✅ Train with mixed precision for efficiency
print("🚀 Starting training...")
trainer.train()

# ✅ Evaluate on validation set
print("\n📊 Validation Results:")
val_metrics = trainer.evaluate()
for key, value in val_metrics.items():
    print(f"{key}: {value:.4f}")

# ✅ Evaluate on test set
print("\n📊 Test Results:")
test_metrics = trainer.evaluate(dataset_splits["test"])
for key, value in test_metrics.items():
    print(f"test_{key}: {value:.4f}")

# ✅ Calculate overfitting metrics
train_metrics = trainer.evaluate(dataset_splits["train"])
overfitting_accuracy = train_metrics['eval_accuracy'] - test_metrics['eval_accuracy']
overfitting_f1 = train_metrics['eval_f1'] - test_metrics['eval_f1']

print(f"\n🎯 Overfitting Analysis:")
print(f"Training Accuracy: {train_metrics['eval_accuracy']:.4f}")
print(f"Test Accuracy: {test_metrics['eval_accuracy']:.4f}")
print(f"Accuracy Gap (overfitting): {overfitting_accuracy:.4f}")
print(f"F1 Gap (overfitting): {overfitting_f1:.4f}")

# ✅ Save the best model
model.save_pretrained("./final_bangla_bert_model")
tokenizer.save_pretrained("./final_bangla_bert_model")
print("\n✅ Best model saved at ./final_bangla_bert_model")

# ✅ Generate predictions for confusion matrix
def get_predictions(trainer, dataset):
    """Get predictions from the model"""
    predictions = trainer.predict(dataset)
    y_pred = predictions.predictions.argmax(-1)
    y_true = predictions.label_ids
    return y_true, y_pred

# ✅ Create confusion matrix
def plot_confusion_matrix(y_true, y_pred, class_names=None, title="Confusion Matrix"):
    """Plot confusion matrix"""
    cm = confusion_matrix(y_true, y_pred)
    
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=class_names, yticklabels=class_names)
    plt.title(title)
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.tight_layout()
    plt.show()
    
    return cm

# ✅ Plot training history
def plot_training_history(trainer):
    """Plot training and validation loss/accuracy over epochs"""
    epochs = range(1, len(trainer.val_losses) + 1)
    
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
    
    # Plot losses
    ax1.plot(epochs, trainer.val_losses, 'b-', label='Validation Loss', linewidth=2)
    if trainer.train_losses:
        # Match training losses to validation epochs
        train_epochs = np.linspace(1, len(trainer.val_losses), len(trainer.train_losses))
        ax1.plot(train_epochs, trainer.train_losses, 'r-', label='Training Loss', linewidth=2)
    
    ax1.set_title('Training & Validation Loss')
    ax1.set_xlabel('Epoch')
    ax1.set_ylabel('Loss')
    ax1.legend()
    ax1.grid(True, alpha=0.3)
    
    # Plot validation accuracy
    ax2.plot(epochs, trainer.val_accuracies, 'g-', label='Validation Accuracy', linewidth=2)
    ax2.set_title('Validation Accuracy')
    ax2.set_xlabel('Epoch')
    ax2.set_ylabel('Accuracy')
    ax2.legend()
    ax2.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

# ✅ Generate confusion matrices for validation and test sets
print("\n📊 Generating Confusion Matrices...")

# Get unique class names
class_names = [f"Class {i}" for i in sorted(df['label'].unique())]

# Validation set confusion matrix
val_true, val_pred = get_predictions(trainer, dataset_splits["validation"])
print(f"\nValidation Confusion Matrix:")
val_cm = plot_confusion_matrix(val_true, val_pred, class_names, "Validation Set Confusion Matrix")
print(val_cm)

# Test set confusion matrix
test_true, test_pred = get_predictions(trainer, dataset_splits["test"])
print(f"\nTest Confusion Matrix:")
test_cm = plot_confusion_matrix(test_true, test_pred, class_names, "Test Set Confusion Matrix")
print(test_cm)

# ✅ Plot training history
print("\n📈 Training History:")
plot_training_history(trainer)

# ✅ Additional classification metrics
from sklearn.metrics import classification_report

print("\n📊 Detailed Classification Report (Test Set):")
print(classification_report(test_true, test_pred, target_names=class_names))

print("\n📊 Detailed Classification Report (Validation Set):")
print(classification_report(val_true, val_pred, target_names=class_names))

# ✅ Optional: Model inference example
def predict_text(text, model, tokenizer, device):
    """Function to predict on new text"""
    model.eval()
    inputs = tokenizer(
        text, 
        return_tensors="pt", 
        padding=True, 
        truncation=True, 
        max_length=256
    ).to(device)
    
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
        predicted_class = torch.argmax(predictions, dim=-1)
    
    return predicted_class.item(), predictions.cpu().numpy()

# ✅ Example usage
sample_text = "এটি একটি পরীক্ষা বাক্য।"
try:
    pred_class, pred_probs = predict_text(sample_text, model, tokenizer, device)
    print(f"\nSample prediction:")
    print(f"Text: {sample_text}")
    print(f"Predicted class: {pred_class}")
    print(f"Probabilities: {pred_probs}")
except Exception as e:
    print(f"Prediction error: {e}")