In [1]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModel, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.utils.class_weight import compute_class_weight
from torch.utils.data import Dataset
import os
import shutil
import transformers
from packaging import version

# Configuration
BANGLABERT_NAME = "csebuetnlp/banglabert"
XLM_ROBERTA_NAME = "xlm-roberta-base"
MAX_LEN = 256 # Restored for better accuracy
BATCH_SIZE = 16
EPOCHS = 5
LEARNING_RATE = 2e-5

class MultiModelDataset(Dataset):
    def __init__(self, texts, labels, tokenizer1, tokenizer2, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer1 = tokenizer1
        self.tokenizer2 = tokenizer2
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        label = self.labels[item]

        encoding1 = self.tokenizer1.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        encoding2 = self.tokenizer2.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids1': encoding1['input_ids'].flatten(),
            'attention_mask1': encoding1['attention_mask'].flatten(),
            'input_ids2': encoding2['input_ids'].flatten(),
            'attention_mask2': encoding2['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

class FeatureEnsembleModel(nn.Module):
    def __init__(self, model1_name, model2_name, num_labels, class_weights=None):
        super(FeatureEnsembleModel, self).__init__()
        self.model1 = AutoModel.from_pretrained(model1_name)
        self.model2 = AutoModel.from_pretrained(model2_name)
        
        hidden_size1 = self.model1.config.hidden_size
        hidden_size2 = self.model2.config.hidden_size
        
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(hidden_size1 + hidden_size2, num_labels)
        self.num_labels = num_labels
        self.class_weights = class_weights

    def forward(self, input_ids1, attention_mask1, input_ids2, attention_mask2, labels=None):
        outputs1 = self.model1(input_ids=input_ids1, attention_mask=attention_mask1)
        features1 = outputs1.last_hidden_state[:, 0, :] 
        
        outputs2 = self.model2(input_ids=input_ids2, attention_mask=attention_mask2)
        features2 = outputs2.last_hidden_state[:, 0, :]
        
        combined_features = torch.cat((features1, features2), dim=1)
        combined_features = self.dropout(combined_features)
        
        logits = self.classifier(combined_features)
        
        loss = None
        if labels is not None:
            if self.class_weights is not None:
                loss_fct = nn.CrossEntropyLoss(weight=self.class_weights.to(logits.device))
            else:
                loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            
        return (loss, logits) if loss is not None else logits

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='macro')
    precision = precision_score(labels, predictions, average='macro', zero_division=0)
    recall = recall_score(labels, predictions, average='macro', zero_division=0)
    return {
        'accuracy': acc,
        'f1_macro': f1,
        'precision_macro': precision,
        'recall_macro': recall
    }

def main():
    # --- DATA PATHS ---
    train_path = '/kaggle/input/violence-dataset-2-0/train - Sheet1.csv'
    val_path = '/kaggle/input/violence-dataset-2-0/Validation - Sheet1.csv' 
    test_path = '/kaggle/input/violence-dataset-2-0/test.csv'
    
    # Load data
    train_df = pd.read_csv(train_path)
    val_df = pd.read_csv(val_path)
    test_df = pd.read_csv(test_path)
    
    print(f"Loaded {len(train_df)} train, {len(val_df)} validation, and {len(test_df)} test samples.")

    num_labels = 3 
    
    # Compute class weights to handle imbalance
    weights = compute_class_weight('balanced', classes=np.unique(train_df['label']), y=train_df['label'])
    class_weights = torch.tensor(weights, dtype=torch.float)

    tokenizer1 = AutoTokenizer.from_pretrained(BANGLABERT_NAME)
    tokenizer2 = AutoTokenizer.from_pretrained(XLM_ROBERTA_NAME)

    train_dataset = MultiModelDataset(train_df.text.to_numpy(), train_df.label.to_numpy(), tokenizer1, tokenizer2, MAX_LEN)
    val_dataset = MultiModelDataset(val_df.text.to_numpy(), val_df.label.to_numpy(), tokenizer1, tokenizer2, MAX_LEN)
    test_dataset = MultiModelDataset(test_df.text.to_numpy(), test_df.label.to_numpy(), tokenizer1, tokenizer2, MAX_LEN)

    model = FeatureEnsembleModel(BANGLABERT_NAME, XLM_ROBERTA_NAME, num_labels, class_weights=class_weights)

    # Handle version compatibility
    eval_strat_key = "eval_strategy" if version.parse(transformers.__version__) >= version.parse("4.41.0") else "evaluation_strategy"

    training_args = TrainingArguments(
        output_dir='./ensemble_results',
        num_train_epochs=EPOCHS,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=10,
        **{eval_strat_key: "epoch"},
        save_strategy="no",
        report_to="none"
    )

    class EnsembleTrainer(Trainer):
        def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
            labels = inputs.get("labels")
            outputs = model(
                input_ids1=inputs.get("input_ids1"),
                attention_mask1=inputs.get("attention_mask1"),
                input_ids2=inputs.get("input_ids2"),
                attention_mask2=inputs.get("attention_mask2"),
                labels=labels
            )
            loss = outputs[0] if isinstance(outputs, tuple) else outputs
            return (loss, outputs) if return_outputs else loss

    trainer = EnsembleTrainer(model=model, args=training_args, train_dataset=train_dataset, eval_dataset=val_dataset, compute_metrics=compute_metrics)

    print("Starting training...")
    trainer.train()

    print("Evaluating on test set...")
    test_results = trainer.evaluate(test_dataset)
    print(f"Test Results: {test_results}")

    # --- DISK CLEANUP BEFORE SAVE ---
    print("Cleaning up temporary files to free disk space...")
    if os.path.exists('./ensemble_results'):
        shutil.rmtree('./ensemble_results')
    if os.path.exists('./logs'):
        shutil.rmtree('./logs')

    # Save ONLY the final model weights
    print("Saving final model weights...")
    model.to('cpu') # Move to CPU to avoid memory spikes
    torch.save(model.state_dict(), "feature_ensemble_model.pt")
    print("Success! Model saved to feature_ensemble_model.pt")

if __name__ == "__main__":
    main()

2025-12-30 00:59:03.530628: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1767056343.712459      47 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1767056343.763559      47 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

Loaded 8339 train, 1790 validation, and 1790 test samples.


tokenizer_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/586 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/443M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/443M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,Precision Macro,Recall Macro
1,0.5202,1.150099,0.611732,0.613348,0.663882,0.62782
2,0.4088,1.42534,0.637989,0.637718,0.670373,0.649994
3,0.0873,1.594532,0.672067,0.672377,0.688309,0.680799
4,0.0546,2.101224,0.663687,0.661967,0.685702,0.671585
5,0.0028,2.346117,0.660894,0.658744,0.687296,0.669825


Evaluating on test set...


Test Results: {'eval_loss': 1.3967911005020142, 'eval_accuracy': 0.8575418994413407, 'eval_f1_macro': 0.8404588055584856, 'eval_precision_macro': 0.8793587256501164, 'eval_recall_macro': 0.82668849754921, 'eval_runtime': 26.8717, 'eval_samples_per_second': 66.613, 'eval_steps_per_second': 4.168, 'epoch': 5.0}
Cleaning up temporary files to free disk space...
Saving final model weights...
Success! Model saved to feature_ensemble_model.pt
