In [1]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel, AdamW, get_linear_schedule_with_warmup, AutoModelForSequenceClassification, Adafactor
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import re
from tqdm import tqdm
import numpy as np
import torch.nn.functional as F
import math, string

from huggingface_hub import login
login(token="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")

# Define if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


# lstm_hidden_size: int = 128, lstm_layers: int = 1, dropout_prob: float = 0.3, lr=2e-5
class BilstmXLMRobertaClassifier(nn.Module):

    def __init__(self,
                 xlm_model_name: str,
                 num_labels: int,
                 lstm_hidden_size: int = 512,
                 lstm_layers: int = 3,
                 dropout_prob: float = 0.3):
        super(BilstmXLMRobertaClassifier, self).__init__()

        # Load XLM-RoBERTa model
        self.roberta = AutoModel.from_pretrained(xlm_model_name, cache_dir="/home/rohit/expt/dp_expt/codalab/NAACL-2025/20698/xlm_roberta/")
        #self.roberta = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
        # Unfreeze layers if necessary
        for param in self.roberta.parameters():
            param.requires_grad = True

        # BiLSTM layer with multiple layers
        self.bilstm = nn.LSTM(input_size=768,  # Embeddings from XLM-RoBERTa
                              hidden_size=lstm_hidden_size,
                              num_layers=lstm_layers,
                              bidirectional=True,
                              batch_first=True)

        # Initialize LSTM weights
        for name, param in self.bilstm.named_parameters():
            if 'weight_ih' in name:
                torch.nn.init.xavier_uniform_(param.data)  # Xavier uniform initialization for input-hidden weights
            elif 'weight_hh' in name:
                torch.nn.init.orthogonal_(param.data)  # Orthogonal initialization for hidden-hidden weights
            elif 'bias' in name:
                torch.nn.init.zeros_(param)  # Initialize biases to zeros

        # Attention mechanism after BiLSTM
        self.attention = nn.Linear(lstm_hidden_size * 2, 1)

        # Dropout layer for regularization
        self.dropout = nn.Dropout(dropout_prob)

        # Layer normalization
        self.layer_norm = nn.LayerNorm(lstm_hidden_size * 2)

        # Classification layer
        self.classifier = nn.Linear(lstm_hidden_size * 2, num_labels)  # Multiply by 2 for BiLSTM

    def forward(self,
                input_ids: torch.Tensor,
                attention_mask: torch.Tensor,
                labels: torch.Tensor = None):

        # Get embeddings from XLM-RoBERTa model
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)

        #outputs = self.roberta.roberta(input_ids=input_ids, attention_mask=attention_mask)
        embeddings = outputs.last_hidden_state  # Get the hidden states (XLM-RoBERTa output)


        # BiLSTM layer
        lstm_output, _ = self.bilstm(embeddings)

        # Attention mechanism
        attention_weights = torch.tanh(self.attention(lstm_output))
        attention_weights = torch.softmax(attention_weights, dim=1)
        lstm_output = torch.sum(lstm_output * attention_weights, dim=1)

        # Normalize LSTM output
        lstm_output = self.layer_norm(lstm_output)

        # Dropout
        lstm_output = self.dropout(lstm_output)

        # Classification layer
        logits = self.classifier(lstm_output)

        # Calculate loss if labels are provided
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.classifier.out_features), labels.view(-1))

        return (loss, logits) if loss is not None else logits


# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("bytesizedllm/MalayalamXLM_Roberta", cache_dir="/home/rohit/expt/dp_expt/codalab/NAACL-2025/20698/xlm_roberta/")

def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))


def remove_html_tags(text):
    return re.sub(r'<.*?>', ' ', text)

punctuation_list = [r'\.', r',', r'\?', r'!', r':', r';', r'"', r'\-', r'–', r'\(', r'\)', r'\[', r'\]', r'\{', r'\}', r'\.\.\.', r'\/', r'\\', r'@', r'&', r'\*', r'#', r'%', r'_', r'~', r'`', r'\^', r'\|', r'=', r'<', r'>', r'\+']
# Create the regex pattern to match any of the punctuation marks
punctuation_pattern = r'(' + '|'.join(punctuation_list) + r')'


# Data preparation function
def load_and_clean_data(trainpath):
    traindf = pd.read_csv(trainpath)
    train_sents = []
    train_labels = []
    labels = list(set([l.strip() for l in traindf["LABEL"]]))
    label2id = {labels[i]:i for i in range(len(labels))}
    for text1, label in zip( traindf["DATA"], traindf["LABEL"]):
        label = label.strip()
        train_sents.append(text1.strip())
        train_labels.append(label2id[label])  # Label as int for multi-class


    return train_sents, train_labels, label2id

# Load and clean data (assumed pre-written)
train_texts, train_labels, label2id = load_and_clean_data("mal_training_data_hum_ai.csv")
    
    
from sklearn.model_selection import train_test_split
train_texts_split, val_texts_split, train_labels_split, val_labels_split = train_test_split(
    train_texts, 
    train_labels, 
    test_size=0.1,  # Specify the percentage for validation (e.g., 20%)
    stratify=train_labels,  # Ensure the split is stratified
    random_state=42  # Set a seed for reproducibility
)


print(len(train_texts), len(train_texts_split),len(train_labels_split),len(val_texts_split), len(val_labels_split))

# Dataset class for PyTorch DataLoader
class HateSpeechDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        # Tokenize and encode the text
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

from sklearn.model_selection import train_test_split

train_dataset = HateSpeechDataset(train_texts_split, train_labels_split, tokenizer)
val_dataset = HateSpeechDataset(val_texts_split, val_labels_split, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)


# Initialize model
model_name = "bytesizedllm/MalayalamXLM_Roberta"
num_labels = len(list(set(train_labels)))
print("No. of labels: ", num_labels, label2id)
model = BilstmXLMRobertaClassifier(model_name, num_labels).to(device)

# Initialize optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=2.5e-5, weight_decay=0.01)

num_epochs = 10
total_steps = len(train_loader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

from torch.nn.utils import clip_grad_norm_
# Training and Evaluation Functions
def train_epoch(model, dataloader, optimizer, device):
    model.train()
    total_loss = 0
    loop = tqdm(dataloader, leave=True, desc="Training")
    for batch in loop:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        loss, logits = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss.backward()

        optimizer.step()

        scheduler.step()  # Move scheduler step inside the loop

        # Gradient clipping
        clip_grad_norm_(model.parameters(), max_norm=1.0)

        total_loss += loss.item()
        loop.set_description(f"Training Batch Loss: {loss.item():.4f}")

    return total_loss / len(dataloader)


def eval_model(model, dataloader, device):
    model.eval()
    predictions, true_labels = [], []
    loop = tqdm(dataloader, leave=True, desc="Evaluating")
    with torch.no_grad():
        for batch in loop:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            logits = model(input_ids=input_ids, attention_mask=attention_mask)
            preds = torch.argmax(logits, dim=1).cpu().numpy()
            labels = labels.cpu().numpy()

            predictions.extend(preds)
            true_labels.extend(labels)

    return accuracy_score(true_labels, predictions), true_labels, predictions


# # Main Training Loop
best_macro_f1 = 0.0
best_model_path = "mal_best_model.pth"

for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    train_loss = train_epoch(model, train_loader, optimizer, device)
    accuracy, true_labels, predictions = eval_model(model, val_loader, device)

    report = classification_report(true_labels, predictions)
    report1 = classification_report(true_labels, predictions, output_dict=True)
    macro_f1 = report1['macro avg']['f1-score']

    print(f"Training Loss: {train_loss:.4f}")
    print(f"Validation Accuracy: {accuracy:.4f}")
    print(f"Validation Macro F1-Score: {macro_f1:.4f}")
    print("Classification Report:\n", report)

    # Save best model
    if macro_f1 >= best_macro_f1:
        best_macro_f1 = macro_f1
        torch.save(model.state_dict(), best_model_path)
        print(f"New best Macro F1-Score: {best_macro_f1:.4f}. Saving model...")

print(f"Best Macro F1-Score achieved: {best_macro_f1:.4f}")
# torch.save(model.state_dict(), "mal_7th_epoch.pth")



The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/rohit/.cache/huggingface/token
Login successful
Using device: cuda




800 720 720 80 80
No. of labels:  2 {'HUMAN': 0, 'AI': 1}


  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(
Some weights of XLMRobertaModel were not initialized from the model checkpoint at bytesizedllm/MalayalamXLM_Roberta and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10


Training Batch Loss: 0.1775: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 90/90 [00:18<00:00,  4.91it/s]
Evaluating: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 22.55it/s]


Training Loss: 0.2048
Validation Accuracy: 0.9875
Validation Macro F1-Score: 0.9875
Classification Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99        40
           1       1.00      0.97      0.99        40

    accuracy                           0.99        80
   macro avg       0.99      0.99      0.99        80
weighted avg       0.99      0.99      0.99        80

New best Macro F1-Score: 0.9875. Saving model...
Epoch 2/10


Training Batch Loss: 0.0219: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 90/90 [00:16<00:00,  5.57it/s]
Evaluating: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 27.23it/s]


Training Loss: 0.1094
Validation Accuracy: 0.9875
Validation Macro F1-Score: 0.9875
Classification Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99        40
           1       1.00      0.97      0.99        40

    accuracy                           0.99        80
   macro avg       0.99      0.99      0.99        80
weighted avg       0.99      0.99      0.99        80

New best Macro F1-Score: 0.9875. Saving model...
Epoch 3/10


Training Batch Loss: 0.3211: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 90/90 [00:15<00:00,  5.76it/s]
Evaluating: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 27.41it/s]


Training Loss: 0.0544
Validation Accuracy: 0.9625
Validation Macro F1-Score: 0.9624
Classification Report:
               precision    recall  f1-score   support

           0       0.93      1.00      0.96        40
           1       1.00      0.93      0.96        40

    accuracy                           0.96        80
   macro avg       0.97      0.96      0.96        80
weighted avg       0.97      0.96      0.96        80

Epoch 4/10


Training Batch Loss: 0.0006: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 90/90 [00:15<00:00,  5.72it/s]
Evaluating: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 26.45it/s]


Training Loss: 0.0133
Validation Accuracy: 1.0000
Validation Macro F1-Score: 1.0000
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        40
           1       1.00      1.00      1.00        40

    accuracy                           1.00        80
   macro avg       1.00      1.00      1.00        80
weighted avg       1.00      1.00      1.00        80

New best Macro F1-Score: 1.0000. Saving model...
Epoch 5/10


Training Batch Loss: 0.0003: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 90/90 [00:15<00:00,  5.79it/s]
Evaluating: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 27.51it/s]


Training Loss: 0.0340
Validation Accuracy: 0.9750
Validation Macro F1-Score: 0.9750
Classification Report:
               precision    recall  f1-score   support

           0       0.95      1.00      0.98        40
           1       1.00      0.95      0.97        40

    accuracy                           0.97        80
   macro avg       0.98      0.97      0.97        80
weighted avg       0.98      0.97      0.97        80

Epoch 6/10


Training Batch Loss: 0.0001: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 90/90 [00:15<00:00,  5.82it/s]
Evaluating: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 26.73it/s]


Training Loss: 0.0087
Validation Accuracy: 1.0000
Validation Macro F1-Score: 1.0000
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        40
           1       1.00      1.00      1.00        40

    accuracy                           1.00        80
   macro avg       1.00      1.00      1.00        80
weighted avg       1.00      1.00      1.00        80

New best Macro F1-Score: 1.0000. Saving model...
Epoch 7/10


Training Batch Loss: 0.0002: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 90/90 [00:15<00:00,  5.82it/s]
Evaluating: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 26.95it/s]


Training Loss: 0.0005
Validation Accuracy: 1.0000
Validation Macro F1-Score: 1.0000
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        40
           1       1.00      1.00      1.00        40

    accuracy                           1.00        80
   macro avg       1.00      1.00      1.00        80
weighted avg       1.00      1.00      1.00        80

New best Macro F1-Score: 1.0000. Saving model...
Epoch 8/10


Training Batch Loss: 0.0008: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 90/90 [00:15<00:00,  5.77it/s]
Evaluating: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 26.38it/s]


Training Loss: 0.0054
Validation Accuracy: 1.0000
Validation Macro F1-Score: 1.0000
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        40
           1       1.00      1.00      1.00        40

    accuracy                           1.00        80
   macro avg       1.00      1.00      1.00        80
weighted avg       1.00      1.00      1.00        80

New best Macro F1-Score: 1.0000. Saving model...
Epoch 9/10


Training Batch Loss: 0.0005: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 90/90 [00:15<00:00,  5.91it/s]
Evaluating: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 27.35it/s]


Training Loss: 0.0010
Validation Accuracy: 1.0000
Validation Macro F1-Score: 1.0000
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        40
           1       1.00      1.00      1.00        40

    accuracy                           1.00        80
   macro avg       1.00      1.00      1.00        80
weighted avg       1.00      1.00      1.00        80

New best Macro F1-Score: 1.0000. Saving model...
Epoch 10/10


Training Batch Loss: 0.0002: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 90/90 [00:14<00:00,  6.01it/s]
Evaluating: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 27.50it/s]


Training Loss: 0.0012
Validation Accuracy: 1.0000
Validation Macro F1-Score: 1.0000
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        40
           1       1.00      1.00      1.00        40

    accuracy                           1.00        80
   macro avg       1.00      1.00      1.00        80
weighted avg       1.00      1.00      1.00        80

New best Macro F1-Score: 1.0000. Saving model...
Best Macro F1-Score achieved: 1.0000


In [3]:

# Load tokenizer and model
model_name = "bytesizedllm/MalayalamXLM_Roberta"
num_labels = len(label2id)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("bytesizedllm/MalayalamXLM_Roberta", cache_dir="/home/rohit/expt/dp_expt/codalab/NAACL-2025/20698/xlm_roberta/")


model = BilstmXLMRobertaClassifier(model_name, num_labels)
best_model_path = "mal_best_model.pth"
model.load_state_dict(torch.load(best_model_path))
model = model.to(device)
model.eval()

def predict_label(text):
    with torch.no_grad():
        encoding = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=128,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        
        # Move input to device
        input_ids = encoding['input_ids'].to(device)
        attention_mask = encoding['attention_mask'].to(device)

        # Get model predictions
        logits = model(input_ids=input_ids, attention_mask=attention_mask)
        pred = torch.argmax(logits, dim=1).cpu().item()  # Get the predicted label
    return pred

# Load the test dataset
test_path = "mal_test_data_hum_ai.csv"  # Path to the test dataset
test_df = pd.read_csv(test_path)

id2label = {v:k for k, v in label2id.items()}

data = []
for text, ID in zip(test_df["DATA"],test_df["ID"]):
    pred = predict_label(text.strip())
    pred = id2label[pred]
    data.append([ID, pred])

output_df = pd.DataFrame(data, columns = ["ID", "LABEL"])
    

    
output_df.to_csv("byteSizedLLM_Malayalam_run2.tsv", sep = "\t", index=False)

print("Predictions saved to prediction.csv")

Some weights of XLMRobertaModel were not initialized from the model checkpoint at bytesizedllm/MalayalamXLM_Roberta and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Predictions saved to prediction.csv
