In [1]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel, AdamW, get_linear_schedule_with_warmup, AutoModelForSequenceClassification, Adafactor
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import re
from tqdm import tqdm
import numpy as np
import torch.nn.functional as F
import math, string


from huggingface_hub import login
login(token="")

# Define if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


# lstm_hidden_size: int = 128, lstm_layers: int = 1, dropout_prob: float = 0.3, lr=2e-5
class BilstmXLMRobertaClassifier(nn.Module):

    def __init__(self,
                 xlm_model_name: str,
                 num_labels: int,
                 lstm_hidden_size: int = 256,
                 lstm_layers: int = 2,
                 dropout_prob: float = 0.3):
        super(BilstmXLMRobertaClassifier, self).__init__()

        # Load XLM-RoBERTa model
        self.roberta = AutoModel.from_pretrained(xlm_model_name, cache_dir="xlm_roberta1/")
        
        # Unfreeze layers if necessary
        for param in self.roberta.parameters():
            param.requires_grad = True
        

        # BiLSTM layer with multiple layers
        self.bilstm = nn.LSTM(input_size=768,  # Embeddings from XLM-RoBERTa
                              hidden_size=lstm_hidden_size,
                              num_layers=lstm_layers,
                              bidirectional=True,
                              batch_first=True)

        # Initialize LSTM weights
        for name, param in self.bilstm.named_parameters():
            if 'weight_ih' in name:
                torch.nn.init.xavier_uniform_(param.data)  # Xavier uniform initialization for input-hidden weights
            elif 'weight_hh' in name:
                torch.nn.init.orthogonal_(param.data)  # Orthogonal initialization for hidden-hidden weights
            elif 'bias' in name:
                torch.nn.init.zeros_(param)  # Initialize biases to zeros

        # Attention mechanism after BiLSTM
        self.attention = nn.Linear(lstm_hidden_size * 2, 1)

        # Dropout layer for regularization
        self.dropout = nn.Dropout(dropout_prob)

        # Layer normalization
        self.layer_norm = nn.LayerNorm(lstm_hidden_size * 2)

        # Classification layer
        self.classifier = nn.Linear(lstm_hidden_size * 2, num_labels)  # Multiply by 2 for BiLSTM

    def forward(self,
                input_ids: torch.Tensor,
                attention_mask: torch.Tensor,
                labels: torch.Tensor = None):

        # Get embeddings from XLM-RoBERTa model
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)

        #outputs = self.roberta.roberta(input_ids=input_ids, attention_mask=attention_mask)
        embeddings = outputs.last_hidden_state  # Get the hidden states (XLM-RoBERTa output)

        # BiLSTM layer
        lstm_output, _ = self.bilstm(embeddings)

        # Attention mechanism
        attention_weights = torch.tanh(self.attention(lstm_output))
        attention_weights = torch.softmax(attention_weights, dim=1)
        lstm_output = torch.sum(lstm_output * attention_weights, dim=1)

        # Normalize LSTM output
        lstm_output = self.layer_norm(lstm_output)

        # Dropout
        lstm_output = self.dropout(lstm_output)

        # Classification layer
        logits = self.classifier(lstm_output)

        # Calculate loss if labels are provided
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.classifier.out_features), labels.view(-1))

        return (loss, logits) if loss is not None else logits


# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("bytesizedllm/MalayalamXLM_Roberta", cache_dir="xlm_roberta1/")

def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

def remove_emojis(text):
    # This regex pattern matches most emojis
    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"  # Emoticons
        "\U0001F300-\U0001F5FF"  # Symbols & Pictographs
        "\U0001F680-\U0001F6FF"  # Transport & Map symbols
        "\U0001F1E0-\U0001F1FF"  # Flags (iOS)
        "\U00002500-\U00002BEF"  # Chinese characters
        "\U00002702-\U000027B0"  # Dingbats
        "\U00002702-\U000027B0"  # Additional symbols
        "\U000024C2-\U0001F251"  # Enclosed characters
        "\U0001F900-\U0001F9FF"  # Supplemental symbols and pictographs
        "\U0001F910-\U0001F9FF"  # Supplemental symbols and pictographs continued
        "]+", flags=re.UNICODE
    )

    return emoji_pattern.sub(r'', text)


def remove_html_tags(text):
    return re.sub(r'<.*?>', ' ', text)

punctuation_list = [r':', r';', r'"', r'\-', r'–', r'\(', r'\)', r'\[', r'\]', r'\{', r'\}', r'\/', r'\\', r'@', r'&', r'\*', r'%', r'_', r'~', r'`', r'\^', r'\|', r'=', r'<', r'>', r'\+']
# Create the regex pattern to match any of the punctuation marks
punctuation_pattern = r'(' + '|'.join(punctuation_list) + r')'

# Function to clean and tokenize input text
def preprocess_text(text):
    text = text.replace("&amp;"," ").replace("<br>"," ").replace("&#39;","'")
    text = remove_html_tags(text)
    text = re.sub(r"http\S+", "", text)
#     text = re.sub(url_pattern, '', text)
#     text = re.sub(r".com$", " ", text)
#     text = re.sub(r"@\S+", "", text)
#     text = re.sub(r'\n+',"\n",text)
#     text = remove_emojis(text)
#     text = re.sub(r'\d+', ' ', text)
#     text = re.sub(r" @ ", " ", text)
#     text = re.sub(r" # ", " ", text)
#     text = re.sub(punctuation_pattern, r' \1 ', text)
#     text = remove_punctuation(text).strip()
    text = re.sub(r'\s+', ' ', text).strip()
    return text.lower()

# Data preparation function
def load_and_clean_data(trainpath, validpath):
    traindf = pd.read_csv(trainpath)
    train_sents = []
    train_labels = []
    labels = list(set([l for l in traindf["label"]]))
    label2id = {labels[i]:i for i in range(len(labels))}
    for text1, label in zip( traindf["text"], traindf["label"]):
        train_sents.append(preprocess_text(text1))
        train_labels.append(label2id[label])  # Label as int for multi-class

    validdf = pd.read_csv(validpath)

    valid_sents = []
    valid_labels = []
    for sent, label in zip(validdf["text"], validdf["label"]):
        valid_sents.append(preprocess_text(sent))
        valid_labels.append(label2id[label])  # Label as int

    return train_sents, train_labels, valid_sents, valid_labels, label2id

# Load and clean data (assumed pre-written)
train_texts, train_labels, val_texts, val_labels, label2id = load_and_clean_data("Fake_train.csv", "./old_data/Fake_test_with_labels.csv")
print(label2id)

# Dataset class for PyTorch DataLoader
class HateSpeechDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        # Tokenize and encode the text
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

from sklearn.model_selection import train_test_split

train_dataset = HateSpeechDataset(train_texts, train_labels, tokenizer)
val_dataset = HateSpeechDataset(val_texts, val_labels, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)


# Initialize model
model_name = "bytesizedllm/MalayalamXLM_Roberta"
num_labels = 2
model = BilstmXLMRobertaClassifier(model_name, num_labels).to(device)

# Initialize optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)

num_epochs = 10
total_steps = len(train_loader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

from torch.nn.utils import clip_grad_norm_
# Training and Evaluation Functions
def train_epoch(model, dataloader, optimizer, device):
    model.train()
    total_loss = 0
    loop = tqdm(dataloader, leave=True, desc="Training")
    for batch in loop:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        loss, logits = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss.backward()

        optimizer.step()

        scheduler.step()  # Move scheduler step inside the loop

        # Gradient clipping
        clip_grad_norm_(model.parameters(), max_norm=1.0)

        total_loss += loss.item()
        loop.set_description(f"Training Batch Loss: {loss.item():.4f}")

    return total_loss / len(dataloader)


def eval_model(model, dataloader, device):
    model.eval()
    predictions, true_labels = [], []
    loop = tqdm(dataloader, leave=True, desc="Evaluating")
    with torch.no_grad():
        for batch in loop:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            logits = model(input_ids=input_ids, attention_mask=attention_mask)
            preds = torch.argmax(logits, dim=1).cpu().numpy()
            labels = labels.cpu().numpy()

            predictions.extend(preds)
            true_labels.extend(labels)

    return accuracy_score(true_labels, predictions), true_labels, predictions

print("Train text:", train_texts[3])
print("val text:", val_texts[10])

# # Main Training Loop
best_macro_f1 = 0.0
best_model_path = "best_model3.pth"

for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    train_loss = train_epoch(model, train_loader, optimizer, device)
    accuracy, true_labels, predictions = eval_model(model, val_loader, device)

    report = classification_report(true_labels, predictions)
    report1 = classification_report(true_labels, predictions, output_dict=True)
    macro_f1 = report1['macro avg']['f1-score']

    print(f"Training Loss: {train_loss:.4f}")
    print(f"Validation Accuracy: {accuracy:.4f}")
    print(f"Validation Macro F1-Score: {macro_f1:.4f}")
    print("Classification Report:\n", report)

    # Save best model
    if macro_f1 > best_macro_f1:
        best_macro_f1 = macro_f1
        torch.save(model.state_dict(), best_model_path)
        print(f"New best Macro F1-Score: {best_macro_f1:.4f}. Saving model...")

print(f"Best Macro F1-Score achieved: {best_macro_f1:.4f}")




The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/rohit/.cache/huggingface/token
Login successful
Using device: cuda




{'Fake': 0, 'original': 1}


  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(
Some weights of XLMRobertaModel were not initialized from the model checkpoint at bytesizedllm/MalayalamXLM_Roberta and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Train text: illathentha avaru purath vidayittalland verenth
val text: ഈ പാട്ടിനു ആടിയ ചേച്ചീസിന്റെ തൊലിക്കട്ടി..
Epoch 1/10


Training Batch Loss: 0.4183: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 102/102 [00:17<00:00,  5.94it/s]
Evaluating: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [00:01<00:00, 21.84it/s]


Training Loss: 0.5630
Validation Accuracy: 0.8332
Validation Macro F1-Score: 0.8328
Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.88      0.84       507
           1       0.87      0.78      0.83       512

    accuracy                           0.83      1019
   macro avg       0.84      0.83      0.83      1019
weighted avg       0.84      0.83      0.83      1019

New best Macro F1-Score: 0.8328. Saving model...
Epoch 2/10


Training Batch Loss: 0.2593: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 102/102 [00:16<00:00,  6.14it/s]
Evaluating: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [00:01<00:00, 22.93it/s]


Training Loss: 0.3151
Validation Accuracy: 0.8783
Validation Macro F1-Score: 0.8783
Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.89      0.88       507
           1       0.89      0.86      0.88       512

    accuracy                           0.88      1019
   macro avg       0.88      0.88      0.88      1019
weighted avg       0.88      0.88      0.88      1019

New best Macro F1-Score: 0.8783. Saving model...
Epoch 3/10


Training Batch Loss: 0.2185: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 102/102 [00:16<00:00,  6.37it/s]
Evaluating: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [00:01<00:00, 21.93it/s]


Training Loss: 0.1899
Validation Accuracy: 0.8803
Validation Macro F1-Score: 0.8801
Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.85      0.88       507
           1       0.86      0.91      0.88       512

    accuracy                           0.88      1019
   macro avg       0.88      0.88      0.88      1019
weighted avg       0.88      0.88      0.88      1019

New best Macro F1-Score: 0.8801. Saving model...
Epoch 4/10


Training Batch Loss: 0.0704: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 102/102 [00:16<00:00,  6.37it/s]
Evaluating: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [00:01<00:00, 22.38it/s]


Training Loss: 0.1117
Validation Accuracy: 0.8763
Validation Macro F1-Score: 0.8763
Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.89      0.88       507
           1       0.89      0.87      0.88       512

    accuracy                           0.88      1019
   macro avg       0.88      0.88      0.88      1019
weighted avg       0.88      0.88      0.88      1019

Epoch 5/10


Training Batch Loss: 0.0371: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 102/102 [00:16<00:00,  6.33it/s]
Evaluating: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [00:01<00:00, 21.89it/s]


Training Loss: 0.0760
Validation Accuracy: 0.8842
Validation Macro F1-Score: 0.8842
Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.87      0.88       507
           1       0.87      0.90      0.89       512

    accuracy                           0.88      1019
   macro avg       0.88      0.88      0.88      1019
weighted avg       0.88      0.88      0.88      1019

New best Macro F1-Score: 0.8842. Saving model...
Epoch 6/10


Training Batch Loss: 0.1442: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 102/102 [00:16<00:00,  6.29it/s]
Evaluating: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [00:01<00:00, 22.65it/s]


Training Loss: 0.0422
Validation Accuracy: 0.8763
Validation Macro F1-Score: 0.8763
Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.88      0.88       507
           1       0.88      0.88      0.88       512

    accuracy                           0.88      1019
   macro avg       0.88      0.88      0.88      1019
weighted avg       0.88      0.88      0.88      1019

Epoch 7/10


Training Batch Loss: 0.0027: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 102/102 [00:15<00:00,  6.41it/s]
Evaluating: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [00:01<00:00, 22.04it/s]


Training Loss: 0.0360
Validation Accuracy: 0.8921
Validation Macro F1-Score: 0.8920
Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.88      0.89       507
           1       0.88      0.91      0.89       512

    accuracy                           0.89      1019
   macro avg       0.89      0.89      0.89      1019
weighted avg       0.89      0.89      0.89      1019

New best Macro F1-Score: 0.8920. Saving model...
Epoch 8/10


Training Batch Loss: 0.0404: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 102/102 [00:16<00:00,  6.32it/s]
Evaluating: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [00:01<00:00, 22.78it/s]


Training Loss: 0.0254
Validation Accuracy: 0.8852
Validation Macro F1-Score: 0.8851
Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.87      0.88       507
           1       0.88      0.90      0.89       512

    accuracy                           0.89      1019
   macro avg       0.89      0.89      0.89      1019
weighted avg       0.89      0.89      0.89      1019

Epoch 9/10


Training Batch Loss: 0.0290: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 102/102 [00:16<00:00,  6.26it/s]
Evaluating: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [00:01<00:00, 21.73it/s]


Training Loss: 0.0201
Validation Accuracy: 0.8881
Validation Macro F1-Score: 0.8881
Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.89      0.89       507
           1       0.89      0.89      0.89       512

    accuracy                           0.89      1019
   macro avg       0.89      0.89      0.89      1019
weighted avg       0.89      0.89      0.89      1019

Epoch 10/10


Training Batch Loss: 0.0069: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 102/102 [00:16<00:00,  6.33it/s]
Evaluating: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [00:01<00:00, 22.24it/s]

Training Loss: 0.0141
Validation Accuracy: 0.8881
Validation Macro F1-Score: 0.8881
Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.88      0.89       507
           1       0.88      0.90      0.89       512

    accuracy                           0.89      1019
   macro avg       0.89      0.89      0.89      1019
weighted avg       0.89      0.89      0.89      1019

Best Macro F1-Score achieved: 0.8920





In [1]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel, AdamW, get_linear_schedule_with_warmup, AutoModelForSequenceClassification, Adafactor
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import re
from tqdm import tqdm
import numpy as np
import torch.nn.functional as F
import math, string

def remove_html_tags(text):
    return re.sub(r'<.*?>', ' ', text)

# Function to clean and tokenize input text
def preprocess_text(text):
    text = text.replace("&amp;"," ").replace("<br>"," ").replace("&#39;","'")
    text = remove_html_tags(text)
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text.lower()

from huggingface_hub import login
login(token="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")

# Define if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


# lstm_hidden_size: int = 128, lstm_layers: int = 1, dropout_prob: float = 0.3, lr=2e-5
class BilstmXLMRobertaClassifier(nn.Module):

    def __init__(self,
                 xlm_model_name: str,
                 num_labels: int,
                 lstm_hidden_size: int = 256,
                 lstm_layers: int = 2,
                 dropout_prob: float = 0.3):
        super(BilstmXLMRobertaClassifier, self).__init__()

        # Load XLM-RoBERTa model
        self.roberta = AutoModel.from_pretrained(xlm_model_name, cache_dir="xlm_roberta1/")
        
        # Unfreeze layers if necessary
        for param in self.roberta.parameters():
            param.requires_grad = True
        

        # BiLSTM layer with multiple layers
        self.bilstm = nn.LSTM(input_size=768,  # Embeddings from XLM-RoBERTa
                              hidden_size=lstm_hidden_size,
                              num_layers=lstm_layers,
                              bidirectional=True,
                              batch_first=True)

        # Initialize LSTM weights
        for name, param in self.bilstm.named_parameters():
            if 'weight_ih' in name:
                torch.nn.init.xavier_uniform_(param.data)  # Xavier uniform initialization for input-hidden weights
            elif 'weight_hh' in name:
                torch.nn.init.orthogonal_(param.data)  # Orthogonal initialization for hidden-hidden weights
            elif 'bias' in name:
                torch.nn.init.zeros_(param)  # Initialize biases to zeros

        # Attention mechanism after BiLSTM
        self.attention = nn.Linear(lstm_hidden_size * 2, 1)

        # Dropout layer for regularization
        self.dropout = nn.Dropout(dropout_prob)

        # Layer normalization
        self.layer_norm = nn.LayerNorm(lstm_hidden_size * 2)

        # Classification layer
        self.classifier = nn.Linear(lstm_hidden_size * 2, num_labels)  # Multiply by 2 for BiLSTM

    def forward(self,
                input_ids: torch.Tensor,
                attention_mask: torch.Tensor,
                labels: torch.Tensor = None):

        # Get embeddings from XLM-RoBERTa model
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)

        #outputs = self.roberta.roberta(input_ids=input_ids, attention_mask=attention_mask)
        embeddings = outputs.last_hidden_state  # Get the hidden states (XLM-RoBERTa output)

        # BiLSTM layer
        lstm_output, _ = self.bilstm(embeddings)

        # Attention mechanism
        attention_weights = torch.tanh(self.attention(lstm_output))
        attention_weights = torch.softmax(attention_weights, dim=1)
        lstm_output = torch.sum(lstm_output * attention_weights, dim=1)

        # Normalize LSTM output
        lstm_output = self.layer_norm(lstm_output)

        # Dropout
        lstm_output = self.dropout(lstm_output)

        # Classification layer
        logits = self.classifier(lstm_output)

        # Calculate loss if labels are provided
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.classifier.out_features), labels.view(-1))

        return (loss, logits) if loss is not None else logits
    
# Load tokenizer and model
model_name = "bytesizedllm/MalayalamXLM_Roberta"
num_labels = 2

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("bytesizedllm/MalayalamXLM_Roberta", cache_dir="xlm_roberta1/")

model = BilstmXLMRobertaClassifier(model_name, num_labels)
best_model_path = "best_model2.pth"
model.load_state_dict(torch.load(best_model_path))
model = model.to(device)
model.eval()

def predict_label(text):
    with torch.no_grad():
        encoding = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=128,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        
        # Move input to device
        input_ids = encoding['input_ids'].to(device)
        attention_mask = encoding['attention_mask'].to(device)

        # Get model predictions
        logits = model(input_ids=input_ids, attention_mask=attention_mask)
        pred = torch.argmax(logits, dim=1).cpu().item()  # Get the predicted label
    return pred

# Load the test dataset
test_path = "old_data/Fake_test_with_labels.csv"  # Path to the test dataset
test_df = pd.read_csv(test_path)

#l2i = {'Fake': 0, 'original': 1}

label2id = {'original': 0, 'Fake': 1}
true_predictions = []
predicted = []
for text, label in zip(test_df["text"],test_df["label"]):
    pred = predict_label(preprocess_text(text))
    predicted.append(pred)
    true_predictions.append(label2id[label])


report = classification_report(true_predictions, predicted)
report1 = classification_report(true_predictions, predicted, output_dict=True)
macro_f1 = report1['macro avg']['f1-score']
print(report1)

print(f"Macro F1-Score: {macro_f1:.4f}")
print(f"Accuracy: {macro_f1:.4f}")
print("Classification Report:\n", report)
    
# output_df.to_csv("prediction.csv", index=False)

# print("Predictions saved to prediction.csv")





The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/rohit/.cache/huggingface/token
Login successful
Using device: cuda


  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(
Some weights of XLMRobertaModel were not initialized from the model checkpoint at bytesizedllm/MalayalamXLM_Roberta and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'0': {'precision': 0.8962818003913894, 'recall': 0.89453125, 'f1-score': 0.8954056695992181, 'support': 512}, '1': {'precision': 0.8937007874015748, 'recall': 0.8954635108481263, 'f1-score': 0.8945812807881773, 'support': 507}, 'accuracy': 0.8949950932286556, 'macro avg': {'precision': 0.8949912938964821, 'recall': 0.8949973804240632, 'f1-score': 0.8949934751936977, 'support': 1019}, 'weighted avg': {'precision': 0.8949976261167711, 'recall': 0.8949950932286556, 'f1-score': 0.894995497737395, 'support': 1019}}
Macro F1-Score: 0.8950
Accuracy: 0.8950
Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.89      0.90       512
           1       0.89      0.90      0.89       507

    accuracy                           0.89      1019
   macro avg       0.89      0.89      0.89      1019
weighted avg       0.89      0.89      0.89      1019



In [3]:
# Load the test dataset
test_path = "Fake_test_without_labels.csv"  # Path to the test dataset
test_df = pd.read_csv(test_path)

id2label = {v:k for k, v in label2id.items()}

data = []
for text, ID in zip(test_df["text"],test_df["Id"]):
    pred = predict_label(preprocess_text(text))
    pred = id2label[pred]
    data.append([ID, pred])

output_df = pd.DataFrame(data, columns = ["Id", "Labels"])
    

    
output_df.to_csv("byteSizedLLM_Malayalam_task1_run3.csv", index=False)

print("Predictions saved to prediction.csv")


Predictions saved to prediction.csv


In [1]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel, AdamW, get_linear_schedule_with_warmup, AutoModelForSequenceClassification, Adafactor
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import re
from tqdm import tqdm
import numpy as np
import torch.nn.functional as F
import math, string


from huggingface_hub import login
login(token="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")

# Define if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


class TransformerXLMRobertaClassifier(nn.Module):
    def __init__(self,
                 xlm_model_name: str,
                 num_labels: int,
                 d_model: int = 768,
                 nhead: int = 8,
                 num_encoder_layers: int = 3,
                 num_decoder_layers: int = 3,
                 dim_feedforward: int = 2048,
                 dropout_prob: float = 0.3):
        super(TransformerXLMRobertaClassifier, self).__init__()

        # Load XLM-RoBERTa model
        self.roberta = AutoModel.from_pretrained(xlm_model_name, cache_dir="xlm_roberta1/")

        # Transformer Encoder Layer
        encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, 
                                                   nhead=nhead, 
                                                   dim_feedforward=dim_feedforward, 
                                                   dropout=dropout_prob)
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_encoder_layers)

        # Transformer Decoder Layer
        decoder_layer = nn.TransformerDecoderLayer(d_model=d_model, 
                                                   nhead=nhead, 
                                                   dim_feedforward=dim_feedforward, 
                                                   dropout=dropout_prob)
        self.decoder = nn.TransformerDecoder(decoder_layer, num_layers=num_decoder_layers)

        # Linear layer to map XLM-RoBERTa output to transformer dimension
        self.input_projection = nn.Linear(768, d_model)

        # Output Classification Layer
        self.classifier = nn.Linear(d_model, num_labels)

        # Dropout for regularization
        self.dropout = nn.Dropout(dropout_prob)

        # Gradient Clipping
        self.gradient_clip_val = 1.0

    def forward(self, input_ids, attention_mask, labels=None):
        # Get embeddings from XLM-RoBERTa
        roberta_outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        embeddings = roberta_outputs.last_hidden_state

        # Project embeddings to match Transformer dimensions
        embeddings = self.input_projection(embeddings)

        # Create a source mask for the Transformer
        seq_len = embeddings.size(1)
        src_mask = self._generate_square_subsequent_mask(seq_len).to(embeddings.device)

        # Transformer Encoder
        encoder_output = self.encoder(embeddings.permute(1, 0, 2), src_mask)

        # Dummy target input for the Transformer Decoder
        # Here, we use the same encoder output for simplicity
        tgt = encoder_output.clone()
        tgt_mask = self._generate_square_subsequent_mask(seq_len).to(encoder_output.device)

        # Transformer Decoder
        decoder_output = self.decoder(tgt, encoder_output, tgt_mask=tgt_mask, memory_mask=src_mask)

        # Take the output of the last token
        output = decoder_output.permute(1, 0, 2).mean(dim=1)

        # Dropout and Classification
        output = self.dropout(output)
        logits = self.classifier(output)

        # Calculate loss if labels are provided
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.classifier.out_features), labels.view(-1))

        return (loss, logits) if loss is not None else logits

    def _generate_square_subsequent_mask(self, size):
        """Generate a square mask for the sequence to prevent attention to future tokens."""
        mask = torch.triu(torch.ones(size, size), diagonal=1).bool()
        return mask


# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("bytesizedllm/MalayalamXLM_Roberta", cache_dir="xlm_roberta1/")

def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

def remove_emojis(text):
    # This regex pattern matches most emojis
    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"  # Emoticons
        "\U0001F300-\U0001F5FF"  # Symbols & Pictographs
        "\U0001F680-\U0001F6FF"  # Transport & Map symbols
        "\U0001F1E0-\U0001F1FF"  # Flags (iOS)
        "\U00002500-\U00002BEF"  # Chinese characters
        "\U00002702-\U000027B0"  # Dingbats
        "\U00002702-\U000027B0"  # Additional symbols
        "\U000024C2-\U0001F251"  # Enclosed characters
        "\U0001F900-\U0001F9FF"  # Supplemental symbols and pictographs
        "\U0001F910-\U0001F9FF"  # Supplemental symbols and pictographs continued
        "]+", flags=re.UNICODE
    )

    return emoji_pattern.sub(r'', text)


def remove_html_tags(text):
    return re.sub(r'<.*?>', ' ', text)

punctuation_list = [r'\.', r',', r'\?', r'!', r':', r';', r'"', r'\-', r'–', r'\(', r'\)', r'\[', r'\]', r'\{', r'\}', r'\.\.\.', r'\/', r'\\', r'@', r'&', r'\*', r'#', r'%', r'_', r'~', r'`', r'\^', r'\|', r'=', r'<', r'>', r'\+']
# Create the regex pattern to match any of the punctuation marks
punctuation_pattern = r'(' + '|'.join(punctuation_list) + r')'

# Function to clean and tokenize input text
def preprocess_text(text):
    text = text.replace("&amp;"," ").replace("<br>"," ").replace("&#39;","'")
    text = remove_html_tags(text)
#     url_pattern = re.compile(r'https?://\S+|www\.\S+')
    text = re.sub(r"http\S+", "", text)
#     text = re.sub(url_pattern, '', text)
#     text = re.sub(r".com$", "", text)
#     text = re.sub(r"@\S+", "", text)
#     text = re.sub(r'\n+',"\n",text)
#     text = remove_emojis(text)
#     text = re.sub(r'\d+', ' ', text)
#     text = re.sub(r" @ ", " ", text)
#     text = re.sub(r" # ", " ", text)
#     text = re.sub(punctuation_pattern, r' \1 ', text)
#     text = remove_punctuation(text).strip()
    text = re.sub(r'\s+', ' ', text).strip()
    return text.lower()

# Data preparation function
def load_and_clean_data(trainpath, validpath):
    traindf = pd.read_csv(trainpath)
    train_sents = []
    train_labels = []
    labels = list(set([l for l in traindf["label"]]))
    label2id = {labels[i]:i for i in range(len(labels))}
    for text1, label in zip( traindf["text"], traindf["label"]):
        train_sents.append(preprocess_text(text1))
        train_labels.append(label2id[label])  # Label as int for multi-class

    validdf = pd.read_csv(validpath)

    valid_sents = []
    valid_labels = []
    for sent, label in zip(validdf["text"], validdf["label"]):
        valid_sents.append(preprocess_text(sent))
        valid_labels.append(label2id[label])  # Label as int

    return train_sents, train_labels, valid_sents, valid_labels, label2id

# Load and clean data (assumed pre-written)
train_texts, train_labels, val_texts, val_labels, label2id = load_and_clean_data("Fake_train.csv", "./old_data/Fake_test_with_labels.csv")
print(label2id)

# Dataset class for PyTorch DataLoader
class HateSpeechDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        # Tokenize and encode the text
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

from sklearn.model_selection import train_test_split

train_dataset = HateSpeechDataset(train_texts, train_labels, tokenizer)
val_dataset = HateSpeechDataset(val_texts, val_labels, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)


# Initialize model
model_name = "bytesizedllm/MalayalamXLM_Roberta"
num_labels = 2
model = TransformerXLMRobertaClassifier(model_name, num_labels).to(device)

# Initialize optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=2.5e-5, weight_decay=0.01)

num_epochs = 10
total_steps = len(train_loader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

from torch.nn.utils import clip_grad_norm_
# Training and Evaluation Functions
def train_epoch(model, dataloader, optimizer, device):
    model.train()
    total_loss = 0
    loop = tqdm(dataloader, leave=True, desc="Training")
    for batch in loop:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        loss, logits = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss.backward()

        optimizer.step()

        scheduler.step()  # Move scheduler step inside the loop

        # Gradient clipping
        clip_grad_norm_(model.parameters(), max_norm=1.0)

        total_loss += loss.item()
        loop.set_description(f"Training Batch Loss: {loss.item():.4f}")

    return total_loss / len(dataloader)


def eval_model(model, dataloader, device):
    model.eval()
    predictions, true_labels = [], []
    loop = tqdm(dataloader, leave=True, desc="Evaluating")
    with torch.no_grad():
        for batch in loop:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            logits = model(input_ids=input_ids, attention_mask=attention_mask)
            preds = torch.argmax(logits, dim=1).cpu().numpy()
            labels = labels.cpu().numpy()

            predictions.extend(preds)
            true_labels.extend(labels)

    return accuracy_score(true_labels, predictions), true_labels, predictions

print("Train text:", train_texts[3])
print("val text:", val_texts[10])

# # Main Training Loop
best_macro_f1 = 0.0
best_model_path = "best_model2.pth"

for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    train_loss = train_epoch(model, train_loader, optimizer, device)
    accuracy, true_labels, predictions = eval_model(model, val_loader, device)

    report = classification_report(true_labels, predictions)
    report1 = classification_report(true_labels, predictions, output_dict=True)
    macro_f1 = report1['macro avg']['f1-score']

    print(f"Training Loss: {train_loss:.4f}")
    print(f"Validation Accuracy: {accuracy:.4f}")
    print(f"Validation Macro F1-Score: {macro_f1:.4f}")
    print("Classification Report:\n", report)

    # Save best model
    if macro_f1 > best_macro_f1:
        best_macro_f1 = macro_f1
        torch.save(model.state_dict(), best_model_path)
        print(f"New best Macro F1-Score: {best_macro_f1:.4f}. Saving model...")

print(f"Best Macro F1-Score achieved: {best_macro_f1:.4f}")




The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/rohit/.cache/huggingface/token
Login successful
Using device: cuda




{'original': 0, 'Fake': 1}


  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(
Some weights of XLMRobertaModel were not initialized from the model checkpoint at bytesizedllm/MalayalamXLM_Roberta and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Train text: illathentha avaru purath vidayittalland verenth
val text: ഈ പാട്ടിനു ആടിയ ചേച്ചീസിന്റെ തൊലിക്കട്ടി..
Epoch 1/10


Training Batch Loss: 0.3619: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 102/102 [00:28<00:00,  3.52it/s]
Evaluating: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [00:02<00:00, 14.99it/s]


Training Loss: 0.5298
Validation Accuracy: 0.8567
Validation Macro F1-Score: 0.8566
Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.88      0.86       512
           1       0.88      0.83      0.85       507

    accuracy                           0.86      1019
   macro avg       0.86      0.86      0.86      1019
weighted avg       0.86      0.86      0.86      1019

New best Macro F1-Score: 0.8566. Saving model...
Epoch 2/10


Training Batch Loss: 0.1854: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 102/102 [00:23<00:00,  4.38it/s]
Evaluating: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [00:01<00:00, 16.84it/s]


Training Loss: 0.2782
Validation Accuracy: 0.8518
Validation Macro F1-Score: 0.8513
Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.79      0.84       512
           1       0.81      0.92      0.86       507

    accuracy                           0.85      1019
   macro avg       0.86      0.85      0.85      1019
weighted avg       0.86      0.85      0.85      1019

Epoch 3/10


Training Batch Loss: 0.1900: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 102/102 [00:24<00:00,  4.21it/s]
Evaluating: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [00:01<00:00, 16.26it/s]


Training Loss: 0.1570
Validation Accuracy: 0.8822
Validation Macro F1-Score: 0.8820
Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.93      0.89       512
           1       0.92      0.84      0.88       507

    accuracy                           0.88      1019
   macro avg       0.89      0.88      0.88      1019
weighted avg       0.89      0.88      0.88      1019

New best Macro F1-Score: 0.8820. Saving model...
Epoch 4/10


Training Batch Loss: 0.1550: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 102/102 [00:27<00:00,  3.73it/s]
Evaluating: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [00:02<00:00, 14.35it/s]


Training Loss: 0.0945
Validation Accuracy: 0.8930
Validation Macro F1-Score: 0.8930
Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.89      0.89       512
           1       0.89      0.89      0.89       507

    accuracy                           0.89      1019
   macro avg       0.89      0.89      0.89      1019
weighted avg       0.89      0.89      0.89      1019

New best Macro F1-Score: 0.8930. Saving model...
Epoch 5/10


Training Batch Loss: 0.1664: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 102/102 [00:26<00:00,  3.81it/s]
Evaluating: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [00:02<00:00, 14.95it/s]


Training Loss: 0.0532
Validation Accuracy: 0.8714
Validation Macro F1-Score: 0.8711
Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.82      0.86       512
           1       0.83      0.93      0.88       507

    accuracy                           0.87      1019
   macro avg       0.88      0.87      0.87      1019
weighted avg       0.88      0.87      0.87      1019

Epoch 6/10


Training Batch Loss: 0.0308: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 102/102 [00:26<00:00,  3.91it/s]
Evaluating: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [00:02<00:00, 14.29it/s]


Training Loss: 0.0357
Validation Accuracy: 0.8862
Validation Macro F1-Score: 0.8859
Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.93      0.89       512
           1       0.93      0.84      0.88       507

    accuracy                           0.89      1019
   macro avg       0.89      0.89      0.89      1019
weighted avg       0.89      0.89      0.89      1019

Epoch 7/10


Training Batch Loss: 0.0004: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 102/102 [00:25<00:00,  3.99it/s]
Evaluating: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [00:02<00:00, 14.04it/s]


Training Loss: 0.0164
Validation Accuracy: 0.8822
Validation Macro F1-Score: 0.8821
Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.91      0.89       512
           1       0.90      0.85      0.88       507

    accuracy                           0.88      1019
   macro avg       0.88      0.88      0.88      1019
weighted avg       0.88      0.88      0.88      1019

Epoch 8/10


Training Batch Loss: 0.0018: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 102/102 [00:26<00:00,  3.90it/s]
Evaluating: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [00:02<00:00, 14.34it/s]


Training Loss: 0.0150
Validation Accuracy: 0.8832
Validation Macro F1-Score: 0.8829
Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.93      0.89       512
           1       0.92      0.84      0.88       507

    accuracy                           0.88      1019
   macro avg       0.89      0.88      0.88      1019
weighted avg       0.89      0.88      0.88      1019

Epoch 9/10


Training Batch Loss: 0.0091: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 102/102 [00:25<00:00,  3.94it/s]
Evaluating: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [00:02<00:00, 13.45it/s]


Training Loss: 0.0136
Validation Accuracy: 0.8813
Validation Macro F1-Score: 0.8812
Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.90      0.88       512
           1       0.90      0.86      0.88       507

    accuracy                           0.88      1019
   macro avg       0.88      0.88      0.88      1019
weighted avg       0.88      0.88      0.88      1019

Epoch 10/10


Training Batch Loss: 0.0018: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 102/102 [00:27<00:00,  3.75it/s]
Evaluating: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [00:02<00:00, 13.69it/s]

Training Loss: 0.0103
Validation Accuracy: 0.8803
Validation Macro F1-Score: 0.8802
Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.89      0.88       512
           1       0.89      0.87      0.88       507

    accuracy                           0.88      1019
   macro avg       0.88      0.88      0.88      1019
weighted avg       0.88      0.88      0.88      1019

Best Macro F1-Score achieved: 0.8930





In [1]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel, AdamW, get_linear_schedule_with_warmup, AutoModelForSequenceClassification, Adafactor
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import re
from tqdm import tqdm
import numpy as np
import torch.nn.functional as F
import math, string

def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

def remove_emojis(text):
    # This regex pattern matches most emojis
    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"  # Emoticons
        "\U0001F300-\U0001F5FF"  # Symbols & Pictographs
        "\U0001F680-\U0001F6FF"  # Transport & Map symbols
        "\U0001F1E0-\U0001F1FF"  # Flags (iOS)
        "\U00002500-\U00002BEF"  # Chinese characters
        "\U00002702-\U000027B0"  # Dingbats
        "\U00002702-\U000027B0"  # Additional symbols
        "\U000024C2-\U0001F251"  # Enclosed characters
        "\U0001F900-\U0001F9FF"  # Supplemental symbols and pictographs
        "\U0001F910-\U0001F9FF"  # Supplemental symbols and pictographs continued
        "]+", flags=re.UNICODE
    )

    return emoji_pattern.sub(r'', text)


def remove_html_tags(text):
    return re.sub(r'<.*?>', ' ', text)

punctuation_list = [r'\.', r',', r'\?', r'!', r':', r';', r'"', r'\-', r'–', r'\(', r'\)', r'\[', r'\]', r'\{', r'\}', r'\.\.\.', r'\/', r'\\', r'@', r'&', r'\*', r'#', r'%', r'_', r'~', r'`', r'\^', r'\|', r'=', r'<', r'>', r'\+']
# Create the regex pattern to match any of the punctuation marks
punctuation_pattern = r'(' + '|'.join(punctuation_list) + r')'

# Function to clean and tokenize input text
def preprocess_text(text):
    text = text.replace("&amp;"," ").replace("<br>"," ").replace("&#39;","'")
    text = remove_html_tags(text)
#     url_pattern = re.compile(r'https?://\S+|www\.\S+')
    text = re.sub(r"http\S+", "", text)
#     text = re.sub(url_pattern, '', text)
#     text = re.sub(r".com$", "", text)
#     text = re.sub(r"@\S+", "", text)
#     text = re.sub(r'\n+',"\n",text)
#     text = remove_emojis(text)
#     text = re.sub(r'\d+', ' ', text)
#     text = re.sub(r" @ ", " ", text)
#     text = re.sub(r" # ", " ", text)
#     text = re.sub(punctuation_pattern, r' \1 ', text)
#     text = remove_punctuation(text).strip()
    text = re.sub(r'\s+', ' ', text).strip()
    return text.lower()

from huggingface_hub import login
login(token="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")

# Define if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


class TransformerXLMRobertaClassifier(nn.Module):
    def __init__(self,
                 xlm_model_name: str,
                 num_labels: int,
                 d_model: int = 768,
                 nhead: int = 8,
                 num_encoder_layers: int = 3,
                 num_decoder_layers: int = 3,
                 dim_feedforward: int = 2048,
                 dropout_prob: float = 0.3):
        super(TransformerXLMRobertaClassifier, self).__init__()

        # Load XLM-RoBERTa model
        self.roberta = AutoModel.from_pretrained(xlm_model_name, cache_dir="xlm_roberta1/")

        # Transformer Encoder Layer
        encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, 
                                                   nhead=nhead, 
                                                   dim_feedforward=dim_feedforward, 
                                                   dropout=dropout_prob)
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_encoder_layers)

        # Transformer Decoder Layer
        decoder_layer = nn.TransformerDecoderLayer(d_model=d_model, 
                                                   nhead=nhead, 
                                                   dim_feedforward=dim_feedforward, 
                                                   dropout=dropout_prob)
        self.decoder = nn.TransformerDecoder(decoder_layer, num_layers=num_decoder_layers)

        # Linear layer to map XLM-RoBERTa output to transformer dimension
        self.input_projection = nn.Linear(768, d_model)

        # Output Classification Layer
        self.classifier = nn.Linear(d_model, num_labels)

        # Dropout for regularization
        self.dropout = nn.Dropout(dropout_prob)

        # Gradient Clipping
        self.gradient_clip_val = 1.0

    def forward(self, input_ids, attention_mask, labels=None):
        # Get embeddings from XLM-RoBERTa
        roberta_outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        embeddings = roberta_outputs.last_hidden_state

        # Project embeddings to match Transformer dimensions
        embeddings = self.input_projection(embeddings)

        # Create a source mask for the Transformer
        seq_len = embeddings.size(1)
        src_mask = self._generate_square_subsequent_mask(seq_len).to(embeddings.device)

        # Transformer Encoder
        encoder_output = self.encoder(embeddings.permute(1, 0, 2), src_mask)

        # Dummy target input for the Transformer Decoder
        # Here, we use the same encoder output for simplicity
        tgt = encoder_output.clone()
        tgt_mask = self._generate_square_subsequent_mask(seq_len).to(encoder_output.device)

        # Transformer Decoder
        decoder_output = self.decoder(tgt, encoder_output, tgt_mask=tgt_mask, memory_mask=src_mask)

        # Take the output of the last token
        output = decoder_output.permute(1, 0, 2).mean(dim=1)

        # Dropout and Classification
        output = self.dropout(output)
        logits = self.classifier(output)

        # Calculate loss if labels are provided
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.classifier.out_features), labels.view(-1))

        return (loss, logits) if loss is not None else logits

    def _generate_square_subsequent_mask(self, size):
        """Generate a square mask for the sequence to prevent attention to future tokens."""
        mask = torch.triu(torch.ones(size, size), diagonal=1).bool()
        return mask
    
# Load tokenizer and model
model_name = "bytesizedllm/MalayalamXLM_Roberta"
num_labels = 2

label2id = {'original': 0, 'Fake': 1}

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("bytesizedllm/MalayalamXLM_Roberta", cache_dir="xlm_roberta1/")

model = TransformerXLMRobertaClassifier(model_name, num_labels)
best_model_path = "best_model1.pth"
model.load_state_dict(torch.load(best_model_path))
model = model.to(device)
model.eval()

def predict_label(text):
    with torch.no_grad():
        encoding = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=128,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        
        # Move input to device
        input_ids = encoding['input_ids'].to(device)
        attention_mask = encoding['attention_mask'].to(device)

        # Get model predictions
        logits = model(input_ids=input_ids, attention_mask=attention_mask)
        pred = torch.argmax(logits, dim=1).cpu().item()  # Get the predicted label
    return pred

# Load the test dataset
test_path = "old_data/Fake_test_with_labels.csv"  # Path to the test dataset
test_df = pd.read_csv(test_path)

true_predictions = []
predicted = []
for text, label in zip(test_df["text"],test_df["label"]):
    pred = predict_label(preprocess_text(text))
    predicted.append(pred)
    true_predictions.append(label2id[label])


report = classification_report(true_predictions, predicted)
report1 = classification_report(true_predictions, predicted, output_dict=True)
macro_f1 = report1['macro avg']['f1-score']
print(report1)

print(f"Validation Macro F1-Score: {macro_f1:.4f}")
print("Classification Report:\n", report)



The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/rohit/.cache/huggingface/token
Login successful
Using device: cuda


  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(
Some weights of XLMRobertaModel were not initialized from the model checkpoint at bytesizedllm/MalayalamXLM_Roberta and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'0': {'precision': 0.8938223938223938, 'recall': 0.904296875, 'f1-score': 0.8990291262135922, 'support': 512}, '1': {'precision': 0.9021956087824351, 'recall': 0.8915187376725838, 'f1-score': 0.8968253968253967, 'support': 507}, 'accuracy': 0.8979391560353287, 'macro avg': {'precision': 0.8980090013024145, 'recall': 0.897907806336292, 'f1-score': 0.8979272615194944, 'support': 1019}, 'weighted avg': {'precision': 0.897988458576801, 'recall': 0.8979391560353287, 'f1-score': 0.8979326681176009, 'support': 1019}}
Validation Macro F1-Score: 0.8979
Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.90      0.90       512
           1       0.90      0.89      0.90       507

    accuracy                           0.90      1019
   macro avg       0.90      0.90      0.90      1019
weighted avg       0.90      0.90      0.90      1019



In [3]:
# Load the test dataset
test_path = "Fake_test_without_labels.csv"  # Path to the test dataset
test_df = pd.read_csv(test_path)

id2label = {v:k for k, v in label2id.items()}

data = []
for text, ID in zip(test_df["text"],test_df["Id"]):
    pred = predict_label(preprocess_text(text))
    pred = id2label[pred]
    data.append([ID, pred])

output_df = pd.DataFrame(data, columns = ["Id", "Labels"])
    

    
output_df.to_csv("byteSizedLLM_Malayalam_task1_run3.csv", index=False)

print("Predictions saved to prediction.csv")


Predictions saved to prediction.csv
