In [1]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel, AdamW, get_linear_schedule_with_warmup, AutoModelForSequenceClassification, Adafactor
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import re
from tqdm import tqdm
import numpy as np
import torch.nn.functional as F
import math, string

def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

def remove_emojis(text):
    # This regex pattern matches most emojis
    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"  # Emoticons
        "\U0001F300-\U0001F5FF"  # Symbols & Pictographs
        "\U0001F680-\U0001F6FF"  # Transport & Map symbols
        "\U0001F1E0-\U0001F1FF"  # Flags (iOS)
        "\U00002500-\U00002BEF"  # Chinese characters
        "\U00002702-\U000027B0"  # Dingbats
        "\U00002702-\U000027B0"  # Additional symbols
        "\U000024C2-\U0001F251"  # Enclosed characters
        "\U0001F900-\U0001F9FF"  # Supplemental symbols and pictographs
        "\U0001F910-\U0001F9FF"  # Supplemental symbols and pictographs continued
        "]+", flags=re.UNICODE
    )

    return emoji_pattern.sub(r'', text)


def remove_html_tags(text):
    return re.sub(r'<.*?>', ' ', text)

punctuation_list = [r'\.', r',', r'\?', r'!', r':', r';', r'"', r'\-', r'â€“', r'\(', r'\)', r'\[', r'\]', r'\{', r'\}', r'\.\.\.', r'\/', r'\\', r'@', r'&', r'\*', r'#', r'%', r'_', r'~', r'`', r'\^', r'\|', r'=', r'<', r'>', r'\+']
# Create the regex pattern to match any of the punctuation marks
punctuation_pattern = r'(' + '|'.join(punctuation_list) + r')'

# Function to clean and tokenize input text
def preprocess_text(text):
    text = text.replace("&amp;"," ").replace("<br>"," ").replace("&#39;","'")
    text = remove_html_tags(text)
#     url_pattern = re.compile(r'https?://\S+|www\.\S+')
    text = re.sub(r"http\S+", "", text)
#     text = re.sub(url_pattern, '', text)
#     text = re.sub(r".com$", "", text)
#     text = re.sub(r"@\S+", "", text)
#     text = re.sub(r'\n+',"\n",text)
#     text = remove_emojis(text)
#     text = re.sub(r'\d+', ' ', text)
#     text = re.sub(r" @ ", " ", text)
#     text = re.sub(r" # ", " ", text)
#     text = re.sub(punctuation_pattern, r' \1 ', text)
#     text = remove_punctuation(text).strip()
    text = re.sub(r'\s+', ' ', text).strip()
    return text.lower()

from huggingface_hub import login
login(token="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")

# Define if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


class TransformerXLMRobertaClassifier(nn.Module):
    def __init__(self,
                 xlm_model_name: str,
                 num_labels: int,
                 d_model: int = 768,
                 nhead: int = 8,
                 num_encoder_layers: int = 3,
                 num_decoder_layers: int = 3,
                 dim_feedforward: int = 2048,
                 dropout_prob: float = 0.3):
        super(TransformerXLMRobertaClassifier, self).__init__()

        # Load XLM-RoBERTa model
        self.roberta = AutoModel.from_pretrained(xlm_model_name, cache_dir="xlm_roberta1/")

        # Transformer Encoder Layer
        encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, 
                                                   nhead=nhead, 
                                                   dim_feedforward=dim_feedforward, 
                                                   dropout=dropout_prob)
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_encoder_layers)

        # Transformer Decoder Layer
        decoder_layer = nn.TransformerDecoderLayer(d_model=d_model, 
                                                   nhead=nhead, 
                                                   dim_feedforward=dim_feedforward, 
                                                   dropout=dropout_prob)
        self.decoder = nn.TransformerDecoder(decoder_layer, num_layers=num_decoder_layers)

        # Linear layer to map XLM-RoBERTa output to transformer dimension
        self.input_projection = nn.Linear(768, d_model)

        # Output Classification Layer
        self.classifier = nn.Linear(d_model, num_labels)

        # Dropout for regularization
        self.dropout = nn.Dropout(dropout_prob)

        # Gradient Clipping
        self.gradient_clip_val = 1.0

    def forward(self, input_ids, attention_mask, labels=None):
        # Get embeddings from XLM-RoBERTa
        roberta_outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        embeddings = roberta_outputs.last_hidden_state

        # Project embeddings to match Transformer dimensions
        embeddings = self.input_projection(embeddings)

        # Create a source mask for the Transformer
        seq_len = embeddings.size(1)
        src_mask = self._generate_square_subsequent_mask(seq_len).to(embeddings.device)

        # Transformer Encoder
        encoder_output = self.encoder(embeddings.permute(1, 0, 2), src_mask)

        # Dummy target input for the Transformer Decoder
        # Here, we use the same encoder output for simplicity
        tgt = encoder_output.clone()
        tgt_mask = self._generate_square_subsequent_mask(seq_len).to(encoder_output.device)

        # Transformer Decoder
        decoder_output = self.decoder(tgt, encoder_output, tgt_mask=tgt_mask, memory_mask=src_mask)

        # Take the output of the last token
        output = decoder_output.permute(1, 0, 2).mean(dim=1)

        # Dropout and Classification
        output = self.dropout(output)
        logits = self.classifier(output)

        # Calculate loss if labels are provided
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.classifier.out_features), labels.view(-1))

        return (loss, logits) if loss is not None else logits

    def _generate_square_subsequent_mask(self, size):
        """Generate a square mask for the sequence to prevent attention to future tokens."""
        mask = torch.triu(torch.ones(size, size), diagonal=1).bool()
        return mask
    
# Load tokenizer and model
model_name = "bytesizedllm/MalayalamXLM_Roberta"
num_labels = 2

label2id = {'original': 0, 'Fake': 1}

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("bytesizedllm/MalayalamXLM_Roberta", cache_dir="xlm_roberta1/")

model = TransformerXLMRobertaClassifier(model_name, num_labels)
best_model_path = "best_model1.pth"
model.load_state_dict(torch.load(best_model_path))
model = model.to(device)
model.eval()

def predict_label(text):
    with torch.no_grad():
        encoding = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=128,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        
        # Move input to device
        input_ids = encoding['input_ids'].to(device)
        attention_mask = encoding['attention_mask'].to(device)

        # Get model predictions
        logits = model(input_ids=input_ids, attention_mask=attention_mask)
        pred = torch.argmax(logits, dim=1).cpu().item()  # Get the predicted label
    return pred

# Load the test dataset
test_path = "old_data/Fake_test_with_labels.csv"  # Path to the test dataset
test_df = pd.read_csv(test_path)

true_predictions = []
predicted = []
for text, label in zip(test_df["text"],test_df["label"]):
    pred = predict_label(preprocess_text(text))
    predicted.append(pred)
    true_predictions.append(label2id[label])


report = classification_report(true_predictions, predicted)
report1 = classification_report(true_predictions, predicted, output_dict=True)
macro_f1 = report1['macro avg']['f1-score']
print(report1)

print(f"Validation Macro F1-Score: {macro_f1:.4f}")
print("Classification Report:\n", report)



The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/rohit/.cache/huggingface/token
Login successful
Using device: cuda


  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(
Some weights of XLMRobertaModel were not initialized from the model checkpoint at bytesizedllm/MalayalamXLM_Roberta and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'0': {'precision': 0.8938223938223938, 'recall': 0.904296875, 'f1-score': 0.8990291262135922, 'support': 512}, '1': {'precision': 0.9021956087824351, 'recall': 0.8915187376725838, 'f1-score': 0.8968253968253967, 'support': 507}, 'accuracy': 0.8979391560353287, 'macro avg': {'precision': 0.8980090013024145, 'recall': 0.897907806336292, 'f1-score': 0.8979272615194944, 'support': 1019}, 'weighted avg': {'precision': 0.897988458576801, 'recall': 0.8979391560353287, 'f1-score': 0.8979326681176009, 'support': 1019}}
Validation Macro F1-Score: 0.8979
Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.90      0.90       512
           1       0.90      0.89      0.90       507

    accuracy                           0.90      1019
   macro avg       0.90      0.90      0.90      1019
weighted avg       0.90      0.90      0.90      1019



In [3]:
# Load the test dataset
test_path = "Fake_test_without_labels.csv"  # Path to the test dataset
test_df = pd.read_csv(test_path)

id2label = {v:k for k, v in label2id.items()}

data = []
for text, ID in zip(test_df["text"],test_df["Id"]):
    pred = predict_label(preprocess_text(text))
    pred = id2label[pred]
    data.append([ID, pred])

output_df = pd.DataFrame(data, columns = ["Id", "Labels"])
    

    
output_df.to_csv("byteSizedLLM_Malayalam_task1_run3.csv", index=False)

print("Predictions saved to prediction.csv")


Predictions saved to prediction.csv
