In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/english-ai-dataset-small/EN_AI_Dec_small.csv
/kaggle/input/bangla-ai-dataset/BD_AI_Dec.csv


## Dataset Importing

In [2]:
bn_df = pd.read_csv('/kaggle/input/bangla-ai-dataset/BD_AI_Dec.csv')
print(bn_df['label'].value_counts())
bn_df.head()

label
1    978
0    768
Name: count, dtype: int64


Unnamed: 0,text,label
0,"""সাহিত্যিক অবদানের জন্য তিনি পাকিস্তান সরকারের...",0
1,"শিকারি পাখি হলো এমন পাখি, যারা ছোট প্রাণী শিকা...",0
2,এ্যানথ্রাক্স সংক্রমণ পশুসম্পদের উপর বিরূপ প্রভ...,0
3,ইলিশ নদী ও সমুদ্রের মিলনস্থলে বেশি পাওয়া যায়...,0
4,তাদের বিবাহ উৎসব ঐতিহ্যগতভাবে হয়। বাবা-মা বিয...,1


In [3]:
en_df = pd.read_csv('/kaggle/input/english-ai-dataset-small/EN_AI_Dec_small.csv')
print(en_df['label'].value_counts())
en_df.head()

label
0    2500
1    2500
Name: count, dtype: int64


Unnamed: 0,label,text
0,0,From Trailville\n\nUpper Des Plaines River Can...
1,0,Coca-Cola Company’s Performance Measurement Es...
2,0,"I can't stop it, you won't like it, you should..."
3,0,"American Sniper, the Clint Eastwood movie abou..."
4,0,eating them.'' His jaw was clenched so tight t...


In [4]:
en_df = en_df.sample(frac=1, random_state=42).reset_index(drop=True)
en_df.head()

Unnamed: 0,label,text
0,0,ine glasses. He ’ s clearly counting on rule n...
1,1,New at SubtleTV! Close\n\nVideo: Video: Misfit...
2,1,\nSome great books to read are To Kill a Mocki...
3,0,Reflection on Neo-Confucian Discourse Essay\n\...
4,0,How Are Notions of Masculinity Represented in ...


In [5]:
bangla_df = bn_df
english_df = en_df

In [6]:
bangla_df['language'] = 'bn'  # Bangla
english_df['language'] = 'en'  # English

In [7]:
combined_df = pd.concat([bangla_df, english_df], ignore_index=True)
print(f"Combined dataset shape: {combined_df.shape}")

Combined dataset shape: (6746, 3)


In [8]:
combined_df = combined_df.sample(frac=1, random_state=42).reset_index(drop=True)

## Import Packages

In [9]:
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification,
    TrainingArguments, 
    Trainer,
    EarlyStoppingCallback
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np

2025-08-13 03:48:07.013849: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1755056887.400020      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1755056887.512009      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [10]:
class MultilingualTextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        # Tokenize text
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [11]:
class MultilingualAIDetector(nn.Module):
    def __init__(self, model_name='csebuetnlp/banglabert', num_labels=2, dropout_rate=0.1):
        super().__init__()
        self.model = AutoModelForSequenceClassification.from_pretrained(
            model_name,
            num_labels=num_labels,
            output_attentions=False,
            output_hidden_states=False
        )
        
        # Add additional dropout for better generalization
        self.dropout = nn.Dropout(dropout_rate)
        
    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        return outputs

# Initialize model and tokenizer
MODEL_NAME = 'xlm-roberta-base'  # Best for multilingual tasks
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = MultilingualAIDetector(MODEL_NAME)

print(f"Model loaded: {MODEL_NAME}")
print(f"Vocabulary size: {tokenizer.vocab_size}")

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded: xlm-roberta-base
Vocabulary size: 250002


In [12]:
# Prepare the data
texts = combined_df['text'].tolist()
labels = combined_df['label'].tolist()
languages = combined_df['language'].tolist()

# Stratified split to maintain language and label balance
train_texts, temp_texts, train_labels, temp_labels = train_test_split(
    texts, labels, 
    test_size=0.3, 
    random_state=42, 
    stratify=labels
)

val_texts, test_texts, val_labels, test_labels = train_test_split(
    temp_texts, temp_labels,
    test_size=0.5,
    random_state=42,
    stratify=temp_labels
)

print(f"Training samples: {len(train_texts)}")
print(f"Validation samples: {len(val_texts)}")
print(f"Test samples: {len(test_texts)}")

# Create datasets
train_dataset = MultilingualTextDataset(train_texts, train_labels, tokenizer)
val_dataset = MultilingualTextDataset(val_texts, val_labels, tokenizer)
test_dataset = MultilingualTextDataset(test_texts, test_labels, tokenizer)

Training samples: 4722
Validation samples: 1012
Test samples: 1012


In [13]:
def compute_metrics(eval_pred):
    """Compute accuracy, precision, recall, and F1-score"""
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, predictions, average='weighted'
    )
    accuracy = accuracy_score(labels, predictions)
    
    return {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }


training_args = TrainingArguments(
    output_dir='./AITD_Exp_2',
    num_train_epochs=5,  
    per_device_train_batch_size=8, 
    per_device_eval_batch_size=16,
    warmup_steps=200,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    eval_strategy='epoch',
    # eval_steps=200,
    save_strategy='epoch',
    # save_steps=400,
    load_best_model_at_end=True,
    metric_for_best_model='f1',
    greater_is_better=True,
    report_to='none',  
    save_total_limit=2,
    learning_rate=2e-5,
    gradient_accumulation_steps=2,
)

In [14]:
# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

# Train the model
print("Starting training...")
trainer.train()

# Save the best model
trainer.save_model('./best_multilingual_ai_detector')
tokenizer.save_pretrained('./best_multilingual_ai_detector')

print("Training completed and model saved!")

Starting training...




Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.6349,0.568834,0.715415,0.687374,0.814412,0.715415
2,0.4421,0.514124,0.794466,0.784638,0.84758,0.794466
3,0.3089,0.442149,0.80336,0.79441,0.854995,0.80336
4,0.2696,0.407074,0.838933,0.836628,0.853292,0.838933
5,0.2032,0.472234,0.838933,0.834981,0.866484,0.838933




Training completed and model saved!


In [15]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

MultilingualAIDetector(
  (model): XLMRobertaForSequenceClassification(
    (roberta): XLMRobertaModel(
      (embeddings): XLMRobertaEmbeddings(
        (word_embeddings): Embedding(250002, 768, padding_idx=1)
        (position_embeddings): Embedding(514, 768, padding_idx=1)
        (token_type_embeddings): Embedding(1, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): XLMRobertaEncoder(
        (layer): ModuleList(
          (0-11): 12 x XLMRobertaLayer(
            (attention): XLMRobertaAttention(
              (self): XLMRobertaSdpaSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): XLMRobert

In [16]:
# Evaluate on test set
test_results = trainer.evaluate(test_dataset)
print("Test Results:")
for key, value in test_results.items():
    print(f"{key}: {value:.4f}")

def evaluate_by_language(texts, labels, languages, model, tokenizer):
    """Evaluate model performance by language"""
    
    # Get the device the model is on
    device = next(model.parameters()).device
    
    # Create test dataset
    test_dataset = MultilingualTextDataset(texts, labels, tokenizer)
    test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)
    
    model.eval()
    all_predictions = []
    all_labels = []
    
    with torch.no_grad():
        for batch in test_loader:
            # Move batch tensors to the same device as the model
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels_batch = batch['labels'].to(device)
            
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            predictions = torch.argmax(outputs.logits, dim=1)
            
            # Move back to CPU for numpy operations
            all_predictions.extend(predictions.cpu().numpy())
            all_labels.extend(labels_batch.cpu().numpy())
    
    # Rest of your function remains the same...
    all_predictions = np.array(all_predictions)
    all_labels = np.array(all_labels)
    languages = np.array(languages)
    
    # Evaluate by language
    for lang in ['bn', 'en']:
        lang_mask = languages == lang
        if np.sum(lang_mask) > 0:
            lang_preds = all_predictions[lang_mask]
            lang_labels = all_labels[lang_mask]
            
            accuracy = accuracy_score(lang_labels, lang_preds)
            precision, recall, f1, _ = precision_recall_fscore_support(
                lang_labels, lang_preds, average='weighted'
            )
            
            lang_name = 'Bangla' if lang == 'bn' else 'English'
            print(f"\n{lang_name} Performance:")
            print(f"Accuracy: {accuracy:.4f}")
            print(f"Precision: {precision:.4f}")
            print(f"Recall: {recall:.4f}")
            print(f"F1-Score: {f1:.4f}")


# Get language info for test set (you'll need to track this)
test_languages = []
for text in test_texts:
    # Simple heuristic: if contains Bangla characters, mark as 'bn'
    if any('\u0980' <= char <= '\u09FF' for char in text):
        test_languages.append('bn')
    else:
        test_languages.append('en')

# Evaluate by language
evaluate_by_language(test_texts, test_labels, test_languages, model, tokenizer)



Test Results:
eval_loss: 0.3898
eval_accuracy: 0.8350
eval_f1: 0.8322
eval_precision: 0.8523
eval_recall: 0.8350
eval_runtime: 20.5249
eval_samples_per_second: 49.3060
eval_steps_per_second: 1.5590
epoch: 5.0000

Bangla Performance:
Accuracy: 0.5874
Precision: 0.6008
Recall: 0.5874
F1-Score: 0.5577

English Performance:
Accuracy: 0.9246
Precision: 0.9329
Recall: 0.9246
F1-Score: 0.9241


In [17]:
def predict_text(text, model, tokenizer, device=None):
    """Predict if text is AI-generated or human-written"""
    
    # Auto-detect device if not provided
    if device is None:
        device = next(model.parameters()).device
    
    model.eval()
    
    # Tokenize input and move to correct device
    inputs = tokenizer(
        text,
        truncation=True,
        padding='max_length',
        max_length=512,
        return_tensors='pt'
    )
    
    # Move all input tensors to the same device as model
    inputs = {key: value.to(device) for key, value in inputs.items()}
    
    with torch.no_grad():
        outputs = model(**inputs)
        probabilities = torch.softmax(outputs.logits, dim=1)
        prediction = torch.argmax(outputs.logits, dim=1)
    
    confidence = probabilities[0][prediction].item()
    pred_label = prediction.item()
    
    result = {
        'prediction': 'AI-generated' if pred_label == 1 else 'Human-written',
        'confidence': confidence,
        'probabilities': {
            'human': probabilities[0][0].item(),
            'ai': probabilities[0][1].item()
        }
    }
    
    return result

# Example usage
sample_bangla = "আর্টিফিশিয়াল ইন্টেলিজেন্স আমাদের জীবনে নতুন সম্ভাবনার দ্বার উন্মোচন করেছে।"
sample_english = "Artificial intelligence has revolutionized the way we interact with technology."

print("Bangla text prediction:")
result_bn = predict_text(sample_bangla, model, tokenizer)
print(f"Prediction: {result_bn['prediction']}")
print(f"Confidence: {result_bn['confidence']:.4f}")

print("\nEnglish text prediction:")
result_en = predict_text(sample_english, model, tokenizer)
print(f"Prediction: {result_en['prediction']}")
print(f"Confidence: {result_en['confidence']:.4f}")

Bangla text prediction:
Prediction: AI-generated
Confidence: 0.6950

English text prediction:
Prediction: AI-generated
Confidence: 0.9969
