# BERT for Multi-Label Classification
This notebook applies a baseline BERT model to classify Quranic verses using multi-label classification.

In [None]:
# Install and import required libraries
!pip install transformers datasets scikit-learn
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report, hamming_loss
from transformers import BertTokenizerFast, BertForSequenceClassification, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset

In [None]:
# Load and preprocess dataset
df = pd.read_csv("QuranDS.csv")
df = df.dropna(subset=['Verses', 'CommonLabel'])
df['CommonLabel'] = df['CommonLabel'].apply(lambda x: x.split())

# Binarize labels
mlb = MultiLabelBinarizer()
labels = mlb.fit_transform(df['CommonLabel'])

# Train-test split
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['Verses'].tolist(), labels, test_size=0.2, random_state=42
)

In [None]:
# Define Dataset class
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

class QuranDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=128)
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item
    def __len__(self):
        return len(self.labels)

train_dataset = QuranDataset(train_texts, train_labels)
test_dataset = QuranDataset(test_texts, test_labels)

In [None]:
# Load model and prepare Trainer
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased', num_labels=len(mlb.classes_), problem_type="multi_label_classification"
)

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_strategy="no",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

In [None]:
# Train and evaluate
trainer.train()

preds = trainer.predict(test_dataset)
predictions = np.where(preds.predictions > 0.5, 1, 0)

print("Hamming Loss:", hamming_loss(test_labels, predictions))
print(classification_report(test_labels, predictions, target_names=mlb.classes_))