# Naïve Bayes Multi-label Text Classification
This notebook demonstrates a multi-label classification pipeline using the Naïve Bayes algorithm on Quranic verses. The pipeline includes preprocessing, vectorization, model training, and evaluation using standard metrics.

In [None]:
# Import required libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import hamming_loss, precision_score, recall_score, f1_score
from sklearn.multioutput import MultiOutputClassifier

## Load Dataset

In [None]:
# Load the dataset (update the file path if needed)
df = pd.read_csv("QuranDS.csv")
df.dropna(subset=['Verses', 'ManualLabels'], inplace=True)  # Ensure no nulls
df.head()

## Preprocess Text and Labels

In [None]:
# Define input features and multi-label targets
X = df['Verses']
y = df['ManualLabels'].str.get_dummies(sep=',')

# Display label distribution
y.sum().sort_values(ascending=False)

## Vectorization using TF-IDF

In [None]:
# Convert verses into TF-IDF features
vectorizer = TfidfVectorizer()
X_vec = vectorizer.fit_transform(X)
X_vec.shape

## Split Dataset into Train and Test Sets

In [None]:
# Perform 80/20 train-test split
X_train, X_test, y_train, y_test = train_test_split(X_vec, y, test_size=0.2, random_state=42)

## Train Naïve Bayes Model

In [None]:
# Fit the Naïve Bayes model
model = MultiOutputClassifier(MultinomialNB())
model.fit(X_train, y_train)

## Evaluate Model Performance

In [None]:
# Predict on the test set
y_pred = model.predict(X_test)

# Compute and print evaluation metrics
print("Hamming Loss:", hamming_loss(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='micro'))
print("Recall:", recall_score(y_test, y_pred, average='micro'))
print("F1 Score:", f1_score(y_test, y_pred, average='micro'))