# **Fine-Tuning DistilBERT for Multi-Label Classification**
### Kaggle Multi-Label Classification Dataset
#### Research Paper Title Categorization

In [None]:

!pip install torch transformers datasets scikit-learn
    

In [None]:

import pandas as pd

# Load dataset from Kaggle
dataset_path = "/kaggle/input/multilabel-classification-dataset/train.csv"
df = pd.read_csv(dataset_path)

# Display first few rows
df.head()
    

In [None]:

# Display column names
print(df.columns)

# Check label distribution
df.iloc[:, 1:].sum().plot(kind="bar", title="Category Distribution")
    

In [None]:

# Take a smaller subset for faster training
df_subset = df.sample(n=2000, random_state=42).reset_index(drop=True)
    

In [None]:

from transformers import AutoTokenizer
from datasets import Dataset

# Load DistilBERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# Tokenize function
def tokenize_function(examples):
    return tokenizer(examples["title"], padding="max_length", truncation=True)

# Convert dataset to Hugging Face format
dataset = Dataset.from_pandas(df_subset)
tokenized_dataset = dataset.map(tokenize_function, batched=True)
    

In [None]:

import torch
from transformers import AutoModelForSequenceClassification

# Load pre-trained DistilBERT model
num_labels = 6  # Number of categories
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=num_labels,
    problem_type="multi_label_classification"
)
    

In [None]:

from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01
)
    

In [None]:

from transformers import Trainer
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

# Define compute metrics function
def compute_metrics(pred):
    logits, labels = pred
    predictions = (torch.sigmoid(torch.tensor(logits)) > 0.5).int().numpy()
    labels = labels.numpy()
    accuracy = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average="micro")
    return {"accuracy": accuracy, "f1": f1}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,
    compute_metrics=compute_metrics
)
    

In [None]:

trainer.train()
    

In [None]:

results = trainer.evaluate()
print(results)
    

In [None]:

predictions = trainer.predict(tokenized_dataset)
y_pred = (torch.sigmoid(torch.tensor(predictions.predictions)) > 0.5).int().numpy()
y_true = tokenized_dataset["label"]

from sklearn.metrics import classification_report
print(classification_report(y_true, y_pred))
    

In [None]:

model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")
    

## **Analysis & Future Improvements**
**Findings:**
- DistilBERT successfully fine-tuned for multi-label classification.
- Binary Cross-Entropy loss used.
- Micro-averaged F1-score was the main evaluation metric.

**Future Improvements:**
- Fine-tune on a larger dataset.
- Experiment with different architectures like `bert-base-uncased`.
- Implement cross-validation for improved generalization.

🚀 **Next Steps:** Test on new research paper titles and evaluate real-world performance!