In [6]:
import pandas as pd
import numpy as np
import torch
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import classification_report, average_precision_score
from sklearn.utils.class_weight import compute_class_weight
from datasets import Dataset

# --- Load Data ---
X_train = pd.read_csv("../data/processed/train.csv")
y_train = pd.read_csv("../data/processed/train_labels.csv").squeeze()
X_test = pd.read_csv("../data/processed/test.csv")
y_test = pd.read_csv("../data/processed/test_labels.csv").squeeze()

# --- Feature Setup ---
text_cols = ['candidate_skills', 'past_job_titles', 'certifications', 'required_skills', 'job_description']
cat_cols = ['education_level', 'candidate_location', 'job_location', 'job_title']
num_cols = [col for col in X_train.columns if col not in text_cols + cat_cols]

# --- Prompt Engineering Function ---
def create_prompt(row):
    parts = []
    for col in cat_cols + num_cols + text_cols:
        val = str(row[col]) if pd.notna(row[col]) else ""
        parts.append(f"{col.replace('_', ' ').title()}: {val}")
    return " | ".join(parts)

X_train['text'] = X_train.apply(create_prompt, axis=1)
X_test['text'] = X_test.apply(create_prompt, axis=1)

# --- Format for Hugging Face Dataset ---
train_df = pd.DataFrame({'text': X_train['text'], 'label': y_train})
test_df = pd.DataFrame({'text': X_test['text'], 'label': y_test})

train_ds = Dataset.from_pandas(train_df)
test_ds = Dataset.from_pandas(test_df)

# --- Tokenizer ---
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

def tokenize(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=512)

train_ds = train_ds.map(tokenize, batched=True)
test_ds = test_ds.map(tokenize, batched=True)

train_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
test_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

# --- Compute Class Weights ---
class_weights = compute_class_weight(class_weight="balanced", classes=np.unique(y_train), y=y_train)
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float)

# --- Custom Loss Function ---
import torch.nn as nn

class WeightedDistilBERT(nn.Module):
    def __init__(self, base_model, class_weights):
        super(WeightedDistilBERT, self).__init__()
        self.model = base_model
        self.loss_fct = nn.CrossEntropyLoss(weight=class_weights)

    def forward(self, input_ids=None, attention_mask=None, labels=None):
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, labels=None)
        logits = outputs.logits
        loss = None
        if labels is not None:
            loss = self.loss_fct(logits, labels)
        return {"loss": loss, "logits": logits}

# --- Load Base Model and Wrap ---
base_model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
model = WeightedDistilBERT(base_model, class_weights_tensor)

# --- Training Arguments (v4.5.2 compatible) ---
training_args = TrainingArguments(
    output_dir="./distilbert_output",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    save_steps=500,
    disable_tqdm=False
)

# --- Trainer ---
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds
)

# --- Train ---
trainer.train()

# --- Predict ---
preds = trainer.predict(test_ds)
y_pred = np.argmax(preds.predictions, axis=1)

# --- Evaluate ---
print("\nDistilBERT Classification Report:\n")
print(classification_report(y_test, y_pred))
print("PR AUC (Average Precision):", average_precision_score(y_test, preds.predictions[:, 1]))


Map: 100%|██████████| 4000/4000 [00:00<00:00, 9743.15 examples/s]
Map: 100%|██████████| 1000/1000 [00:00<00:00, 10792.82 examples/s]
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
50,0.7344
100,0.863
150,0.8324
200,0.9609
250,0.916
300,0.9083
350,0.7471
400,0.9589
450,0.6803
500,0.7651





DistilBERT Classification Report:

              precision    recall  f1-score   support

           0       0.96      1.00      0.98       961
           1       0.00      0.00      0.00        39

    accuracy                           0.96      1000
   macro avg       0.48      0.50      0.49      1000
weighted avg       0.92      0.96      0.94      1000

PR AUC (Average Precision): 0.1014730664893092


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])



#Conclusion
Although the overall accuracy is high (96%), this is misleading due to severe class imbalance.

The model completely fails to identify the minority class (label 1) — with 0 precision and 0 recall, meaning not a single positive case was predicted.

This results in a macro average F1-score of just 0.49, indicating poor generalization across both classes.

Despite prompt engineering and class weighting, the model defaults to majority class predictions.

This version is not suitable for deployment in scenarios where correctly detecting the minority class (e.g. qualified candidates, fraud, defects) is critical.
