In [24]:
import pandas as pd
import pdfplumber
import torch
import torch.nn as nn
from datasets import (
    Dataset,
    DatasetDict,
)
from sentence_transformers import SentenceTransformer
from sklearn.metrics import (
    accuracy_score,
    f1_score,
)
from torch.utils.data import DataLoader

# 1. Understanding the Problem
- Task: Predict whether a given resume and job description pair is a good fit (binary classification).
- Approach: Fine-tune a pre-trained SBERT model for binary classification instead of semantic similarity regression.
- Why SBERT? SBERT provides powerful sentence embeddings that can be adapted for classification tasks.

# 2. Preparing the Dataset for Binary Classification

In [25]:
def extract_text_from_pdf(file_path):
    """Extracts text from PDF file."""
    with pdfplumber.open(file_path) as pdf:
        text = ""
        for page in pdf.pages:
            text += page.extract_text()
    return text

In [26]:
df = pd.read_csv('dataset/job_descriptions.csv')
df['resume'] = extract_text_from_pdf('dataset/resume.pdf')
df['label'] = df['label'].astype(int)
df.dropna(subset=['resume', 'job_description', 'label'], inplace=True)
df.head()

Unnamed: 0,job_description_id,job_description,label,resume
0,4053455340,"Staff Software Engineer - Distributed Systems,...",0,Panos Lin Guohui\n+18042477019 | lghpanos@gmai...
1,4056940532,Machine Learning Engineer - Model Training Inf...,0,Panos Lin Guohui\n+18042477019 | lghpanos@gmai...
2,4049587892,Full Stack Engineer\nBE AT THE FOREFRONT OF AI...,1,Panos Lin Guohui\n+18042477019 | lghpanos@gmai...
3,4046300409,"Software Engineer, Realtime Compute\nTecton he...",0,Panos Lin Guohui\n+18042477019 | lghpanos@gmai...
4,4056216398,Research Engineer - New Grad\nAbout Applied In...,0,Panos Lin Guohui\n+18042477019 | lghpanos@gmai...


In [27]:
dataset = Dataset.from_pandas(df)
# Split into train and temp (which will be split into validation and test)
train_testvalid = dataset.train_test_split(test_size=0.2, seed=42)

# Split the temp set into validation and test
test_valid = train_testvalid['test'].train_test_split(test_size=0.5, seed=42)

# Combine splits into a DatasetDict
dataset_splits = DatasetDict({
    'train':      train_testvalid['train'],
    'validation': test_valid['train'],
    'test':       test_valid['test']
})
print(f"Train size: {len(dataset_splits['train'])}")
print(f"Validation size: {len(dataset_splits['validation'])}")
print(f"Test size: {len(dataset_splits['test'])}")

Train size: 447
Validation size: 56
Test size: 56


In [28]:
if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("Using MPS device")
elif torch.cuda.is_available():
    device = torch.device("cuda")
    print("Using CUDA device")
else:
    device = torch.device("cpu")
    print("Using CPU device")


Using MPS device


In [ ]:
model_name = 'sentence-transformers/all-MiniLM-L6-v2'
base_model = SentenceTransformer(model_name)

In [29]:
class SBERTClassifier(nn.Module):
    def __init__(self, base_model, num_classes=2):
        super(SBERTClassifier, self).__init__()
        self.base_model = base_model
        self.num_classes = num_classes
        self.classifier = nn.Linear(base_model.get_sentence_embedding_dimension(), num_classes)

    def forward(self, input_pairs):
        # Unpack the input pairs
        resumes, job_descriptions = input_pairs

        # Tokenize the inputs
        encoded_inputs = self.base_model.tokenize(resumes + job_descriptions)

        # Move inputs to the device
        encoded_inputs = {key: val.to(device) for key, val in encoded_inputs.items()}

        # Get embeddings
        model_output = self.base_model(encoded_inputs)
        embeddings = model_output['sentence_embedding']

        # Split embeddings back into resumes and job descriptions
        resume_embeddings = embeddings[:len(resumes)]
        job_description_embeddings = embeddings[len(resumes):]

        # Compute the absolute difference between embeddings
        features = torch.abs(resume_embeddings - job_description_embeddings)

        # Pass through classifier
        logits = self.classifier(features)
        return logits

In [30]:
model = SBERTClassifier(base_model)
model.to(device)

SBERTClassifier(
  (base_model): SentenceTransformer(
    (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
    (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
    (2): Normalize()
  )
  (classifier): Linear(in_features=384, out_features=2, bias=True)
)

In [31]:
class ResumeJobDataset(torch.utils.data.Dataset):
    def __init__(self, dataset):
        self.dataset = dataset

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
        resume = item['resume']
        job_description = item['job_description']
        label = item['label']
        return (resume, job_description), label

In [32]:
train_dataset = ResumeJobDataset(dataset_splits['train'])
validation_dataset = ResumeJobDataset(dataset_splits['validation'])
test_dataset = ResumeJobDataset(dataset_splits['test'])

train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
validation_dataloader = DataLoader(validation_dataset, batch_size=16)
test_dataloader = DataLoader(test_dataset, batch_size=16)

In [33]:
criterion = nn.CrossEntropyLoss()

In [34]:
# optimizer = AdamW(model.parameters(), lr=2e-5)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

In [35]:
def train(model, dataloader, optimizer, criterion):
    model.train()
    total_loss = 0
    for batch in dataloader:
        input_pairs, labels = batch
        resumes, job_descriptions = input_pairs
        labels = labels.to(device)

        optimizer.zero_grad()
        logits = model((resumes, job_descriptions))
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    avg_loss = total_loss / len(dataloader)
    return avg_loss

In [36]:
def evaluate(model, dataloader):
    model.eval()
    predictions, true_labels = [], []
    with torch.no_grad():
        for batch in dataloader:
            input_pairs, labels = batch
            resumes, job_descriptions = input_pairs
            labels = labels.to(device)

            logits = model((resumes, job_descriptions))
            preds = torch.argmax(logits, dim=1)
            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
    accuracy = accuracy_score(true_labels, predictions)
    f1 = f1_score(true_labels, predictions)
    return accuracy, f1

In [38]:
num_epochs = 10
best_validation_f1 = 0

In [40]:
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    train_loss = train(model, train_dataloader, optimizer, criterion)
    val_accuracy, val_f1 = evaluate(model, validation_dataloader)
    print(f"Train Loss: {train_loss:.4f}")
    print(f"Validation Accuracy: {val_accuracy:.4f}, Validation F1 Score: {val_f1:.4f}")

    # Save the model if it has the best F1 score so far
    if val_f1 > best_validation_f1:
        best_validation_f1 = val_f1
        torch.save(model.state_dict(), 'best_model_2024_11_10.pt')

Epoch 1/10


Exception ignored in: <function Dataset.__del__ at 0x11746a700>
Traceback (most recent call last):
  File "/Users/linguohui/ResumeJDRelevancy/ResumeJDRelevancy/.venv/lib/python3.12/site-packages/datasets/arrow_dataset.py", line 1395, in __del__
KeyboardInterrupt: 


KeyboardInterrupt: 

In [26]:
model.load_state_dict(torch.load('best_model.pt', weights_only=False))

<All keys matched successfully>

In [27]:
test_accuracy, test_f1 = evaluate(model, test_dataloader)
print(f"Test Accuracy: {test_accuracy:.4f}, Test F1 Score: {test_f1:.4f}")

RuntimeError: Tensor for argument weight is on cpu but expected on mps