In [3]:
DATA_INPUT = 'resume_screen.csv'

In [4]:
import pandas as pd
import polars as pl
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [5]:
df = pd.read_csv(DATA_INPUT)

In [6]:
df.describe()

Unnamed: 0,advance
count,2000.0
mean,0.5
std,0.500125
min,0.0
25%,0.0
50%,0.5
75%,1.0
max,1.0


In [7]:
df

Unnamed: 0,id,resume_text_256,jd_text_128,job_family,seniority,advance
0,RS00000,"8+ years experience; key skills: linux, contai...",We are hiring a Senior DevOps professional. Mu...,DevOps,Senior,1
1,RS00001,"0+ years experience; key skills: kafka, ci/cd,...",We are hiring a Junior PM professional. Must h...,PM,Junior,0
2,RS00002,"0+ years experience; key skills: bug-tracking,...",We are hiring a Junior QA professional. Must h...,QA,Junior,1
3,RS00003,"2+ years experience; key skills: test-cases, b...",We are hiring a Mid QA professional. Must have...,QA,Mid,1
4,RS00004,"9+ years experience; key skills: testing, bug-...",We are hiring a Senior QA professional. Must h...,QA,Senior,1
...,...,...,...,...,...,...
1995,RS01995,"1+ years experience; key skills: java, databas...",We are hiring a Junior Backend professional. M...,Backend,Junior,1
1996,RS01996,"6+ years experience; key skills: databases, ja...",We are hiring a Senior Backend professional. M...,Backend,Senior,1
1997,RS01997,"4+ years experience; key skills: javascript, h...",We are hiring a Mid Frontend professional. Mus...,Frontend,Mid,1
1998,RS01998,"8+ years experience; key skills: metrics, data...",We are hiring a Senior Backend professional. M...,Backend,Senior,0


In [9]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Concatenate, Dropout, Lambda
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import OneHotEncoder
from sentence_transformers import SentenceTransformer

# ----------------------------
# 1. Load BERT model
# ----------------------------
bert_model = SentenceTransformer('all-MiniLM-L6-v2')
embedding_dim = bert_model.get_sentence_embedding_dimension()

# ----------------------------
# 2. Extract data from df
# ----------------------------
resume_texts = df['resume_text_256'].tolist()
jd_texts = df['jd_text_128'].tolist()
job_family = df['job_family'].tolist()
seniority = df['seniority'].tolist()
advance = df['advance'].values

# ----------------------------
# 3. Compute BERT embeddings
# ----------------------------
resume_embeddings = bert_model.encode(resume_texts, batch_size=32, show_progress_bar=True)
jd_embeddings = bert_model.encode(jd_texts, batch_size=32, show_progress_bar=True)

# ----------------------------
# 4. One-hot encode categorical features
# ----------------------------
ohe_job = OneHotEncoder(sparse_output=False)
ohe_seniority = OneHotEncoder(sparse_output=False)

job_family_encoded = ohe_job.fit_transform(np.array(job_family).reshape(-1,1))
seniority_encoded = ohe_seniority.fit_transform(np.array(seniority).reshape(-1,1))


# ----------------------------
# 5. Cosine similarity function
# ----------------------------
def cosine_similarity(a, b):
    a_norm = tf.linalg.l2_normalize(a, axis=1)
    b_norm = tf.linalg.l2_normalize(b, axis=1)
    return tf.reduce_sum(a_norm * b_norm, axis=1, keepdims=True)

# ----------------------------
# 6. Build Keras model
# ----------------------------
resume_input = Input(shape=(embedding_dim,), name='resume_input')
jd_input = Input(shape=(embedding_dim,), name='jd_input')
job_family_input = Input(shape=(job_family_encoded.shape[1],), name='job_family_input')
seniority_input = Input(shape=(seniority_encoded.shape[1],), name='seniority_input')

# Cosine similarity
cos_sim = Lambda(lambda x: cosine_similarity(x[0], x[1]))([resume_input, jd_input])

# Concatenate all features
x = Concatenate()([resume_input, jd_input, job_family_input, seniority_input, cos_sim])

# Dense layers
x = Dense(256, activation='relu')(x)
x = Dropout(0.3)(x)
x = Dense(128, activation='relu')(x)
x = Dropout(0.3)(x)
x = Dense(64, activation='relu')(x)

# Output layer
output = Dense(1, activation='sigmoid')(x)

model = Model(
    inputs=[resume_input, jd_input, job_family_input, seniority_input],
    outputs=output
)
model.compile(optimizer=Adam(learning_rate=1e-4), loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

# ----------------------------
# 7. Train model
# ----------------------------
model.fit(
    [resume_embeddings, jd_embeddings, job_family_encoded, seniority_encoded],
    advance,
    validation_split=0.2,
    epochs=10,
    batch_size=32
)


Batches:   0%|          | 0/63 [00:00<?, ?it/s]

Batches:   0%|          | 0/63 [00:00<?, ?it/s]

I0000 00:00:1758166394.340706   50603 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 1655 MB memory:  -> device: 0, name: AMD Radeon RX 7900 XTX, pci bus id: 0000:03:00.0


Epoch 1/10


I0000 00:00:1758166397.316219   55547 service.cc:148] XLA service 0x7f101c003a70 initialized for platform ROCM (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1758166397.316258   55547 service.cc:156]   StreamExecutor device (0): AMD Radeon RX 7900 XTX, AMDGPU ISA version: gfx1100
2025-09-18 04:33:17.334579: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.


[1m24/50[0m [32m━━━━━━━━━[0m[37m━━━━━━━━━━━[0m [1m0s[0m 4ms/step - accuracy: 0.5230 - loss: 0.6927

I0000 00:00:1758166401.813909   55547 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 49ms/step - accuracy: 0.5156 - loss: 0.6920 - val_accuracy: 0.5375 - val_loss: 0.6900
Epoch 2/10
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.5444 - loss: 0.6894 - val_accuracy: 0.4950 - val_loss: 0.6902
Epoch 3/10
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.5681 - loss: 0.6853 - val_accuracy: 0.4875 - val_loss: 0.6884
Epoch 4/10
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.5813 - loss: 0.6806 - val_accuracy: 0.5425 - val_loss: 0.6848
Epoch 5/10
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.5906 - loss: 0.6771 - val_accuracy: 0.5425 - val_loss: 0.6835
Epoch 6/10
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m-2s[0m -38680us/step - accuracy: 0.6000 - loss: 0.6723 - val_accuracy: 0.5550 - val_loss: 0.6782
Epoch 7/10
[1m50/50[0m [32m━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x7f116c07e690>

In [10]:
# Evaluate on the same data (or better: separate test set)
loss, accuracy = model.evaluate(
    [resume_embeddings, jd_embeddings, job_family_encoded, seniority_encoded],
    advance,
    batch_size=32
)
print(f"Test Loss: {loss:.4f}, Test Accuracy: {accuracy:.4f}")


[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.6805 - loss: 0.6087
Test Loss: 0.6087, Test Accuracy: 0.6805


In [11]:
# Predict probabilities
pred_probs = model.predict(
    [resume_embeddings, jd_embeddings, job_family_encoded, seniority_encoded],
    batch_size=32
)

# Convert probabilities to binary predictions
pred_classes = (pred_probs >= 0.5).astype(int)

# Example
for i in range(5):
    print(f"Predicted: {pred_classes[i][0]}, Actual: {advance[i]}")


[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step
Predicted: 0, Actual: 1
Predicted: 0, Actual: 0
Predicted: 1, Actual: 1
Predicted: 1, Actual: 1
Predicted: 1, Actual: 1


In [20]:
# ============================================================
# Optimized PyTorch + SentenceTransformer Pipeline
# ============================================================

import numpy as np
import pandas as pd
import re
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from sentence_transformers import SentenceTransformer

# ----------------------------
# 1️⃣ Load dataset
# ----------------------------
df = pd.read_csv(DATA_INPUT)

# ----------------------------
# 2️⃣ Feature engineering
# ----------------------------
def extract_years(resume_text):
    match = re.search(r'(\d+)\+?\s*years', resume_text)
    return int(match.group(1)) if match else 0

df['years_experience'] = df['resume_text_256'].apply(extract_years)

skills_list = ['Python','AWS','DevOps','QA','Frontend','Backend','PM']

def skill_overlap(resume, jd):
    resume_skills = set(s.lower() for s in skills_list if s.lower() in resume.lower())
    jd_skills = set(s.lower() for s in skills_list if s.lower() in jd.lower())
    return len(resume_skills & jd_skills) / (len(jd_skills) + 1e-6)

df['skill_overlap'] = df.apply(lambda row: skill_overlap(row['resume_text_256'], row['jd_text_128']), axis=1)

# ----------------------------
# 3️⃣ Encode categorical features
# ----------------------------
ohe_job = OneHotEncoder(sparse_output=False)
ohe_seniority = OneHotEncoder(sparse_output=False)

job_family_encoded = ohe_job.fit_transform(df[['job_family']])
seniority_encoded = ohe_seniority.fit_transform(df[['seniority']])

numeric_features = np.stack([df['years_experience'], df['skill_overlap']], axis=1)

# ----------------------------
# 4️⃣ Sentence embeddings
# ----------------------------
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

sbert_model = SentenceTransformer('all-MiniLM-L6-v2', device=device)

resume_embeddings = sbert_model.encode(list(df['resume_text_256']), batch_size=32, show_progress_bar=True, convert_to_numpy=True)
jd_embeddings = sbert_model.encode(list(df['jd_text_128']), batch_size=32, show_progress_bar=True, convert_to_numpy=True)

# ----------------------------
# 5️⃣ Dataset class
# ----------------------------
class ResumeDataset(Dataset):
    def __init__(self, resumes, jds, job_family, seniority, numeric_feats, labels):
        self.resumes = torch.tensor(resumes, dtype=torch.float32)
        self.jds = torch.tensor(jds, dtype=torch.float32)
        self.job_family = torch.tensor(job_family, dtype=torch.float32)
        self.seniority = torch.tensor(seniority, dtype=torch.float32)
        self.numeric_feats = torch.tensor(numeric_feats, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.float32).unsqueeze(1)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return (self.resumes[idx], self.jds[idx], self.job_family[idx],
                self.seniority[idx], self.numeric_feats[idx], self.labels[idx])

# ----------------------------
# 6️⃣ Train/Validation split
# ----------------------------
X_train_res, X_val_res, X_train_jd, X_val_jd, X_train_job, X_val_job, \
X_train_senior, X_val_senior, X_train_num, X_val_num, y_train, y_val = train_test_split(
    resume_embeddings, jd_embeddings, job_family_encoded, seniority_encoded, numeric_features,
    df['advance'].values, test_size=0.2, random_state=42
)

train_dataset = ResumeDataset(X_train_res, X_train_jd, X_train_job, X_train_senior, X_train_num, y_train)
val_dataset = ResumeDataset(X_val_res, X_val_jd, X_val_job, X_val_senior, X_val_num, y_val)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

# ----------------------------
# 7️⃣ Neural network
# ----------------------------
class AdvancePredictor(nn.Module):
    def __init__(self, resume_dim, jd_dim, job_dim, seniority_dim, numeric_dim):
        super().__init__()
        input_dim = resume_dim + jd_dim + job_dim + seniority_dim + numeric_dim + 1  # cosine similarity
        self.fc1 = nn.Linear(input_dim, 128)
        self.dropout1 = nn.Dropout(0.3)
        self.fc2 = nn.Linear(128, 64)
        self.dropout2 = nn.Dropout(0.3)
        self.out = nn.Linear(64, 1)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, resume, jd, job, seniority, numeric):
        cos_sim = torch.sum(resume*jd, dim=1, keepdim=True) / (
            torch.norm(resume, dim=1, keepdim=True) * torch.norm(jd, dim=1, keepdim=True) + 1e-6)
        x = torch.cat([resume, jd, job, seniority, numeric, cos_sim], dim=1)
        x = self.relu(self.fc1(x))
        x = self.dropout1(x)
        x = self.relu(self.fc2(x))
        x = self.dropout2(x)
        x = self.sigmoid(self.out(x))
        return x

model = AdvancePredictor(
    resume_dim=resume_embeddings.shape[1],
    jd_dim=jd_embeddings.shape[1],
    job_dim=job_family_encoded.shape[1],
    seniority_dim=seniority_encoded.shape[1],
    numeric_dim=numeric_features.shape[1]
).to(device)

# ----------------------------
# 8️⃣ Training setup
# ----------------------------
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=3e-4)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=5, factor=0.5, verbose=True)
num_epochs = 100
early_stop_patience = 10
best_val_loss = np.inf
epochs_no_improve = 0

# ----------------------------
# 9️⃣ Training loop with early stopping
# ----------------------------
for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    for resumes, jds, jobs, seniors, numerics, labels in train_loader:
        resumes, jds, jobs, seniors, numerics, labels = resumes.to(device), jds.to(device), \
                                                        jobs.to(device), seniors.to(device), \
                                                        numerics.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(resumes, jds, jobs, seniors, numerics)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * labels.size(0)

    train_loss /= len(train_loader.dataset)

    # Validation
    model.eval()
    val_loss = 0
    val_preds = []
    val_labels = []
    with torch.no_grad():
        for resumes, jds, jobs, seniors, numerics, labels in val_loader:
            resumes, jds, jobs, seniors, numerics, labels = resumes.to(device), jds.to(device), \
                                                            jobs.to(device), seniors.to(device), \
                                                            numerics.to(device), labels.to(device)
            outputs = model(resumes, jds, jobs, seniors, numerics)
            loss = criterion(outputs, labels)
            val_loss += loss.item() * labels.size(0)
            val_preds.append(outputs.cpu())
            val_labels.append(labels.cpu())

    val_loss /= len(val_loader.dataset)
    scheduler.step(val_loss)

    val_preds = torch.cat(val_preds)
    val_labels = torch.cat(val_labels)
    val_pred_classes = (val_preds >= 0.5).int()
    val_accuracy = (val_pred_classes == val_labels.int()).float().mean()

    print(f"Epoch {epoch+1}/{num_epochs} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | Val Acc: {val_accuracy:.4f}")

    # Early stopping
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), "advance_predictor_best.pt")
        epochs_no_improve = 0
    else:
        epochs_no_improve += 1
        if epochs_no_improve >= early_stop_patience:
            print(f"Early stopping triggered at epoch {epoch+1}")
            break

# ----------------------------
# 10️⃣ Evaluation
# ----------------------------
print("\nValidation Results:")
print(confusion_matrix(val_labels, val_pred_classes))
print(classification_report(val_labels, val_pred_classes))

Using device: cuda


Batches:   0%|          | 0/63 [00:00<?, ?it/s]

Batches:   0%|          | 0/63 [00:00<?, ?it/s]



Epoch 1/100 | Train Loss: 0.6926 | Val Loss: 0.6921 | Val Acc: 0.4975
Epoch 2/100 | Train Loss: 0.6907 | Val Loss: 0.6887 | Val Acc: 0.5450
Epoch 3/100 | Train Loss: 0.6869 | Val Loss: 0.6856 | Val Acc: 0.5675
Epoch 4/100 | Train Loss: 0.6780 | Val Loss: 0.6734 | Val Acc: 0.6450
Epoch 5/100 | Train Loss: 0.6694 | Val Loss: 0.6596 | Val Acc: 0.6500
Epoch 6/100 | Train Loss: 0.6548 | Val Loss: 0.6420 | Val Acc: 0.6625
Epoch 7/100 | Train Loss: 0.6367 | Val Loss: 0.6196 | Val Acc: 0.6850
Epoch 8/100 | Train Loss: 0.6184 | Val Loss: 0.5992 | Val Acc: 0.7075
Epoch 9/100 | Train Loss: 0.5948 | Val Loss: 0.5956 | Val Acc: 0.6975
Epoch 10/100 | Train Loss: 0.5816 | Val Loss: 0.5555 | Val Acc: 0.7400
Epoch 11/100 | Train Loss: 0.5634 | Val Loss: 0.5320 | Val Acc: 0.7625
Epoch 12/100 | Train Loss: 0.5368 | Val Loss: 0.5174 | Val Acc: 0.7725
Epoch 13/100 | Train Loss: 0.5254 | Val Loss: 0.5075 | Val Acc: 0.7600
Epoch 14/100 | Train Loss: 0.5058 | Val Loss: 0.5241 | Val Acc: 0.7350
Epoch 15/100 | 