In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModel
import torch
from torch.utils.data import DataLoader, Dataset
from torch.nn import functional as F
from torch import nn, optim
from tqdm import tqdm


In [13]:
# Load datasets
resumes = pd.read_csv('data\Resume Dataset\Resume\Resume_With_Skills_And_Roles.csv')
jobs = pd.read_csv('data/Linkedin Job Postings (2023-2024)/cleaned_JD_with_skills.csv')

# print column names
print(resumes.columns)
print(jobs.columns)




Index(['Resume_str', 'Category', 'skills', 'hard_skills', 'roles'], dtype='object')
Index(['roles', 'combined_skills_desc', 'skills', 'hard_skills'], dtype='object')


In [14]:
# Combine text fields into a single column for TF-IDF processing
resumes["text_features"] = resumes["Resume_str"]
jobs["text_features"] = jobs["combined_skills_desc"]

# Ensure skills and hard skills are in list format
resumes["skills_list"] = resumes["skills"].str.split(", ")
jobs["skills_list"] = jobs["skills"].str.split(", ")

resumes["hard_skills_list"] = resumes["hard_skills"].str.split(", ")
jobs["hard_skills_list"] = jobs["hard_skills"].str.split(", ")

# Get job roles
resumes["job_roles_list"] = resumes["roles"].str.split(", ")
jobs["job_roles_list"] = jobs["roles"].str.split(", ")

# Show the first 5 rows of the dataframe
# print(resumes.head())
# print(jobs.head())

# Remove all unnecessary columns
resumes = resumes[["text_features", "skills_list", "hard_skills_list", "job_roles_list"]]
jobs = jobs[["text_features", "skills_list", "hard_skills_list", "job_roles_list"]]

In [15]:
# Get size of datasets
print("Resumes shape: ", resumes.shape)
print("Jobs shape: ", jobs.shape)

# Remove nan values
resumes = resumes.dropna()
jobs = jobs.dropna()

# Get size of datasets
print("Resumes shape: ", resumes.shape)
print("Jobs shape: ", jobs.shape)


Resumes shape:  (2483, 4)
Jobs shape:  (2439, 4)
Resumes shape:  (2460, 4)
Jobs shape:  (2439, 4)


In [16]:
# Show column names
print(resumes.columns)
print(jobs.columns)

Index(['text_features', 'skills_list', 'hard_skills_list', 'job_roles_list'], dtype='object')
Index(['text_features', 'skills_list', 'hard_skills_list', 'job_roles_list'], dtype='object')


In [17]:
from transformers import AutoTokenizer

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")


In [18]:
# Tokenize resumes
resume_text_features = resumes["text_features"].tolist()
tokenized_resumes = tokenizer(
    resume_text_features,
    padding=True,
    truncation=True,
    max_length=128,  # Adjust max_length as needed
    return_tensors="pt"
)

# Tokenize job descriptions
jd_text_features = jobs["text_features"].tolist()
tokenized_jds = tokenizer(
    jd_text_features,
    padding=True,
    truncation=True,
    max_length=128,
    return_tensors="pt"
)

# Print sizes of tokenized resumes and job descriptions
print("Tokenized resumes shape: ", tokenized_resumes["input_ids"].shape)
print("Tokenized job descriptions shape: ", tokenized_jds["input_ids"].shape)


Tokenized resumes shape:  torch.Size([2460, 128])
Tokenized job descriptions shape:  torch.Size([2439, 128])


In [19]:
tokenized_resumes_tensor = tokenized_resumes['input_ids']
tokenized_jds_tensor = tokenized_jds['input_ids']

print(f"Tokenized Resumes shape: {tokenized_resumes_tensor.shape}")  # Expect torch.Size([2460, 128])
print(f"Tokenized Jobs shape: {tokenized_jds_tensor.shape}")   

Tokenized Resumes shape: torch.Size([2460, 128])
Tokenized Jobs shape: torch.Size([2439, 128])


In [20]:
resumes["combined_skills"] = resumes["skills_list"].apply(lambda x: " ".join(x)) + " " + resumes["hard_skills_list"].apply(lambda x: " ".join(x))
jobs["combined_skills"] = jobs["skills_list"].apply(lambda x: " ".join(x)) + " " + jobs["hard_skills_list"].apply(lambda x: " ".join(x))


In [31]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=100)
resume_skills_tfidf_sparse = tfidf.fit_transform(resumes["combined_skills"])
resume_skills_tfidf_dense = resume_skills_tfidf_sparse.toarray()  # Converts sparse to dense

jd_skills_tfidf_sparse = tfidf.transform(jobs["combined_skills"])
jd_skills_tfidf_dense = jd_skills_tfidf_sparse.toarray()

In [24]:
from sentence_transformers import SentenceTransformer

# Initialize the model
embedder = SentenceTransformer('all-MiniLM-L6-v2')  # Lightweight and efficient model

# Convert job roles list to sentences
resumes["job_roles_str"] = resumes["job_roles_list"].apply(lambda roles: " ".join(roles))
jobs["job_roles_str"] = jobs["job_roles_list"].apply(lambda roles: " ".join(roles))

# Generate embeddings for job roles
resume_roles_emb = embedder.encode(resumes["job_roles_str"].tolist(), convert_to_tensor=True)
jd_roles_emb = embedder.encode(jobs["job_roles_str"].tolist(), convert_to_tensor=True)

print(f"Resume roles embeddings shape: {resume_roles_emb.shape}")
print(f"Job roles embeddings shape: {jd_roles_emb.shape}")





Resume roles embeddings shape: torch.Size([2460, 384])
Job roles embeddings shape: torch.Size([2439, 384])


In [32]:
resume_skills_tfidf = torch.tensor(resume_skills_tfidf_dense, dtype=torch.float32)
jd_skills_tfidf = torch.tensor(jd_skills_tfidf_dense, dtype=torch.float32)

## Combine Features

In [35]:
import torch

# Convert TF-IDF features to tensors
resume_skills_tfidf_tensor = torch.tensor(resume_skills_tfidf, dtype=torch.float32)
job_skills_tfidf_tensor = torch.tensor(jd_skills_tfidf, dtype=torch.float32)

# Concatenate tokenized text, TF-IDF features, and role embeddings
resume_combined_features = torch.cat([tokenized_resumes_tensor, resume_skills_tfidf_tensor, resume_roles_emb], dim=1)
jd_combined_features = torch.cat([tokenized_jds_tensor, job_skills_tfidf_tensor, jd_roles_emb], dim=1)

print(f"Combined Resume Features shape: {resume_combined_features.shape}")  # Expect (2460, 612)
print(f"Combined JD Features shape: {jd_combined_features.shape}")          # Expect (2439, 612)         # Expected: (2439, 128 + 100 + 384)


Combined Resume Features shape: torch.Size([2460, 612])
Combined JD Features shape: torch.Size([2439, 612])


  resume_skills_tfidf_tensor = torch.tensor(resume_skills_tfidf, dtype=torch.float32)
  job_skills_tfidf_tensor = torch.tensor(jd_skills_tfidf, dtype=torch.float32)


# Build two tower models and combine them with a simple MLP.

In [38]:
import torch
from torch.utils.data import Dataset

class ResumeJDDataset(Dataset):
    def __init__(self, resumes, jobs, num_negatives=1):
        self.resumes = resumes
        self.jobs = jobs
        self.num_negatives = num_negatives  # Number of negative samples per positive pair

    def __len__(self):
        return len(self.resumes)

    def __getitem__(self, idx):
        # Positive pair
        resume = self.resumes[idx]
        jd_positive = self.jobs[idx % len(self.jobs)]  # Matching positive JD
        positive_label = 1

        # Negative pairs
        negative_indices = torch.randint(0, len(self.jobs), (self.num_negatives,))
        jd_negatives = [self.jobs[i] for i in negative_indices]
        negative_labels = [0] * self.num_negatives

        # Combine positive and negative pairs
        resume_batch = [resume] * (1 + self.num_negatives)  # Repeat the resume for all pairs
        jd_batch = [jd_positive] + jd_negatives
        labels = [positive_label] + negative_labels

        return {
            'resume_features': torch.stack(resume_batch),
            'jd_features': torch.stack(jd_batch),
            'label': torch.tensor(labels, dtype=torch.float32)
        }


In [39]:
from torch.utils.data import DataLoader

# Initialize the dataset
num_negatives = 2  # Number of negative samples per positive pair
train_dataset = ResumeJDDataset(
    resumes=resume_combined_features,
    jobs=jd_combined_features,
    num_negatives=num_negatives
)

# Initialize the DataLoader
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# Verify the output of the DataLoader
for batch in train_loader:
    print(f"Resume Features Batch Shape: {batch['resume_features'].shape}")  # Expect (32, 1+num_negatives, 612)
    print(f"Job Features Batch Shape: {batch['jd_features'].shape}")         # Expect (32, 1+num_negatives, 612)
    print(f"Labels Batch Shape: {batch['label'].shape}")                    # Expect (32, 1+num_negatives)
    break


Resume Features Batch Shape: torch.Size([32, 3, 612])
Job Features Batch Shape: torch.Size([32, 3, 612])
Labels Batch Shape: torch.Size([32, 3])


In [36]:
import torch
import torch.nn as nn

class TwoTowerModel(nn.Module):
    def __init__(self, input_dim=612, projection_dim=256):
        super(TwoTowerModel, self).__init__()
        
        # Resume Tower
        self.resume_tower = nn.Sequential(
            nn.Linear(input_dim, projection_dim),
            nn.ReLU(),
            nn.Linear(projection_dim, projection_dim)
        )

        # Job Description Tower
        self.jd_tower = nn.Sequential(
            nn.Linear(input_dim, projection_dim),
            nn.ReLU(),
            nn.Linear(projection_dim, projection_dim)
        )

    def forward(self, resume_features, jd_features):
        # Pass features through respective towers
        resume_proj = self.resume_tower(resume_features)
        jd_proj = self.jd_tower(jd_features)
        
        return resume_proj, jd_proj


In [42]:
# Initialize model, loss function, and optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = TwoTowerModel(input_dim=612, projection_dim=256).to(device)
criterion = nn.MSELoss()  # Regression loss on cosine similarity
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)

# Training Loop
epochs = 5
for epoch in range(epochs):
    model.train()
    total_loss = 0

    for batch in train_loader:
        # Flatten dynamic pairs into a single batch
        resume_features = batch['resume_features'].view(-1, batch['resume_features'].shape[-1]).to(device)
        jd_features = batch['jd_features'].view(-1, batch['jd_features'].shape[-1]).to(device)
        labels = batch['label'].view(-1).to(device)


        # Forward pass through the model
        resume_proj, jd_proj = model(resume_features, jd_features)

        # Compute cosine similarity
        similarity = torch.nn.functional.cosine_similarity(resume_proj, jd_proj, dim=-1)

        # Calculate loss and backpropagate
        loss = criterion(similarity, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()


    print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(train_loader):.4f}")


Epoch 1/5, Loss: 0.0000
Epoch 2/5, Loss: 0.0000
Epoch 3/5, Loss: 0.0000
Epoch 4/5, Loss: 0.0000
Epoch 5/5, Loss: 0.0000


# Rank Jobs