In [1]:
#Import Libraries and Configuration

import pandas as pd
from sentence_transformers import SentenceTransformer, util
import os
import torch

# --- Configuration ---
# Path to the cleaned data CSV generated by the previous script
CLEANED_DATA_PATH = "./data/cleaned_resume_job_data.csv"

# The Sentence-BERT model to use. 'all-MiniLM-L6-v2' is a good general-purpose model.
# You can explore others like 'all-mpnet-base-v2' for higher performance (but slower).
MODEL_NAME = 'all-MiniLM-L6-v2'

# Path to save the DataFrame with the new similarity scores
OUTPUT_CSV_PATH = "./data/resume_job_match_with_similarity.csv"

print("Configuration loaded.")
print(f"Using cleaned data from: {CLEANED_DATA_PATH}")
print(f"Using Sentence-BERT model: {MODEL_NAME}")
print(f"Output will be saved to: {OUTPUT_CSV_PATH}")


  from .autonotebook import tqdm as notebook_tqdm


Configuration loaded.
Using cleaned data from: ./data/cleaned_resume_job_data.csv
Using Sentence-BERT model: all-MiniLM-L6-v2
Output will be saved to: ./data/resume_job_match_with_similarity.csv


In [2]:
# Check if the cleaned data file exists
if not os.path.exists(CLEANED_DATA_PATH):
    print(f"Error: Cleaned data file not found at '{CLEANED_DATA_PATH}'.")
    print("Please ensure the previous data preparation script ran successfully and saved the file.")
    # You might want to add a sys.exit() here if this is critical for execution
    # For a notebook, we'll just print an error and stop.
else:
    try:
        df = pd.read_csv(CLEANED_DATA_PATH)
        print(f"Loaded cleaned data from {CLEANED_DATA_PATH}. Shape: {df.shape}")
        print("\nSample of cleaned data (first 5 rows):")
        print(df[['job_description_cleaned', 'resume_cleaned', 'match_score']].head())
    except Exception as e:
        print(f"Error loading cleaned data: {e}")

Loaded cleaned data from ./data/cleaned_resume_job_data.csv. Shape: (10000, 5)

Sample of cleaned data (first 5 rows):
                             job_description_cleaned  \
0  data analyst needed with experience in sql, ex...   
1  data scientist needed with experience in stati...   
2  software engineer needed with experience in sy...   
3  ml engineer needed with experience in python, ...   
4  software engineer needed with experience in re...   

                                      resume_cleaned  match_score  
0  experienced professional skilled in sql, power...            4  
1  experienced professional skilled in python, de...            4  
2  experienced professional skilled in wait, git,...            5  
3  experienced professional skilled in return, de...            4  
4  experienced professional skilled in rest apis,...            5  


In [3]:
# Cell 3: Load Pre-trained Sentence-BERT Model

print(f"Loading Sentence-BERT model: {MODEL_NAME}...")
try:
    # This downloads the model if it's not already cached locally
    model = SentenceTransformer(MODEL_NAME)
    print("Model loaded successfully.")
    # Check if GPU is available and use it
    if torch.cuda.is_available():
        model.to(torch.device("cuda"))
        print("Model moved to GPU for faster processing.")
    else:
        print("GPU not available, using CPU.")
except Exception as e:
    print(f"Error loading Sentence-BERT model. Make sure you have 'sentence-transformers' and 'torch' installed (`pip install sentence-transformers torch`). Error: {e}")
    print("If you encounter issues, try running `pip install --upgrade transformers sentence-transformers torch`")


Loading Sentence-BERT model: all-MiniLM-L6-v2...
Model loaded successfully.
GPU not available, using CPU.


In [4]:
# Cell 4: Generate Embeddings for Job Descriptions and Resumes

print("\nGenerating embeddings for job descriptions...")
# Encode job descriptions into dense vector embeddings
# convert_to_tensor=True ensures the output is a PyTorch tensor, useful for GPU acceleration
# show_progress_bar=True provides visual feedback during encoding
job_embeddings = model.encode(df['job_description_cleaned'].tolist(),
                              convert_to_tensor=True,
                              show_progress_bar=True)

print("\nGenerating embeddings for resumes...")
# Encode resumes into dense vector embeddings
resume_embeddings = model.encode(df['resume_cleaned'].tolist(),
                                 convert_to_tensor=True,
                                 show_progress_bar=True)

print(f"\nGenerated {len(job_embeddings)} job embeddings.")
print(f"Generated {len(resume_embeddings)} resume embeddings.")
print(f"Each embedding has a dimension of: {job_embeddings.shape[1]}") # Should be 384 for MiniLM-L6-v2



Generating embeddings for job descriptions...


Batches: 100%|██████████| 313/313 [00:10<00:00, 31.28it/s]



Generating embeddings for resumes...


Batches: 100%|██████████| 313/313 [00:07<00:00, 43.03it/s]



Generated 10000 job embeddings.
Generated 10000 resume embeddings.
Each embedding has a dimension of: 384


In [5]:
# Cell 5: Calculate Cosine Similarity

print("\nCalculating cosine similarity between job descriptions and resumes...")

# util.cos_sim calculates the cosine similarity between two sets of embeddings.
# It returns a matrix where element [i][j] is the similarity between embedding i from the first set
# and embedding j from the second set.
# Since we want the similarity for each (job_description, resume) pair, we need the diagonal elements.
cosine_scores = util.cos_sim(job_embeddings, resume_embeddings).diag()

# Convert the PyTorch tensor to a NumPy array and then to a Python list
# .cpu() moves the tensor to CPU if it was on GPU, .numpy() converts to NumPy array
df['bert_similarity_score'] = cosine_scores.cpu().numpy().tolist()

print("\nSample of data with new BERT similarity scores (first 5 rows):")
print(df[['job_description_cleaned', 'resume_cleaned', 'match_score', 'bert_similarity_score']].head())



Calculating cosine similarity between job descriptions and resumes...

Sample of data with new BERT similarity scores (first 5 rows):
                             job_description_cleaned  \
0  data analyst needed with experience in sql, ex...   
1  data scientist needed with experience in stati...   
2  software engineer needed with experience in sy...   
3  ml engineer needed with experience in python, ...   
4  software engineer needed with experience in re...   

                                      resume_cleaned  match_score  \
0  experienced professional skilled in sql, power...            4   
1  experienced professional skilled in python, de...            4   
2  experienced professional skilled in wait, git,...            5   
3  experienced professional skilled in return, de...            4   
4  experienced professional skilled in rest apis,...            5   

   bert_similarity_score  
0               0.652394  
1               0.372719  
2               0.447902  
3    

In [6]:
# Cell 6: Save Results

print(f"\nSaving DataFrame with BERT similarity scores to: {OUTPUT_CSV_PATH}")
try:
    df.to_csv(OUTPUT_CSV_PATH, index=False)
    print("DataFrame saved successfully.")
except Exception as e:
    print(f"Error saving data to CSV: {e}")


Saving DataFrame with BERT similarity scores to: ./data/resume_job_match_with_similarity.csv
DataFrame saved successfully.
