In [None]:
# Import libraries
import pandas as pd
import numpy as np
import re
import unicodedata
import joblib
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sentence_transformers import SentenceTransformer
import shap

# Step 1: Data Preprocessing & Cleaning
def clean_text(text):
    """Clean the text by normalizing, removing special characters, and converting to lowercase."""
    text = unicodedata.normalize("NFKD", text)  # Fix encoding issues
    text = re.sub(r'[^a-zA-Z0-9., ]', '', text)  # Remove special characters
    text = text.lower().strip()  # Convert to lowercase
    return text

# Load dataset
resume_df = pd.read_csv("UpdatedResumeDataSet.csv")  # Replace with your file path
job_df = pd.read_csv("job_descriptions.csv")  # Replace with your file path

# Combine relevant text fields
resume_df['Combined_Text'] = resume_df['Category'] + " " + resume_df['Resume']
job_df['Combined_Text'] = job_df['Job Title'] + " " + job_df['Job Description'] + " " + job_df['skills']

# Clean text fields
resume_df['Combined_Text'] = resume_df['Combined_Text'].apply(clean_text)
job_df['Combined_Text'] = job_df['Combined_Text'].apply(clean_text)

In [None]:
# Step 2: BERT-based Embedding for Semantic Matching
model = SentenceTransformer('all-MiniLM-L6-v2')

# Create embeddings for resume and job description
resume_embeddings = model.encode(resume_df['Combined_Text'].tolist())
job_embeddings = model.encode(job_df['Combined_Text'].tolist())

# Compute cosine similarity on embeddings
similarity_matrix = cosine_similarity(resume_embeddings, job_embeddings)

# Find the best match for each resume
matches = similarity_matrix.argmax(axis=1)