In [1]:
import os
import re
from PyPDF2 import PdfReader
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.corpus import stopwords

# Download NLTK stopwords
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Nidhi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
# Function to extract text from PDF
def extract_text_from_pdf(file_path):
    reader = PdfReader(file_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text()
    return text

# Function to clean text
def clean_text(text):
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.lower()
    text = ' '.join([word for word in text.split() if word not in stopwords.words('english')])
    return text

# Function to match resumes with job description
def match_resumes(job_description, resume_texts):
    vectorizer = TfidfVectorizer()
    all_texts = [job_description] + resume_texts
    tfidf_matrix = vectorizer.fit_transform(all_texts)
    cosine_similarities = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:])
    scores = cosine_similarities[0] *100
    return scores


In [3]:
# Directory paths
job_description_dir = 'job_descriptions'
resume_dir = 'resumes'


In [4]:
# Extract and clean job descriptions
job_descriptions = []
for file_name in os.listdir(job_description_dir):
    if file_name.endswith('.pdf'):
        file_path = os.path.join(job_description_dir, file_name)
        job_description_text = extract_text_from_pdf(file_path)
        cleaned_text = clean_text(job_description_text)
        job_descriptions.append(cleaned_text)


In [5]:
# Extract and clean resumes
resume_texts = []
resume_files = []
for file_name in os.listdir(resume_dir):
    if file_name.endswith('.pdf'):
        file_path = os.path.join(resume_dir, file_name)
        resume_text = extract_text_from_pdf(file_path)
        cleaned_text = clean_text(resume_text)
        resume_texts.append(cleaned_text)
        resume_files.append(file_name)


In [6]:
# Match each job description with resumes
for i, job_description in enumerate(job_descriptions):
    print(f"\nJob Description {i+1} Matches:\n")
    scores = match_resumes(job_description, resume_texts)
    ranked_resumes = sorted(zip(resume_files, scores), key=lambda x: x[1], reverse=True)
    for resume, score in ranked_resumes:
        print(f"Resume: {resume}, Score: {score:.2f}")



Job Description 1 Matches:

Resume: barry_allen_fe.pdf, Score: 28.57
Resume: bruce_wayne_fullstack.pdf, Score: 16.18
Resume: alfred_pennyworth_pm.pdf, Score: 14.11
Resume: john_doe.pdf, Score: 12.36
Resume: harvey_dent_mle.pdf, Score: 6.51

Job Description 2 Matches:

Resume: bruce_wayne_fullstack.pdf, Score: 16.91
Resume: alfred_pennyworth_pm.pdf, Score: 16.36
Resume: john_doe.pdf, Score: 15.58
Resume: barry_allen_fe.pdf, Score: 15.06
Resume: harvey_dent_mle.pdf, Score: 7.18

Job Description 3 Matches:

Resume: john_doe.pdf, Score: 30.43
Resume: alfred_pennyworth_pm.pdf, Score: 20.39
Resume: bruce_wayne_fullstack.pdf, Score: 15.61
Resume: barry_allen_fe.pdf, Score: 9.57
Resume: harvey_dent_mle.pdf, Score: 6.16

Job Description 4 Matches:

Resume: alfred_pennyworth_pm.pdf, Score: 23.19
Resume: bruce_wayne_fullstack.pdf, Score: 8.10
Resume: john_doe.pdf, Score: 7.93
Resume: harvey_dent_mle.pdf, Score: 5.71
Resume: barry_allen_fe.pdf, Score: 4.03
