In [None]:

# Author : Manoj G
# Date : 15-07-2024
# Description : AI Based resume screening tool to automate the process of evaluating and shortlisting resumes based on job description

import os
import pandas as pd
import re
from PyPDF2 import PdfReader
import docx
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk

from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

def clean_text(text):
    """Lowercase, remove special chars and stopwords."""
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', ' ', text)
    tokens = [word for word in text.split() if word not in stop_words]
    return " ".join(tokens)

def extract_text_pdf(path):
    try:
        reader = PdfReader(path)
        text = ""
        for page in reader.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + " "
        return text
    except Exception as e:
        print(f"Error reading PDF {path}: {e}")
        return ""

def extract_text_docx(path):
    try:
        doc = docx.Document(path)
        return "\n".join([para.text for para in doc.paragraphs])
    except Exception as e:
        print(f"Error reading DOCX {path}: {e}")
        return ""

def extract_resume_text(path):
    if path.lower().endswith('.pdf'):
        return extract_text_pdf(path)
    elif path.lower().endswith('.docx'):
        return extract_text_docx(path)
    else:
        print(f"Unsupported file type: {path}")
        return ""

def main():
    # Loading job dataset CSV
    job_csv = input("Enter path to job description CSV dataset: ").strip()
    while not os.path.isfile(job_csv):
        print("Invalid path. Try again.")
        job_csv = input("Enter path to job description CSV dataset: ").strip()
    df_jobs = pd.read_csv(job_csv)
    
    # Normalize column names
    df_jobs.columns = df_jobs.columns.str.strip().str.lower()
    
    required_cols = ['job title', 'skills required', 'experience required']
    if not all(col in df_jobs.columns for col in required_cols):
        print(f"CSV missing required columns: {required_cols}")
        return
    
    # Add qualifications if missing
    if 'qualifications' not in df_jobs.columns:
        df_jobs['qualifications'] = ''
    
    # Combine text columns for job description
    df_jobs['job_description'] = df_jobs[required_cols + ['qualifications']].fillna('').agg(' '.join, axis=1)
    
    # List available jobs for user
    print("\nAvailable Job Titles:")
    for idx, title in enumerate(df_jobs['job title']):
        print(f"{idx}: {title}")
    
    # 2. Load resumes folder
    resumes_folder = input("Enter path to folder containing resumes (PDF/DOCX): ").strip()
    while not os.path.isdir(resumes_folder):
        print("Invalid folder. Try again.")
        resumes_folder = input("Enter path to folder containing resumes (PDF/DOCX): ").strip()
    
    resume_files = [os.path.join(resumes_folder, f) for f in os.listdir(resumes_folder)
                    if f.lower().endswith('.pdf') or f.lower().endswith('.docx')]
    if not resume_files:
        print("No PDF or DOCX resumes found in folder.")
        return
    
    # Extract and clean resumes text
    resumes_texts = []
    resume_names = []
    for path in resume_files:
        print(f"Extracting text from {os.path.basename(path)}...")
        text = extract_resume_text(path)
        if text:
            resumes_texts.append(clean_text(text))
            resume_names.append(os.path.basename(path))
    
    if not resumes_texts:
        print("No readable resumes found.")
        return
    
    # Clean job descriptions
    jobs_corpus = df_jobs['job_description'].apply(clean_text).tolist()
    
    # Vectorize
    vectorizer = TfidfVectorizer()
    vectorizer.fit(jobs_corpus + resumes_texts)
    
    job_vectors = vectorizer.transform(jobs_corpus)
    resume_vectors = vectorizer.transform(resumes_texts)
    
    # Let user select one job by index (shown above)
    job_index = input("Enter job index to match resumes against(0-24): ").strip()
    while not job_index.isdigit() or int(job_index) < 0 or int(job_index) >= len(df_jobs):
        print("Invalid input, enter a valid job index.")
        job_index = input("Enter job index to match resumes against: ").strip()
    job_index = int(job_index)
    
    # Compute similarity scores
    scores = cosine_similarity(job_vectors[job_index], resume_vectors)[0]
    
    # Rank and display results
    ranked = sorted(zip(resume_names, scores), key=lambda x: x[1], reverse=True)
    print("\nResume Screening Results:")
    for rank, (name, score) in enumerate(ranked, 1):
        print(f"{rank}. {name} - Match Score: {score:.4f}")

if __name__ == "__main__":
    main()