In [15]:
# =======================================
# SMART RESUME ANALYZER - EVALUATION NOTEBOOK
# =======================================

import os
import random
import pandas as pd
%pip install fpdf2
from fpdf import FPDF
import matplotlib.pyplot as plt
from sklearn.metrics import precision_score, recall_score, f1_score
from scipy.stats import pearsonr, spearmanr
import tempfile

# ---- import analyzer pieces ----
import fitz
import re
from sentence_transformers import SentenceTransformer, util


Note: you may need to restart the kernel to use updated packages.


ImportError: cannot import name 'is_fsdp_enabled' from 'transformers.integrations' (/opt/homebrew/Caskroom/miniconda/base/envs/aiml/lib/python3.12/site-packages/transformers/integrations/__init__.py)

In [None]:
# ----------------- YOUR ANALYZER FUNCTIONS -----------------

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    return "".join([page.get_text() for page in doc])

SECTION_PATTERNS = {
    "Education": r"(?i)\b(education|academic background)\b",
    "Experience": r"(?i)\b(work experience|professional experience|employment history|experience)\b",
    "Projects": r"(?i)\b(projects|personal projects)\b",
    "Skills": r"(?i)\b(skills|technical skills|key skills)\b",
    "Certifications": r"(?i)\b(certifications|licenses)\b",
    "Summary": r"(?i)\b(career objective|summary|professional summary|objective)\b",
    "Contact": r"(?i)\b(phone|email|linkedin|github)\b",
    "Achievements": r"(?i)\b(achievements|awards|honors)\b",
    "Languages": r"(?i)\b(languages spoken|languages)\b",
    "Tools": r"(?i)\b(technologies|tools|software)\b"
}
def analyze_sections(text):
    section_stats = {}
    for section, pattern in SECTION_PATTERNS.items():
        match = re.search(pattern, text)
        present = bool(match)
        bullet_count = 0
        if present:
            section_text = text[match.start():match.start() + 1000]
            bullet_count = len(re.findall(r"[\n•\-‣▪▶●][ \t]*", section_text))
        section_stats[section] = {"present": present, "bullet_count": bullet_count}
    return section_stats

def calculate_ats_score(text):
    if not text:
        return 0
    lower = text.lower()
    score = 0
    total = 12
    sections = analyze_sections(text)
    if sections["Experience"]["present"]: score += 2
    if sections["Education"]["present"]: score += 2
    if sections["Skills"]["present"]: score += 2
    for sec in ["Projects", "Certifications", "Achievements"]:
        if sections[sec]["present"]:
            score += 1
    if re.search(r"[\w\.-]+@[\w\.-]+\.\w+", lower): score += 1
    if re.search(r'(\+?\d[\d\-\s]{8,}\d)', lower): score += 1
    if re.search(r"\b\d{4}\b", lower): score += 1
    if len(text.split()) > 250: score += 1
    total_bullets = sum(info["bullet_count"] for info in sections.values())
    if total_bullets >= 5: score += 1
    return min(int((score/total)*100), 100)

# --- Load your skills dataset ---
skills_df = pd.read_csv("skills_dataset_top50000.csv")
skills_list = sorted([s.strip().lower() for s in skills_df["Skill"].dropna()], key=len, reverse=True)

def extract_skills_from_text(text):
    found_skills = set()
    text_lower = text.lower()
    for skill in skills_list:
        if re.search(rf"\b{re.escape(skill)}\b", text_lower):
            found_skills.add(skill)
    return found_skills

# --- JD Ranking using your actual logic ---
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

def rank_resumes(jd_text, resume_paths):
    jd_skills = extract_skills_from_text(jd_text)
    jd_skills = [s.lower() for s in jd_skills]
    jd_skill_embeddings = embedding_model.encode(jd_skills, convert_to_tensor=True) if jd_skills else None
    jd_text_embedding = embedding_model.encode(jd_text, convert_to_tensor=True)

    ranked_list = []
    for path in resume_paths:
        text = extract_text_from_pdf(path)
        resume_skills = extract_skills_from_text(text)
        resume_skills = [s.lower() for s in resume_skills]

        # skill match
        skill_match = 0.0
        if resume_skills and jd_skill_embeddings is not None:
            resume_skill_embeddings = embedding_model.encode(resume_skills, convert_to_tensor=True)
            cosine_scores = util.cos_sim(resume_skill_embeddings, jd_skill_embeddings)
            best_for_each_jd = cosine_scores.max(dim=0).values
            skill_match = float(best_for_each_jd.mean().item())

        # semantic similarity
        semantic_score = float(util.cos_sim(
            embedding_model.encode(text, convert_to_tensor=True),
            jd_text_embedding
        ).item())

        skill_match = max(0.0, min(1.0, skill_match))
        semantic_score = max(0.0, min(1.0, semantic_score))
        ats_score = max(0.0, min(1.0, calculate_ats_score(text) / 100))

        final_score = 0.45*skill_match + 0.35*semantic_score + 0.2*ats_score
        ranked_list.append({
            "filename": os.path.basename(path),
            "skill_match": round(skill_match*100,1),
            "semantic_score": round(semantic_score*100,1),
            "ats_score": round(ats_score*100,1),
            "final_score": round(final_score*100,1)
        })

    return sorted(ranked_list, key=lambda x: x['final_score'], reverse=True)





In [None]:
os.makedirs("test_resumes", exist_ok=True)
NUM_RESUMES = 20

skills_pool = {
    "Software Engineer": ["Python", "Flask", "Django", "SQL", "Git", "REST API", "Docker", "Linux"],
    "Data Scientist": ["Python", "Pandas", "NumPy", "Machine Learning", "TensorFlow", "Scikit-learn", "SQL", "Data Visualization"],
    "Web Developer": ["HTML", "CSS", "JavaScript", "React", "Node.js", "MongoDB", "Git", "Bootstrap"],
    "Mobile Developer": ["Java", "Kotlin", "Android", "Firebase", "Git", "REST API"],
    "Cloud Engineer": ["AWS", "Azure", "Docker", "Kubernetes", "Linux", "Terraform", "CI/CD"],
    "Cybersecurity Analyst": ["Network Security", "Penetration Testing", "SIEM", "Python", "Linux", "Incident Response"],
    "Product Manager": ["Agile", "Scrum", "JIRA", "Roadmapping", "Market Research", "Stakeholder Management"],
    "AI Engineer": ["Deep Learning", "PyTorch", "Computer Vision", "NLP", "Transformers", "Python", "MLOps"],
}
roles = list(skills_pool.keys())
universities = ["IIT Bombay", "IIT Delhi", "MIT Pune", "VIT Vellore", "BITS Pilani", "Stanford University", "Harvard University"]
companies = ["Google", "Microsoft", "Amazon", "Adobe", "Tesla", "Flipkart", "TCS", "Infosys", "IBM", "Meta"]

def generate_resume(role):
    name = random.choice(["John", "Jane", "Rahul", "Aisha", "Mark", "Priya", "Siddharth", "Emily", "Karan", "Meera"]) \
           + " " + random.choice(["Doe", "Smith", "Kumar", "Khan", "Lee", "Patel", "Gupta", "Sharma"])
    uni = random.choice(universities)
    company = random.choice(companies)
    skills = random.sample(skills_pool[role], k=min(len(skills_pool[role]), random.randint(4,6)))
    degree = random.choice(["B.Tech", "B.E.", "M.Tech", "M.Sc.", "MBA"])
    exp_years = random.randint(1,6)
    resume_text = f"""
{name}
Email: {name.lower().replace(' ','.')}@example.com | Phone: +91-9876543210 | LinkedIn: linkedin.com/in/{name.lower().replace(' ','')}
Education:
- {degree} in Computer Science from {uni}, Graduation Year {2017+exp_years}

Experience:
- {role} at {company} ({exp_years} years)
  Worked on multiple projects involving {', '.join(skills)}.
  Designed and implemented solutions improving performance and scalability.

Projects:
- {role} Portfolio Project — Built using {', '.join(skills[:3])}
- Data analysis and visualization on large datasets to derive actionable insights.

Skills:
{', '.join(skills)}

Certifications:
- {role} Certification from Coursera
- Cloud Fundamentals (AWS)

Achievements:
- Awarded best {role} intern at {company}
    """
    return resume_text.strip(), skills

pdf_paths = []
ground_truth_skills = []
for i in range(NUM_RESUMES):
    role = random.choice(roles)
    text, skills = generate_resume(role)
    pdf = FPDF()
    pdf.set_auto_page_break(auto=True, margin=15)
    pdf.add_page()
    pdf.set_font("Arial", size=12)
    for line in text.split("\n"):
        pdf.multi_cell(0, 8, line)
    file_path = f"test_resumes/resume_{i+1}.pdf"
    pdf.output(file_path)
    pdf_paths.append(file_path)
    ground_truth_skills.append(set(map(str.lower, skills)))

print(f"✅ Generated {NUM_RESUMES} synthetic resumes in 'test_resumes/'")

In [None]:
predicted_skills = []
ats_scores = []
for path in pdf_paths:
    txt = extract_text_from_pdf(path)
    predicted_skills.append(extract_skills_from_text(txt))
    ats_scores.append(calculate_ats_score(txt))

# ---- Skill Metrics ----
p = sum(len(gt & pred)/(len(pred) if pred else 1) for gt,pred in zip(ground_truth_skills, predicted_skills))/NUM_RESUMES
r = sum(len(gt & pred)/(len(gt) if gt else 1) for gt,pred in zip(ground_truth_skills, predicted_skills))/NUM_RESUMES
f1 = sum((2*len(gt & pred)/(len(gt)+len(pred)) if gt or pred else 0) for gt,pred in zip(ground_truth_skills, predicted_skills))/NUM_RESUMES

print(f"Skill Extraction Precision: {p:.2f}")
print(f"Skill Extraction Recall:    {r:.2f}")
print(f"Skill Extraction F1:        {f1:.2f}")

# ---- ATS vs Recruiter Correlation ----
recruiter_scores = [random.randint(50,95) for _ in range(NUM_RESUMES)]
corr, _ = pearsonr(recruiter_scores, ats_scores)
print(f"Pearson correlation (ATS vs Recruiter): {corr:.2f}")

plt.scatter(recruiter_scores, ats_scores)
plt.xlabel("Recruiter Scores")
plt.ylabel("ATS Scores")
plt.title("ATS vs Recruiter")
plt.show()


In [None]:
# ---- JD Ranking Real Test ----
jd_text = "We are looking for a Python developer with Flask, SQL, and Docker experience."
ranking = rank_resumes(jd_text, pdf_paths)
print("\nTop 5 Ranked Resumes:")
for row in ranking[:5]:
    print(row)

# Compare with random recruiter ranking (replace with real if available)
model_ranks = [r['filename'] for r in ranking]
recruiter_ranks = model_ranks.copy()
random.shuffle(recruiter_ranks)

# Spearman correlation
rho, _ = spearmanr(range(len(recruiter_ranks)), [model_ranks.index(r)+1 for r in recruiter_ranks])
print(f"Spearman Rank Correlation: {rho:.2f}")

# Top-3 accuracy
top3_model = set(model_ranks[:3])
top3_recruiter = set(recruiter_ranks[:3])
top3_acc = len(top3_model & top3_recruiter)/3
print(f"Top-3 accuracy JD ranking: {top3_acc*100:.1f}%")
