In [2]:
import os
import re
import docx2txt
import fitz  # PyMuPDF
from collections import defaultdict
from typing import List, Dict, Optional
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

# Suppress tqdm warning
os.environ["TQDM_DISABLE"] = "1"

# ------------ Load Resume NER Model ------------ #
model_name = "manishiitg/resume-ner"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

# Use GPU if available
device = 0 if os.environ.get("CUDA_VISIBLE_DEVICES", None) else -1
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple", device=device)

# ------------ Text Extraction ------------ #
def extract_text(file_path: str) -> str:
    ext = os.path.splitext(file_path)[-1].lower()
    if ext == ".pdf":
        doc = fitz.open(file_path)
        return "\n".join([page.get_text() for page in doc])
    elif ext == ".docx":
        return docx2txt.process(file_path)
    else:
        raise ValueError("Unsupported file type. Use PDF or DOCX.")

# ------------ Enhanced Basic Info Extraction ------------ #
def extract_basic_info(text: str) -> Dict[str, str]:
    # Improved name extraction - looks near contact info
    email_match = re.search(r'[\w\.-]+@[\w\.-]+\.\w+', text)
    phone_match = re.search(r'(?:\+?\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}\b', text)
    
    # Look for name in the lines above email/phone
    name = None
    if email_match or phone_match:
        contact_pos = email_match.start() if email_match else phone_match.start()
        preceding_text = text[:contact_pos]
        # Look for the most name-like text in the preceding lines
        name_candidates = re.findall(r'^(?:[A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,3})(?=\s*[\n\r])', preceding_text, re.MULTILINE)
        if name_candidates:
            name = name_candidates[-1]  # Take the last one before contact info

    return {
        "name": name.strip() if name else None,
        "email": email_match.group(0).strip() if email_match else None,
        "phone": phone_match.group(0).strip() if phone_match else None
    }

def clean_output(obj: Dict[str, str]) -> Dict[str, str]:
    cleaned = {}
    for k, v in obj.items():
        if isinstance(v, str):
            v = v.strip()
            if k == "date":
                # Clean up date formats
                v = re.sub(r'(\d{4})\s*–\s*(\d{4})\s*-\s*(\d{4})\s*–\s*(\d{4})', r'\1–\2', v)
                v = re.sub(r'(\b[A-Za-z]{3}\b \d{4})[\s-]+([A-Za-z]{3}\b \d{4})', r'\1 – \2', v)
            elif k in ["degree", "designation"]:
                v = v.title()
        cleaned[k] = v if v else None
    return cleaned

# ------------ Enhanced Structured Resume Info ------------ #
def extract_resume_info_structured(text: str) -> Dict[str, List[Dict[str, str]]]:
    # First try to identify resume sections
    section_pattern = r'(?:^|\n)\s*(Education|Work Experience|Experience|Employment|Academic Background)\s*(?:\n|$)'
    sections = re.split(section_pattern, text, flags=re.IGNORECASE)
    
    education = []
    experience = []
    
    # Process each section with context
    for i in range(1, len(sections), 2):
        section_title = sections[i].lower()
        section_content = sections[i+1]
        
        sentences = re.split(r'(?<=[\.\?\!\n])\s+', section_content)
        
        for sent in sentences:
            if not sent.strip():
                continue
                
            ents = ner_pipeline(sent)
            if not ents:
                continue

            tags = defaultdict(str)
            date_tokens = []
            org_tokens = []
            degree_tokens = []

            for ent in ents:
                clean_word = re.sub(r"\s+", " ", ent['word'].replace("##", "")).strip()
                entity_type = ent['entity_group']
                
                if entity_type == 'DATE':
                    if re.match(r'^(?:19|20)\d{2}|[A-Za-z]{3}', clean_word):  # Basic date validation
                        date_tokens.append(clean_word)
                elif entity_type in ['ORG', 'INSTITUTE', 'COMPANY']:
                    org_tokens.append(clean_word)
                elif entity_type == 'EducationDegree':
                    degree_tokens.append(clean_word)
                
                tags[entity_type] += clean_word + " "

            for key in tags:
                tags[key] = tags[key].strip()

            # Improved date parsing
            date = None
            if len(date_tokens) >= 2:
                date = f"{date_tokens[0]} – {date_tokens[1]}"
            elif len(date_tokens) == 1:
                date = date_tokens[0]

            # === Enhanced Education Detection ===
            degree_keywords = r'\b(b\.?\s*tech|m\.?\s*tech|bachelor|master|b\.?\s*[es]|m\.?\s*[es]|ph\.?\s*d|diploma|school|college|university|institute)\b'
            is_edu_sentence = (
                'EducationDegree' in tags or
                re.search(degree_keywords, sent, re.IGNORECASE) or
                any(t in section_title for t in ['education', 'academic'])
            )
            
            if is_edu_sentence:
                # Get the most specific degree first
                degree = " ".join(degree_tokens) if degree_tokens else None
                if not degree:
                    degree_match = re.search(degree_keywords, sent, re.IGNORECASE)
                    degree = degree_match.group(0).strip() if degree_match else None
                
                # Use the first org token if available
                university = org_tokens[0] if org_tokens else None
                
                if degree or university:  # Only add if we have meaningful data
                    education.append(clean_output({
                        "degree": degree,
                        "university": university,
                        "date": date
                    }))
                    continue  # Skip work exp detection for education sentences

            # === Enhanced Work Experience Detection ===
            if 'Designation' in tags or any(t in section_title for t in ['experience', 'employment']):
                role = tags.get('Designation')
                if not role:
                    # Try to extract role from sentence start
                    role_match = re.match(r'^([A-Z][a-z]+(?:\s+[A-Za-z]+)*)', sent)
                    role = role_match.group(1) if role_match else None
                
                # Use the first org token if available
                company = org_tokens[0] if org_tokens else None
                
                # Skip if it's clearly not a work experience (e.g., education org)
                if company and any(edu.get('university') == company for edu in education):
                    continue
                    
                if role:  # Only add if we have at least a role
                    experience.append(clean_output({
                        "designation": role,
                        "organization": company,
                        "date": date
                    }))

    return {
        "education": education,
        "work_experience": experience
    }

# ------------ Batch Folder Parser ------------ #
def process_resumes_in_folder(folder_path: str):
    for file_name in sorted(os.listdir(folder_path)):
        if not file_name.lower().endswith(('.pdf', '.docx')):
            continue

        file_path = os.path.join(folder_path, file_name)
        print(f"\n📄 Processing: {file_name}")

        try:
            text = extract_text(file_path)
            basic_info = extract_basic_info(text)
            structured = extract_resume_info_structured(text)

            result = {
                "name": basic_info["name"],
                "email": basic_info["email"],
                "phone": basic_info["phone"],
                "education": structured["education"],
                "work_experience": structured["work_experience"]
            }
            
            # Post-processing to clean up results
            if not result["name"]:
                # Fallback to look for name at document start
                name_match = re.search(r'^([A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,3})(?=\s*[\n\r])', text, re.MULTILINE)
                if name_match:
                    result["name"] = name_match.group(1).strip()
            
            print("\nRESULT:")
            print(result)

        except Exception as e:
            print(f"⚠️ Error processing {file_name}: {e}")

# ------------ Run Parser ------------ #
process_resumes_in_folder("../test-resume")


📄 Processing: Achyuth_Resume.docx

RESULT:
{'name': 'Configuration Management Tools', 'email': 'rsda.sada-8@gmail.com', 'phone': '+1 (124)-456-1232', 'education': [], 'work_experience': []}

📄 Processing: K Nishanth Java.docx

RESULT:
{'name': 'Java Developer', 'email': 'asdasfadaf@gmail.com', 'phone': '(424) 324-3454', 'education': [], 'work_experience': []}

📄 Processing: Manoj Resume.pdf

RESULT:
{'name': 'Manoj Parasuram \nSadanala', 'email': 'manojparasuram.sadhanala@gmail.com', 'phone': '9000830070', 'education': [{'degree': 'College', 'university': 'amrita vishwa vidyapeetham', 'date': '2021 – present – 2019 – 2021'}, {'degree': 'College', 'university': None, 'date': 'december 2021 – may 2024'}], 'work_experience': [{'designation': 'Flutter Developer Intern', 'organization': 'amrita mind and brain center', 'date': 'november 2024 – present'}, {'designation': 'Hardware Security Researcher', 'organization': 'redant', 'date': 'july 2023 – october 202'}, {'designation': 'Flutter Dev