In [10]:
import os
import re
import docx2txt
import fitz  # PyMuPDF
from collections import defaultdict
from typing import List, Dict
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
from dateutil import parser

# ------------ Load Resume NER Model ------------ #
model_name = "manishiitg/resume-ner"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

# Use GPU if available
device = 0 if os.environ.get("CUDA_VISIBLE_DEVICES", None) else -1
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple", device=device)

# ------------ Text Extraction ------------ #
def extract_text(file_path: str) -> str:
    """Extracts text from PDF or DOCX files."""
    ext = os.path.splitext(file_path)[-1].lower()
    if ext == ".pdf":
        doc = fitz.open(file_path)
        return "\n".join([page.get_text() for page in doc])
    elif ext == ".docx":
        return docx2txt.process(file_path)
    else:
        raise ValueError("Unsupported file type. Use PDF or DOCX.")

# ------------ Helper Functions ------------ #
def parse_date(date_string):
    """Parses a date string and returns a consistent format."""
    try:
        date_obj = parser.parse(date_string, fuzzy=True)
        return date_obj.strftime('%Y-%m-%d')  # Consistent format
    except (ValueError, TypeError):
        return None

def clean_output(obj: Dict[str, str]) -> Dict[str, str]:
    """Cleans and formats the output data."""
    return {k: v.strip().title() if isinstance(v, str) and v else v for k, v in obj.items()}

# ------------ Resume Class ------------ #
class Resume:
    """Represents a resume and its extracted information."""

    def __init__(self, text: str, ner_results: List[Dict]):
        self.text = text
        self.ner_results = ner_results
        self.name = self.extract_name()
        self.email = self.extract_email()
        self.phone = self.extract_phone()
        self.education = self.extract_education()
        self.experience = self.extract_work_experience()

    def extract_name(self) -> str:
        """Extracts the name from the text, prioritizing NER."""
        name_from_ner = None
        for ent in self.ner_results:
            if ent['entity_group'] == 'NAME':
                name_from_ner = ent['word'].strip()
                break

        if name_from_ner:
            return name_from_ner

        name_match = re.search(r'^(?:[A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,})(?:\s+[A-Z][a-z]+)?', self.text, re.MULTILINE)
        return name_match.group(0).strip() if name_match else None

    def extract_email(self) -> str:
        """Extracts the email address from the text."""
        email_match = re.search(r'[\w\.-]+@[\w\.-]+\.\w+', self.text)
        return email_match.group(0).strip() if email_match else None

    def extract_phone(self) -> str:
        """Extracts the phone number from the text."""
        phone_match = re.search(r'(\+?\d[\d\s\-\(\)]{8,})', self.text)
        return phone_match.group(0).strip() if phone_match else None

    def extract_education(self) -> List[Dict[str, str]]:
        """Extracts education information from sentences."""
        sentences = re.split(r'(?<=[\.\?\!\n])\s+', self.text)
        education = []
        for sent in sentences:
            ents = ner_pipeline(sent)
            if not ents:
                continue

            tags = defaultdict(str)
            date_tokens = []

            for ent in ents:
                clean_word = re.sub(r"\s+", " ", ent['word'].replace("##", "")).strip()
                tags[ent['entity_group']] += clean_word + " "
                if ent['entity_group'] == 'DATE':
                    date_tokens.append(clean_word)

            for key in tags:
                tags[key] = tags[key].strip()

            is_edu_sentence = (
                'EducationDegree' in tags or
                re.search(r'\b(b\.?tech|m\.?tech|bachelor|master|engineering|school|college|university)\b', sent, re.IGNORECASE)
            )
            if is_edu_sentence and ('ORG' in tags or 'INSTITUTE' in tags):
                degree = tags.get('EducationDegree', None)
                university = tags.get('ORG') or tags.get('INSTITUTE', None)

                # Enhanced degree extraction
                if not degree:
                    degree_match = re.search(r'(?:(?:B\.?Tech\.?|M\.?Tech\.?|B\.?E\.?|M\.?E\.?|B\.?Sc\.?|M\.?Sc\.?|B\.?A\.?|M\.?A\.?|Bachelor(?: of)?|Master(?: of)?)\s+(?:in\s+)?([\w\s&]+))', sent, re.IGNORECASE)
                    degree = degree_match.group(1).strip() if degree_match else None

                # Enhanced university extraction
                if not university:
                    university_match = re.search(r'([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*(?:\s+(?:University|College|Institute|School))?)', sent)
                    university = university_match.group(1).strip() if university_match else None

                # Date extraction for education
                date_match = re.search(r'(\d{4})[–-]?(\d{4})?', sent)
                date = f"{date_match.group(1)} - {date_match.group(2)}" if date_match and date_match.group(2) else date_match.group(1) if date_match else None

                # Filter out noise
                if degree and len(degree) < 3:
                    degree = None
                if university and len(university) < 3:
                    university = None

                if degree or university:
                  education.append(clean_output({
                      "degree": degree,
                      "university": university,
                      "date": date
                  }))
        return education

    def extract_work_experience(self) -> List[Dict[str, str]]:
        """Extracts work experience information from sentences."""
        sentences = re.split(r'(?<=[\.\?\!\n])\s+', self.text)
        experience = []
        for sent in sentences:
            ents = ner_pipeline(sent)
            if not ents:
                continue

            tags = defaultdict(str)
            date_tokens = []

            for ent in ents:
                clean_word = re.sub(r"\s+", " ", ent['word'].replace("##", "")).strip()
                tags[ent['entity_group']] += clean_word + " "
                if ent['entity_group'] == 'DATE':
                    date_tokens.append(clean_word)

            for key in tags:
                tags[key] = tags[key].strip()

            if 'Designation' in tags:
                role = tags.get('Designation')

                # Enhanced company extraction
                company = tags.get('ORG') or tags.get('COMPANY')
                if not company:
                    company_match = re.search(r'(?:at\s+)?([A-Z][a-z]+(?:[A-Z][a-z]+|\s&\s[A-Z][a-z]+)*(?:\s+(?:Ltd\.?|Inc\.?|Corp\.?|LLC))?)', sent)
                    company = company_match.group(1).strip() if company_match else None

                # Date extraction for work experience
                date_match = re.search(r'((?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s+\d{4}|\d{4})', sent)
                date_start, date_end = None, None
                if date_match:
                  date_range = date_match.group(0)
                  dates = date_range.split('-')
                  if len(dates) == 2:
                    date_start = parse_date(dates[0].strip())
                    date_end = parse_date(dates[1].strip())
                  else:
                    date_start = parse_date(dates[0].strip())
                date = f"{date_start} - {date_end}" if date_start and date_end else date_start

                # Filter out noise
                if role and len(role) < 3:
                    role = None
                if company and len(company) < 3:
                    company = None

                if role or company:
                  experience.append(clean_output({
                      "designation": role,
                      "organization": company if company else None,
                      "date": date
                  }))
        return experience

# ------------ Batch Folder Parser ------------ #
def process_resumes_in_folder(folder_path: str):
    """Processes all resume files in a folder."""
    for file_name in sorted(os.listdir(folder_path)):
        if not file_name.lower().endswith(('.pdf', '.docx')):
            continue

        file_path = os.path.join(folder_path, file_name)
        print(f"\n📄 Processing: {file_name}")

        try:
            text = extract_text(file_path)
            ner_results = ner_pipeline(text)
            resume = Resume(text, ner_results)

            print("\nBASIC INFO:")
            print(clean_output({
                "Name": resume.name,
                "Email": resume.email,
                "Phone Number": resume.phone
            }))

            print("\nEducation:")
            for edu in resume.education:
                print("-", edu)

            print("\nWork Experience:")
            for work in resume.experience:
                print("-", work)

        except Exception as e:
            print(f"⚠️ Error processing {file_name}: {e}")

# ------------ Run Parser ------------ #
# Example usage:
process_resumes_in_folder("../test-resume")


📄 Processing: Achyuth_Resume.docx

BASIC INFO:
{'Name': 'Achyuth Rao', 'Email': 'Rsda.Sada-8@Gmail.Com', 'Phone Number': '+1 (124)-456-1232'}

Education:

Work Experience:
- {'designation': '/ Cloud Engineer', 'organization': 'Devops', 'date': None}
- {'designation': 'Build Engineer', 'organization': 'Over', 'date': None}
- {'designation': 'Engineer', 'organization': 'Cloud', 'date': '2021-03-29'}
- {'designation': 'Engineer', 'organization': 'Devops', 'date': '2020-08-29'}
- {'designation': 'Engineer', 'organization': 'Devops', 'date': '2019-01-29'}
- {'designation': 'Build Engineer', 'organization': 'Build', 'date': '2018-01-29'}
- {'designation': 'Linux System Administrator', 'organization': 'Linux', 'date': '2016-05-29'}

📄 Processing: K Nishanth Java.docx

BASIC INFO:
{'Name': 'Strong Experience', 'Email': 'Asdasfadaf@Gmail.Com', 'Phone Number': '424) 324-3454'}

Education:
- {'degree': 'Bachelor Of Technology', 'university': 'Indian Institute Of Technology, Kharagpur', 'date': '