In [None]:
import os
import fitz  # PyMuPDF
import docx2txt
import pytesseract
from PIL import Image
from pdf2image import convert_from_path
import re

def extract_text(file_path):
    ext = os.path.splitext(file_path)[-1].lower()
    
    if ext == ".pdf":
        doc = fitz.open(file_path)
        full_text = ""
        for page in doc:
            page_text = page.get_text()
            full_text += page_text + "\n"
        
        # Fallback to OCR if page is mostly empty
        if len(full_text.strip()) < 100:
            images = convert_from_path(file_path)
            full_text = ""
            for image in images:
                full_text += pytesseract.image_to_string(image) + "\n"

        return full_text

    elif ext == ".docx":
        return docx2txt.process(file_path)

    else:
        raise ValueError("Unsupported file type. Use PDF or DOCX.")


In [None]:
def detect_sections(text):
    lines = text.splitlines()
    sections = {}
    current_section = None

    # Map normalized headers to canonical keys
    section_keywords = {
        "education": ["education", "academic background", "qualifications"],
        "work_experience": ["experience", "employment", "work history", "professional experience"],
        "projects": ["projects", "project experience"],
        "skills": ["skills", "technical skills", "technologies"],
        "certifications": ["certifications", "courses", "achievements"],
        "awards": ["awards", "honors"],
        "publications": ["publications"],
        "profile": ["profile", "summary", "objective"]
    }

    # Precompile pattern
    header_patterns = {key: re.compile(rf'^\s*({"|".join(vals)})\s*$', re.IGNORECASE) 
                       for key, vals in section_keywords.items()}

    # Iterate over lines and capture section blocks
    for i, line in enumerate(lines):
        line_clean = line.strip().lower()

        matched = None
        for key, pattern in header_patterns.items():
            if pattern.match(line_clean):
                matched = key
                break

        if matched:
            current_section = matched
            sections[current_section] = []
            continue

        if current_section:
            sections[current_section].append(line.strip())

    # Join the section lines
    return {k: "\n".join(v).strip() for k, v in sections.items() if v}


In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
from collections import defaultdict
import re

model_name = "manishiitg/resume-ner"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
def extract_entities_from_section(section_text):
    results = {"education": [], "experience": []}
    lines = section_text.splitlines()
    buffer = ""

    for line in lines:
        line = line.strip()
        if not line:
            continue

        # Accumulate lines into sentence-like blocks
        buffer += " " + line
        if line.endswith(".") or len(buffer) > 120:
            ents = ner_pipeline(buffer.strip())
            tags = defaultdict(str)
            date_tokens = []

            for ent in ents:
                word = ent['word'].replace("##", "")
                tags[ent['entity_group']] += word + " "
                if ent['entity_group'] == 'DATE':
                    date_tokens.append(word)

            for key in tags:
                tags[key] = tags[key].strip()

            date = None
            if len(date_tokens) >= 2:
                date = f"{date_tokens[0]} - {date_tokens[1]}"
            elif len(date_tokens) == 1:
                date = date_tokens[0]

            if 'EducationDegree' in tags or 'INSTITUTE' in tags:
                results["education"].append({
                    "degree": tags.get("EducationDegree", None),
                    "university": tags.get("INSTITUTE") or tags.get("ORG", None),
                    "date": date
                })

            if 'Designation' in tags:
                results["experience"].append({
                    "designation": tags.get("Designation"),
                    "organization": tags.get("ORG") or tags.get("COMPANY") or tags.get("INSTITUTE", None),
                    "date": date
                })

            buffer = ""  # Reset buffer

    return results


In [14]:
def fallback_email(text):
    match = re.search(r'[\w\.-]+@[\w\.-]+\.\w+', text)
    return match.group(0) if match else None

def fallback_phone(text):
    match = re.search(r'(\+?\d[\d\s\-\(\)]{8,}\d)', text)
    return re.sub(r'[^\d+]', '', match.group(0)) if match else None

def fallback_name(text):
    # Naive first-line name detection
    lines = text.strip().splitlines()
    first_line = lines[0] if lines else ""
    return first_line.strip() if 3 <= len(first_line.split()) <= 5 else None


In [15]:
import json

def parse_resume(file_path):
    text = extract_text(file_path)
    print(f"Extracted Text: {text}...")  # Print first 500 chars for debugging
    sections = detect_sections(text)
    print(f"Detected Sections: {sections}")  # Print detected sections for debugging
    entities = {}
    print(f"Processing Sections: {sections.items()}")  # Print section keys for debugging

    for key, section_text in sections.items():
        print(f"Processing Section: {key}")
        print(f"Section Text: {section_text}")
        # Skip empty sections
        ner_data = extract_entities_from_section(section_text)
        entities.update(ner_data)
    print(f"Extracted Entities: {entities}")  # Print extracted entities for debugging

    return {
        "name": fallback_name(text),
        "email": fallback_email(text),
        "phone": fallback_phone(text),
        "education": entities.get("education", []),
        "work_experience": entities.get("experience", [])
    }

# Example usage
resume_path = "../test-resume/Manoj Resume.pdf"
parsed = parse_resume(resume_path)

# Save to JSON
with open("parsed_resume.json", "w") as f:
    json.dump(parsed, f, indent=2)


Extracted Text: Manoj Parasuram 
Sadanala
manojparasuram.sadhanala@gmail.com
9000830070
Eluru, India
manoj1749.github.io
manoj1749
Manoj Parasuram Sadanala
Education
Electronics and Communication 
Engineering, Amrita Vishwa Vidyapeetham
2021 – present | Kollam, India
CURRENT CGPA: 7.28/10
Intermediate, 
Sri Chaitanya College Of Education
2019 – 2021 | Vijayawada, India
Percentage: 93.7%
DR.K.K.R's Gowtham School
2016 – 2019 | Eluru, India
CGPA: 9.7/10
Technical Skills
Machine Learning and Deep Learning
Python
SQL
Computer Vision
Flutter
PyTorch
TensorFlow
Continuous Integration and Continuous 
Delivery (CI/CD)
Git, Github, Gitlab
Springboot
MATLAB
Profile
Resourceful individual with practical focus and 
valuable project experiences. Experienced in 
delivering dozen projects in deversified domains. 
Adept at time management and leadership skills, 
demonstrating proficiency in industrial skills for 
effective project contributions.
Experience
Flutter Developer Intern, Go Eleventh Mile
No