In [1]:
!pip3 install pdfplumber



In [None]:
import os
import re
import json
import pdfplumber
import docx

# ----------- Utility Functions -----------

def extract_text_from_file(filepath):
    if filepath.endswith('.pdf'):
        with pdfplumber.open(filepath) as pdf:
            return '\n'.join(page.extract_text() for page in pdf.pages if page.extract_text())
    elif filepath.endswith('.docx'):
        doc = docx.Document(filepath)
        return '\n'.join([para.text for para in doc.paragraphs])
    return ""

def extract_email(text):
    emails = re.findall(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+", text)
    return emails[0] if emails else ""

def extract_phone(text):
    phones = re.findall(r"\+?\d[\d\s().-]{7,}\d", text)
    return phones[0] if phones else ""

def extract_name(text):
    lines = text.split('\n')
    for line in lines[:10]:
        if len(line.split()) <= 4 and not any(char.isdigit() for char in line):
            return line.strip()
    return ""

def extract_section(text, section_keywords):
    lines = text.split('\n')
    capture = False
    section = []
    for line in lines:
        if any(keyword.lower() in line.lower() for keyword in section_keywords):
            capture = True
        if capture:
            if line.strip() == "" or any(kw.lower() in line.lower() for kw in ['skills', 'projects', 'summary', 'certifications']):
                break
            section.append(line)
    return "\n".join(section).strip()

def parse_resume(filepath):
    text = extract_text_from_file(filepath)
    return {
        "Filename": os.path.basename(filepath),
        "Name": extract_name(text),
        "Email": extract_email(text),
        "Phone": extract_phone(text),
        "Education": extract_section(text, ['Education']),
        "Work Experience": extract_section(text, ['Experience', 'Work Experience', 'Professional Experience'])
    }

# ----------- Run on a Single Resume -----------

# Replace this with your actual resume file path
resume_path = "../resume-parser-NLP/test-resume/Manoj Resume.pdf"  # or .docx

parsed_resume = parse_resume(resume_path)

print(json.dumps(parsed_resume, indent=4))


{
    "Filename": "Manoj Resume.pdf",
    "Name": "Manoj Parasuram",
    "Email": "manojparasuram.sadhanala@gmail.com",
    "Phone": "9000830070",
    "Education": "Education\nResearch Intern, Amrita Mind and Brain Center\nElectronics and Communication\nJune 2024 \u2013 present\nEngineering, Amrita Vishwa Vidyapeetham\nWokring on Research based on the Human GAIT\n2021 \u2013 present | Kollam, India\nCURRENT CGPA: 7.28/10\nFreelancer\nFebruary 2024 \u2013 present\nIntermediate,\nDeveloped websites and Flutter apps for clients,\nSri Chaitanya College Of Education\nserving both business and community needs.\n2019 \u2013 2021 | Vijayawada, India\nPercentage: 93.7%\nMentor & Member, amFOSS\nDecember 2021 \u2013 May 2024\nDR.K.K.R's Gowtham School",
    "Work Experience": "valuable project experiences. Experienced in"
}
