In [1]:
import fitz  # PyMuPDF
import re

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

def extract_email(text):
    match = re.search(r'[\w\.-]+@[\w\.-]+\.\w+', text)
    return match.group(0) if match else None

def extract_phone(text):
    match = re.search(r'\b\d{10}\b', text)
    return match.group(0) if match else None

def extract_name(text):
    lines = text.strip().split('\n')
    for line in lines:
        # Heuristic: name is likely to be in the first few lines
        if re.match(r'^[A-Z][a-z]+\s[A-Z][a-z]+(?:\s[A-Z][a-z]+)?$', line.strip()):
            return line.strip()
    return None

def extract_section(text, header):
    pattern = re.compile(rf"{header}\n(.*?)\n(?:\w|\Z)", re.DOTALL | re.IGNORECASE)
    match = pattern.search(text)
    return match.group(1).strip() if match else None

def extract_education(text):
    if "Education" in text:
        edu_section = text.split("Education")[1]
        if "Technical Skills" in edu_section:
            edu_section = edu_section.split("Technical Skills")[0]
        return edu_section.strip()
    return None

def extract_experience(text):
    if "Experience" in text:
        exp_section = text.split("Experience")[1]
        # Truncate before next section
        for end_marker in ["Community Outreach", "Awards", "Projects", "Publications", "Workshops"]:
            if end_marker in exp_section:
                exp_section = exp_section.split(end_marker)[0]
                break
        return exp_section.strip()
    return None

# -------------------------------
# Example Usage
# -------------------------------
pdf_path = "../test-resume/Manoj Resume.pdf"  # Replace with your file path
text = extract_text_from_pdf(pdf_path)

info = {
    "Name": extract_name(text),
    "Email": extract_email(text),
    "Phone Number": extract_phone(text),
    "Education": extract_education(text),
    "Work Experience": extract_experience(text)
}

for key, value in info.items():
    print(f"\n=== {key} ===")
    print(value)



=== Name ===
Manoj Parasuram

=== Email ===
manojparasuram.sadhanala@gmail.com

=== Phone Number ===
9000830070

=== Education ===
Electronics and Communication 
Engineering, Amrita Vishwa Vidyapeetham
2021 – present | Kollam, India
CURRENT CGPA: 7.28/10
Intermediate, 
Sri Chaitanya College Of

=== Work Experience ===
d in 
delivering dozen projects in deversified domains. 
Adept at time management and leadership skills, 
demonstrating proficiency in industrial skills for 
effective project contributions.


In [15]:
import os
import re
import fitz  # PyMuPDF
import docx2txt
import spacy

nlp = spacy.load("en_core_web_sm")

def extract_text(file_path):
    ext = os.path.splitext(file_path)[1].lower()
    if ext == ".pdf":
        doc = fitz.open(file_path)
        return "\n".join(page.get_text() for page in doc)
    elif ext in [".docx", ".doc"]:
        return docx2txt.process(file_path)
    else:
        raise ValueError("Unsupported file format")

def extract_email(text):
    match = re.search(r'[\w\.-]+@[\w\.-]+\.\w+', text)
    return match.group(0) if match else None

def extract_phone(text):
    match = re.search(r'\+?\d[\d\s\-\(\)]{8,}', text)
    return match.group(0) if match else None

def extract_name(text):
    doc = nlp(text[:1000])
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            return ent.text.strip()
    return None

def extract_entities_by_label(text, label):
    doc = nlp(text)
    return list(set(ent.text.strip() for ent in doc.ents if ent.label_ == label))

def extract_section_between_keywords(text, start_keywords, stop_keywords):
    lines = text.splitlines()
    collecting = False
    collected = []

    for line in lines:
        lower = line.strip().lower()
        if any(start in lower for start in start_keywords):
            collecting = True
            continue
        if collecting and any(stop in lower for stop in stop_keywords):
            break
        if collecting:
            collected.append(line.strip())
    return "\n".join(collected).strip() if collected else None

def extract_education(text):
    return extract_section_between_keywords(
        text,
        start_keywords=['education', 'academic background'],
        stop_keywords=['skills', 'experience', 'projects', 'certifications', 'awards', 'workshops']
    )

def extract_experience(text):
    return extract_section_between_keywords(
        text,
        start_keywords=['experience', 'professional experience', 'work experience'],
        stop_keywords=['skills', 'projects', 'education', 'certifications', 'publications', 'workshops']
    )


def parse_resume(file_path):
    text = extract_text(file_path)

    return {
        "Name": extract_name(text),
        "Email": extract_email(text),
        "Phone Number": extract_phone(text),
        "Education": extract_education(text),
        "Work Experience": extract_experience(text)
    }

# -----------------------------
# Example: Run this on a file
# -----------------------------
if __name__ == "__main__":
    # Provide the path to the resume file here
    file_path = "../test-resume/Siddharth_Reddy_Resume.pdf"  # Update with your actual file name

    if not os.path.exists(file_path):
        print(f"Error: File '{file_path}' not found.")
    else:
        print(f"\n--- Parsing: {os.path.basename(file_path)} ---")
        result = parse_resume(file_path)
        for key, val in result.items():
            print(f"\n{key}:\n{val if val else 'Not found'}")




--- Parsing: Siddharth_Reddy_Resume.pdf ---

Name:
Anthireddy Siddharth Reddy

Email:
siddharthreddy2812@gmail.com

Phone Number:
+916281287188


Education:
B.Tech in Electrical and Computer Engineering
2021 – 2025
Amrita Vishwa Vidhyapeetham
CGPA: 9/10
2019 – 2021
Sri Chaitanya Group of Institutions
Percentage: 96.6%
2018 – 2019
Thushara High School
CGPA: 9.3/10

Work Experience:
AI Full Stack Developer at SymboSystems
Oct 2024 - Present
• Led the development of an advanced AI system leveraging large language models (LLMs) and
multimodal interactions, integrating cutting-edge AI technologies to deliver seamless user
• Engineered distributed training and inference pipelines, incorporating Claude and GPT models
with a custom Retrieval-Augmented Generation (RAG) architecture utilizing FAISS for efficient
vector storage and retrieval.
• Designed and implemented a real-time audio-visual processing pipeline by integrating Deepgram
for speech-to-text and LiveKit for low-latency streaming, e