In [3]:
import fitz  # PyMuPDF
import re

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

def extract_email(text):
    match = re.search(r'[\w\.-]+@[\w\.-]+\.\w+', text)
    return match.group(0) if match else None

def extract_phone(text):
    match = re.search(r'\b\d{10}\b', text)
    return match.group(0) if match else None

def extract_name(text):
    lines = text.strip().split('\n')
    for line in lines:
        # Heuristic: name is likely to be in the first few lines
        if re.match(r'^[A-Z][a-z]+\s[A-Z][a-z]+(?:\s[A-Z][a-z]+)?$', line.strip()):
            return line.strip()
    return None

def extract_section(text, header):
    pattern = re.compile(rf"{header}\n(.*?)\n(?:\w|\Z)", re.DOTALL | re.IGNORECASE)
    match = pattern.search(text)
    return match.group(1).strip() if match else None

def extract_education(text):
    if "Education" in text:
        edu_section = text.split("Education")[1]
        if "Technical Skills" in edu_section:
            edu_section = edu_section.split("Technical Skills")[0]
        return edu_section.strip()
    return None

def extract_experience(text):
    if "Experience" in text:
        exp_section = text.split("Experience")[1]
        # Truncate before next section
        for end_marker in ["Community Outreach", "Awards", "Projects", "Publications", "Workshops"]:
            if end_marker in exp_section:
                exp_section = exp_section.split(end_marker)[0]
                break
        return exp_section.strip()
    return None

import os

def process_resumes_in_folder(folder_path: str):
    for file_name in sorted(os.listdir(folder_path)):
        if not file_name.lower().endswith(('.pdf', '.docx')):
            continue

        file_path = os.path.join(folder_path, file_name)
        print(f"\n📄 Processing: {file_name}")
        text = extract_text_from_pdf(file_path)
        info = {
            "Name": extract_name(text),
            "Email": extract_email(text),
            "Phone Number": extract_phone(text),
            "Education": extract_education(text),
            "Work Experience": extract_experience(text)
        }
        for key, value in info.items():
            print(f"\n=== {key} ===")
            print(value)
            # basic_info = extract_basic_info(text)
            # structured = extract_resume_info_structured(text)

            # print("\nBASIC INFO:")
            # print(basic_info)

            # print("\nEducation:")
            # for edu in structured["Education"]:
            #     print("-", edu)

            # print("\nWork Experience:")
            # for work in structured["Work Experience"]:
            #     print("-", work)
            
# process_resumes_in_folder("../test-resume")

In [6]:
import os
import re
import fitz  # PyMuPDF
import docx2txt
import spacy

nlp = spacy.load("en_core_web_sm")

def extract_text(file_path):
    ext = os.path.splitext(file_path)[1].lower()
    if ext == ".pdf":
        doc = fitz.open(file_path)
        return "\n".join(page.get_text() for page in doc)
    elif ext in [".docx", ".doc"]:
        return docx2txt.process(file_path)
    else:
        raise ValueError("Unsupported file format")

def extract_email(text):
    match = re.search(r'[\w\.-]+@[\w\.-]+\.\w+', text)
    return match.group(0) if match else None

def extract_phone(text):
    match = re.search(r'\+?\d[\d\s\-\(\)]{8,}', text)
    return match.group(0) if match else None

def extract_name(text):
    doc = nlp(text[:1000])
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            return ent.text.strip()
    return None

def extract_entities_by_label(text, label):
    doc = nlp(text)
    return list(set(ent.text.strip() for ent in doc.ents if ent.label_ == label))

def extract_section_between_keywords(text, start_keywords, stop_keywords):
    lines = text.splitlines()
    collecting = False
    collected = []

    for line in lines:
        lower = line.strip().lower()
        if any(start in lower for start in start_keywords):
            collecting = True
            continue
        if collecting and any(stop in lower for stop in stop_keywords):
            break
        if collecting:
            collected.append(line.strip())
    return "\n".join(collected).strip() if collected else None

def extract_education(text):
    return extract_section_between_keywords(
        text,
        start_keywords=['education', 'academic background'],
        stop_keywords=['skills', 'experience', 'projects', 'certifications', 'awards', 'workshops']
    )

def extract_experience(text):
    return extract_section_between_keywords(
        text,
        start_keywords=['experience', 'professional experience', 'work experience'],
        stop_keywords=['skills', 'projects', 'education', 'certifications', 'publications', 'workshops']
    )


def parse_resume(file_path):
    text = extract_text(file_path)

    return {
        "Name": extract_name(text),
        "Email": extract_email(text),
        "Phone Number": extract_phone(text),
        "Education": extract_education(text),
        "Work Experience": extract_experience(text)
    }

# -----------------------------
# Example: Run this on a file
# -----------------------------
# if __name__ == "__main__":
#     # Provide the path to the resume file here
#     file_path = "../test-resume/Siddharth_Reddy_Resume.pdf"  # Update with your actual file name

#     if not os.path.exists(file_path):
#         print(f"Error: File '{file_path}' not found.")
#     else:
#         print(f"\n--- Parsing: {os.path.basename(file_path)} ---")
#         result = parse_resume(file_path)
#         for key, val in result.items():
#             print(f"\n{key}:\n{val if val else 'Not found'}")


import os

def process_resumes_in_folder(folder_path: str):
    for file_name in sorted(os.listdir(folder_path)):
        if not file_name.lower().endswith(('.pdf', '.docx')):
            continue

        file_path = os.path.join(folder_path, file_name)
        print(f"\n📄 Processing: {file_name}")
        text = parse_resume(file_path)
        for key, val in text.items():
            print(f"\n{key}:\n{val if val else 'Not found'}")
            # basic_info = extract_basic_info(text)
            # structured = extract_resume_info_structured(text)

            # print("\nBASIC INFO:")
            # print(basic_info)

            # print("\nEducation:")
            # for edu in structured["Education"]:
            #     print("-", edu)

            # print("\nWork Experience:")
            # for work in structured["Work Experience"]:
            #     print("-", work)
            
process_resumes_in_folder("../test-resume")


📄 Processing: Achyuth_Resume.docx

Name:
Achyuth Rao

Email:
rsda.sada-8@gmail.com

Phone Number:
+1 (124)-456-1232 





Education:
Not found

Work Experience:
Expertise in Amazon AWS Cloud services like EC2, S3, EBS, VPC, ELB, AMI, SNS, RDS, IAM, Route 53, Glacier, Kinesis, Auto scaling, CloudFront, CloudWatch, CloudTrail, Cloud Formation, Elastic Beanstalk, OPS Work, Amazon Lambda, Security Groups.

Using Clover ETL migrated data to AWS Redshift and used AWS Beanstalk for fast deploying, scaling, and load balancing of web applications and services developed with Java, PHP, Node.js, Python, Ruby, and Docker on familiar web servers such as Apache, and IIS.


Expert in Orchestration & Migration of CI/CD processes using CloudFormation, Terraform Templates & Containerization of Infrastructure using Docker, which was set up in, AWS, and VPC.

Expertise in working with Terraform key features such as Infrastructure as a code (IaaS), Execution plans, Resource Graphs, Change Automation and e