In [2]:
pip install pymupdf


Collecting pymupdf
  Downloading pymupdf-1.26.0-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.0-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m58.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymupdf
Successfully installed pymupdf-1.26.0


In [5]:
import fitz  # PyMuPDF
import re
import os
import pandas as pd

# Step 1: Extract raw text from PDF
def extract_text_from_pdf(pdf_path):
    try:
        doc = fitz.open(pdf_path)
        text = ""
        for page in doc:
            text += page.get_text()
        doc.close()
        return text
    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")
        return ""

# Step 2: Normalize extracted text
def normalize_text(text):
    text = re.sub(r'\s+', ' ', text).strip()
    text = re.sub(r' +', ' ', text)
    text = re.sub(r'\n+', '\n', text)
    return text

# Step 3: Extract fields using regex and keyword rules
def extract_fields(text):
    fields = {
        "Name": "",
        "Email": "",
        "Education": [],
        "Skills": []
    }

    lines = text.split('\n')
    if lines:
        fields["Name"] = lines[0].strip()

    email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
    email_match = re.search(email_pattern, text)
    if email_match:
        fields["Email"] = email_match.group()

    education_keywords = ['Education', 'Academic', 'Degree', 'University', 'College']
    education_lines = []
    for line in lines:
        if any(keyword in line for keyword in education_keywords):
            education_lines.append(line.strip())
        elif education_lines and re.match(r'^\d{4}\s*-?\s*\d{4}?$', line):
            education_lines[-1] += f" ({line.strip()})"
    fields["Education"] = education_lines if education_lines else ["Not found"]

    skills_keywords = ['Skills', 'Technical Skills', 'Proficiencies']
    skills_lines = []
    capture_skills = False
    for line in lines:
        if any(keyword in line for keyword in skills_keywords):
            capture_skills = True
            continue
        if capture_skills:
            if any(keyword in line for keyword in ['Experience', 'Work', 'Education']):
                break
            skills = [skill.strip() for skill in line.split(',') if skill.strip()]
            skills_lines.extend(skills)
    fields["Skills"] = skills_lines if skills_lines else ["Not found"]

    return fields

# Step 4: Structure output as dictionary
def structure_output(fields):
    return {
        "Name": fields["Name"],
        "Email": fields["Email"],
        "Education": fields["Education"],
        "Skills": fields["Skills"]
    }

# Step 5: Process multiple resumes and save to CSV
def process_resumes(pdf_folder, output_csv):
    os.makedirs(os.path.dirname(output_csv), exist_ok=True)
    results = []
    for pdf_file in os.listdir(pdf_folder):
        if pdf_file.endswith('.pdf'):
            pdf_path = os.path.join(pdf_folder, pdf_file)
            print(f"Processing {pdf_file}...")

            raw_text = extract_text_from_pdf(pdf_path)
            normalized_text = normalize_text(raw_text)

            fields = extract_fields(normalized_text)
            structured_data = structure_output(fields)

            structured_data["File"] = pdf_file
            results.append(structured_data)

    df = pd.DataFrame(results)
    df['Education'] = df['Education'].apply(lambda x: '; '.join(x) if isinstance(x, list) else x)
    df['Skills'] = df['Skills'].apply(lambda x: '; '.join(x) if isinstance(x, list) else x)
    df.to_csv(output_csv, index=False)
    print(f"Results saved to {output_csv}")

# Run the parser
pdf_folder = "data/input"
output_csv = "data/output/resume_data.csv"
process_resumes(pdf_folder, output_csv)

Processing Resume6.pdf...
Processing Resume1.pdf...
Processing Resume4.pdf...
Processing Resume2.pdf...
Processing Resume3.pdf...
Processing Resume5.pdf...
Processing Resume7.pdf...
Results saved to data/output/resume_data.csv
