In [3]:
import re
import pytesseract
from pdf2image import convert_from_path
from PIL import Image
from tkinter import Tk
from tkinter.filedialog import askopenfilenames
import os

def extract_text_from_pdf(pdf_path, poppler_path=None):
    pages = convert_from_path(pdf_path, poppler_path=poppler_path)
    all_text = ""
    for i, page in enumerate(pages):
        image_path = f"page_{i}.png"
        page.save(image_path, "PNG")
        text = pytesseract.image_to_string(Image.open(image_path))
        all_text += f"\n{text}"
        os.remove(image_path)
    return all_text

def parse_resume_text(text):
    lines = [line.strip() for line in text.split('\n') if line.strip()]
    name = lines[0] if lines else None

    email_match = re.search(r'[\w\.-]+@[\w\.-]+', text)
    email = email_match.group() if email_match else None

    # ✅ Improved phone regex
    phone_match = re.search(r'(\+20|\b0)?[\s\(]*1[0-9]{2}[\s\)-]*[0-9]{3}[\s\-]*[0-9]{4}', text)
    phone = phone_match.group() if phone_match else None

    linkedin_match = re.search(r'(https?://)?(www\.)?linkedin\.com/in/\S+', text)
    linkedin = linkedin_match.group() if linkedin_match else None

    return {
        "Name": name,
        "Email": email,
        "Phone": phone,
        "LinkedIn": linkedin
    }

if __name__ == "__main__":
    Tk().withdraw()
    pdf_files = askopenfilenames(filetypes=[("PDF files", "*.pdf")])

    if pdf_files:
        for pdf_file in pdf_files:
            print(f"\n📄 Processing: {os.path.basename(pdf_file)}")
            text = extract_text_from_pdf(pdf_file, poppler_path=r"C:\poppler\Library\bin")
            structured_info = parse_resume_text(text)

            print("--- STRUCTURED INFO ---")
            for key, value in structured_info.items():
                print(f"{key}: {value}")
    else:
        print("No files selected.")



📄 Processing: 5- Kyrillos Ragaey (RMEHRO).pdf
--- STRUCTURED INFO ---
Name: Kyrillos Ragaey Fawzy
Email: engkyroles@gmail.com
Phone: None
LinkedIn: None

📄 Processing: 6- Hosny F Alqattan (RMEHRO).pdf
--- STRUCTURED INFO ---
Name: Hosny Fathallah Algattan, Ph.D., P.E-co, PMP
Email: hosnyfh.alqattan@gmail.com
Phone: +201005012510
LinkedIn: https://www.linkedin.com/in/hosny-fathalla-ph-d-p-e-pmp-8528b79b

📄 Processing: 8- Mohamed  Waheeb, Resume.pdf
--- STRUCTURED INFO ---
Name: Mohamed Adel Waheeb
Email: waheeb.mohamed@yahoo.com
Phone: None
LinkedIn: https://www.linkedin.com/in/mohamed-waheeb-pmp-8980a392/

📄 Processing: 9- Hossam Ahmed Abd Elghany (RMEHRO).pdf
--- STRUCTURED INFO ---
Name: Hossam Ahmed Abd Elghany
Email: hossamabdelghanyO60@gmail.com
Phone: 01004432183
LinkedIn: https://www.linkedin.com/in/hossam-ahmed-b0027686/

📄 Processing: Abdelrahman-Ibrahim (1).pdf
--- STRUCTURED INFO ---
Name: Personal Information:
Email: Abdelrahman3597@Gmail.com
Phone: 01111001598
LinkedIn: h