In [None]:
import re
import pytesseract
from pdf2image import convert_from_path
from PIL import Image
from tkinter import Tk
from tkinter.filedialog import askopenfilename
import os

# OCR function
def extract_text_from_pdf(pdf_path, poppler_path=None):
    pages = convert_from_path(pdf_path, poppler_path=poppler_path)
    all_text = ""
    for i, page in enumerate(pages):
        image_path = f"page_{i}.png"
        page.save(image_path, "PNG")
        text = pytesseract.image_to_string(Image.open(image_path))
        all_text += f"\n{text}"
        os.remove(image_path)
    return all_text

# Parser function
def parse_resume_text(text):
    lines = [line.strip() for line in text.split('\n') if line.strip()]

    # 1. Name = first line (rough heuristic)
    name = lines[0] if lines else None

    # 2. Email
    email_match = re.search(r'[\w\.-]+@[\w\.-]+', text)
    email = email_match.group() if email_match else None

    # 3. Phone (basic Egypt + intl support)
    phone_match = re.search(r'(\+20|0)?1[0-9]{9}', text)
    phone = phone_match.group() if phone_match else None

    # 4. LinkedIn
    linkedin_match = re.search(r'(https?://)?(www\.)?linkedin\.com/in/\S+', text)
    linkedin = linkedin_match.group() if linkedin_match else None

    # 5. GitHub
    github_match = re.search(r'(https?://)?(www\.)?github\.com/\S+', text)
    github = github_match.group() if github_match else None

    return {
        "Name": name,
        "Email": email,
        "Phone": phone,
        "LinkedIn": linkedin,
        "GitHub": github
    }

# Main
if __name__ == "__main__":
    Tk().withdraw()
    pdf_file = askopenfilename(filetypes=[("PDF files", "*.pdf")])

    if pdf_file:
        text = extract_text_from_pdf(pdf_file, poppler_path=r"C:\poppler\Library\bin")  # adjust if needed
        structured_info = parse_resume_text(text)

        print("\n--- STRUCTURED INFO ---")
        for key, value in structured_info.items():
            print(f"{key}: {value}")
    else:
        print("No file selected.")



--- STRUCTURED INFO ---
Name: Hazem Omar Mohamed Atwa
Email: hazematwwa@gmai.com
Phone: 01118559305
LinkedIn: linkedin.com/in/hazem-omar
GitHub: None
