<a href="https://colab.research.google.com/github/murathan9165/resume-parser-extern/blob/main/MK_Resume_Parser_with_PyMuPDF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Resume Parser with PyMuPDF

## Step 1: Setup Environment

In [None]:
!pip install pymupdf

Upload resume File

In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
import os
print(os.listdir())


## Step 2: Inspecting the PDF

In [None]:
import fitz

In [None]:
# Load the PDF
doc = fitz.open("MurathanKocaman_Resume_Sep10_wFin.pdf")


# Print the number of pages
print(f"Total Pages: {doc.page_count}")


# Print metadata
print("PDF Metadata:")

doc.metadata


## Step 3: Extracting Text from a PDF

### Extracting Raw Text from a Page

In [None]:
# Open the first page of the document
page = doc[0]

# Extract text as a plain string
text = page.get_text("text")

# Print the extracted text
print(text)

### Extracting Text with Bounding Box Positions

In [None]:
# Extract words along with bounding box information
words = page.get_text("words")

# Print first 5 extracted words with bounding boxes
for word in words[:10]:
    print(word)

### Extracting Text Block-by-Block (Paragraph-Level)

In [None]:
# Extract structured text as blocks
blocks = page.get_text("blocks")

# Print each block
for block in blocks:
    print(f"Block: {block}")

## Step 4: Extracting Text from All Pages in a PDF

In [None]:
# Loop through all pages and extract text
for page_num in range(len(doc)):
    page = doc[page_num]
    text = page.get_text("text")
    print(f"--- Page {page_num + 1} ---")
    print(text)

## Step 5: Extracting Key Information from a PDF

In [None]:
import re

# Extract raw text from the first page
text = doc[0].get_text("text")

# Use regex to find a phone number in the text
phone_match = re.search(r"\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}", text)

if phone_match:
    phone_number = phone_match.group()
    print(f"Candidate Phone Number: {phone_number}")
else:
    print("Phone number not found.")

In [None]:
# Extract raw text from first page
text = doc[0].get_text('text')

# Use regex to find an email address in the text
email_match = re.search(r'\S+@\S+\.\S+', text)

if email_match:
  email_address = email_match.group()
  print(f'Candidate Email Address: {email_address}')
else:
  print('Email address not found.')


## Step 6: Extracting Bounding Boxes for Specific Fields

In [None]:
# Extract words with their bounding boxes
words = doc[0].get_text("words")

# Define the field we are looking for
target_word = email_address

# Search for the target word and retrieve its bounding box
for word in words:
    x0, y0, x1, y1, text, block, line, word_no = word  # Unpack correctly
    if target_word.lower() in text.lower():  # Case-insensitive match
        print(f"Found '{target_word}' at: ({x0}, {y0}, {x1}, {y1})")

##Step 7: Displaying The Bounding Boxes on the Doc's Image

In [None]:
import cv2
import numpy as np
from PIL import Image

# Convert PDF page to an image
pix = doc[0].get_pixmap()
img = np.array(Image.frombytes("RGB", [pix.width, pix.height], pix.samples))

# Convert image to OpenCV BGR format
img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)

# Get the actual image height
img_height = img.shape[0]  # OpenCV uses (height, width, channels)

# Extract words and their bounding boxes
words = doc[0].get_text("words")

# Define the field we are looking for
target_word = "murathankocaman@gmail.com"  # Replace with actual email

# Flag to check if the word was found
word_found = False

# Search for the target word and retrieve its bounding box
for word in words:
    x0, y0, x1, y1, text, block, line, word_no = word  # Unpack correctly

    if target_word in text:  # Case-sensitive match (modify if needed)
        # Convert PyMuPDF's y-coordinates (bottom-left origin) to OpenCV's (top-left origin)
        y0_new = y1  # Convert bottom-left to top-left
        y1_new = y0  # Convert bottom-left to top-left

        # Convert coordinates to integers
        x0, y0_new, x1, y1_new = map(int, [x0, y0_new, x1, y1_new])

        # Draw a rectangle around the detected word
        cv2.rectangle(img, (x0, y0_new), (x1, y1_new), (0, 255, 0), 2)

        print(f"Found '{target_word}' at: ({x0}, {y0_new}, {x1}, {y1_new})")
        word_found = True

# Ensure an image is displayed even if no word is found
if not word_found:
    print(f"'{target_word}' not found in document.")

# Convert back to RGB for displaying in PIL
img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

# Display the image using PIL (works in Jupyter/Colab)
display(Image.fromarray(img_rgb))