In [14]:
import fitz  # PyMuPDF
import pytesseract
from PIL import Image
import io

def extract_text_from_pdf(pdf_path):
    """
    Extract text directly from a PDF without Poppler.
    Uses PyMuPDF (fitz) for text extraction.
    """
    text = ""
    try:
        # Open the PDF file
        doc = fitz.open(pdf_path)
        
        for page in doc:
            text += page.get_text("text")  # Extracts text directly

        if not text.strip():
            print("⚠️ No text found! Trying OCR on images...")

            for page_num in range(len(doc)):
                img = page_to_image(doc, page_num)
                extracted_text = pytesseract.image_to_string(img)
                text += extracted_text

        return text.strip() if text else "OCR Failed"
    
    except Exception as e:
        print(f"❌ Error extracting text: {e}")
        return "OCR Failed"

def page_to_image(doc, page_num):
    """
    Converts a PDF page to an image for OCR.
    """
    page = doc[page_num]
    pix = page.get_pixmap()  # Get pixel map of the page
    img = Image.open(io.BytesIO(pix.tobytes("ppm")))
    return img


In [15]:
import re

import re

def clean_name(text):
    """
    Extracts the full name correctly by removing unnecessary words.
    """
    match = re.search(r"Name[:\s]+([A-Za-z\s]+)", text)
    if match:
        name = match.group(1).strip()
        return " ".join(name.split("\n"))  # Remove newlines
    return "OCR Failed"

def extract_date(text):
    """
    Extracts date using multiple formats.
    Supports:
    - DD-MM-YYYY
    - MM/DD/YYYY
    - YYYY-MM-DD
    """
    match = re.search(r'(\d{2}-\d{2}-\d{4}|\d{2}/\d{2}/\d{4}|\d{4}-\d{2}-\d{2})', text)
    return match.group(1) if match else "OCR Failed"



def extract_ielts_details(text):
    """
    Extract IELTS details from OCR text.
    """
    name_match = re.search(r'Name[:\s]+([A-Za-z\s]+)', text)
    dob_match = re.search(r'Date of Birth[:\s]+(\d{2}-\d{2}-\d{4})', text)
    listening_match = re.search(r'Listening[:\s]+(\d+\.\d+)', text)
    reading_match = re.search(r'Reading[:\s]+(\d+\.\d+)', text)
    writing_match = re.search(r'Writing[:\s]+(\d+\.\d+)', text)
    speaking_match = re.search(r'Speaking[:\s]+(\d+\.\d+)', text)

    return {
        "name": clean_name(text),
        "dob": extract_date(text),
        "listening": float(re.search(r'Listening[:\s]+(\d+\.\d+)', text).group(1)) if re.search(r'Listening[:\s]+(\d+\.\d+)', text) else "OCR Failed",
        "reading": float(re.search(r'Reading[:\s]+(\d+\.\d+)', text).group(1)) if re.search(r'Reading[:\s]+(\d+\.\d+)', text) else "OCR Failed",
        "writing": float(re.search(r'Writing[:\s]+(\d+\.\d+)', text).group(1)) if re.search(r'Writing[:\s]+(\d+\.\d+)', text) else "OCR Failed",
        "speaking": float(re.search(r'Speaking[:\s]+(\d+\.\d+)', text).group(1)) if re.search(r'Speaking[:\s]+(\d+\.\d+)', text) else "OCR Failed",
    }

def extract_gre_details(text):
    """
    Extract GRE details from OCR text.
    """
    test_date_match = re.search(r'Test Date[:\s]+(\d{2}-\d{2}-\d{4})', text)
    verbal_match = re.search(r'Verbal Reasoning[:\s]+(\d+)', text)
    verbal_percentile_match = re.search(r'Verbal Reasoning Percentile[:\s]+(\d+)', text)
    quant_match = re.search(r'Quantitative Reasoning[:\s]+(\d+)', text)
    quant_percentile_match = re.search(r'Quantitative Reasoning Percentile[:\s]+(\d+)', text)
    awa_match = re.search(r'Analytical Writing[:\s]+(\d+\.\d+)', text)
    awa_percentile_match = re.search(r'Analytical Writing Percentile[:\s]+(\d+)', text)

    return {
        "test_date": test_date_match.group(1) if test_date_match else "OCR Failed",
        "verbal_score": int(verbal_match.group(1)) if verbal_match else "OCR Failed",
        "verbal_percentile": int(verbal_percentile_match.group(1)) if verbal_percentile_match else "OCR Failed",
        "quant_score": int(quant_match.group(1)) if quant_match else "OCR Failed",
        "quant_percentile": int(quant_percentile_match.group(1)) if quant_percentile_match else "OCR Failed",
        "awa_score": float(awa_match.group(1)) if awa_match else "OCR Failed",
        "awa_percentile": int(awa_percentile_match.group(1)) if awa_percentile_match else "OCR Failed",
    }


In [16]:
# File Paths (Update with your actual file paths)
ielts_pdf_path = "IELTS.pdf"
gre_pdf_path = "gre.pdf"

# Extract Text
ielts_text = extract_text_from_pdf(ielts_pdf_path)
gre_text = extract_text_from_pdf(gre_pdf_path)

# Extract Details
ielts_details = extract_ielts_details(ielts_text)
gre_details = extract_gre_details(gre_text)

print("\n📄 **Extracted IELTS Details:**")
print(ielts_details)

print("\n📄 **Extracted GRE Details:**")
print(gre_details)


⚠️ No text found! Trying OCR on images...

📄 **Extracted IELTS Details:**
{'name': 'SYED\nFirst Name\nNOORE RASUL\nCandidate ID\nX', 'dob': 'OCR Failed', 'listening': 8.5, 'reading': 9.0, 'writing': 7.0, 'speaking': 7.0}

📄 **Extracted GRE Details:**
{'test_date': 'OCR Failed', 'verbal_score': 'OCR Failed', 'verbal_percentile': 'OCR Failed', 'quant_score': 'OCR Failed', 'quant_percentile': 'OCR Failed', 'awa_score': 'OCR Failed', 'awa_percentile': 'OCR Failed'}
