In [4]:
import fitz  # PyMuPDF
import pytesseract
from PIL import Image
import io
import re

# ✅ Extract text from PDF using PyMuPDF
def extract_text_from_pdf(pdf_path):
    text = ""
    try:
        doc = fitz.open(pdf_path)
        for page in doc:
            text += page.get_text("text")

        if not text.strip():
            print(f"⚠️ No text found in {pdf_path}! Trying OCR on images...")
            for page_num in range(len(doc)):
                img = page_to_image(doc, page_num)
                extracted_text = pytesseract.image_to_string(img)
                text += extracted_text

        return text.strip() if text else "OCR Failed"
    except Exception as e:
        print(f"❌ Error extracting text: {e}")
        return "OCR Failed"

# ✅ Convert PDF page to image for OCR
def page_to_image(doc, page_num):
    page = doc[page_num]
    pix = page.get_pixmap()
    img = Image.open(io.BytesIO(pix.tobytes("ppm")))
    return img

# ✅ Extract only required IELTS fields
def extract_ielts_details(text):
    extracted_data = {}

    # **Extract Candidate ID**
    candidate_id_match = re.search(r'Candidate ID[:\s]*([A-Za-z0-9]+)', text, re.IGNORECASE)
    if candidate_id_match:
        extracted_data["candidate_id"] = candidate_id_match.group(1)

    # **Extract Date of Birth**
    dob_match = re.search(r'Date of Birth[:\s]*(\d{2}/\d{2}/\d{4})', text)
    if dob_match:
        extracted_data["dob"] = dob_match.group(1)

    # **Extract Center Number**
    center_number_match = re.search(r'Centre Number[:\s]*([A-Za-z0-9]+)', text, re.IGNORECASE)
    if center_number_match:
        extracted_data["center_number"] = center_number_match.group(1)

    # **Extract Test Date**
    test_date_match = re.search(r'Test Date[:\s]*(\d{2}/\d{2}/\d{4})', text)
    if test_date_match:
        extracted_data["test_date"] = test_date_match.group(1)

    # **Extract Scores**
    score_fields = {
        "listening": r'Listening[:\s]+(\d+\.\d+)',
        "reading": r'Reading[:\s]+(\d+\.\d+)',
        "writing": r'Writing[:\s]+(\d+\.\d+)',
        "speaking": r'Speaking[:\s]+(\d+\.\d+)',
    }

    for key, pattern in score_fields.items():
        match = re.search(pattern, text)
        if match:
            extracted_data[key] = match.group(1)  # Keep as string

    return extracted_data  # ✅ Return only extracted fields

# ✅ File Path for IELTS PDF
ielts_pdf_path = "IELTS.pdf"

# ✅ Extract Text from IELTS PDF
ielts_text = extract_text_from_pdf(ielts_pdf_path)

# ✅ Extract IELTS Details
ielts_details = extract_ielts_details(ielts_text)

# ✅ Print Only Extracted Fields
print("\n📄 **Extracted IELTS Details:**")
print(ielts_details)


ModuleNotFoundError: No module named 'frontend'

In [2]:
pip install fitz

Collecting fitz
  Downloading fitz-0.0.1.dev2-py2.py3-none-any.whl.metadata (816 bytes)
Collecting configparser (from fitz)
  Downloading configparser-7.2.0-py3-none-any.whl.metadata (5.5 kB)
Collecting httplib2 (from fitz)
  Downloading httplib2-0.22.0-py3-none-any.whl.metadata (2.6 kB)
Collecting nibabel (from fitz)
  Downloading nibabel-5.3.2-py3-none-any.whl.metadata (9.1 kB)
Collecting nipype (from fitz)
  Downloading nipype-1.10.0-py3-none-any.whl.metadata (7.1 kB)
Collecting pyxnat (from fitz)
  Downloading pyxnat-1.6.3-py3-none-any.whl.metadata (5.4 kB)
Collecting importlib-resources>=5.12 (from nibabel->fitz)
  Using cached importlib_resources-6.5.2-py3-none-any.whl.metadata (3.9 kB)
Collecting prov>=1.5.2 (from nipype->fitz)
  Downloading prov-2.0.1-py3-none-any.whl.metadata (3.6 kB)
Collecting rdflib>=5.0.0 (from nipype->fitz)
  Downloading rdflib-7.1.3-py3-none-any.whl.metadata (11 kB)
Collecting simplejson>=3.8.0 (from nipype->fitz)
  Downloading simplejson-3.20.1-cp311-cp


[notice] A new release of pip is available: 24.1.2 -> 25.0.1
[notice] To update, run: C:\Users\NIHARIKA\anaconda3\python.exe -m pip install --upgrade pip
