In [1]:
import cv2
import pytesseract
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
import matplotlib.pyplot as plt
from pdf2image import convert_from_path
from PIL import Image, ImageEnhance
import numpy as np

In [2]:
from pdf2image import convert_from_path
import os

pdfs = r"Sample Problem.pdf"

output_dir = 'pages'

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

pages = convert_from_path(pdfs, 350)

for i, page in enumerate(pages, start=1):
    image_name = os.path.join(output_dir, f'Page_{i}.png')
    page.save(image_name, "PNG")

In [4]:
import os
from PIL import Image
import cv2
import numpy as np
from pytesseract import pytesseract
from PIL import ImageEnhance

def enhance_image_before_extracting_data(image_path):
    pil_image = Image.open(image_path)
    opencv_image = cv2.cvtColor(np.array(pil_image), cv2.COLOR_RGB2BGR)
    
    height, width = opencv_image.shape[:2]
    opencv_image = cv2.resize(opencv_image, (width*2, height*2), interpolation=cv2.INTER_CUBIC)
    
    kernel = np.array([[-1, -1, -1], [-1, 9, -1], [-1, -1, -1]])
    sharpened = cv2.filter2D(opencv_image, -1, kernel)
    
    pil_image = Image.fromarray(cv2.cvtColor(sharpened, cv2.COLOR_BGR2RGB))
    
    enhancer = ImageEnhance.Contrast(pil_image)
    enhanced_image = enhancer.enhance(2.0)
    
    return enhanced_image

def extract_data_from_image(pil_image):
    image = cv2.cvtColor(np.array(pil_image), cv2.COLOR_RGB2BGR)
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    blur = cv2.GaussianBlur(gray, (3,3), 0)
    thresh = cv2.adaptiveThreshold(blur, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, 11, 30)
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))
    dilate = cv2.dilate(thresh, kernel, iterations=2)
    contours, _ = cv2.findContours(dilate, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    def sort_contours(cnts):
        sorted_cnts = sorted(cnts, key=lambda c: cv2.boundingRect(c)[1])
        tolerance_factor = 10
        row_groups = []
        current_row = [sorted_cnts[0]]
        for contour in sorted_cnts[1:]:
            y = cv2.boundingRect(contour)[1]
            if abs(y - cv2.boundingRect(current_row[0])[1]) < tolerance_factor:
                current_row.append(contour)
            else:
                row_groups.append(current_row)
                current_row = [contour]
        row_groups.append(current_row)
        for row in row_groups:
            row.sort(key=lambda c: cv2.boundingRect(c)[0])
        return [c for row in row_groups for c in row]

    sorted_contours = sort_contours(contours)
    extracted_data = []

    for contour in sorted_contours:
        x, y, w, h = cv2.boundingRect(contour)
        if w > 100 and h > 20:
            roi = image[y:y+h, x:x+w]
            inner_rois = [
                        (w * 0.02, h * 0.05, w * 0.30, h * 0.13), # number - top-left 
                        (w * 0.7, h * 0.05, w * 0.29, h * 0.14),  # epic number - top-right
                        (w * 0.02, h * 0.22, w * 0.7, h * 0.12), # name
                        (w * 0.02, h * 0.34, w * 0.7, h * 0.12), # relative name
                        (w * 0.02, h * 0.44, w * 0.7, h * 0.21), # house number - age - gender
                        ]
            roi_data = []
            for i, (rx, ry, rw, rh) in enumerate(inner_rois):
                inner_roi = roi[int(ry):int(ry+rh), int(rx):int(rx+rw)]
                inner_roi_gray = cv2.cvtColor(inner_roi, cv2.COLOR_BGR2GRAY)
                _, inner_roi_thresh = cv2.threshold(inner_roi_gray, 120, 255, cv2.THRESH_BINARY)
                text = pytesseract.image_to_string(inner_roi_thresh, config='--psm 6')
                roi_data.append(f"ROI {i+1}: {text.strip()}")
            extracted_data.append(roi_data)

    return extracted_data

pages_dir = 'pages'

all_extracted_data = []

for image_file in os.listdir(pages_dir):
    image_path = os.path.join(pages_dir, image_file)
    
    if image_file.endswith('.png'):
        print(f"Processing {image_file}...")
        enhanced_image = enhance_image_before_extracting_data(image_path)
        extracted_text_data = extract_data_from_image(enhanced_image)
        all_extracted_data.extend(extracted_text_data)

Processing Page_1.png...
Processing Page_2.png...


In [5]:
all_extracted_data

[['ROI 1: 1',
  'ROI 2: TXK6940811',
  'ROI 3: Name : Lina Shaktikurmmar Chavan',
  'ROI 4: Husband s Name: Shaktikumar Chavan',
  'ROI 5: House Number :\nAge : 3? Gender : Female'],
 ['ROI 1: 2?',
  'ROI 2: TXK6467708',
  'ROI 3: Name :‘sagar shashikant jadhav',
  'ROI 4: Father s Name: shashikant jadhav',
  'ROI 5: House Number :\nAge : 33 Gender : Male'],
 ['ROI 1: | 3',
  'ROI 2: TXK6467740',
  'ROI 3: Name : suresh mahadry kurade',
  'ROI 4: Father s Name: mahadry kurade',
  'ROI 5: House Number :\nAge :- 59 Gender : Male'],
 ['ROI 1: 4',
  'ROI 2: TXK6467690',
  'ROI 3: Name : siddharth anil mahajan',
  "ROI 4: Father's Name: anil mahajan",
  'ROI 5: House Number :\nAge : 30 Gender : Mala'],
 ['ROI 1: §',
  'ROI 2: TAK6894930',
  'ROI 3: Name : Savita Sambha) Nanavare',
  'ROI 4: Father s Name: Sambhaji Nanavare',
  'ROI 5: House Number :\nAge : 25 Gender ! Female'],
 ['ROI 1: | 6',
  'ROI 2: TXKO467724',
  'ROI 3: Name : karan subhas oswal',
  'ROI 4: Father s Name: subhas oswal

In [8]:
import pandas as pd
import re

def preprocess_extracted_data(data):

    processed_data = []

    for item in data:

        part_s_no = item[0].replace('ROI 1:', '').strip()

        if re.match(r'^\s*\|?\s*[ES]\s+\d+', part_s_no):
            continue

        epic_no = item[1].replace('ROI 2:', '').strip()

        voter_name_details = item[2].replace('ROI 3:', '').strip()
        voter_name_details = re.sub(r"[‘’]", '', voter_name_details)
        voter_name_details = re.sub(r'=:', '', voter_name_details)
        voter_name_match = re.search(r"Name\s*[:=+\-*>]?\s*['‘]?([a-zA-Z]+(?:\s[a-zA-Z]+)*)['’]?", voter_name_details, re.IGNORECASE)
        voter_full_name = voter_name_match.group(1).upper() if voter_name_match else ''

        relative_name_details = item[3].replace('ROI 4:', '').strip()
        relative_name_details = re.sub(r"[‘’]", '', relative_name_details)
        relative_match = re.search(r"(Father|Hus[bh]and|Others)\s*(?:[''`$]?\s*[sS]\s*Name)?\s*[:$]*\s*(.+?)(?=\n|$)", relative_name_details, re.IGNORECASE)
        relative_name = relative_match.group(2).upper() if relative_match else ''

        if re.search(r"Hus[bh]and", relative_name_details, re.IGNORECASE):
            relation_type = 'HSBN'
        elif re.search(r"Father", relative_name_details, re.IGNORECASE):
            relation_type = 'FTHR'
        else:
            relation_type = 'OTHR'

        other_details = item[4].replace('ROI 5:', '').strip()
        other_details = other_details.replace('Gander', 'Gender')  
        other_details = other_details.replace('Nurmber', 'Number') 
        other_details = re.sub(r'\s*-\s*', ': ', other_details)  
        other_details = re.sub(r'[\n]+', '\n', other_details)

        age_gender_match = re.search(r'Age\s*[:=\-+]?\s*(\d+|\?+)\s*(?:Gender|Gander|Gente)\s*[:=\-+\'!>]?\s*(Male|Female|Mala)', other_details, re.IGNORECASE)
        age = age_gender_match.group(1).strip() if age_gender_match else ''
        gender = age_gender_match.group(2).strip() if age_gender_match else ''

        if "Fem" in other_details:
            gender = 'F'
        else:
            gender = 'M'

        age = age.replace('?', '7').strip()
        gender = gender.replace('?', '').strip()

        def extract_house_number(text):
            house_number_match = re.search(r'House\s*Nu(?:mber|rmber)\s*[:=]?\s*(.*?)(?=\n|$)', text, re.IGNORECASE)
    
            if house_number_match:
                house_number = house_number_match.group(1).strip()
        
                if house_number.startswith('+') or house_number.startswith('-'):
                    return '-'  
                if re.match(r'^\d+$|^[a-zA-Z\s]+$', house_number):
                    return house_number
        
                return ''  
            return ''
        
        
        processed_data.append([part_s_no, voter_full_name, relative_name, relation_type, age, gender, extract_house_number(other_details), epic_no])

    return processed_data


cleaned_data = preprocess_extracted_data(all_extracted_data)

columns = ["Part S.No", "Voter Full Name", "Relative's Name", "Relation Type", "Age", "Gender", "House No", "EPIC No"]

df = pd.DataFrame(cleaned_data, columns=columns)

output_file = 'final_output_file.xlsx'
df.to_excel(output_file, index=False)

output_file

'final_output_file.xlsx'