In [None]:
!pip install paddlepaddle
!pip install paddleocr
!pip install opencv-python

Collecting paddlepaddle
  Downloading paddlepaddle-3.1.0-cp311-cp311-manylinux1_x86_64.whl.metadata (8.8 kB)
Collecting opt_einsum==3.3.0 (from paddlepaddle)
  Downloading opt_einsum-3.3.0-py3-none-any.whl.metadata (6.5 kB)
Downloading paddlepaddle-3.1.0-cp311-cp311-manylinux1_x86_64.whl (195.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m195.0/195.0 MB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading opt_einsum-3.3.0-py3-none-any.whl (65 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m65.5/65.5 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: opt_einsum, paddlepaddle
  Attempting uninstall: opt_einsum
    Found existing installation: opt_einsum 3.4.0
    Uninstalling opt_einsum-3.4.0:
      Successfully uninstalled opt_einsum-3.4.0
Successfully installed opt_einsum-3.3.0 paddlepaddle-3.1.0
Collecting paddleocr
  Downloading paddleocr-3.1.0-py3-none-any.whl.metadata (22 kB)
Collecting paddlex>=3.1.0 (f

In [19]:
import json
import cv2
from paddleocr import PaddleOCR
import re
from datetime import datetime

class IDCardOCR:
    def __init__(self):
        self.ocr = PaddleOCR(use_textline_orientation=True, lang='en')

    def extract_text_from_image(self, image_path):
        """Extract all text from the ID card image"""
        try:
            img = cv2.imread(image_path)
            if img is None:
                print(f"Could not read image: {image_path}")
                return []

            result = self.ocr.predict(image_path)
            extracted_texts = []

            if result and isinstance(result, list):
                for item in result:
                    if isinstance(item, dict) and 'rec_texts' in item and 'rec_scores' in item:
                        texts = item['rec_texts']
                        scores = item['rec_scores']

                        for text, confidence in zip(texts, scores):
                            if text.strip():
                                extracted_texts.append({
                                    'text': str(text).strip(),
                                    'confidence': float(confidence)
                                })

            return extracted_texts

        except Exception as e:
            print(f"Error processing image: {e}")
            return []

    def _convert_to_iso_date(self, date_str):
        if not date_str:
            return None

        patterns = [
            r'(\d{1,2})[./](\d{1,2})[./](\d{4})',
            r'(\d{4})[./](\d{1,2})[./](\d{1,2})',
        ]

        for pattern in patterns:
            match = re.match(pattern, date_str.strip())
            if match:
                try:
                    if pattern == patterns[0]:  # DD/MM/YYYY
                        day, month, year = match.groups()
                        return f"{year}-{month.zfill(2)}-{day.zfill(2)}"
                    else:
                        year, month, day = match.groups()
                        return f"{year}-{month.zfill(2)}-{day.zfill(2)}"
                except:
                    pass

        return date_str

    def parse_id_card_info(self, extracted_texts):

        texts = [item['text'] for item in extracted_texts]

        id_info = {
            'name': None,
            'father_name': None,
            'husband_name': None,
            'id_number': None,
            'date_of_birth': None,
            'date_of_issue': None,
            'date_of_expiry': None,
            'gender': None,
            'raw_text': texts
        }

        date_pattern = r'\d{1,2}[./]\d{1,2}[./]\d{4}'
        found_dates = []

        for i, text in enumerate(texts):
            text_lower = text.lower().strip()

            if re.match(r'\d{5}-\d{7}-\d{1}', text):
                id_info['id_number'] = text

            if text_lower in ['m', 'male', 'f', 'female']:
                id_info['gender'] = 'Male' if text_lower in ['m', 'male'] else 'Female'

            if re.match(date_pattern, text):
                context_before = ""
                context_after = ""

                for j in range(max(0, i-2), i):
                    context_before += texts[j].lower() + " "
                for j in range(i+1, min(len(texts), i+3)):
                    context_after += texts[j].lower() + " "

                full_context = context_before + " " + context_after

                date_info = {
                    'date': text,
                    'context': full_context,
                    'index': i
                }

                if any(word in full_context for word in ['birth', 'born', 'b.']):
                    id_info['date_of_birth'] = self._convert_to_iso_date(text)
                elif any(word in full_context for word in ['issue', 'issued', 'grant', 'doi']):
                    id_info['date_of_issue'] = self._convert_to_iso_date(text)
                elif any(word in full_context for word in ['expiry', 'expire', 'valid', 'until', 'doe']):
                    id_info['date_of_expiry'] = self._convert_to_iso_date(text)
                else:
                    found_dates.append(date_info)

        if found_dates:
            found_dates.sort(key=lambda x: x['index'])

            for date_info in found_dates:
                date_str = self._convert_to_iso_date(date_info['date'])

                # Assign based on what's missing and logical order
                if not id_info['date_of_birth']:
                    id_info['date_of_birth'] = date_str
                elif not id_info['date_of_issue']:
                    id_info['date_of_issue'] = date_str
                elif not id_info['date_of_expiry']:
                    id_info['date_of_expiry'] = date_str


        for i, text in enumerate(texts):
            text_lower = text.lower().strip()

            if text_lower == 'name' and i + 1 < len(texts):
                next_text = texts[i + 1]
                if self._is_valid_name(next_text):
                    id_info['name'] = next_text

            if any(phrase in text_lower for phrase in ['father', "father's name", 'father name']):
                if i + 1 < len(texts):
                    next_text = texts[i + 1]
                    if self._is_valid_name(next_text):
                        id_info['father_name'] = next_text

            if any(phrase in text_lower for phrase in ['husband', "husband's name", 'husband name']):
                if i + 1 < len(texts):
                    next_text = texts[i + 1]
                    if self._is_valid_name(next_text):
                        id_info['husband_name'] = next_text

        return id_info

    def _is_valid_name(self, text):
        """Check if text looks like a valid name"""
        if not text or len(text.strip()) < 2:
            return False

        if not re.match(r'^[A-Za-z\s\-\'\.]+$', text):
            return False

        if not re.search(r'[A-Za-z]', text):
            return False

        # Exclude common non-name words
        excluded_words = ['card', 'identity', 'republic', 'pakistan', 'name', 'father', 'husband', 'gender', 'date']
        if any(word in text.lower() for word in excluded_words):
            return False

        return True

    def process_id_card(self, image_path, output_json_path=None):
        print(f"Processing ID card: {image_path}")

        extracted_texts = self.extract_text_from_image(image_path)

        if not extracted_texts:
            print("No text extracted from the image")
            return None


        id_info = self.parse_id_card_info(extracted_texts)

        result = {
            "personal_information": {
                "name": id_info['name'],
                "father_name": id_info['father_name'],
                "husband_name": id_info['husband_name'],
                "gender": id_info['gender'],
                "date_of_birth": id_info['date_of_birth']
            },
            "document_details": {
                "id_number": id_info['id_number'],
                "date_of_issue": id_info['date_of_issue'],
                "date_of_expiry": id_info['date_of_expiry']
            },
            "extraction_timestamp": datetime.now().isoformat(),
            "raw_text": id_info['raw_text']
        }

        if output_json_path:
            with open(output_json_path, 'w', encoding='utf-8') as f:
                json.dump(result, f, indent=2, ensure_ascii=False)
            print(f"Results saved to: {output_json_path}")

        return result


def main():
    ocr_processor = IDCardOCR()

    image_path = "saadID.jpeg"
    output_path = "id_card_data2.json"

    try:
        result = ocr_processor.process_id_card(image_path, output_path)

        if result:
            print("\n=== Extracted Information ===")

            personal_info = result.get("personal_information", {})
            print("\nPersonal Information:")
            for key, value in personal_info.items():
                if value:
                    print(f"  {key.replace('_', ' ').title()}: {value}")


            document_details = result.get("document_details", {})
            print("\nDocument Details:")
            for key, value in document_details.items():
                if value:
                    print(f"  {key.replace('_', ' ').title()}: {value}")

    except FileNotFoundError:
        print(f"Image file not found: {image_path}")
    except Exception as e:
        print(f"Error: {e}")

if __name__ == "__main__":
    main()

[32mCreating model: ('PP-LCNet_x1_0_doc_ori', None)[0m
[32mUsing official model (PP-LCNet_x1_0_doc_ori), the model files will be automatically downloaded and saved in /root/.paddlex/official_models.[0m


Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]

[32mCreating model: ('UVDoc', None)[0m
[33mThe model(UVDoc) is not supported to run in MKLDNN mode! Using `paddle` instead![0m
[32mUsing official model (UVDoc), the model files will be automatically downloaded and saved in /root/.paddlex/official_models.[0m


Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]

[32mCreating model: ('PP-LCNet_x1_0_textline_ori', None)[0m
[32mUsing official model (PP-LCNet_x1_0_textline_ori), the model files will be automatically downloaded and saved in /root/.paddlex/official_models.[0m


Fetching 8 files:   0%|          | 0/8 [00:00<?, ?it/s]

[32mCreating model: ('PP-OCRv5_server_det', None)[0m
[32mUsing official model (PP-OCRv5_server_det), the model files will be automatically downloaded and saved in /root/.paddlex/official_models.[0m


Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]

[32mCreating model: ('PP-OCRv5_server_rec', None)[0m
[32mUsing official model (PP-OCRv5_server_rec), the model files will be automatically downloaded and saved in /root/.paddlex/official_models.[0m


Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]

Processing ID card: saadID.jpeg
Results saved to: id_card_data2.json

=== Extracted Information ===

Personal Information:
  Name: Muhammad Saad Nadeem
  Father Name: Muhammad Nadeem Aqdas
  Gender: Male
  Date Of Birth: 2003-05-15

Document Details:
  Id Number: 35202-7690257-1
  Date Of Issue: 2023-08-27
  Date Of Expiry: 2033-08-27


In [16]:
import json
import cv2
from paddleocr import PaddleOCR
import re
from datetime import datetime

class PassportOCR:
    def __init__(self):
        self.ocr = PaddleOCR(use_textline_orientation=True, lang='chinese_cht')

    def extract_text_from_image(self, image_path):
        try:
            img = cv2.imread(image_path)
            if img is None:
                print(f"Could not read image: {image_path}")
                return []

            result = self.ocr.predict(image_path)
            extracted_texts = []

            if result and isinstance(result, list):
                for item in result:
                    if isinstance(item, dict) and 'rec_texts' in item and 'rec_scores' in item:
                        texts = item['rec_texts']
                        scores = item['rec_scores']

                        for text, confidence in zip(texts, scores):
                            if text.strip():
                                extracted_texts.append({
                                    'text': str(text).strip(),
                                    'confidence': float(confidence)
                                })

            return extracted_texts

        except Exception as e:
            print(f"Error processing image: {e}")
            return []

    def _convert_to_iso_date(self, date_str):
        if not date_str:
            return None

        patterns = [
            r'(\d{1,2})[./\s](\d{1,2})[./\s](\d{4})',
            r'(\d{2})\s*([A-Z]{3})\s*(\d{4})',
            r'(\d{2})\s*([A-Z]{3})[./\s]*(\d{2})',
            r'(\d{4})[./\s](\d{1,2})[./\s](\d{1,2})',
        ]

        month_map = {
            'JAN': '01', 'FEB': '02', 'MAR': '03', 'APR': '04',
            'MAY': '05', 'JUN': '06', 'JUL': '07', 'AUG': '08',
            'SEP': '09', 'OCT': '10', 'NOV': '11', 'DEC': '12'
        }

        for i, pattern in enumerate(patterns):
            match = re.match(pattern, date_str.strip(), re.IGNORECASE)
            if match:
                try:
                    if i == 0:
                        day, month, year = match.groups()
                        return f"{year}-{month.zfill(2)}-{day.zfill(2)}"
                    elif i == 1:
                        day, month_abbr, year = match.groups()
                        month = month_map.get(month_abbr.upper(), '01')
                        return f"{year}-{month}-{day.zfill(2)}"
                    elif i == 2:
                        day, month_abbr, year = match.groups()
                        year = "20" + year if len(year) == 2 else year
                        month = month_map.get(month_abbr.upper(), '01')
                        return f"{year}-{month}-{day.zfill(2)}"
                    else:
                        year, month, day = match.groups()
                        return f"{year}-{month.zfill(2)}-{day.zfill(2)}"
                except:
                    pass

        return date_str

    def _extract_mrz_info(self, texts):
        mrz_info = {}

        for text in texts:
            if len(text) > 40 and all(c.isalnum() or c in '<' for c in text):
                if text.startswith('P<'):
                    country_code = text[2:5]
                    mrz_info['country_code'] = country_code

                    name_part = text[5:].replace('<', ' ').strip()
                    name_parts = [part for part in name_part.split() if part]
                    if name_parts:
                        mrz_info['surname'] = name_parts[0]
                        if len(name_parts) > 1:
                            mrz_info['given_names'] = ' '.join(name_parts[1:])

                elif re.match(r'^[A-Z0-9<]{9}[0-9][A-Z]{3}[0-9]{6}[0-9][MF<][0-9]{6}[0-9]', text):
                    passport_number = text[:9].replace('<', '')
                    mrz_info['passport_number'] = passport_number

                    nationality = text[10:13]
                    mrz_info['nationality'] = nationality

                    birth_date = text[13:19]
                    if birth_date.isdigit():
                        year = '19' + birth_date[:2] if birth_date[:2] > '30' else '20' + birth_date[:2]
                        mrz_info['date_of_birth'] = f"{year}-{birth_date[2:4]}-{birth_date[4:6]}"

                    gender = text[20]
                    if gender in ['M', 'F']:
                        mrz_info['gender'] = 'Male' if gender == 'M' else 'Female'

                    expiry_date = text[21:27]
                    if expiry_date.isdigit():
                        year = '19' + expiry_date[:2] if expiry_date[:2] > '30' else '20' + expiry_date[:2]
                        mrz_info['date_of_expiry'] = f"{year}-{expiry_date[2:4]}-{expiry_date[4:6]}"

        return mrz_info

    def parse_passport_info(self, extracted_texts):
        texts = [item['text'] for item in extracted_texts]

        passport_info = {
            'type': 'PASSPORT',
            'passport_number': None,
            'surname': None,
            'given_names': None,
            'nationality': None,
            'country_code': None,
            'date_of_birth': None,
            'date_of_issue': None,
            'date_of_expiry': None,
            'place_of_birth': None,
            'gender': None,
            'issuing_authority': None,
            'raw_text': texts
        }

        mrz_info = self._extract_mrz_info(texts)
        passport_info.update(mrz_info)

        date_pattern = r'\d{1,2}[./\s](\d{1,2}|[A-Z]{3})[./\s]*\d{2,4}'
        found_dates = []

        for i, text in enumerate(texts):
            text_upper = text.upper().strip()
            text_lower = text.lower().strip()

            if re.match(r'^[A-Z]{1,2}\d{6,9}$', text) or re.match(r'^[A-Z0-9]{8,9}$', text):
                if not passport_info['passport_number']:
                    passport_info['passport_number'] = text

            if text_upper in ['M', 'MALE', 'F', 'FEMALE']:
                passport_info['gender'] = 'Male' if text_upper in ['M', 'MALE'] else 'Female'

            if re.match(date_pattern, text, re.IGNORECASE):
                context_before = ""
                context_after = ""

                for j in range(max(0, i-2), i):
                    context_before += texts[j].lower() + " "
                for j in range(i+1, min(len(texts), i+3)):
                    context_after += texts[j].lower() + " "

                full_context = context_before + " " + context_after

                if any(word in full_context for word in ['birth', 'born', 'b.', 'date of birth']):
                    passport_info['date_of_birth'] = self._convert_to_iso_date(text)
                elif any(word in full_context for word in ['issue', 'issued', 'grant', 'doi', 'date of issue']):
                    passport_info['date_of_issue'] = self._convert_to_iso_date(text)
                elif any(word in full_context for word in ['expiry', 'expire', 'valid', 'until', 'doe', 'date of expiry']):
                    passport_info['date_of_expiry'] = self._convert_to_iso_date(text)
                else:
                    found_dates.append({
                        'date': text,
                        'context': full_context,
                        'index': i
                    })

        if found_dates:
            found_dates.sort(key=lambda x: x['index'])

            for date_info in found_dates:
                date_str = self._convert_to_iso_date(date_info['date'])

                if not passport_info['date_of_birth']:
                    passport_info['date_of_birth'] = date_str
                elif not passport_info['date_of_issue']:
                    passport_info['date_of_issue'] = date_str
                elif not passport_info['date_of_expiry']:
                    passport_info['date_of_expiry'] = date_str

        for i, text in enumerate(texts):
            text_lower = text.lower().strip()

            if any(phrase in text_lower for phrase in ['surname', 'family name', 'last name']):
                if i + 1 < len(texts):
                    next_text = texts[i + 1]
                    if self._is_valid_name(next_text) and not passport_info['surname']:
                        passport_info['surname'] = next_text

            if any(phrase in text_lower for phrase in ['given name', 'first name', 'given names']):
                if i + 1 < len(texts):
                    next_text = texts[i + 1]
                    if self._is_valid_name(next_text) and not passport_info['given_names']:
                        passport_info['given_names'] = next_text

            if any(phrase in text_lower for phrase in ['nationality', 'citizen']):
                if i + 1 < len(texts):
                    next_text = texts[i + 1]
                    if len(next_text) >= 3 and not passport_info['nationality']:
                        passport_info['nationality'] = next_text

            if any(phrase in text_lower for phrase in ['place of birth', 'born in', 'birth place']):
                if i + 1 < len(texts):
                    next_text = texts[i + 1]
                    if len(next_text) >= 3 and not passport_info['place_of_birth']:
                        passport_info['place_of_birth'] = next_text

            if 'passport' in text_lower and 'number' in text_lower:
                if i + 1 < len(texts):
                    next_text = texts[i + 1]
                    if re.match(r'^[A-Z0-9]{6,12}$', next_text) and not passport_info['passport_number']:
                        passport_info['passport_number'] = next_text

        return passport_info

    def _is_valid_name(self, text):
        if not text or len(text.strip()) < 2:
            return False

        if not re.match(r'^[A-Za-z\s\-\'\.]+$', text):
            return False

        if not re.search(r'[A-Za-z]', text):
            return False

        excluded_words = ['passport', 'republic', 'pakistan', 'islamic', 'government', 'specimen', 'type', 'code']
        if any(word in text.lower() for word in excluded_words):
            return False

        return True

    def process_passport(self, image_path, output_json_path=None):
        print(f"Processing passport: {image_path}")

        extracted_texts = self.extract_text_from_image(image_path)

        if not extracted_texts:
            print("No text extracted from the image")
            return None

        passport_info = self.parse_passport_info(extracted_texts)

        result = {
            "document_type": "PASSPORT",
            "personal_information": {
                "surname": passport_info['surname'],
                "given_names": passport_info['given_names'],
                "nationality": passport_info['nationality'],
                "gender": passport_info['gender'],
                "date_of_birth": passport_info['date_of_birth'],
                "place_of_birth": passport_info['place_of_birth']
            },
            "document_details": {
                "passport_number": passport_info['passport_number'],
                "country_code": passport_info['country_code'],
                "date_of_issue": passport_info['date_of_issue'],
                "date_of_expiry": passport_info['date_of_expiry'],
                "issuing_authority": passport_info['issuing_authority']
            },
            "extraction_timestamp": datetime.now().isoformat(),
            "raw_text": passport_info['raw_text']
        }

        if output_json_path:
            with open(output_json_path, 'w', encoding='utf-8') as f:
                json.dump(result, f, indent=2, ensure_ascii=False)
            print(f"Results saved to: {output_json_path}")

        return result


def main():
    ocr_processor = PassportOCR()

    image_path = "pakPass.jpg"
    output_path = "passport_data.json"

    try:
        result = ocr_processor.process_passport(image_path, output_path)

        if result:
            print("\n=== Extracted Information ===")

            personal_info = result.get("personal_information", {})
            print("\nPersonal Information:")
            for key, value in personal_info.items():
                if value:
                    print(f"  {key.replace('_', ' ').title()}: {value}")

            document_details = result.get("document_details", {})
            print("\nDocument Details:")
            for key, value in document_details.items():
                if value:
                    print(f"  {key.replace('_', ' ').title()}: {value}")

    except FileNotFoundError:
        print(f"Image file not found: {image_path}")
    except Exception as e:
        print(f"Error: {e}")

if __name__ == "__main__":
    main()

[32mCreating model: ('PP-LCNet_x1_0_doc_ori', None)[0m
[32mUsing official model (PP-LCNet_x1_0_doc_ori), the model files will be automatically downloaded and saved in /root/.paddlex/official_models.[0m


Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]

[32mCreating model: ('UVDoc', None)[0m
[33mThe model(UVDoc) is not supported to run in MKLDNN mode! Using `paddle` instead![0m
[32mUsing official model (UVDoc), the model files will be automatically downloaded and saved in /root/.paddlex/official_models.[0m


Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]

[32mCreating model: ('PP-LCNet_x1_0_textline_ori', None)[0m
[32mUsing official model (PP-LCNet_x1_0_textline_ori), the model files will be automatically downloaded and saved in /root/.paddlex/official_models.[0m


Fetching 8 files:   0%|          | 0/8 [00:00<?, ?it/s]

[32mCreating model: ('PP-OCRv5_server_det', None)[0m
[32mUsing official model (PP-OCRv5_server_det), the model files will be automatically downloaded and saved in /root/.paddlex/official_models.[0m


Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]

[32mCreating model: ('PP-OCRv5_server_rec', None)[0m
[32mUsing official model (PP-OCRv5_server_rec), the model files will be automatically downloaded and saved in /root/.paddlex/official_models.[0m


Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]

Processing passport: pakPass.jpg
Results saved to: passport_data.json

=== Extracted Information ===

Personal Information:
  Surname: SPECIMEN
  Given Names: MARTINA
  Nationality: PAK
  Gender: Male
  Date Of Birth: 1977-12-29

Document Details:
  Passport Number: AS5528252
  Country Code: PAK
  Date Of Expiry: 2018-11-05


In [15]:
import json
import cv2
from paddleocr import PaddleOCR
import re
from datetime import datetime
from difflib import SequenceMatcher

class IDCardOCR:
    def __init__(self, lang='chinese_cht'):
        self.ocr = PaddleOCR(use_textline_orientation=True, lang=lang)

        self.field_mappings = {
            'name': ['name', '氏名', 'nom', 'nombre', 'नाम', 'اسم'],
            'father_name': ['father', "father's name", 'father name', 'پدر کا نام', 'पिता का नाम'],
            'husband_name': ['husband', "husband's name", 'husband name', 'شوہر کا نام', 'पति का नाम'],
            'mother_name': ['mother', "mother's name", 'mother name', 'والدہ کا نام', 'माता का नाम'],
            'date_of_birth': ['birth', 'born', 'date of birth', 'dob', 'b.', '生年月日', 'تاریخ پیدائش', 'जन्म तिथि'],
            'date_of_issue': ['issue', 'issued', 'date of issue', 'doi', '交付', 'تاریخ اجراء', 'जारी तिथि'],
            'date_of_expiry': ['expiry', 'expire', 'valid until', 'doe', '有効期限', 'تاریخ اختتام', 'समाप्ति तिथि'],
            'gender': ['gender', 'sex', '性別', 'جنس', 'लिंग'],
            'address': ['address', '住所', 'پتہ', 'पता'],
            'id_number': ['id', 'number', 'card number', 'identity', 'my number', 'شناختی نمبر', 'पहचान संख्या'],
            'nationality': ['nationality', '国籍', 'قومیت', 'राष्ट्रीयता'],
            'occupation': ['occupation', 'profession', '職業', 'پیشہ', 'व्यवसाय']
        }

    def extract_text_from_image(self, image_path):
        """Extract all text from the ID card image with coordinates"""
        try:
            img = cv2.imread(image_path)
            if img is None:
                print(f"Could not read image: {image_path}")
                return []

            result = self.ocr.predict(image_path)
            extracted_data = []

            if result and isinstance(result, list):
                for item in result:
                    if isinstance(item, dict) and 'rec_texts' in item and 'rec_scores' in item:
                        texts = item['rec_texts']
                        scores = item['rec_scores']
                        boxes = item.get('dt_boxes', [])

                        for i, (text, confidence) in enumerate(zip(texts, scores)):
                            if text.strip():
                                box = boxes[i] if i < len(boxes) else None
                                extracted_data.append({
                                    'text': str(text).strip(),
                                    'confidence': float(confidence),
                                    'box': box,
                                    'index': len(extracted_data)
                                })

            return extracted_data

        except Exception as e:
            print(f"Error processing image: {e}")
            return []

    def _similarity(self, a, b):
        """Calculate similarity between two strings"""
        return SequenceMatcher(None, a.lower(), b.lower()).ratio()

    def _is_date(self, text):
        """Check if text looks like a date"""
        date_patterns = [
            r'\d{1,2}[./\-]\d{1,2}[./\-]\d{4}',
            r'\d{4}[./\-]\d{1,2}[./\-]\d{1,2}',
            r'\d{4}[年]\d{1,2}[月]\d{1,2}[日]?',
            r'[HhSsRr]\d{1,2}[年/.-]\d{1,2}[月/.-]\d{1,2}[日]?',
        ]
        return any(re.search(pattern, text) for pattern in date_patterns)

    def _is_id_number(self, text):
        """Check if text looks like an ID number"""
        id_patterns = [
            r'\d{5}-\d{7}-\d{1}',
            r'\d{4}\s?\d{4}\s?\d{4}',
            r'\d{9,15}',
            r'[A-Z]{2}\d{6,8}',
        ]
        return any(re.search(pattern, text) for pattern in id_patterns)

    def _convert_to_iso_date(self, date_str):
        """Convert date string to ISO format (YYYY-MM-DD)"""
        if not date_str:
            return None

        # Multiple date format patterns
        patterns = [
            (r'(\d{1,2})[./\-](\d{1,2})[./\-](\d{4})', 'dmy'),
            (r'(\d{4})[./\-](\d{1,2})[./\-](\d{1,2})', 'ymd'),
            (r'(\d{4})[年](\d{1,2})[月](\d{1,2})[日]?', 'ymd'),
            (r'[HhSsRr](\d{1,2})[年/.-](\d{1,2})[月/.-](\d{1,2})[日]?', 'era'),
        ]

        for pattern, format_type in patterns:
            match = re.search(pattern, date_str.strip())
            if match:
                try:
                    if format_type == 'dmy':
                        day, month, year = match.groups()
                        return f"{year}-{month.zfill(2)}-{day.zfill(2)}"
                    elif format_type == 'ymd':
                        year, month, day = match.groups()
                        return f"{year}-{month.zfill(2)}-{day.zfill(2)}"
                    elif format_type == 'era':
                        era_year, month, day = match.groups()
                        # Simple era conversion
                        era_char = date_str[0].upper()
                        base_year = {'H': 1988, 'R': 2018, 'S': 1925}.get(era_char, 2018)
                        year = int(era_year) + base_year
                        return f"{year}-{month.zfill(2)}-{day.zfill(2)}"
                except:
                    pass

        return date_str

    def find_key_value_pairs(self, extracted_data):
        key_value_pairs = {}

        for i, item in enumerate(extracted_data):
            text = item['text']
            text_lower = text.lower().strip()


            if len(text_lower) < 2 or text_lower.isdigit():
                continue


            best_field = None
            best_score = 0.0

            for field, keywords in self.field_mappings.items():
                for keyword in keywords:
                    similarity = self._similarity(text_lower, keyword)
                    if similarity > best_score and similarity > 0.6:
                        best_score = similarity
                        best_field = field

            if best_field and best_field not in key_value_pairs:
                value = self._find_value_for_key(extracted_data, i, best_field)
                if value:
                    key_value_pairs[best_field] = value

        for item in extracted_data:
            text = item['text']

            if self._is_id_number(text) and 'id_number' not in key_value_pairs:
                key_value_pairs['id_number'] = text

            # Check for dates without clear context
            if self._is_date(text):
                iso_date = self._convert_to_iso_date(text)
                if iso_date and iso_date != text:
                    year = int(iso_date.split('-')[0])
                    current_year = datetime.now().year

                    if year < current_year - 10 and 'date_of_birth' not in key_value_pairs:
                        key_value_pairs['date_of_birth'] = iso_date
                    elif year > current_year and 'date_of_expiry' not in key_value_pairs:
                        key_value_pairs['date_of_expiry'] = iso_date
                    elif 'date_of_issue' not in key_value_pairs:
                        key_value_pairs['date_of_issue'] = iso_date

        return key_value_pairs

    def _find_value_for_key(self, extracted_data, key_index, field_type):
        """Find the value associated with a key"""
        key_item = extracted_data[key_index]

        for i in range(key_index + 1, min(len(extracted_data), key_index + 4)):
            candidate = extracted_data[i]
            candidate_text = candidate['text'].strip()

            if len(candidate_text) < 2:
                continue

            # Skip if it looks like another key
            if any(self._similarity(candidate_text.lower(), keyword) > 0.6
                   for keywords in self.field_mappings.values()
                   for keyword in keywords):
                continue


            if field_type in ['date_of_birth', 'date_of_issue', 'date_of_expiry']:
                if self._is_date(candidate_text):
                    return self._convert_to_iso_date(candidate_text)
            elif field_type == 'id_number':
                if self._is_id_number(candidate_text):
                    return candidate_text
            elif field_type == 'gender':
                gender_indicators = {
                    'male': ['m', 'male', '男', 'مرد'],
                    'female': ['f', 'female', '女', 'عورت']
                }
                text_lower = candidate_text.lower()
                for gender, indicators in gender_indicators.items():
                    if any(indicator in text_lower for indicator in indicators):
                        return gender.title()
            elif field_type in ['name', 'father_name', 'husband_name', 'mother_name']:

                if re.match(r'^[A-Za-z\u0080-\uFFFF\s\-\'\.]+$', candidate_text) and len(candidate_text) > 2:
                    return candidate_text
            else:
                return candidate_text

        return None

    def process_id_card(self, image_path, output_json_path=None):
        """Complete process: extract text and find key-value pairs"""
        print(f"Processing ID card: {image_path}")

        extracted_data = self.extract_text_from_image(image_path)

        if not extracted_data:
            print("No text extracted from the image")
            return None


        key_value_pairs = self.find_key_value_pairs(extracted_data)


        result = {
            "personal_information": {
                "name": key_value_pairs.get('name'),
                "father_name": key_value_pairs.get('father_name'),
                "husband_name": key_value_pairs.get('husband_name'),
                "mother_name": key_value_pairs.get('mother_name'),
                "gender": key_value_pairs.get('gender'),
                "date_of_birth": key_value_pairs.get('date_of_birth'),
                "nationality": key_value_pairs.get('nationality'),
                "address": key_value_pairs.get('address'),
                "occupation": key_value_pairs.get('occupation')
            },
            "document_details": {
                "id_number": key_value_pairs.get('id_number'),
                "date_of_issue": key_value_pairs.get('date_of_issue'),
                "date_of_expiry": key_value_pairs.get('date_of_expiry')
            },
            "extraction_timestamp": datetime.now().isoformat(),
            "all_detected_fields": key_value_pairs,
            "raw_text": [item['text'] for item in extracted_data]
        }

        if output_json_path:
            with open(output_json_path, 'w', encoding='utf-8') as f:
                json.dump(result, f, indent=2, ensure_ascii=False)
            print(f"Results saved to: {output_json_path}")

        return result

def main():
    ocr_processor = IDCardOCR(lang='chinese_cht')  # 'en', 'japan', 'chinese_cht'

    image_path = "chineseID.png"
    output_path = "id_card_data.json"

    try:
        result = ocr_processor.process_id_card(image_path, output_path)

        if result:
            print("\n=== Extracted Information ===")

            personal_info = result.get("personal_information", {})
            print("\nPersonal Information:")
            for key, value in personal_info.items():
                if value:
                    print(f"  {key.replace('_', ' ').title()}: {value}")

            document_details = result.get("document_details", {})
            print("\nDocument Details:")
            for key, value in document_details.items():
                if value:
                    print(f"  {key.replace('_', ' ').title()}: {value}")

            all_fields = result.get("all_detected_fields", {})
            print(f"\nAll Detected Fields ({len(all_fields)} found):")
            for key, value in all_fields.items():
                print(f"  {key}: {value}")

    except FileNotFoundError:
        print(f"Image file not found: {image_path}")
    except Exception as e:
        print(f"Error: {e}")

if __name__ == "__main__":
    main()

[32mCreating model: ('PP-LCNet_x1_0_doc_ori', None)[0m
[32mUsing official model (PP-LCNet_x1_0_doc_ori), the model files will be automatically downloaded and saved in /root/.paddlex/official_models.[0m


Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]

[32mCreating model: ('UVDoc', None)[0m
[33mThe model(UVDoc) is not supported to run in MKLDNN mode! Using `paddle` instead![0m
[32mUsing official model (UVDoc), the model files will be automatically downloaded and saved in /root/.paddlex/official_models.[0m


Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]

[32mCreating model: ('PP-LCNet_x1_0_textline_ori', None)[0m
[32mUsing official model (PP-LCNet_x1_0_textline_ori), the model files will be automatically downloaded and saved in /root/.paddlex/official_models.[0m


Fetching 8 files:   0%|          | 0/8 [00:00<?, ?it/s]

[32mCreating model: ('PP-OCRv5_server_det', None)[0m
[32mUsing official model (PP-OCRv5_server_det), the model files will be automatically downloaded and saved in /root/.paddlex/official_models.[0m


Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]

[32mCreating model: ('PP-OCRv5_server_rec', None)[0m
[32mUsing official model (PP-OCRv5_server_rec), the model files will be automatically downloaded and saved in /root/.paddlex/official_models.[0m


Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]

Processing ID card: chineseID.png
Results saved to: id_card_data.json

=== Extracted Information ===

Personal Information:
  Name: 证件样本
  Gender: Female
  Date Of Birth: 1981-08-03
  Nationality: 加拿大/CAN

Document Details:
  Id Number: 911124198108030024
  Date Of Issue: 1981-08-03
  Date Of Expiry: 2033-15-09

All Detected Fields (7 found):
  name: 证件样本
  gender: Female
  date_of_birth: 1981-08-03
  nationality: 加拿大/CAN
  date_of_issue: 1981-08-03
  date_of_expiry: 2033-15-09
  id_number: 911124198108030024
