In [None]:
# Install dependencies (pin NumPy < 2 to avoid binary-compat errors)
pip install "numpy<2" "rembg[cpu]" easyocr pytesseract

INFO: pip is looking at multiple versions of rembg[cpu] to determine which version is compatible with other requirements. This could take a while.
Collecting rembg[cpu]
  Downloading rembg-2.0.71-py3-none-any.whl.metadata (16 kB)
  Downloading rembg-2.0.70-py3-none-any.whl.metadata (17 kB)
  Downloading rembg-2.0.69-py3-none-any.whl.metadata (17 kB)
INFO: pip is looking at multiple versions of opencv-python-headless to determine which version is compatible with other requirements. This could take a while.
Collecting opencv-python-headless (from rembg[cpu])
  Downloading opencv_python_headless-4.12.0.88-cp37-abi3-win_amd64.whl.metadata (20 kB)
  Downloading opencv_python_headless-4.11.0.86-cp37-abi3-win_amd64.whl.metadata (20 kB)
Downloading rembg-2.0.69-py3-none-any.whl (43 kB)
Downloading opencv_python_headless-4.11.0.86-cp37-abi3-win_amd64.whl (39.4 MB)
   ---------------------------------------- 0.0/39.4 MB ? eta -:--:--
   ---------------------------------------- 0.3/39.4 MB ? eta 


[notice] A new release of pip is available: 25.1.1 -> 25.3
[notice] To update, run: C:\Users\Omar\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [58]:
from tkinter import *
import cv2
import re
import os
from tkinter import filedialog
from rembg import remove
from PIL import Image
from easyocr import easyocr
import numpy as np
import pytesseract
import string
from datetime import datetime
# from OCR_Endpoint import settings

# Set Tesseract path and tessdata directory
pytesseract.pytesseract.tesseract_cmd = r'D:\ocr\tesseract\tesseract.exe'
os.environ['TESSDATA_PREFIX'] = r'D:\ocr\tessdata'

data = {"first name": "0",
        "seconed name": "0",
        "address": "0",
        "id": "0",
        "birthdate": "0",
        "error": 0}

# OCR configuration
TESS_LANG_TEXT = "ara"
TESS_LANG_ID = "ara"
TESS_CONFIG_TEXT = "--psm 11 --oem 3"
TESS_CONFIG_ID = "--psm 7 --oem 3"

ARABIC_DIGITS = ["٠", "١", "٢", "٣", "٤", "٥", "٦", "٧", "٨", "٩"]
PUN = set(string.punctuation)
_ARABIC_TO_WESTERN = str.maketrans('٠١٢٣٤٥٦٧٨٩', '0123456789')
_WESTERN_TO_ARABIC = str.maketrans('0123456789', '٠١٢٣٤٥٦٧٨٩')

def _to_western_digits(sval: str) -> str:
    return (sval or "").translate(_ARABIC_TO_WESTERN)

def _to_arabic_digits(sval: str) -> str:
    return (sval or "").translate(_WESTERN_TO_ARABIC)

def _count_arabic_letters(sval: str) -> int:
    return len(re.findall(r'[\u0600-\u06FF]', sval or ""))

def _arabic_words(sval: str) -> list[str]:
    return re.findall(r'[\u0600-\u06FF]{2,}', sval or "")

def _clean_name(sval: str) -> str:
    sval = re.sub(r'[^\u0600-\u06FF\s]', ' ', sval or "")
    return ' '.join(sval.split())

def _choose_names(t_lines: list[str], easy_tokens: list[str]) -> tuple[str, str, list[str]]:
    t_first = _clean_name(t_lines[0]) if len(t_lines) > 0 else ""
    t_second = _clean_name(t_lines[2]) if len(t_lines) > 2 else ""
    easy_words: list[str] = []
    for tok in easy_tokens:
        easy_words.extend(_arabic_words(tok))
    e_first = easy_words[0] if len(easy_words) > 0 else ""
    e_second = easy_words[1] if len(easy_words) > 1 else ""
    def _best(a: str, b: str) -> str:
        if len(a) >= 2:
            return a
        return b
    first = _best(t_first, e_first) or t_first or e_first
    second = _best(t_second, e_second) or t_second or e_second
    if len(first) < 2 and e_first:
        first = e_first
    if len(second) < 2 and e_second:
        second = e_second
    return first or "0", second or "0", easy_words

def _best_text(text_a: str, text_b: str) -> str:
    a = text_a.strip() if text_a else ""
    b = text_b.strip() if text_b else ""
    if not a and not b:
        return ""
    if _count_arabic_letters(a) != _count_arabic_letters(b):
        return a if _count_arabic_letters(a) > _count_arabic_letters(b) else b
    return a if len(a) >= len(b) else b

def _best_id_list(list_a: list[str], list_b: list[str]) -> list[str]:
    def score(lst: list[str]) -> tuple[int, int]:
        if not lst:
            return (0, 0)
        best = max(lst, key=len)
        digits = len(re.findall(r'[0-9٠-٩]', best))
        return (digits, len(best))
    return list_a if score(list_a) >= score(list_b) else list_b

def _sanitize_addr(sval: str) -> str:
    """Keep Arabic letters, digits and common separators; drop OCR garbage like > ؟ etc."""
    sval = (sval or "").replace('؟', ' ').replace('?', ' ').replace('>', ' ').replace('<', ' ')
    # Remove English letters (a-z, A-Z)
    sval = re.sub(r'[a-zA-Z]', ' ', sval)
    # Remove common OCR artifacts and special characters, keep only Arabic, digits, spaces, and separators
    sval = re.sub(r'[^\u0600-\u06FF0-9٠-٩\s\-ـ]', ' ', sval)
    # Remove excessive spaces
    sval = ' '.join(sval.split())
    return sval

def _extract_locality_prefix(sval: str) -> str:
    """Return the part before numbers/markers (often the area name)."""
    sval = _sanitize_addr(sval)
    m = re.search(r'[مق]|[0-9٠-٩]', sval)
    prefix = sval[:m.start()] if m else sval
    prefix = re.sub(r'[^\u0600-\u06FF\s]', ' ', prefix)
    prefix = ' '.join(prefix.split())
    return prefix

def _extract_longest_arabic_phrase(sval: str) -> str:
    """Pick the longest phrase of Arabic words (ignoring markers/digits)."""
    sval = _sanitize_addr(sval)
    if not sval:
        return ""
    tmp = re.sub(r'[0-9٠-٩]', ' ', sval)
    tmp = re.sub(r'\b[مق]\b', ' ', tmp)
    tmp = re.sub(r'[\-ـ]', ' ', tmp)
    tmp = ' '.join(tmp.split())
    phrases = re.findall(r'[\u0600-\u06FF]{2,}(?:\s+[\u0600-\u06FF]{2,}){0,3}', tmp)
    if not phrases:
        return ""
    phrases = [' '.join(p.split()) for p in phrases]
    phrases.sort(key=lambda p: (_count_arabic_letters(p), len(p.split()), len(p)), reverse=True)
    return phrases[0]

def _extract_all_locality_parts(sval: str) -> str:
    """Extract all Arabic locality parts (area, district, governorate) excluding standalone markers."""
    sval = _sanitize_addr(sval)
    # Remove standalone markers (م, ق, ك) when they're followed by numbers
    cleaned = re.sub(r'\b[مقك]\s*[\-ـ:]?\s*[0-9٠-٩]+', ' ', sval)
    # Remove all remaining standalone numbers
    cleaned = re.sub(r'\b[0-9٠-٩]+\b', ' ', cleaned)
    # Remove standalone single letter markers that remain
    cleaned = re.sub(r'\b[مقك]\b', ' ', cleaned)
    # Clean up extra separators
    cleaned = re.sub(r'[\-ـ]+', ' ', cleaned)
    # Keep only Arabic letters and spaces
    cleaned = re.sub(r'[^\u0600-\u06FF\s]', ' ', cleaned)
    cleaned = ' '.join(cleaned.split())
    return cleaned.strip()

def _pick_locality(addr_t: str, addr_e: str) -> str:
    cands = []
    for s0 in [addr_t, addr_e]:
        # Try to extract all locality parts (including governorate)
        full_locality = _extract_all_locality_parts(s0)
        if full_locality:
            cands.append(full_locality)
        # Also try traditional methods as fallback
        cands.append(_extract_locality_prefix(s0))
        cands.append(_extract_longest_arabic_phrase(s0))
    cands = [c.strip() for c in cands if c and c.strip()]
    if not cands:
        return ""
    cands = list(dict.fromkeys(cands))
    cands.sort(key=lambda p: (_count_arabic_letters(p), len(p.split()), len(p)), reverse=True)
    best = cands[0]
    if _count_arabic_letters(best) < 3:
        raw = _sanitize_addr(addr_t)
        raw2 = _sanitize_addr(addr_e)
        best = raw if _count_arabic_letters(raw) >= _count_arabic_letters(raw2) else raw2
    return best.strip()

def _extract_marker_number(sval: str, marker: str) -> str:
    sval = _sanitize_addr(sval)
    # Match marker only when it's standalone (word boundary before it)
    # This prevents matching م in "مركز" or ق in "القنطرة"
    m = re.search(rf'(?:^|[\s\-ـ]){marker}\s*[\-ـ:]?\s*([0-9٠-٩]{{1,3}})', sval)
    return m.group(1) if m else ""

def _closest_number_after_marker(sval: str, marker: str) -> str:
    """Pick the nearest 2-3 digit group to the marker, if it exists."""
    sval = _sanitize_addr(sval)
    # Find standalone marker (not part of a word)
    marker_idx = -1
    for match in re.finditer(rf'(?:^|[\s\-ـ]){marker}(?:[\s\-ـ]|$)', sval):
        marker_idx = match.start() + (1 if match.group(0)[0] in ' \-ـ' else 0)
        break
    
    if marker_idx == -1:
        return ""
    best = None
    for m in re.finditer(r'[0-9٠-٩]{2,3}', sval):
        dist = abs(m.start() - marker_idx)
        cand = m.group(0)
        if best is None or dist < best[0]:
            best = (dist, cand)
    return best[1] if best else ""

def _best_number(num_t: str, num_e: str, all_digits_t: list[str], all_digits_e: list[str]) -> str:
    """Prefer longer numbers (2-3 digits). If only 1 digit exists, try to build 2 digits from the other OCR digits."""
    candidates = [x for x in [num_t, num_e] if x]
    if not candidates:
        return ""
    candidates_sorted = sorted(candidates, key=len, reverse=True)
    best = candidates_sorted[0]
    if len(best) >= 2:
        return best
    singles = [d for d in (all_digits_t + all_digits_e) if len(d) == 1 and d != best]
    if singles:
        return best + singles[0]
    return best

def _extract_city_district(sval: str) -> str:
    """Extract city/district/governorate name from address."""
    known_cities = [
        'اكتوبر', '6 اكتوبر', 'القاهرة', 'الجيزة', 'الاسكندرية', 'الاسماعيلية',
        'بورسعيد', 'السويس', 'المنصورة', 'طنطا', 'الزقازيق', 'اسيوط', 'الفيوم',
        'بنها', 'دمياط', 'اسوان', 'الاقصر', 'قنا', 'سوهاج', 'المنيا', 'كفر الشيخ',
        'الدقهلية', 'الشرقية', 'الغربية', 'القليوبية', 'البحيرة', 'مطروح'
    ]
    sval = _sanitize_addr(sval)
    words = sval.split()
    
    # Look for known city names
    for i, word in enumerate(words):
        for city in known_cities:
            if city in word or word in city:
                # Return from this position to end
                return ' '.join(words[i:])
    
    # If no known city found, return last 1-2 words
    if len(words) >= 2:
        return ' '.join(words[-2:])
    elif len(words) == 1:
        return words[0]
    return ""

def _extract_area_name(sval: str, city: str) -> str:
    """Extract area name (first part before city/markers)."""
    sval = _sanitize_addr(sval)
    # Remove city name
    if city:
        sval = sval.replace(city, ' ')
    # Remove markers and numbers
    sval = re.sub(r'\b[مقك]\s*[\-ـ:]?\s*[0-9٠-٩]+', ' ', sval)
    sval = re.sub(r'\b[0-9٠-٩]+\b', ' ', sval)
    sval = re.sub(r'\b[مقك]\b', ' ', sval)
    sval = re.sub(r'[\-ـ]+', ' ', sval)
    sval = ' '.join(sval.split())
    return sval.strip()

def choose_address(addr_tesseract: str, addr_easyocr: str) -> str:
    """Build a clean address using BOTH OCR outputs with multiple marker types."""
    addr_t = _sanitize_addr(addr_tesseract)
    addr_e = _sanitize_addr(addr_easyocr)
    if not addr_t and not addr_e:
        return "0"
    
    # Extract city/district from both sources
    city_t = _extract_city_district(addr_t)
    city_e = _extract_city_district(addr_e)
    city = city_t if _count_arabic_letters(city_t) >= _count_arabic_letters(city_e) else city_e
    
    # Extract area name (without city)
    area_t = _extract_area_name(addr_t, city)
    area_e = _extract_area_name(addr_e, city)
    area = area_t if _count_arabic_letters(area_t) >= _count_arabic_letters(area_e) else area_e
    
    # Support multiple marker types: م (meem), ق (qaf), ك (kaf)
    markers = {}
    possible_markers = ['م', 'ق', 'ك']
    
    for marker in possible_markers:
        m_t = _extract_marker_number(addr_t, marker)
        m_e = _extract_marker_number(addr_e, marker)
        
        if not m_t and not m_e:
            continue
            
        m2_t = _closest_number_after_marker(addr_t, marker)
        m2_e = _closest_number_after_marker(addr_e, marker)
        
        all_t = [_to_western_digits(x) for x in re.findall(r'[0-9٠-٩]+', addr_t)]
        all_e = [_to_western_digits(x) for x in re.findall(r'[0-9٠-٩]+', addr_e)]
        
        best = _best_number(_to_western_digits(m2_t or m_t), _to_western_digits(m2_e or m_e), all_t, all_e)
        
        if len(best) == 1:
            twos = [d for d in (all_t + all_e) if len(d) == 2]
            if twos:
                best = twos[-1]
        
        if best:
            markers[marker] = _to_arabic_digits(best)
    
    result = ""
    if len(markers) == 0:
        result = f"{area} {city}".strip() or addr_e or addr_t
    elif len(markers) == 1:
        marker, number = list(markers.items())[0]
        result = f"{area} {marker} {number} {city}".strip()
    elif len(markers) == 2:
        items = list(markers.items())
        result = f"{area} {items[0][0]} {items[0][1]} -{items[1][0]} {items[1][1]} {city}".strip()
    else:
        marker_str = ' -'.join([f"{k} {v}" for k, v in markers.items()])
        result = f"{area} {marker_str} {city}".strip()
    
    # Final cleanup: remove any remaining English letters or special characters
    result = re.sub(r'[a-zA-Z]', '', result)
    result = re.sub(r'[^\u0600-\u06FF0-9٠-٩\s\-ـ]', ' ', result)
    result = ' '.join(result.split())
    return result

def _remove_cross_line_duplicates(address: str) -> str:
    """Remove duplicate words that appear in multiple parts of the address.
    If a word appears in multiple parts, keep it only in the last occurrence."""
    if not address or address == "0":
        return address
    
    # Normalize: separate dashes and markers to ensure consistent tokenization
    # Replace "-ق" with "- ق", etc.
    normalized = address
    for marker in ['م', 'ق', 'ك']:
        normalized = normalized.replace(f'-{marker}', f'- {marker}')
        normalized = normalized.replace(f'ـ{marker}', f'ـ {marker}')
    
    # Split by spaces
    parts = re.split(r'\s+', normalized)
    if len(parts) <= 1:
        return address
    
    # Track words and marker+number combinations
    word_positions = {}
    i = 0
    while i < len(parts):
        word = parts[i]
        
        # Check if this token contains a marker+number combo (e.g., "م٢٦" or "ق٢٦")
        marker_num_match = re.match(r'^([مقك])([0-9٠-٩]+)$', word)
        if marker_num_match:
            marker = marker_num_match.group(1)
            number = marker_num_match.group(2)
            combo = f"{marker} {number}"
            if combo not in word_positions:
                word_positions[combo] = []
            word_positions[combo].append(i)
            i += 1
            continue
        
        # Check if this is a marker followed by a number in the next token
        if word in ['م', 'ق', 'ك'] and i + 1 < len(parts) and re.match(r'^[0-9٠-٩]+$', parts[i + 1]):
            combo = f"{word} {parts[i + 1]}"
            if combo not in word_positions:
                word_positions[combo] = []
            word_positions[combo].append((i, i + 1))
            i += 2
            continue
        
        # Skip standalone markers, numbers, and separators
        if word in ['م', 'ق', 'ك', '-', 'ـ'] or re.match(r'^[0-9٠-٩]+$', word):
            i += 1
            continue
        
        # Regular word
        if word not in word_positions:
            word_positions[word] = []
        word_positions[word].append(i)
        i += 1
    
    # Mark positions to remove (keep only the last occurrence of duplicates)
    positions_to_remove = set()
    for item, positions in word_positions.items():
        if len(positions) > 1:
            for pos in positions[:-1]:
                if isinstance(pos, tuple):
                    positions_to_remove.add(pos[0])
                    positions_to_remove.add(pos[1])
                else:
                    positions_to_remove.add(pos)
    
    # Rebuild address without removed positions
    cleaned_parts = [parts[i] for i in range(len(parts)) if i not in positions_to_remove]
    return ' '.join(cleaned_parts)

def _pick_file() -> str:
    try:
        wi = Tk()
        wi.withdraw()
        wi.attributes('-topmost', True)
        wi.update()
        print("Opening file picker...")
        file_path = filedialog.askopenfilename(parent=wi, title="choose image")
        wi.destroy()
        return file_path
    except Exception:
        return ""

def _preprocess_for_ocr_light(img: np.ndarray) -> np.ndarray:
    if img is None:
        return img
    if len(img.shape) == 3:
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    else:
        gray = img
    gray = cv2.medianBlur(gray, 3)
    _, th = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    return th

def _extract_birthdate_from_id(id_value: str) -> str:
    digits = re.sub(r'\D', '', _to_western_digits(str(id_value)))
    if len(digits) < 7:
        return "0"
    # Egyptian-style 14-digit ID: CYYMMDD... where C=2 (1900s) or 3 (2000s)
    if len(digits) >= 7 and digits[0] in ('2', '3') and len(digits) >= 7:
        century = 1900 if digits[0] == '2' else 2000
        yy = int(digits[1:3])
        mm = int(digits[3:5])
        dd = int(digits[5:7])
        try:
            dt = datetime(century + yy, mm, dd)
            return dt.strftime('%Y-%m-%d')
        except Exception:
            return "0"
    # Fallback: if 6 digits found (YYMMDD) at start, assume 19xx/20xx based on year
    if len(digits) >= 6:
        yy = int(digits[0:2])
        mm = int(digits[2:4])
        dd = int(digits[4:6])
        century = 2000 if yy <= (datetime.now().year % 100) else 1900
        try:
            dt = datetime(century + yy, mm, dd)
            return dt.strftime('%Y-%m-%d')
        except Exception:
            return "0"
    return "0"

try:
    wi = Tk()
    wi.withdraw()  # Hide the main window
    wi.attributes('-topmost', True)  # Bring dialog to front (Jupyter sometimes hides it)
    wi.update()

    print("Opening file picker...")
    file = filedialog.askopenfilename(parent=wi, title="choose image")
    wi.destroy()  # Close window after selection

    if not file:
        raise Exception("No file selected")

    name = file
    print("Selected:", name)

    input_path = name
    input = Image.open(input_path)
    output = remove(input)
    img_array = np.array(output)
    img = cv2.cvtColor(img_array, cv2.COLOR_RGBA2BGR)
    blurred = cv2.blur(img, (5,5))
    kernel = np.array([[-1,-1,-1] ,[-1,9,-1],[-1,-1,-1]])
    sharpened = cv2.filter2D(blurred, -1, kernel)
    canny = cv2.Canny(sharpened, 50, 200)
    pts = np.argwhere(canny>0)
    if pts.size == 0:
        cropped = img
    else:
        y1,x1 = pts.min(axis=0)
        y2,x2 = pts.max(axis=0)
        cropped = img[y1:y2, x1:x2]
    w,h,c=cropped.shape
    o=int(w/2)
    i=int(h/2.5)
    n=int(h/6)
    cr=cropped[n-13:i+15,o:]  # Extended bottom boundary to capture more address text
    cropped_img=cropped[i+8:,o+10:]
    
    # Split the text region into names and address
    cr_height = cr.shape[0]
    split_point = int(cr_height * 0.52)  # Split at 52% (names in top, address in bottom)
    names_region = cr[0:split_point, :]
    address_region_raw = cr[split_point:, :]
    
    # Apply preprocessing to improve address region quality
    address_region = cv2.GaussianBlur(address_region_raw, (3, 3), 0)
    address_region = cv2.convertScaleAbs(address_region, alpha=1.3, beta=10)  # Increase contrast and brightness
    
    # Save the names region to newimg.png
    cv2.imwrite("newimg.png", names_region)
    # Save the address region to address.png
    cv2.imwrite("address.png", address_region)
    # Save the ID number region to id_card.png
    cv2.imwrite("id_card.png",cropped_img)

    # Read NAMES region using Tesseract (path 1)
    text_names=pytesseract.image_to_string(names_region,lang='ara',config='--psm 11 --oem 3')
    splited_names=text_names.split('\n')

    arabic_digits = ["٠", "١", "٢", "٣", "٤", "٥", "٦", "٧", "٨", "٩"]
    pun=set(string.punctuation)

    # One EasyOCR reader for address + ID (and fallback names)
    s = easyocr.Reader(['ar','ar'])
    d_names = s.readtext(names_region, detail=0, text_threshold=0.18, width_ths=0.9, low_text=0.17)
    # Optimized parameters for better address reading
    d_address = s.readtext(address_region, detail=0, text_threshold=0.15, width_ths=0.7, low_text=0.15, paragraph=True)

    state=0
    if len(text_names.split('\n'))==4:
        state=1
        print(state)
        firstname=splited_names[0] if len(splited_names) > 0 else "0"
        secondname=splited_names[2] if len(splited_names) > 2 else "0"

        # Address from EasyOCR (address region)
        address_easyocr = ' '.join(d_address) if len(d_address) > 0 else ""

        # Use EasyOCR result for address
        address = choose_address("", address_easyocr)
        # Remove cross-line duplicates
        address = _remove_cross_line_duplicates(address)

        data["first name"] = firstname
        data["seconed name"] = secondname
        data["address"] = address

        for i in data:
            if i == None:
                data["error"] = "1"
                break
            else:
                imgs = cv2.imread('id_card.png',0)
                gauss = cv2.GaussianBlur(imgs, (7,7), 0)
                unsharp_image = cv2.addWeighted(imgs, 2, gauss, -1, 0)
                o=s.readtext(unsharp_image, detail = 0,text_threshold = 0.27
                ,width_ths = 0.8,low_text= .008)
                if len(o) == 1:
                    data["id"] = o[0]
                elif len(o) == 0:
                    data["id"] = "0"
                elif len(o) > 1:
                    data["id"]=max(o, key=len)

            break

    elif state==0:
        state=2
        imgs = cv2.imread('id_card.png',0)
        # Enhanced preprocessing for better digit recognition
        imgs = cv2.medianBlur(imgs, 3)
        gauss = cv2.GaussianBlur(imgs, (5,5), 0)
        unsharp_image = cv2.addWeighted(imgs, 2.2, gauss, -1.2, 0)
        # Additional contrast enhancement
        unsharp_image = cv2.convertScaleAbs(unsharp_image, alpha=1.4, beta=5)

        # Use already-read EasyOCR result for the text region
        d = d_names

        # Address from EasyOCR (address region) - keep all tokens to capture leading numbers
        address_easyocr = ' '.join(d_address) if len(d_address) > 0 else ""

        for i in d:
            if i in arabic_digits:
                break
            else:
                my_data = ','.join(d)
                split_list = my_data.split(',')
                data["first name"] = split_list[0] if len(split_list) > 0 else "0"
                # Second name should include all remaining name parts (not just one word)
                data["seconed name"] = ','.join(split_list[1:]) if len(split_list) > 1 else "0"
                # Clean up commas in second name
                data["seconed name"] = data["seconed name"].replace(",", " ").strip()

                # Use EasyOCR result for address
                data["address"] = choose_address("", address_easyocr)
                # Remove cross-line duplicates
                data["address"] = _remove_cross_line_duplicates(data["address"])

                # Clean up any remaining brackets/quotes
                data["address"] = data["address"].replace("[", "").replace("]", "").replace("'","")

        # Enhanced OCR parameters for better digit recognition
        o=s.readtext(unsharp_image, detail = 0, text_threshold = 0.25,
                     width_ths = 0.7, low_text= 0.01, paragraph=False)
        print(state)
        if o == None or d == None:
            state = 4
            data["error"] = "1"
        else:
            if len(o) == 1:
                data["id"] = o[0]
            elif len(o) == 0:
                data["id"] = "0"
            elif len(o) > 1:
                    data["id"]=max(o, key=len)

    elif state == 4:
        data["error"] = "1"
    if len(str(data["id"]))<20:
        data["error"]="1"
    for i in data["first name"]:
        if i in arabic_digits or i in pun:
            data["first name"] = data["first name"].replace(i, "")
    for i in data["seconed name"]:
        if i in arabic_digits or i in pun:
            data["seconed name"] = data["seconed name"].replace(i, "")

    # Validation: second name should be longer than first name
    if len(data["seconed name"]) <= len(data["first name"]):
        data["first name"], data["seconed name"] = data["seconed name"], data["first name"]

    def _word_parts(value: str) -> list[str]:
        return [part for part in (value or "").split() if part]

    first_parts = _word_parts(data["first name"])
    second_parts = _word_parts(data["seconed name"])

    if len(first_parts) > 3:
        data["error"] = "1"
    if len(second_parts) <= 1:
        data["error"] = "1"

    ar=['ا', 'ب', 'ت', 'ث', 'ج', 'ح', 'خ', 'د', 'ذ', 'ر', 'ز', 'س', 'ش', 'ص', 'ض', 'ط', 'ظ', 'ع', 'غ', 'ف', 'ق', 'ك', 'ل', 'م', 'ن', 'ه', 'و', 'ي']
    for i in str(data["id"]):
        if i in ar or i in pun:
            data["id"]=str(data["id"]).replace(i, "")

    arabic_string = str(data["id"])

    # Use regular expression to extract individual numbers
    regex = r'[٠-٩]+'
    matches = re.findall(regex, arabic_string)

    # Reverse the order of the matches
    matches.reverse()

    # Concatenate the matches into a single string
    concatenated_string = ''.join(matches)

    # Convert the concatenated string to an integer
    if concatenated_string:
        integer_value = int(concatenated_string.translate(_ARABIC_TO_WESTERN))
        data["id"]=integer_value

    data["birthdate"] = _extract_birthdate_from_id(data["id"])
    
    # Validate birthdate - if invalid, set error flag
    if data["birthdate"] == "0":
        data["error"] = "1"

except Exception as e:
    print("Error:", e)
    data['id'] = 0
    data['error'] = 1
    data["birthdate"] = "0"

for key, value in data.items():
    print(key, value)

Opening file picker...
Selected: C:/Users/Omar/Downloads/WhatsApp Image 2026-02-02 at 1.07.40 PM.jpeg


Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


1
first name ابراهيم
seconed name سليمان ابراهيم محمد
address ك ١٤ ابوخليفه مركز القنطره غرب - الاسماعيلية
id 29610121900051
birthdate 1996-10-12
error 0
