In [1]:
# Step 1: Install & Authenticate
!pip install --upgrade google-cloud-vision
!pip install opencv-python deepface




In [43]:
import threading
import json
import cv2
import os
import re
import io
import matplotlib.pyplot as plt
from datetime import datetime
from google.cloud import vision
from google.cloud.vision_v1 import types
from deepface import DeepFace

doc_image_path = "/content/drive/MyDrive/proj/id.jpg"
ref_image_path = "/content/drive/MyDrive/proj/target1.jpg"

In [44]:
# Shared variable
shared_result = {}

# -----------------------------
# Thread 1: Face Comparison
# -----------------------------
def face_thread():
    global shared_result

    def extract_face_opencv(image_path, output_path):
        img = cv2.imread(image_path)
        if img is None:
            print(f"❌ Could not load image: {image_path}")
            return None

        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
        faces = face_cascade.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=5)

        if len(faces) == 0:
            print(f"❌ No face detected in {image_path}")
            return None

        for (x, y, w, h) in faces[:1]:
            face = img[y:y+h, x:x+w]
            cv2.imwrite(output_path, face)
            return output_path

        return None

    def compare_faces(face1_path, face2_path):
        result = DeepFace.verify(
            img1_path=face1_path,
            img2_path=face2_path,
            model_name="ArcFace",
            detector_backend="retinaface",
            enforce_detection=False
        )
        distance = round(result["distance"], 4)
        return {
            "match": result["verified"],
            "distance": distance,
            "similarity": round((1 - distance), 4)
        }

    def show_faces_side_by_side(img1_path, img2_path, title1="Doc Face", title2="Ref Face"):
        img1 = cv2.imread(img1_path)
        img2 = cv2.imread(img2_path)
        img1 = cv2.cvtColor(img1, cv2.COLOR_BGR2RGB)
        img2 = cv2.cvtColor(img2, cv2.COLOR_BGR2RGB)

        plt.figure(figsize=(8, 4))
        plt.subplot(1, 2, 1)
        plt.imshow(img1)
        plt.title(title1)
        plt.axis("off")

        plt.subplot(1, 2, 2)
        plt.imshow(img2)
        plt.title(title2)
        plt.axis("off")
        plt.tight_layout()
        plt.show()

    # Perform extraction
    doc_face = extract_face_opencv(doc_image_path, "/content/drive/MyDrive/proj/doc_face.jpg")
    ref_face = extract_face_opencv(ref_image_path, "/content/drive/MyDrive/proj/ref_face.jpg")

    # Perform comparison and display
    if doc_face and ref_face:
        shared_result = compare_faces(doc_face, ref_face)
        print("\n[Face Thread Result]")
        print(shared_result)

        # ✅ Display faces after comparison
        show_faces_side_by_side(doc_face, ref_face)

    else:
        shared_result = {"match": False, "distance": None, "similarity": None}
        print("❌ Face extraction failed.")


In [45]:
# -----------------------------
# Thread 2: OCR + Field Extraction
# -----------------------------
def ocr_thread():
    global shared_result

    # Wait for face_thread to complete
    face_thread_handle.join()

    key_path = "/content/drive/MyDrive/proj/ocr-proj-464311-962bfb48b8cc.json"
    os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = key_path

    client = vision.ImageAnnotatorClient()
    with io.open(doc_image_path, 'rb') as image_file:
        content = image_file.read()
    image = types.Image(content=content)
    response = client.text_detection(image=image)
    texts = response.text_annotations
    full_text = texts[0].description if texts else ""

    # ✅ Save and print raw OCR extracted text for debugging
    raw_text_path = os.path.splitext(doc_image_path)[0] + "_ocr_raw.txt"
    with open(raw_text_path, "w", encoding="utf-8") as f:
        f.write(full_text)
    print(f"\n📝 Raw OCR Text saved to: {raw_text_path}")
    print("\n---- OCR Extracted Text ----")
    print(full_text)


    def detect_language(text):
        if re.search(r'[\u4e00-\u9fff]', text): return "chinese"
        elif re.search(r'[\u0600-\u06FF]', text): return "urdu"
        return "english_or_default"

    def extract_pakistani_fields(text):
      data = {}
      lines = [line.strip() for line in text.split('\n') if line.strip()]
      joined = "\n".join(lines)

      if "passport" in joined.lower():
          data["document_type"] = "Passport"

          # --- Passport Number (Robust Logic) ---
          for i in range(len(lines)):
              current = lines[i].lower()

              # Case 1: line is exactly "passport number"
              if "passport number" in current and i + 1 < len(lines):
                  candidate = lines[i + 1].strip()
                  if re.match(r'^[A-Z]{1,2}\d{6,9}$', candidate):
                      data["passport_number"] = candidate
                      break

              # Case 2: line is "passport", next is "number", next is actual number
              if "passport" in current and i + 2 < len(lines):
                  if "number" in lines[i + 1].lower():
                      candidate = lines[i + 2].strip()
                      if re.match(r'^[A-Z]{1,2}\d{6,9}$', candidate):
                          data["passport_number"] = candidate
                          break

          # --- Fallback: Match anywhere in the full text ---
          if "passport_number" not in data:
              fallback = re.search(r'\b[A-Z]{1,2}\d{6,9}\b', joined)
              if fallback:
                  data["passport_number"] = fallback.group()


          # --- CNIC inside passport ---
          cnic_match = re.search(r'\b\d{5}-\d{7}-\d\b', joined.replace("O", "0"))
          if cnic_match:
              data["id_number"] = cnic_match.group()

          # --- Given & Surname ---
          for i in range(len(lines)):
              if "surname" in lines[i].lower():
                  for j in range(i + 1, min(i + 4, len(lines))):
                      candidate = lines[j].strip()
                      if "code" in candidate.lower() or candidate == "PAK":
                          continue
                      if re.match(r'^[A-Z ]+$', candidate) and len(candidate) > 2:
                          data["last_name"] = candidate.title()
                          break

              if "given names" in lines[i].lower() and i + 1 < len(lines):
                  if re.match(r'^[A-Z ]+$', lines[i + 1]):
                      data["first_name"] = lines[i + 1].title()

          # --- MRZ Name Fallback ---
          mrz_lines = [line for line in lines if re.match(r'^<PAK[A-Z<]+$', line)]
          if mrz_lines:
              name_parts = mrz_lines[0].replace("<PAK", "").split("<<")
              if len(name_parts) >= 2:
                  surname = name_parts[0].replace("<", "").strip().title()
                  given_name = name_parts[1].replace("<", "").strip().title()
                  if "last_name" not in data:
                      data["last_name"] = surname
                  if "first_name" not in data:
                      data["first_name"] = given_name

          # --- Gender ---
          gender_match = re.search(r'\bSex[\s:\n]*(M|F)\b', joined, re.IGNORECASE)
          if gender_match:
              data["gender"] = gender_match.group(1).upper()

          # --- DOB ---
          dob_match = re.search(r'Date of Birth[\s:\n]*([0-9]{2} [A-Z]{3} [0-9]{4})', joined, re.IGNORECASE)
          if dob_match:
              data["date_of_birth"] = dob_match.group(1).strip()

          # --- Issue & Expiry Dates ---
          issue_match = re.search(r'Date of Issue[\s:\n]*([0-9]{2} [A-Z]{3} [0-9]{4})', joined, re.IGNORECASE)
          expiry_match = re.search(r'Date of Expiry[\s:\n]*([0-9]{2} [A-Z]{3} [0-9]{4})', joined, re.IGNORECASE)
          if issue_match:
              data["issue_date"] = issue_match.group(1).strip()
          if expiry_match:
              data["expiry_date"] = expiry_match.group(1).strip()

          # --- Husband / Father Name ---
          for i in range(len(lines)):
              if "place of birth" in lines[i].lower():
                  for j in range(i - 1, max(0, i - 4), -1):
                      if re.match(r'^[A-Z ]{5,}$', lines[j]):
                          data["father_or_husband_name"] = lines[j].title()
                          break

      elif "identity card" in joined.lower():
          data["document_type"] = "ID Card"

          if "pakistan" in joined.lower():
              data["nationality"] = "Pakistan"

          id_match = re.search(r'\b\d{5}-\d{7}-\d\b', joined)
          if id_match:
              data["id_number"] = id_match.group()

          gender_match = re.search(r'\b(M|F|X)\b', joined)
          if gender_match:
              data["gender"] = gender_match.group()

          dob_match = re.search(r'Date of Birth[\s:\n]*([\d./-]{8,10})', joined)
          if dob_match:
              data["date_of_birth"] = dob_match.group(1)

          issue_match = re.search(r'Date of Issue[\s:\n]*([\d./-]{8,10})', joined)
          if issue_match:
              data["issue_date"] = issue_match.group(1)

          expiry_match = re.search(r'Date of Expiry[\s:\n]*([\d./-]{8,10})', joined)
          if expiry_match:
              data["expiry_date"] = expiry_match.group(1)

          name_lines = []
          for i, line in enumerate(lines):
              if line.lower().startswith("name") and i + 1 < len(lines):
                  name_lines.append(lines[i + 1])
          if name_lines:
              name_parts = name_lines[0].split()
              if len(name_parts) == 1:
                  data["first_name"] = name_parts[0]
              elif len(name_parts) == 2:
                  data["first_name"], data["last_name"] = name_parts
              elif len(name_parts) >= 3:
                  data["first_name"] = name_parts[0]
                  data["middle_name"] = " ".join(name_parts[1:-1])
                  data["last_name"] = name_parts[-1]

          urdu_lines = [line for line in lines if re.search(r'[\u0600-\u06FF]', line)]
          if urdu_lines:
              if len(urdu_lines) == 1:
                  data["urdu_name"] = urdu_lines[0]
              elif len(urdu_lines) >= 2:
                  data["urdu_name"] = urdu_lines[0]
                  data["urdu_father_name"] = urdu_lines[1]

      elif "driving license" in joined.lower() or "driver" in joined.lower():
          data["document_type"] = "Driving License"

      else:
          data["document_type"] = "Unknown"

      if "pakistan" in joined.lower() and "nationality" not in data:
          data["nationality"] = "Pakistan"

      return data


    def extract_chinese_fields(text):
      data = {"document_type": "ID Card"}
      lines = [line.strip() for line in text.split('\n') if line.strip()]
      joined = "\n".join(lines)

      # ✅ Detect if it's a Chinese Passport
      if "PASSPORT" in joined.upper() and "CHINA" in joined.upper():
          data["document_type"] = "Passport"
          data["country"] = "China"

          # Passport Number
          match = re.search(r'\bE\d{8,9}\b', joined)
          if match:
              data["passport_number"] = match.group()

          # English Name
          name_match = re.search(r'\n([A-Z]{2,})\s+([A-Z]{2,})\n', joined)
          if name_match:
              data["last_name"] = name_match.group(1).title()
              data["first_name"] = name_match.group(2).title()

          # Date of Birth
          dob_match = re.search(r'Date of birth[^\d]*(\d{2} \w{3} \d{4})', joined, re.IGNORECASE)
          if dob_match:
              data["date_of_birth"] = dob_match.group(1)

          # Issue Date
          issue_match = re.search(r'Date of issue[^\d]*(\d{2} \w{3} \d{4})', joined, re.IGNORECASE)
          if issue_match:
              data["issue_date"] = issue_match.group(1)

          # Expiry Date
          expiry_match = re.search(r'expir[^\d]*(\d{2} \w{3} \d{4})', joined, re.IGNORECASE)
          if expiry_match:
              data["expiry_date"] = expiry_match.group(1)

          # Gender
          if "女" in joined or "/F" in joined:
              data["gender"] = "F"
          elif "男" in joined or "/M" in joined:
              data["gender"] = "M"

          # Chinese name
          for i, line in enumerate(lines):
              if "姓名" in line and i + 1 < len(lines):
                  if re.search(r'[\u4e00-\u9fff]', lines[i + 1]):
                      data["chinese_name"] = lines[i + 1].strip()

          # Nationality
          nat_match = re.search(r'(国籍|Nationality)[^\n]*\n*([^\n]+)', joined)
          if nat_match:
              raw_nat = nat_match.group(2)
              eng_nat = re.search(r'\b[A-Z]{3}\b', raw_nat)
              if eng_nat:
                  data["nationality"] = eng_nat.group()
              else:
                  data["nationality"] = raw_nat.strip()

          for k, v in data.items():
              if isinstance(v, str):
                  data[k] = v.replace('\n', ' ').strip()

          return data  # ✅ Exit early for passport

      # ---------------------
      # Existing Chinese ID Card Logic (unchanged)
      # ---------------------
      if "中国" in joined or "CHINA" in joined.upper():
          data["country"] = "China"

      id_match = re.search(r'\b\d{17}[\dXx]\b', joined)
      if id_match:
          data["id_number"] = id_match.group()

      dob_match = re.search(r'(出生日期|Date of Birth)[^\d]*(\d{4}[./-]\d{2}[./-]\d{2})', joined)
      if dob_match:
          data["date_of_birth"] = dob_match.group(2)

      validity_match = re.search(r'(\d{4}[./-]\d{2}[./-]\d{2})-(\d{4}[./-]\d{2}[./-]\d{2})', joined)
      if validity_match:
          data["issue_date"] = validity_match.group(1)
          data["expiry_date"] = validity_match.group(2)

      gender_match = re.search(r'(性别|Sex)[^\n]*\n*([男女MF])/?([MF男女]?)', joined, re.IGNORECASE)
      if gender_match:
          g1 = gender_match.group(2).upper()
          g2 = gender_match.group(3).upper()
          gender_char = g1 if g1 in ['M', 'F'] else g2
          if gender_char in ['M', '男']:
              data["gender"] = "M"
          elif gender_char in ['F', '女']:
              data["gender"] = "F"

      eng_name_match = re.search(r'[A-Z]{2,},\s*[A-Z\s]{2,}', joined)
      if eng_name_match:
          full_name = eng_name_match.group().splitlines()[0].strip()
          parts = full_name.split(',')
          if len(parts) == 2:
              data["last_name"] = parts[0].strip().title()
              data["first_name"] = parts[1].strip().title()

      for i, line in enumerate(lines):
          if re.search(r'(姓名|姓名/Name)', line):
              if i + 1 < len(lines):
                  chinese_name = lines[i + 1].strip()
                  if re.search(r'[\u4e00-\u9fff]', chinese_name):
                      data["chinese_name"] = chinese_name

      nat_match = re.search(r'(国籍|Nationality)[^\n]*\n*([^\n]+)', joined)
      if nat_match:
          raw_nat = nat_match.group(2)
          eng_nat = re.search(r'\b[A-Z]{3}\b', raw_nat)
          if eng_nat:
              data["nationality"] = eng_nat.group()
          else:
              data["nationality"] = raw_nat.strip()

      for k, v in data.items():
          if isinstance(v, str):
              data[k] = v.replace('\n', ' ').strip()

      return data

    def extract_id_fields(text):
        return extract_chinese_fields(text) if detect_language(text) == "chinese" else extract_pakistani_fields(text)

    fields = extract_id_fields(full_text)
    for k, v in fields.items():
        if isinstance(v, str): fields[k] = v.replace('\n', ' ').strip()

    from datetime import datetime

    # ---------------------------
    # Convert Dates to ISO Format
    # ---------------------------
    def to_iso(date_str):
        if not isinstance(date_str, str):
            return date_str

        cleaned = date_str.upper().strip().replace("\n", " ").replace("  ", " ")

        # Fix common OCR issues in month and day
        replacements = {
            "10A": "10",   # common OCR confusion
            "1O": "10",    # 'O' instead of zero
            "OCTOBER": "OCT", "0CT": "OCT", "OCT": "OCT",
            "JANUARY": "JAN", "JAN": "JAN",
            "FEBRUARY": "FEB", "FEB": "FEB",
            "MARCH": "MAR", "MAR": "MAR",
            "APRIL": "APR", "APR": "APR",
            "MAY": "MAY",
            "JUNE": "JUN", "JUN": "JUN",
            "JULY": "JUL", "JUL": "JUL",
            "AUGUST": "AUG", "AUG": "AUG",
            "SEPTEMBER": "SEP", "SEP": "SEP",
            "NOVEMBER": "NOV", "NOV": "NOV",
            "DECEMBER": "DEC", "DEC": "DEC"
        }

        for wrong, right in replacements.items():
            cleaned = cleaned.replace(wrong, right)

        # Remove symbols like [ ] or stray punctuation
        cleaned = re.sub(r"[^\w\s:/-]", "", cleaned)
        cleaned = re.sub(r"\s+/", "/", cleaned)

        formats = [
            "%d %b %Y", "%d %B %Y",
            "%d/%b/%Y", "%d/%B/%Y",
            "%d %b/%Y", "%d %B/%Y",
            "%d-%b-%Y", "%d-%B-%Y",
            "%d/%m/%Y", "%d-%m-%Y", "%d.%m.%Y",
            "%Y/%m/%d", "%Y-%m-%d"
        ]

        for fmt in formats:
            try:
                return datetime.strptime(cleaned, fmt).strftime("%Y-%m-%d")
            except:
                continue

        return date_str


    # ---------------------------
    # Apply to Extracted Fields
    # ---------------------------
    for key in ["date_of_birth", "issue_date", "expiry_date"]:
        if key in fields:
            fields[key] = to_iso(fields[key])


    # 🔗 Inject similarity result
    if shared_result:
        fields["face_match"] = shared_result.get("match")
        fields["face_distance"] = shared_result.get("distance")
        fields["face_similarity"] = shared_result.get("similarity")

    print("\n---- Final Extracted Fields ----")
    for k, v in fields.items():
        print(f"{k}: {v}")

    # Save JSON
    json_filename = os.path.splitext(os.path.basename(doc_image_path))[0] + ".json"
    output_path = os.path.join("/content/drive/MyDrive/proj", json_filename)

    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(fields, f, ensure_ascii=False, indent=2)
    print(f"\n✅ Saved JSON to: {output_path}")


In [46]:
# -----------------------------
# Run both threads
# -----------------------------
face_thread_handle = threading.Thread(target=face_thread)
ocr_thread_handle = threading.Thread(target=ocr_thread)

face_thread_handle.start()
ocr_thread_handle.start()

# Wait for both to finish
face_thread_handle.join()
ocr_thread_handle.join()


❌ Could not load image: /content/drive/MyDrive/proj/target1.jpg
❌ Face extraction failed.

📝 Raw OCR Text saved to: /content/drive/MyDrive/proj/id_ocr_raw.txt

---- OCR Extracted Text ----
PAKISTAN National Identity Card
ISLAMIC REPUBLIC OF PAKISTAN
Name
Hassaan Mustafa
Father Name
حسان مصطفے
Muhammad Shafiq Rafiq
23880
Gender Country of Stay
M
Pakistan
Identity Number
35202-5836303-9
Date of Issue
02.12.2021
محمد شفیق رفیق
Date of Birth
06.10.2003
Hass
Date of Expiry
02.12.2031
Holder's Signature

---- Final Extracted Fields ----
document_type: ID Card
nationality: Pakistan
id_number: 35202-5836303-9
gender: M
date_of_birth: 06.10.2003
issue_date: 02.12.2021
expiry_date: 02.12.2031
first_name: Hassaan
last_name: Mustafa
urdu_name: حسان مصطفے
urdu_father_name: محمد شفیق رفیق
face_match: False
face_distance: None
face_similarity: None

✅ Saved JSON to: /content/drive/MyDrive/proj/id.json
