In [1]:
!pip install --quiet easyocr opencv-python-headless Pillow numpy gradio pytesseract

# Install Tesseract (Linux -> works in Colab)
!apt-get update -qq
!apt-get install -y -qq tesseract-ocr libtesseract-dev


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.9 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/2.9 MB[0m [31m36.7 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.9/2.9 MB[0m [31m49.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m180.7/180.7 kB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m963.8/963.8 kB[0m [31m73.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m300.6/300.6 kB[0m [31m31.8 MB/s[0m eta [36m0:00:00[0m
[?25hW: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Selecting previously unselected package libarchive-dev:amd64.
(Reading database

In [2]:
import easyocr
import cv2
import numpy as np
from PIL import Image
import pytesseract
import re, unicodedata, json
import gradio as gr


In [3]:
def preprocess_image(img_bgr):
    gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
    gray = cv2.medianBlur(gray, 3)

    # Upscale if small
    h, w = gray.shape
    if min(h, w) < 400:
        gray = cv2.resize(gray, (w*2, h*2))

    # Adaptive threshold
    th = cv2.adaptiveThreshold(
        gray, 255,
        cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
        cv2.THRESH_BINARY, 11, 2
    )
    return th


In [4]:
# Load OCR reader ONCE (much faster)
reader = easyocr.Reader(['en'], gpu=False)

def clean_text(text):
    text = unicodedata.normalize("NFKC", text)
    text = re.sub(r"\s+", " ", text)
    text = re.sub(r"[^\x00-\x7F]+", " ", text)
    return text.strip()

def extract_text_from_image(image_pil, engine='easyocr'):
    img = np.array(image_pil.convert("RGB"))[:, :, ::-1]  # PIL→CV2 (BGR)
    pre = preprocess_image(img)

    if engine == 'easyocr':
        results = reader.readtext(pre, detail=1)
        raw = " ".join([r[1] for r in results])
        lines = [{"text": r[1], "bbox": r[0], "conf": float(r[2])} for r in results]

    else:
        raw = pytesseract.image_to_string(Image.fromarray(pre), config="--psm 6")
        lines = []

    cleaned = clean_text(raw)

    # Extract simple fields
    phones = re.findall(r"\+?\d[\d\s\-]{5,}", cleaned)
    emails = re.findall(r"[a-zA-Z0-9._+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}", cleaned)
    urls = re.findall(r"(https?://\S+|www\.\S+)", cleaned)
    amounts = re.findall(r"(?:Rs\.?|INR|\$)\s?\d[\d,]*", cleaned)

    return {
        "raw_text": raw,
        "clean_text": cleaned,
        "lines": lines,
        "extracted_fields": {
            "phones": phones,
            "emails": emails,
            "urls": urls,
            "amounts": amounts
        }
    }




Progress: |██████████████████████████████████████████████████| 100.0% Complete



Progress: |--------------------------------------------------| 0.0% CompleteProgress: |--------------------------------------------------| 0.1% CompleteProgress: |--------------------------------------------------| 0.1% CompleteProgress: |--------------------------------------------------| 0.2% CompleteProgress: |--------------------------------------------------| 0.2% CompleteProgress: |--------------------------------------------------| 0.3% CompleteProgress: |--------------------------------------------------| 0.4% CompleteProgress: |--------------------------------------------------| 0.4% CompleteProgress: |--------------------------------------------------| 0.5% CompleteProgress: |--------------------------------------------------| 0.5% CompleteProgress: |--------------------------------------------------| 0.6% CompleteProgress: |--------------------------------------------------| 0.6% CompleteProgress: |--------------------------------------------------| 0.7% Complet

In [5]:
from google.colab import files

uploaded = files.upload()

for name in uploaded:
    print("Processing:", name)
    out = extract_text_from_image(Image.open(name), engine='easyocr')
    print("CLEAN TEXT:", out["clean_text"])
    print("FIELDS:", out["extracted_fields"])


Saving testing 1.png to testing 1.png
Processing: testing 1.png
CLEAN TEXT: Mum, Ivc chngcd Iom prcviscr Lts + my 0x QUIIIt YOU Cn Cololo My ol numtor 0k rx 112 Wiw @c U #J0 J Su0s0s NiuM 1831 did u ctwnac: 18 }i Y EE 19.J1 Ac ycu busy #0J1 Odu gC: n# phonc No rm #6 busy: why
FIELDS: {'phones': [], 'emails': [], 'urls': [], 'amounts': []}


In [6]:
def gradio_run(img, engine):
    res = extract_text_from_image(img, engine.lower())
    return (
        "CLEAN TEXT:\n" + res["clean_text"] +
        "\n\nFIELDS:\n" + json.dumps(res["extracted_fields"], indent=2)
    )

demo = gr.Interface(
    fn=gradio_run,
    inputs=[gr.Image(type="pil"), gr.Radio(["easyocr", "tesseract"], value="easyocr")],
    outputs="text",
    title="Scam Screenshot OCR",
    description="Upload any scam image or screenshot"
)

demo.launch()


It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://e53bbc6c70d816eae3.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [7]:
# Install extra libs: langdetect for language detection
!pip install --quiet langdetect


[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/981.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m972.8/981.5 kB[0m [31m30.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m22.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone


In [8]:
from langdetect import detect, DetectorFactory
DetectorFactory.seed = 0  # deterministic langdetect

# keep previously imported libs:
# import cv2, numpy as np, easyocr, pytesseract, PIL.Image, re, unicodedata...


In [9]:
import cv2
import numpy as np

def detect_text_regions(img_bgr, min_area=400, max_area_ratio=0.9):
    """
    img_bgr: OpenCV BGR image
    Returns list of cropped region images (BGR) sorted top->bottom.
    Approach: convert to gray, morphological gradient to highlight text, threshold,
    dilate to join letters into lines/blocks, find contours, filter by size.
    """
    gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
    # morphological gradient to highlight text edges
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3,3))
    grad = cv2.morphologyEx(gray, cv2.MORPH_GRADIENT, kernel)
    # blur and threshold
    blur = cv2.GaussianBlur(grad, (3,3), 0)
    _, th = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    # dilate horizontally (text lines)
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (25,3))
    dil = cv2.dilate(th, kernel, iterations=2)
    # find contours
    contours, _ = cv2.findContours(dil, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    h_img, w_img = gray.shape
    regions = []
    for cnt in contours:
        x,y,w,h = cv2.boundingRect(cnt)
        area = w * h
        if area < min_area or area > (w_img*h_img*max_area_ratio):
            continue
        # pad and clip
        pad_x = int(w * 0.02) + 2
        pad_y = int(h * 0.05) + 2
        x0 = max(0, x-pad_x); y0 = max(0, y-pad_y)
        x1 = min(w_img, x+w+pad_x); y1 = min(h_img, y+h+pad_y)
        regions.append((y0, x0, y1, x1))
    # sort by top coordinate (y)
    regions = sorted(regions, key=lambda r: r[0])
    crops = [img_bgr[y0:y1, x0:x1] for (y0,x0,y1,x1) in regions]
    return crops


In [10]:
import easyocr
from langdetect import detect
import unicodedata, re
from PIL import Image
import numpy as np

# reuse / create reader with default languages; we'll re-init if detected language is different
DEFAULT_LANGS = ['en']  # start with english
OCR_READER = easyocr.Reader(DEFAULT_LANGS, gpu=False)  # instantiate once

SUPPORTED_LANGS_FOR_EASYOCR = {
    # add the shorthand names you'd likely need for your dataset
    'en':'en', 'hi':'hi', 'ur':'ur', 'es':'es', 'pt':'pt', 'fr':'fr',
    # extend as needed; easyocr uses these two-letter codes / language names
}

def detect_language_from_text(sample_text, fallback='en'):
    """Use langdetect to guess language from text. Return language code or fallback."""
    try:
        lang = detect(sample_text)
        return lang if lang else fallback
    except Exception:
        return fallback

def clean_text_basic(text):
    if text is None:
        return ""
    t = unicodedata.normalize("NFKC", text)
    t = re.sub(r"[\x00-\x1f\x7f-\x9f]", " ", t)
    t = re.sub(r"\s+", " ", t).strip()
    return t

def extract_text_whatsapp_aware(image_input,
                                engine='easyocr',
                                preprocess_func=None,
                                min_confidence=0.3,
                                supported_langs=SUPPORTED_LANGS_FOR_EASYOCR):
    """
    image_input: PIL.Image | path | cv2 numpy
    Steps:
      - convert to cv2 BGR
      - detect text regions (bubbles/lines)
      - quick OCR on entire image to get initial text -> detect language
      - if detected language supported by easyocr, re-run OCR per region with language set
      - assemble per-region text and return structured output
    """
    # load to cv2 BGR
    if isinstance(image_input, str):
        pil = Image.open(image_input).convert('RGB')
        img_bgr = np.array(pil)[:,:,::-1]
    elif isinstance(image_input, Image.Image):
        pil = image_input.convert('RGB')
        img_bgr = np.array(pil)[:,:,::-1]
    elif isinstance(image_input, np.ndarray):
        img_bgr = image_input.copy()
        if img_bgr.ndim == 2:
            img_bgr = cv2.cvtColor(img_bgr, cv2.COLOR_GRAY2BGR)
    else:
        raise ValueError("Unsupported image_input type")

    # Preprocess whole image if provided
    proc = img_bgr
    if preprocess_func:
        try:
            proc = preprocess_func(img_bgr)
            # ensure BGR for easyocr
            if proc.ndim == 2:
                proc_bgr = cv2.cvtColor(proc, cv2.COLOR_GRAY2BGR)
            else:
                proc_bgr = proc
        except Exception:
            proc_bgr = img_bgr
    else:
        proc_bgr = img_bgr

    # Quick initial OCR (fast) to get some text to detect language
    try:
        quick_res = OCR_READER.readtext(proc_bgr, detail=0)
        quick_raw = " ".join(quick_res)
        quick_clean = clean_text_basic(quick_raw)
    except Exception:
        quick_clean = ""

    # language detection
    detected_lang = detect_language_from_text(quick_clean, fallback='en')
    # map to easyocr supported code if present
    easyocr_lang = supported_langs.get(detected_lang, None)

    # If detected language is supported by easyocr, re-init reader with that language to improve accuracy
    reader = OCR_READER
    if easyocr_lang and easyocr_lang not in DEFAULT_LANGS:
        try:
            # re-create reader with both english + detected language to be safe
            new_langs = [easyocr_lang]
            if 'en' not in new_langs:
                new_langs = ['en'] + new_langs
            reader = easyocr.Reader(new_langs, gpu=False)
        except Exception:
            reader = OCR_READER

    # detect text regions (better for WhatsApp)
    regions = detect_text_regions(proc_bgr)
    # if no regions found, fallback to single region = whole image
    if not regions:
        regions = [proc_bgr]

    lines = []
    assembled_text = []

    for reg in regions:
        try:
            # easyocr accepts numpy BGR images
            res = reader.readtext(reg, detail=1)
            # filter by confidence
            filtered = []
            for bbox, txt, conf in res:
                confv = float(conf) if conf is not None else 0.0
                if confv >= min_confidence:
                    filtered.append({"bbox": bbox, "text": txt, "conf": confv})
            if filtered:
                # sort filtered by left/top coordinate if multiple boxes present
                filtered_sorted = sorted(filtered, key=lambda r: (r['bbox'][0][1], r['bbox'][0][0]))
                lines.extend(filtered_sorted)
                assembled_text.append(" ".join([f['text'] for f in filtered_sorted]))
        except Exception:
            # fallback single-run easyocr text (no detail)
            try:
                single = reader.readtext(reg, detail=0)
                if single:
                    lines.append({"bbox": None, "text": " ".join(single), "conf": None})
                    assembled_text.append(" ".join(single))
            except Exception:
                continue

    raw_text = " ".join(assembled_text)
    clean = clean_text_basic(raw_text)

    # simple field extraction (phones, emails, urls, amounts)
    phones = re.findall(r"\+?\d[\d\s\-]{5,}", clean)
    emails = re.findall(r"[a-zA-Z0-9._+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}", clean)
    urls = re.findall(r"(https?://\S+|www\.\S+)", clean)
    amounts = re.findall(r"(?:Rs\.?|INR|\$)\s?\d[\d,]*", clean)

    return {
        "detected_language": detected_lang,
        "raw_text": raw_text,
        "clean_text": clean,
        "lines": lines,
        "extracted_fields": {"phones": phones, "emails": emails, "urls": urls, "amounts": amounts}
    }




In [18]:
# path saved earlier in the session
test_path = "testing 1.png"

# Run whatsapp-aware OCR & auto-language detection
result = extract_text_whatsapp_aware(test_path, engine='easyocr', preprocess_func=None)
print("Detected language:", result['detected_language'])
print("CLEAN TEXT (first 400 chars):")
print(result['clean_text'][:400])
print("\nEXTRACTED FIELDS:", result['extracted_fields'])
print("\nNumber of lines/blocks detected:", len(result['lines']))


Detected language: en
CLEAN TEXT (first 400 chars):
Mum Ive changed from provider this is my new number you can delete my old number Ok XX Who are U 18.30 guesses mum Why did change_ Are you busy? Did get new phone_ No Im busy: 12.77

EXTRACTED FIELDS: {'phones': [], 'emails': [], 'urls': [], 'amounts': []}

Number of lines/blocks detected: 15


In [13]:
from google.colab import files

uploaded = files.upload()

for name in uploaded:
    print("Processing:", name)
    out = extract_text_from_image(Image.open(name), engine='easyocr')
    print("CLEAN TEXT:", out["clean_text"])
    print("FIELDS:", out["extracted_fields"])


Saving ChatGPT Image Nov 22, 2025, 11_46_24 AM.png to ChatGPT Image Nov 22, 2025, 11_46_24 AM (1).png
Processing: ChatGPT Image Nov 22, 2025, 11_46_24 AM (1).png
CLEAN TEXT: 022. Congratulations! Please update your PAN- Hellog Card information to avoid Your email has won $100,000 account suspension: in our promotional draw: Earn r5,0000 daily Update now at from home. Apply now: To claim your: prize,reply with http:Ilworxcardhunpdate.info http:Ilworkfromhome your full details. offer.in .122 PM INVOICE SCAM BankAlerit Your CV has been selected COMPANY PLC Your A/c no. 82XX1547 is for JOB VACANCY Contact Balance S432.50 about to be blocked. HR on 91234 56789_ YOUR PAYMENT IS PENDING: Verify details immecliately: to schedule your interview: Avold penalttes by paying now: http: Ilsecure-bank.com https:Ilfake-involce-pay.com Jan 18 2024 Pay Jurn In" Aietanad 2held riJuloi_ Ihl You have been chosen for Package Update CONFIDENTIAL a special Rs. 10,000 bonus. 8' Shipping Dept 2,000,000 USD Acti

In [19]:
# Install Tesseract Urdu language pack and shaping/bidi libs
!apt-get update -qq
!apt-get install -y -qq tesseract-ocr-urd

# Python deps for Urdu text shaping and detection
!pip install --quiet arabic_reshaper python-bidi langdetect


W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Selecting previously unselected package tesseract-ocr-urd.
(Reading database ... 121846 files and directories currently installed.)
Preparing to unpack .../tesseract-ocr-urd_1%3a4.00~git30-7274cfa-1.1_all.deb ...
Unpacking tesseract-ocr-urd (1:4.00~git30-7274cfa-1.1) ...
Setting up tesseract-ocr-urd (1:4.00~git30-7274cfa-1.1) ...


In [20]:
# Imports (add to existing ones)
import arabic_reshaper
from bidi.algorithm import get_display
from langdetect import detect, DetectorFactory
DetectorFactory.seed = 0

# Map langdetect -> EasyOCR/Tesseract codes
LANG_MAP = {
    'en': {'easyocr':'en', 'tesseract':'eng'},
    'ur': {'easyocr':'ur', 'tesseract':'urd'},
    'hi': {'easyocr':'hi', 'tesseract':'hin'},
    # add more mappings as needed
}

# Arabic-Indic / Extended Arabic-Indic digit normalization to European digits
ARABIC_INDIC_DIGITS = {
    ord('٠'): '0', ord('١'): '1', ord('٢'): '2', ord('٣'): '3', ord('٤'): '4',
    ord('٥'): '5', ord('٦'): '6', ord('٧'): '7', ord('٨'): '8', ord('٩'): '9',
    ord('۰'): '0', ord('۱'): '1', ord('۲'): '2', ord('۳'): '3', ord('۴'): '4',
    ord('۵'): '5', ord('۶'): '6', ord('۷'): '7', ord('۸'): '8', ord('۹'): '9',
}

def normalize_digits(text):
    return text.translate(ARABIC_INDIC_DIGITS)

def shape_and_bidi(urdu_text):
    """Reshape Urdu/Arabic script for correct visual order and normalize digits."""
    reshaped = arabic_reshaper.reshape(urdu_text)
    bidi_text = get_display(reshaped)
    return normalize_digits(bidi_text)


In [21]:
import easyocr
from PIL import Image
import numpy as np
import cv2
import re, unicodedata

# Initialize reader with english + urdu (recreate if needed)
OCR_READER = easyocr.Reader(['en','ur'], gpu=False)  # instantiate once

def detect_language_from_text_safe(sample_text, fallback='en'):
    try:
        lang = detect(sample_text)
        return lang
    except Exception:
        return fallback

def clean_text_basic(text):
    if not text: return ""
    t = unicodedata.normalize("NFKC", text)
    t = re.sub(r"[\x00-\x1f\x7f-\x9f]", " ", t)
    t = re.sub(r"\s+", " ", t).strip()
    return t

def extract_text_with_urdu_support(image_input,
                                   min_confidence=0.3,
                                   preprocess_func=None):
    """
    Returns: {detected_language, raw_text, clean_text, lines, extracted_fields}
    - Uses quick OCR to detect language, then re-runs region-wise OCR with appropriate language.
    - Shapes & bidi-fixes Urdu text for readable output.
    """
    # load image -> cv2 BGR
    if isinstance(image_input, str):
        pil = Image.open(image_input).convert('RGB')
        img_bgr = np.array(pil)[:,:,::-1]
    elif isinstance(image_input, Image.Image):
        pil = image_input.convert('RGB')
        img_bgr = np.array(pil)[:,:,::-1]
    elif isinstance(image_input, np.ndarray):
        img_bgr = image_input.copy()
    else:
        raise ValueError("Unsupported image_input type")

    proc_img = img_bgr
    if preprocess_func:
        try:
            proc_img = preprocess_func(img_bgr)
            if proc_img.ndim == 2:
                proc_bgr = cv2.cvtColor(proc_img, cv2.COLOR_GRAY2BGR)
            else:
                proc_bgr = proc_img
        except Exception:
            proc_bgr = img_bgr
    else:
        proc_bgr = img_bgr

    # Quick OCR to detect language
    try:
        quick = OCR_READER.readtext(proc_bgr, detail=0)
        quick_text = " ".join(quick)
        quick_clean = clean_text_basic(quick_text)
    except Exception:
        quick_clean = ""

    detected = detect_language_from_text_safe(quick_clean, fallback='en')
    mapped = LANG_MAP.get(detected, LANG_MAP['en'])
    easyocr_lang_code = mapped['easyocr']

    # If detected language is ur or not in current reader, re-create reader with that lang
    reader = OCR_READER
    try:
        if easyocr_lang_code not in reader.lang_list:
            # create reader with english + detected to be safe
            new_langs = ['en', easyocr_lang_code] if easyocr_lang_code != 'en' else ['en']
            reader = easyocr.Reader(new_langs, gpu=False)
    except Exception:
        reader = OCR_READER

    # Detect regions (reuse detect_text_regions if available) else whole image
    try:
        regions = detect_text_regions(proc_bgr)
    except Exception:
        regions = [proc_bgr]

    lines = []
    assembled = []
    for reg in regions:
        try:
            res = reader.readtext(reg, detail=1)
            filtered = []
            for bbox, txt, conf in res:
                confv = float(conf) if conf is not None else 0.0
                if confv >= min_confidence:
                    filtered.append({"bbox": bbox, "text": txt, "conf": confv})
            if filtered:
                lines.extend(filtered)
                assembled.append(" ".join([f['text'] for f in filtered]))
        except Exception:
            try:
                single = reader.readtext(reg, detail=0)
                if single:
                    lines.append({"bbox": None, "text": " ".join(single), "conf": None})
                    assembled.append(" ".join(single))
            except Exception:
                continue

    raw_text = " ".join(assembled)
    clean = clean_text_basic(raw_text)

    # If detected Urdu, shape & bidi the clean text and each line
    if detected == 'ur':
        clean = shape_and_bidi(clean)
        for l in lines:
            if l.get('text'):
                l['text'] = shape_and_bidi(l['text'])

    # Normalize digits in clean text (handles Urdu digits)
    clean = normalize_digits(clean)

    # Extract simple fields (phones/emails/urls/amounts) from normalized text
    phones = re.findall(r"\+?\d[\d\s\-]{5,}", clean)
    emails = re.findall(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}", clean)
    urls = re.findall(r"(https?://\S+|www\.\S+)", clean)
    amounts = re.findall(r"(?:Rs\.?|INR|\$)\s?\d[\d,]*", clean)

    return {
        "detected_language": detected,
        "raw_text": raw_text,
        "clean_text": clean,
        "lines": lines,
        "extracted_fields": {"phones": phones, "emails": emails, "urls": urls, "amounts": amounts}
    }




Progress: |██████████████████████████████████████████████████| 100.0% Complete

In [25]:
test_path = "images 2.jpeg"

res = extract_text_with_urdu_support(test_path, min_confidence=0.25, preprocess_func=preprocess_image)
print("Detected language:", res['detected_language'])
print("\nCLEAN TEXT (first 600 chars):\n", res['clean_text'][:600])
print("\nEXTRACTED FIELDS:", res['extracted_fields'])
print("\nLINES detected:", len(res['lines']))
# show first 6 lines
for i,l in enumerate(res['lines'][:6]):
    print(i+1, l.get('conf'), "->", l.get('text'))


Detected language: ur

CLEAN TEXT (first 600 chars):
 

EXTRACTED FIELDS: {'phones': [], 'emails': [], 'urls': [], 'amounts': []}

LINES detected: 0
