In [None]:
!pip install paddlepaddle paddleocr

Collecting paddlepaddle
  Downloading paddlepaddle-2.6.2-cp311-cp311-manylinux1_x86_64.whl.metadata (8.6 kB)
Collecting paddleocr
  Downloading paddleocr-2.10.0-py3-none-any.whl.metadata (12 kB)
Collecting astor (from paddlepaddle)
  Downloading astor-0.8.1-py2.py3-none-any.whl.metadata (4.2 kB)
Collecting opt-einsum==3.3.0 (from paddlepaddle)
  Downloading opt_einsum-3.3.0-py3-none-any.whl.metadata (6.5 kB)
Collecting pyclipper (from paddleocr)
  Downloading pyclipper-1.3.0.post6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.0 kB)
Collecting lmdb (from paddleocr)
  Downloading lmdb-1.6.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.1 kB)
Collecting rapidfuzz (from paddleocr)
  Downloading rapidfuzz-3.12.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting python-docx (from paddleocr)
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Collecting fire>=0.3.0 (from paddleocr)
  Download

In [None]:
import cv2
import re
import numpy as np
import requests
import pandas as pd
from paddleocr import PaddleOCR
from google.colab import files
import threading

# Initialize PaddleOCR
ocr = PaddleOCR(use_angle_cls=True, lang='en')

def download_image(image_url):
    """Downloads an image from a given URL and returns it as a NumPy array."""
    response = requests.get(image_url)
    if response.status_code == 200:
        image_array = np.asarray(bytearray(response.content), dtype=np.uint8)
        return cv2.imdecode(image_array, cv2.IMREAD_COLOR)
    return None

def extract_text(image):
    """Extracts text from an image using PaddleOCR."""
    results = ocr.ocr(image, cls=True)
    extracted_text = "\n".join([line[1][0] for result in results for line in result])
    return extracted_text

def extract_transaction_details(text):
    """Extracts UTR Number (PhonePe), UPI Transaction ID (Google Pay), or Paytm Transaction ID from text."""
    lines = text.split('\n')

    for i, line in enumerate(lines):

        #Added regex code to handle edge cases as well
        #The code can handle normal gpay, phonepay, paytm screenshots
        #as well as screenshots where the transaction id occurs on the immediate next line(not the same one as the title)

        match_phonepe = re.search(r"T\d{21}", line)  # PhonePe Transaction ID
        match_googlepay = re.search(r"[A-Z]{5}\d{10}", line)  # Google Pay TXN ID
        match_upi = re.search(r"UPI transaction ID[:\s]*(\d{9,})", line)  # Generic UPI Transaction ID
        match_paytm = re.search(r"(\d{12,15})", line)  # Paytm Transaction ID (typically 12-15 digits)

        if match_phonepe:
            return match_phonepe.group(0)
        elif match_googlepay:
            return match_googlepay.group(0)
        elif match_upi:
            return match_upi.group(1)
        elif match_paytm:
            return match_paytm.group(0)
        elif i + 1 < len(lines):
            match_next_line = re.search(r"UPI transaction ID[:\s]*(\d{9,})", lines[i + 1])
            if match_next_line:
                return match_next_line.group(1)

    return None  # No transaction ID found

def process_image_url(image_url):
    """Processes an image URL and returns the extracted transaction ID."""
    if not image_url:
        return None

    image = download_image(image_url)
    if image is not None:
        extracted_text = extract_text(image)
        return extract_transaction_details(extracted_text)
    return None

def process_images_multithreaded(image_urls):
    """Processes multiple images using multithreading."""
    results = {}
    threads = []

    def worker(url):
        results[url] = process_image_url(url)

    for url in image_urls:
        thread = threading.Thread(target=worker, args=(url,))
        threads.append(thread)
        thread.start()

    for thread in threads:
        thread.join()

    return results

def process_transactions(file_path, reg_path, drive1_path, drive2_path):
    """Processes an input file to extract and verify transaction details."""
    reg = pd.read_csv(reg_path, dtype=str)
    drive1 = pd.read_excel(drive1_path)
    drive2 = pd.read_excel(drive2_path)
    drive = pd.concat([drive1, drive2], axis=0, ignore_index=True)

    df = pd.read_excel(file_path)
    df["extracted_transaction_id"] = df["screenshot"].dropna().apply(process_image_url)

    reg["extracted_transaction_id"] = reg["extracted_transaction_id"].astype(str)
    drive_verified = drive[drive["transaction state"] == "SaleSuccess"]
    drive_verified["rrn no"] = drive_verified["rrn no"].astype(float).astype(int).astype(str)

    merged_df = reg.merge(drive_verified, left_on="extracted_transaction_id", right_on="rrn no", how="left")
    merged_df["status"] = merged_df["rrn no"].apply(lambda x: "verified" if pd.notna(x) else "not verified")
    final_df = merged_df.drop(columns=["rrn no", "transaction state"])

    return final_df

def save_and_download(df, output_filename="processed_transactions.xlsx"):
    """Saves the processed dataframe and makes it available for download."""
    df.to_excel(output_filename, index=False)
    files.download(output_filename)

def main():
    """Main function to handle input, processing, and output."""
    print("Upload the transaction file, registration data, and drive transaction files.")
    uploaded = files.upload()
    file_paths = list(uploaded.keys())

    if len(file_paths) < 4:
        print("Error: Please upload all required files.")
        return

    processed_df = process_transactions(file_paths[0], file_paths[1], file_paths[2], file_paths[3])
    save_and_download(processed_df)

if __name__ == "__main__":
    main()

ModuleNotFoundError: No module named 'paddleocr'