# 🚀 MOHRE Attested Contract OCR System - Google Colab
## Enhanced OCR with Gemini 2.5 Pro + Backup Detection Methods

### Features:
- 🎯 Precise zone detection with 30% horizontal shrinkage + 5% height reduction
- 🔧 Enhanced backup methods (image processing + grey boundary detection)
- 🧠 Gemini 2.5 Pro with thinking capabilities
- 📊 CSV reporting and visualizations
- 🛡️ Robust fallback detection methods


## 📦 Step 1: Install Dependencies


In [None]:
# Install system dependencies
!apt-get update
!apt-get install -y tesseract-ocr tesseract-ocr-ara tesseract-ocr-eng
!apt-get install -y libtesseract-dev

# Install Python packages
%pip install opencv-python-headless>=4.8.0
%pip install pytesseract>=0.3.10
%pip install Pillow>=10.0.0
%pip install PyMuPDF>=1.23.0
%pip install numpy>=1.24.0
%pip install google-generativeai>=0.3.0
%pip install matplotlib>=3.7.0

print("✅ All dependencies installed successfully!")


Get:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Get:3 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:5 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Hit:6 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Get:7 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease [18.1 kB]
Get:8 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease [24.3 kB]
Get:9 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:11 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy/main amd64 Packages [32.9 kB]
Get:12 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [1,923 kB]
Get:13 https://ppa.la

## 🔑 Step 2: Configure API Key


In [None]:
# Set your Gemini API key here
GEMINI_API_KEY = "AIzaSyDmNcG-gz9PN4zmoq5anCstMqdfhXqJaL0"  # Replace with your key

import os
os.environ['GEMINI_API_KEY'] = GEMINI_API_KEY

print("🔑 API key configured successfully!")


🔑 API key configured successfully!


Gemini zone classifier

In [None]:
#!/usr/bin/env python3
"""
Gemini-Enhanced Signature Zone Classifier
Combines precise zone detection with Gemini 2.5 Pro classification
"""

import os
import sys
import json
import logging
# import argparse # Removed argparse
import tempfile
from typing import Dict, Optional, Tuple, NamedTuple
from dataclasses import dataclass
from pathlib import Path

import cv2
import fitz  # PyMuPDF
import numpy as np
import pytesseract
import google.generativeai as genai
from PIL import Image
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from matplotlib.patches import Rectangle

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

@dataclass
class DetectedZone:
    """Represents a detected signature zone"""
    x: int
    y: int
    width: int
    height: int
    confidence: float
    text_above: str

@dataclass
class GeminiClassificationResult:
    """Result from Gemini classification"""
    classification: str  # "signature", "contract", or "blank"
    confidence: float
    reasoning: str
    gemini_response: str

class GeminiZoneClassifier:
    """Enhanced signature zone classifier using Gemini for classification"""

    def __init__(self, api_key: str, verbose: bool = False):
        """Initialize the classifier with Gemini API key"""
        self.verbose = verbose
        self.setup_gemini(api_key)

        # Check if tesseract is available
        try:
            self.tesseract_path = self._find_tesseract()
            if self.tesseract_path:
                pytesseract.pytesseract.tesseract_cmd = self.tesseract_path
                logger.info(f"Tesseract found at: {self.tesseract_path}")
            else:
                logger.warning("Tesseract not found. OCR functionality may be limited.")
        except Exception as e:
            logger.error(f"Error setting up Tesseract: {e}")

    def setup_gemini(self, api_key: str):
        """Setup Gemini API"""
        try:
            genai.configure(api_key=api_key)
            # Use Gemini 2.5 Pro with thinking (most advanced model with superior reasoning)
            self.model = genai.GenerativeModel('gemini-2.5-pro')
            logger.info("Gemini 2.5 Pro with thinking initialized successfully")
        except Exception as e:
            logger.error(f"Failed to initialize Gemini: {e}")
            raise

    def _find_tesseract(self) -> Optional[str]:
        """Find tesseract executable"""
        common_paths = [
            '/opt/homebrew/bin/tesseract',  # macOS Homebrew
            '/usr/local/bin/tesseract',     # macOS/Linux
            '/usr/bin/tesseract',           # Linux
            'tesseract'                     # System PATH
        ]

        for path in common_paths:
            if os.path.exists(path) or (path == 'tesseract' and os.system('which tesseract > /dev/null 2>&1') == 0):
                return path
        return None

    def extract_second_page(self, pdf_path: str) -> np.ndarray:
        """Extract the second page from PDF as image"""
        try:
            pdf_document = fitz.open(pdf_path)

            if len(pdf_document) < 2:
                raise ValueError(f"PDF has only {len(pdf_document)} page(s), need at least 2")

            # Get second page (index 1)
            page = pdf_document[1]

            # Convert to image with high DPI for better OCR
            mat = fitz.Matrix(2.0, 2.0)  # 2x zoom = 144 DPI
            pix = page.get_pixmap(matrix=mat)
            img_data = pix.tobytes("ppm")

            # Convert to numpy array
            nparr = np.frombuffer(img_data, np.uint8)
            image = cv2.imdecode(nparr, cv2.IMREAD_COLOR)

            pdf_document.close()

            if self.verbose:
                logger.info(f"Extracted second page: {image.shape}")

            return image

        except Exception as e:
            logger.error(f"Error extracting second page: {e}")
            raise

    def get_bottom_two_thirds(self, image: np.ndarray) -> Tuple[np.ndarray, int]:
        """Extract bottom 2/3 of the image"""
        height = image.shape[0]
        start_y = height // 3  # Start from 1/3 down

        cropped_image = image[start_y:, :]

        if self.verbose:
            logger.info(f"Bottom 2/3 extracted: {cropped_image.shape}")

        return cropped_image, start_y

    def detect_text_with_positions(self, image: np.ndarray) -> list:
        """Detect text and return positions using Tesseract"""
        try:
            # Configure tesseract for better Arabic/English text detection
            # Add '--dpi 300' to fix the resolution warning
            config = '--oem 3 --psm 6'

            # Get detailed text information with fallback
            try:
                data = pytesseract.image_to_data(image, config=config, output_type=pytesseract.Output.DICT)
            except Exception as e:
                if self.verbose:
                    logger.warning(f"Basic OCR config failed: {e}, trying fallback...")
                # Fallback to even simpler config
                config = '--psm 6'
                data = pytesseract.image_to_data(image, config=config, output_type=pytesseract.Output.DICT)

            detections = []
            n_boxes = len(data['level'])

            for i in range(n_boxes):
                confidence = int(data['conf'][i])
                text = data['text'][i].strip()

                if confidence > 30 and len(text) > 0:  # Filter low confidence and empty text
                    x, y, w, h = data['left'][i], data['top'][i], data['width'][i], data['height'][i]

                    detections.append({
                        'text': text,
                        'x': x,
                        'y': y,
                        'width': w,
                        'height': h,
                        'confidence': confidence
                    })

            if self.verbose:
                logger.info(f"Detected {len(detections)} text elements")

            return detections

        except Exception as e:
            logger.error(f"Text detection failed: {e}")
            return []

    def enhance_image_for_text_detection(self, image: np.ndarray) -> np.ndarray:
        """Enhance image for better text detection - backup method"""
        try:
            # Convert to grayscale
            gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

            # Apply sharpening filter
            kernel = np.array([[-1,-1,-1], [-1, 9,-1], [-1,-1,-1]])
            sharpened = cv2.filter2D(gray, -1, kernel)

            # Enhance contrast
            enhanced = cv2.convertScaleAbs(sharpened, alpha=1.5, beta=20)

            # Apply morphological operations to clean up text
            kernel_morph = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
            cleaned = cv2.morphologyEx(enhanced, cv2.MORPH_CLOSE, kernel_morph)

            # Convert back to BGR for consistency
            enhanced_bgr = cv2.cvtColor(cleaned, cv2.COLOR_GRAY2BGR)

            if self.verbose:
                logger.info("Applied image enhancement: sharpening + contrast + morphological cleaning")

            return enhanced_bgr

        except Exception as e:
            if self.verbose:
                logger.warning(f"Image enhancement failed: {e}")
            return image

    def backup_zone_detection_method1(self, image: np.ndarray) -> Optional[DetectedZone]:
        """Backup method 1: Enhanced image processing + relaxed keyword matching"""
        try:
            # Enhance image for better text detection
            enhanced_image = self.enhance_image_for_text_detection(image)

            # Detect text with enhanced image
            enhanced_detections = self.detect_text_with_positions(enhanced_image)

            if self.verbose:
                logger.info(f"Backup method 1: Enhanced detection found {len(enhanced_detections)} text elements")

            # Try broader keyword matching
            relaxed_keywords = [
                'signature', 'sign', 'party', 'second', 'الطرف', 'توقيع',
                'name', 'اسم', 'below', 'above', 'here'
            ]

            height, width = image.shape[:2]
            candidates = []

            for detection in enhanced_detections:
                text_lower = detection['text'].lower()

                # More flexible keyword matching
                for keyword in relaxed_keywords:
                    if keyword in text_lower and detection['confidence'] > 30:  # Lower confidence threshold
                        zone_start_y = detection['y'] + detection['height'] + 5
                        base_zone_width = min(350, width - 80)
                        zone_width = int(base_zone_width * 0.7)
                        zone_x = (width - zone_width) // 2

                        candidates.append({
                            'text': detection['text'],
                            'zone_start_y': zone_start_y,
                            'zone_x': zone_x,
                            'zone_width': zone_width,
                            'confidence': detection['confidence']
                        })

                        if self.verbose:
                            logger.info(f"Backup method 1: Found relaxed keyword '{keyword}' in '{detection['text']}'")
                        break

            if candidates:
                # Select best candidate
                best_candidate = max(candidates, key=lambda x: x['confidence'])
                return self.build_signature_zone_from_candidate(image, best_candidate)

        except Exception as e:
            if self.verbose:
                logger.warning(f"Backup detection method 1 failed: {e}")

        return None

    def backup_zone_detection_method2(self, image: np.ndarray) -> Optional[DetectedZone]:
        """Backup method 2: Grey boundary detection using specific RGB color (237, 237, 237)"""
        try:
            height, width = image.shape[:2]

            # Convert to RGB for color matching
            rgb_image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

            # Define target grey color (237, 237, 237)
            target_color = np.array([237, 237, 237])
            color_tolerance = 10  # Allow some variation

            # Create mask for the target grey color
            lower_bound = target_color - color_tolerance
            upper_bound = target_color + color_tolerance
            grey_mask = cv2.inRange(rgb_image, lower_bound, upper_bound)

            if self.verbose:
                grey_pixel_count = np.sum(grey_mask > 0)
                logger.info(f"Backup method 2: Found {grey_pixel_count} pixels matching RGB(237,237,237)")

            # Find horizontal strips by analyzing rows
            grey_strips = []
            min_strip_width = int(width * 0.3)  # Strip must be at least 30% of page width

            for y in range(height):
                row = grey_mask[y, :]
                grey_pixels_in_row = np.sum(row > 0)

                if grey_pixels_in_row > min_strip_width:
                    # Find the span of grey pixels in this row
                    grey_positions = np.where(row > 0)[0]
                    if len(grey_positions) > 0:
                        strip_start = grey_positions[0]
                        strip_end = grey_positions[-1]
                        strip_width = strip_end - strip_start

                        # Only consider substantial horizontal strips
                        if strip_width > min_strip_width:
                            grey_strips.append({
                                'y': y,
                                'start_x': strip_start,
                                'end_x': strip_end,
                                'width': strip_width,
                                'grey_density': grey_pixels_in_row / width
                            })

            if len(grey_strips) < 2:
                if self.verbose:
                    logger.warning(f"Backup method 2: Found only {len(grey_strips)} grey strips, need at least 2")
                return None

            # Sort strips by Y position
            grey_strips.sort(key=lambda x: x['y'])

            if self.verbose:
                logger.info(f"Backup method 2: Found {len(grey_strips)} horizontal grey strips")
                for i, strip in enumerate(grey_strips[:5]):  # Log first 5
                    logger.info(f"  Strip {i}: y={strip['y']}, width={strip['width']}, density={strip['grey_density']:.2f}")

            # Find the signature zone between two prominent grey strips
            # Look for strips in the lower portion of the document
            lower_half_start = int(height * 0.4)
            candidate_pairs = []

            for i in range(len(grey_strips) - 1):
                upper_strip = grey_strips[i]
                lower_strip = grey_strips[i + 1]

                # Both strips should be in the lower portion and have good density
                if (upper_strip['y'] >= lower_half_start and
                    lower_strip['y'] >= lower_half_start and
                    upper_strip['grey_density'] > 0.2 and
                    lower_strip['grey_density'] > 0.2):

                    zone_height = lower_strip['y'] - upper_strip['y']

                    # Zone should be reasonable size (between 50 and 200 pixels)
                    if 50 <= zone_height <= 200:
                        candidate_pairs.append({
                            'upper_strip': upper_strip,
                            'lower_strip': lower_strip,
                            'zone_start_y': upper_strip['y'] + 5,  # Small offset from strip
                            'zone_height': zone_height - 10,  # Small margins
                            'confidence': min(upper_strip['grey_density'], lower_strip['grey_density'])
                        })

            if not candidate_pairs:
                if self.verbose:
                    logger.warning("Backup method 2: No suitable grey strip pairs found for signature zone")
                return None

            # Select best candidate (highest confidence)
            best_pair = max(candidate_pairs, key=lambda x: x['confidence'])

            # Use original method for horizontal positioning
            base_zone_width = min(400, width - 100)  # Max 400px, leave margins
            zone_width = int(base_zone_width * 0.7)  # 30% shrinkage
            zone_x = (width - zone_width) // 2  # Centered

            # Apply 5% height reduction as in original method
            zone_height = best_pair['zone_height']
            height_reduction = int(zone_height * 0.05)
            zone_height = zone_height - height_reduction

            # Ensure minimum height
            if zone_height < 50:
                zone_height = 80

            # Bounds checking
            zone_start_y = max(0, min(best_pair['zone_start_y'], height - zone_height))
            zone_x = max(0, min(zone_x, width - zone_width))
            zone_height = min(zone_height, height - zone_start_y)
            zone_width = min(zone_width, width - zone_x)

            if self.verbose:
                logger.info(f"Backup method 2: Grey boundary detection successful")
                logger.info(f"  Zone: ({zone_x}, {zone_start_y}) size {zone_width}x{zone_height}")
                logger.info(f"  Upper strip at y={best_pair['upper_strip']['y']}, Lower strip at y={best_pair['lower_strip']['y']}")

            return DetectedZone(
                x=zone_x,
                y=zone_start_y,
                width=zone_width,
                height=zone_height,
                confidence=0.7,  # Higher confidence since it's based on actual document structure
                text_above="Grey Boundary Detection"
            )

        except Exception as e:
            if self.verbose:
                logger.warning(f"Backup detection method 2 failed: {e}")

        return None

    def build_signature_zone_from_candidate(self, image: np.ndarray, candidate: dict) -> Optional[DetectedZone]:
        """Build a signature zone from a candidate with all refinements applied"""
        try:
            zone_start_y = candidate['zone_start_y']
            zone_x = candidate['zone_x']
            zone_width = candidate['zone_width']

            # Apply the same refinement logic as the main detection
            initial_zone_end_y = self.detect_grey_boundary(image, zone_start_y, zone_x, zone_width)
            zone_end_y = self.refine_white_background_boundary(
                image, zone_x, zone_width, initial_zone_end_y, zone_start_y
            )
            zone_height = zone_end_y - zone_start_y

            # 5% height reduction
            height_reduction = int(zone_height * 0.05)
            zone_height = zone_height - height_reduction

            # Ensure minimum height
            if zone_height < 50:
                zone_height = min(100, image.shape[0] - zone_start_y - 20)

            # Bounds checking
            height, width = image.shape[:2]
            zone_start_y = max(0, min(zone_start_y, height - zone_height))
            zone_x = max(0, min(zone_x, width - zone_width))
            zone_height = min(zone_height, height - zone_start_y)
            zone_width = min(zone_width, width - zone_x)

            return DetectedZone(
                x=zone_x,
                y=zone_start_y,
                width=zone_width,
                height=zone_height,
                confidence=candidate['confidence'],
                text_above=candidate['text']
            )

        except Exception as e:
            if self.verbose:
                logger.warning(f"Failed to build zone from candidate: {e}")
            return None

    def find_signature_zone(self, image: np.ndarray, text_detections: list) -> Optional[DetectedZone]:
        """Find the signature zone based on text detection and visual cues (original method)"""
        height, width = image.shape[:2]

        # Look for signature-related keywords
        signature_keywords = [
            'signature', 'second party', 'party\'s signature', 'sign',
            'توقيع', 'الطرف الثاني'  # Arabic keywords
        ]

        candidates = []

        for detection in text_detections:
            text_lower = detection['text'].lower()

            # Check if text contains signature keywords
            for keyword in signature_keywords:
                if keyword in text_lower:
                    # Calculate potential zone below this text
                    zone_start_y = detection['y'] + detection['height'] + 10

                    # Estimate zone width and position (centered)
                    base_zone_width = min(400, width - 100)  # Max 400px, leave margins
                    zone_x = (width - base_zone_width) // 2

                    # **HORIZONTAL SHRINKAGE: Reduce width by 30%**
                    zone_width = int(base_zone_width * 0.7)  # 30% reduction
                    zone_x = (width - zone_width) // 2  # Re-center after shrinkage

                    candidates.append({
                        'text': detection['text'],
                        'zone_start_y': zone_start_y,
                        'zone_x': zone_x,
                        'zone_width': zone_width,
                        'confidence': detection['confidence']
                    })

                    if self.verbose:
                        logger.info(f"Found signature keyword: '{detection['text']}' at y={detection['y']}")

        if not candidates:
            if self.verbose:
                logger.warning("No signature keywords found")
            return None

        # Select best candidate (highest confidence)
        best_candidate = max(candidates, key=lambda x: x['confidence'])
        return self.build_signature_zone_from_candidate(image, best_candidate)

    def find_signature_zone_with_backup(self, image: np.ndarray, text_detections: list) -> Optional[DetectedZone]:
        """Enhanced zone detection with backup methods when initial detection fails"""

        # **PRIMARY METHOD: Original detection logic**
        signature_zone = self.find_signature_zone(image, text_detections)

        if signature_zone:
            if self.verbose:
                logger.info("✅ Primary zone detection successful")
            return signature_zone

        if self.verbose:
            logger.warning("❌ Primary zone detection failed. Trying backup methods...")

        # **BACKUP METHOD 1: Enhanced image processing**
        signature_zone = self.backup_zone_detection_method1(image)
        if signature_zone:
            if self.verbose:
                logger.info("✅ Backup method 1 (enhanced processing) successful")
            return signature_zone

        # **BACKUP METHOD 2: Geometric detection**
        signature_zone = self.backup_zone_detection_method2(image)
        if signature_zone:
            if self.verbose:
                logger.info("✅ Backup method 2 (geometric detection) successful")
            return signature_zone

        # **ALL METHODS FAILED**
        if self.verbose:
            logger.error("❌ All zone detection methods failed. Zone not found.")

        return None

    def detect_grey_boundary(self, image: np.ndarray, start_y: int, zone_x: int, zone_width: int) -> int:
        """Scan downward to find grey boundary that marks end of white signature zone"""
        height = image.shape[0]
        gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

        for y in range(start_y, height - 10):
            # Sample a horizontal stripe
            horizontal_stripe = gray_image[y:y+5, zone_x:zone_x+zone_width]

            if horizontal_stripe.size > 0:
                # Count grey pixels (between 150-220 intensity)
                grey_pixels = np.sum((horizontal_stripe >= 150) & (horizontal_stripe <= 220))
                total_pixels = horizontal_stripe.size
                grey_percentage = grey_pixels / total_pixels

                if grey_percentage > 0.6:  # 60% or more is grey
                    if self.verbose:
                        logger.info(f"Grey boundary detected at y={y}, grey%={grey_percentage:.2f}")
                    return y

        # Fallback: use reasonable maximum height
        max_height = min(150, height - start_y - 20)
        return start_y + max_height

    def refine_white_background_boundary(self, image: np.ndarray, zone_x: int, zone_width: int,
                                       initial_lower_y: int, start_y: int) -> int:
        """Scan from bottom up to ensure we have white background, shrinking zone if needed"""
        gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

        current_lower_y = initial_lower_y
        for y in range(initial_lower_y, start_y, -5):  # Scan upward
            horizontal_stripe = gray_image[y-3:y+3, zone_x:zone_x+zone_width]

            if horizontal_stripe.size > 0:
                # Count white pixels (>= 230 intensity)
                white_pixels = np.sum(horizontal_stripe >= 230)
                total_pixels = horizontal_stripe.size
                white_percentage = white_pixels / total_pixels

                if white_percentage > 0.7:  # 70% or more is white
                    if self.verbose:
                        logger.info(f"White background confirmed at y={y}, white%={white_percentage:.2f}")
                    return y
                else:
                    current_lower_y = y - 5
                    if self.verbose:
                        logger.info(f"Non-white background at y={y}, white%={white_percentage:.2f}, shrinking")

        # Ensure minimum height
        min_height = 50
        if current_lower_y - start_y < min_height:
            current_lower_y = start_y + min_height
            if self.verbose:
                logger.info(f"Zone shrunk to minimum height: {min_height}")

        return current_lower_y

    def classify_with_gemini(self, zone_image: np.ndarray) -> GeminiClassificationResult:
        """Use Gemini to classify the zone content"""
        try:
            # Convert numpy array to PIL Image
            zone_image_rgb = cv2.cvtColor(zone_image, cv2.COLOR_BGR2RGB)
            pil_image = Image.fromarray(zone_image_rgb)

            # Create the prompt for classification
            prompt = """
            You are an expert document analyzer. Please analyze this image which represents a signature zone from an official contract document.

            Your task is to classify this zone into one of exactly three categories:

            1. **"signature"** - If the zone contains handwritten signatures, initials, or any form of handwriting/pen marks
            2. **"contract"** - If the zone contains a document image, labor card, ID card, passport page, or any rectangular/page-like document or photo of a person
            3. **"blank"** - If the zone is mostly empty/white with no significant content

            **IMPORTANT CLASSIFICATION RULES:**
            - If you see ANY handwriting, signatures, or pen marks → classify as "signature"
            - If you see a rectangular document, card, photo, or any embedded page/image → classify as "contract"
            - Only classify as "blank" if the area is genuinely empty or contains only printed text/borders
            - Be very careful to distinguish between handwritten content (signature) and printed/embedded documents (contract)

            Please respond in this exact JSON format:
            {
                "classification": "signature|contract|blank",
                "confidence": 0.0-1.0,
                "reasoning": "Detailed explanation of what you see and why you classified it this way"
            }

            Analyze the image carefully and provide your classification.
            """

            response = self.model.generate_content([prompt, pil_image])

            if self.verbose:
                logger.info(f"Gemini raw response: {response.text}")

            # Parse the JSON response
            try:
                # Extract JSON from response text
                import re
                json_match = re.search(r'\{.*\}', response.text, re.DOTALL)
                if json_match:
                    json_str = json_match.group()
                    result_data = json.loads(json_str)
                else:
                    # Fallback parsing if JSON is not properly formatted
                    raise ValueError("No valid JSON found in response")

                # Validate the classification
                valid_classifications = ["signature", "contract", "blank"]
                classification = result_data.get("classification", "").lower()

                if classification not in valid_classifications:
                    logger.warning(f"Invalid classification '{classification}', defaulting to 'blank'")
                    classification = "blank"

                confidence = float(result_data.get("confidence", 0.5))
                confidence = max(0.0, min(1.0, confidence))  # Clamp between 0 and 1

                reasoning = result_data.get("reasoning", "Gemini analysis completed")

                return GeminiClassificationResult(
                    classification=classification,
                    confidence=confidence,
                    reasoning=reasoning,
                    gemini_response=response.text
                )

            except (json.JSONDecodeError, ValueError, KeyError) as e:
                logger.error(f"Error parsing Gemini response: {e}")
                logger.error(f"Raw response: {response.text}")

                # Fallback classification based on text analysis
                response_lower = response.text.lower()
                if "signature" in response_lower or "handwriting" in response_lower:
                    classification = "signature"
                    confidence = 0.6
                elif "contract" in response_lower or "document" in response_lower:
                    classification = "contract"
                    confidence = 0.6
                else:
                    classification = "blank"
                    confidence = 0.5

                return GeminiClassificationResult(
                    classification=classification,
                    confidence=confidence,
                    reasoning=f"Fallback analysis due to parsing error: {e}",
                    gemini_response=response.text
                )

        except Exception as e:
            logger.error(f"Gemini classification failed: {e}")
            return GeminiClassificationResult(
                classification="error",
                confidence=0.0,
                reasoning=f"Classification failed: {e}",
                gemini_response=""
            )

    def visualize_results(self, original_image: np.ndarray, cropped_image: np.ndarray,
                         text_detections: list, zone: Optional[DetectedZone],
                         classification: GeminiClassificationResult) -> None:
        """Visualize the analysis results"""
        fig, axes = plt.subplots(2, 2, figsize=(16, 12))
        fig.suptitle('Gemini-Enhanced Signature Zone Analysis', fontsize=16, fontweight='bold')

        # 1. Original second page
        axes[0, 0].imshow(cv2.cvtColor(original_image, cv2.COLOR_BGR2RGB))
        axes[0, 0].set_title('Original Second Page')
        axes[0, 0].axis('off')

        # 2. Bottom 2/3 with text detections
        axes[0, 1].imshow(cv2.cvtColor(cropped_image, cv2.COLOR_BGR2RGB))
        axes[0, 1].set_title('Bottom 2/3 with Text Detections')
        axes[0, 1].axis('off')

        # Draw text bounding boxes
        for detection in text_detections:
            rect = Rectangle((detection['x'], detection['y']), detection['width'], detection['height'],
                           linewidth=1, edgecolor='blue', facecolor='none', alpha=0.7)
            axes[0, 1].add_patch(rect)

            # Add text labels for significant detections
            if detection['confidence'] > 50 and len(detection['text']) > 3:
                axes[0, 1].text(detection['x'], detection['y'] - 5, detection['text'][:15],
                              color='blue', fontsize=8, alpha=0.8)

        if zone:
            # Highlight the signature zone
            rect = Rectangle((zone.x, zone.y), zone.width, zone.height,
                           linewidth=3, edgecolor='red', facecolor='none', alpha=0.8)
            axes[0, 1].add_patch(rect)

        # 3. Detected signature zone (cropped)
        if zone:
            zone_image = cropped_image[zone.y:zone.y+zone.height, zone.x:zone.x+zone.width]
            if zone_image.size > 0:
                axes[1, 0].imshow(cv2.cvtColor(zone_image, cv2.COLOR_BGR2RGB))
                axes[1, 0].set_title(f'Detected Signature Zone\nGemini Classification: {classification.classification.upper()}')
            else:
                axes[1, 0].text(0.5, 0.5, 'Zone extraction failed', ha='center', va='center')
                axes[1, 0].set_title('Signature Zone (Error)')
        else:
            axes[1, 0].text(0.5, 0.5, 'No signature zone detected', ha='center', va='center')
            axes[1, 0].set_title('Signature Zone (Not Found)')
        axes[1, 0].axis('off')

        # 4. Gemini classification results
        axes[1, 1].axis('off')

        # Create text summary
        summary_text = f"""GEMINI CLASSIFICATION RESULTS
{'='*35}

Classification: {classification.classification.upper()}
Confidence: {classification.confidence:.2f}
Reasoning: {classification.reasoning}

ZONE DETECTION
{'='*35}
"""

        if zone:
            summary_text += f"""Zone Location: ({zone.x}, {zone.y})
Zone Size: {zone.width}x{zone.height}
Text Above: "{zone.text_above}"
Detection Confidence: {zone.confidence:.0f}%

GEMINI ANALYSIS
{'='*35}
Model: Gemini 2.5 Pro (Thinking)
Response Length: {len(classification.gemini_response)} chars
Classification Method: AI Vision Analysis
"""
        else:
            summary_text += "Zone not detected"

        axes[1, 1].text(0.05, 0.95, summary_text, transform=axes[1, 1].transAxes,
                        fontsize=10, verticalalignment='top', fontfamily='monospace')

        plt.tight_layout()
        plt.show()

    def save_results(self, result_data: Dict, output_path: str) -> None:
        """Save analysis results to JSON file"""
        try:
            with open(output_path, 'w', encoding='utf-8') as f:
                json.dump(result_data, f, indent=2, ensure_ascii=False)
            logger.info(f"Results saved to: {output_path}")
        except Exception as e:
            logger.error(f"Error saving results: {e}")

    def process_contract(self, pdf_path: str, show_visualization: bool = True,
                        save_results: bool = True) -> Dict:
        """Main processing pipeline for a single contract"""

        logger.info(f"Processing contract: {pdf_path}")

        try:
            # Step 1: Extract second page
            original_image = self.extract_second_page(pdf_path)

            # Step 2: Get bottom 2/3
            cropped_image, start_y = self.get_bottom_two_thirds(original_image)

            # Step 3: Detect text with positions
            text_detections = self.detect_text_with_positions(cropped_image)

            # Step 4: Find signature zone
            signature_zone = self.find_signature_zone_with_backup(cropped_image, text_detections)

            # Step 5: Classify with Gemini
            if signature_zone:
                zone_image = cropped_image[signature_zone.y:signature_zone.y+signature_zone.height,
                                         signature_zone.x:signature_zone.x+signature_zone.width]
                classification = self.classify_with_gemini(zone_image)
                if self.verbose:
                    logger.info(f"✅ Zone successfully sent to Gemini for classification")
            else:
                # Zone detection failed completely - mark as "not found"
                classification = GeminiClassificationResult(
                    classification="not found",
                    confidence=0.0,
                    reasoning="Zone detection failed: Primary method and all backup methods (enhanced processing + geometric detection) could not locate a signature zone",
                    gemini_response=""
                )
                if self.verbose:
                    logger.error("❌ Zone not found - will not send to Gemini. Marking as 'not found'.")

            # Step 6: Show visualization
            if show_visualization:
                self.visualize_results(original_image, cropped_image, text_detections,
                                     signature_zone, classification)

            # Prepare results
            result_data = {
                "file_path": pdf_path,
                "classification": classification.classification,
                "confidence": classification.confidence,
                "reasoning": classification.reasoning,
                "gemini_response": classification.gemini_response,
                "zone_detected": signature_zone is not None,
                "detected_texts": [det['text'] for det in text_detections if det['confidence'] > 50],
                "zone_location": (signature_zone.x, signature_zone.y) if signature_zone else None,
                "zone_size": (signature_zone.width, signature_zone.height) if signature_zone else None,
                "text_above_zone": signature_zone.text_above if signature_zone else None,
                "processing_status": "success"
            }

            # Step 7: Save results
            if save_results:
                results_dir = Path("results")
                results_dir.mkdir(exist_ok=True)

                pdf_name = Path(pdf_path).stem
                output_path = results_dir / f"{pdf_name}_gemini_analysis.json"
                self.save_results(result_data, str(output_path))

            # Print summary
            print("\n" + "="*60)
            print("GEMINI SIGNATURE ZONE CLASSIFICATION SUMMARY")
            print("="*60)
            print(f"File: {Path(pdf_path).name}")
            print(f"Classification: {classification.classification.upper()}")
            print(f"Confidence: {classification.confidence:.2f}")
            print(f"Reasoning: {classification.reasoning}")
            if signature_zone:
                print(f"Zone Location: ({signature_zone.x}, {signature_zone.y})")
                print(f"Zone Size: {signature_zone.width}x{signature_zone.height}")
                print(f"Text Above Zone: \"{signature_zone.text_above}\"")
            print("="*60)
            print(f"✅ Processing completed successfully!")
            print(f"Classification: {classification.classification.upper()}")
            print()

            return result_data

        except Exception as e:
            logger.error(f"Error processing contract: {e}")
            return {
                "file_path": pdf_path,
                "classification": "error",
                "confidence": 0.0,
                "reasoning": f"Processing failed: {e}",
                "gemini_response": "",
                "zone_detected": False,
                "processing_status": "error"
            }



Batch Gemini processor

In [None]:
#!/usr/bin/env python3
"""
Batch Gemini OCR Processor
A comprehensive tool for batch processing PDF contracts with Gemini classification
"""

import os
import sys
import csv
import json
import logging
# import argparse # Removed argparse
from pathlib import Path
from datetime import datetime
from typing import List, Dict, Optional
# import tkinter as tk # Removed tkinter
# from tkinter import filedialog, messagebox # Removed tkinter modules

import cv2
import fitz  # PyMuPDF
import numpy as np
import pytesseract
import google.generativeai as genai
from PIL import Image
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from matplotlib.patches import Rectangle


# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class BatchGeminiProcessor:
    """Batch processor for PDF contracts using Gemini classification"""

    def __init__(self, api_key: str, verbose: bool = False):
        """Initialize the batch processor"""
        self.api_key = api_key
        self.verbose = verbose
        # Initialize GeminiZoneClassifier here with the provided API key and verbosity
        self.classifier = GeminiZoneClassifier(api_key, verbose)
        self.processed_count = 0
        self.total_count = 0
        self.results = []

    # Removed select_folder GUI method

    def get_pdf_files(self, folder_path: str) -> List[str]:
        """Get all PDF files from the specified folder"""
        try:
            pdf_files = []
            folder = Path(folder_path)

            if not folder.is_dir():
                 logger.error(f"Input path is not a directory: {folder_path}")
                 return []


            for file_path in folder.glob("*.pdf"):
                if file_path.is_file():
                    pdf_files.append(str(file_path))

            # Sort files alphabetically for consistent processing order
            pdf_files.sort()

            if self.verbose:
                logger.info(f"Found {len(pdf_files)} PDF files in {folder_path}")
                # for pdf_file in pdf_files: # Removed verbose logging of each file to avoid clutter
                #     logger.info(f"  - {Path(pdf_file).name}")

            return pdf_files

        except Exception as e:
            logger.error(f"Error scanning folder {folder_path}: {e}")
            return []

    def save_visualization(self, original_image: np.ndarray, cropped_image: np.ndarray,
                          text_detections: list, zone: Optional[DetectedZone],
                          classification: GeminiClassificationResult,
                          output_path: str, contract_name: str) -> bool:
        """Save visualization as an image file"""
        try:
            # Create the visualization
            fig, axes = plt.subplots(2, 2, figsize=(16, 12))
            fig.suptitle(f'Gemini Analysis: {contract_name}', fontsize=16, fontweight='bold')

            # 1. Original second page
            axes[0, 0].imshow(cv2.cvtColor(original_image, cv2.COLOR_BGR2RGB))
            axes[0, 0].set_title('Original Second Page')
            axes[0, 0].axis('off')

            # 2. Bottom 2/3 with text detections
            axes[0, 1].imshow(cv2.cvtColor(cropped_image, cv2.COLOR_BGR2RGB))
            axes[0, 1].set_title('Bottom 2/3 with Text Detections')
            axes[0, 1].axis('off')

            # Draw text bounding boxes
            for detection in text_detections:
                rect = Rectangle((detection['x'], detection['y']), detection['width'], detection['height'],
                               linewidth=1, edgecolor='blue', facecolor='none', alpha=0.7)
                axes[0, 1].add_patch(rect)

                # Add text labels for significant detections
                if detection['confidence'] > 50 and len(detection['text']) > 3:
                    axes[0, 1].text(detection['x'], detection['y'] - 5, detection['text'][:15],
                                  color='blue', fontsize=8, alpha=0.8)

            if zone:
                # Highlight the signature zone
                rect = Rectangle((zone.x, zone.y), zone.width, zone.height,
                               linewidth=3, edgecolor='red', facecolor='none', alpha=0.8)
                axes[0, 1].add_patch(rect)

            # 3. Detected signature zone (cropped)
            if zone:
                zone_image = cropped_image[zone.y:zone.y+zone.height, zone.x:zone.x+zone.width]
                if zone_image.size > 0:
                    axes[1, 0].imshow(cv2.cvtColor(zone_image, cv2.COLOR_BGR2RGB))
                    axes[1, 0].set_title(f'Signature Zone\nClassification: {classification.classification.upper()}')
                else:
                    axes[1, 0].text(0.5, 0.5, 'Zone extraction failed', ha='center', va='center')
                    axes[1, 0].set_title('Signature Zone (Error)')
            else:
                axes[1, 0].text(0.5, 0.5, 'No signature zone detected', ha='center', va='center')
                axes[1, 0].set_title('Signature Zone (Not Found)')
            axes[1, 0].axis('off')

            # 4. Classification results
            axes[1, 1].axis('off')

            # Create text summary
            summary_text = f"""GEMINI CLASSIFICATION
{'='*25}

Classification: {classification.classification.upper()}
Confidence: {classification.confidence:.2f}

Reasoning:
{classification.reasoning}

ZONE DETECTION
{'='*25}
"""

            if zone:
                summary_text += f"""Location: ({zone.x}, {zone.y})
Size: {zone.width}x{zone.height}
Text Above: "{zone.text_above}"
Detection Confidence: {zone.confidence:.0f}%
"""
            else:
                summary_text += "Zone not detected"

            # Wrap long text
            wrapped_text = ""
            for line in summary_text.split('\n'):
                if len(line) > 45:
                    words = line.split(' ')
                    current_line = ""
                    for word in words:
                        if len(current_line + word) > 45:
                            wrapped_text += current_line.strip() + '\n'
                            current_line = word + ' '
                        else:
                            current_line += word + ' '
                    wrapped_text += current_line.strip() + '\n'
                else:
                    wrapped_text += line + '\n'

            axes[1, 1].text(0.05, 0.95, wrapped_text, transform=axes[1, 1].transAxes,
                            fontsize=9, verticalalignment='top', fontfamily='monospace')

            plt.tight_layout()

            # Save the figure
            plt.savefig(output_path, dpi=150, bbox_inches='tight')
            plt.close()  # Important: close the figure to free memory

            if self.verbose:
                logger.info(f"Visualization saved: {output_path}")

            return True

        except Exception as e:
            logger.error(f"Error saving visualization for {contract_name}: {e}")
            return False

    def process_single_contract(self, pdf_path: str, visualization_folder: str = None) -> Dict:
        """Process a single contract and return results"""
        contract_name = Path(pdf_path).stem

        try:
            if self.verbose:
                logger.info(f"Processing: {contract_name}")

            # Step 1: Extract second page
            original_image = self.classifier.extract_second_page(pdf_path)

            # Step 2: Get bottom 2/3
            cropped_image, start_y = self.classifier.get_bottom_two_thirds(original_image)

            # Step 3: Detect text with positions
            text_detections = self.classifier.detect_text_with_positions(cropped_image)

            # Step 4: Find signature zone with backup methods
            signature_zone = self.classifier.find_signature_zone_with_backup(cropped_image, text_detections)

            # Step 5: Classify with Gemini
            if signature_zone:
                zone_image = cropped_image[signature_zone.y:signature_zone.y+signature_zone.height,
                                         signature_zone.x:signature_zone.x+signature_zone.width]
                classification = self.classifier.classify_with_gemini(zone_image)
            else:
                # Zone detection failed - mark as "not found"
                classification = GeminiClassificationResult(
                    classification="not found",
                    confidence=0.0,
                    reasoning="Zone detection failed: Primary and backup methods could not locate a signature zone",
                    gemini_response=""
                )

            # Step 6: Save visualization if requested
            visualization_path = None
            if visualization_folder:
                visualization_filename = f"{contract_name}_analysis.png"
                visualization_path = os.path.join(visualization_folder, visualization_filename)

                self.save_visualization(
                    original_image, cropped_image, text_detections,
                    signature_zone, classification, visualization_path, contract_name
                )

            # Prepare result
            result = {
                "contract_name": contract_name,
                "file_path": pdf_path,
                "classification": classification.classification,
                "confidence": classification.confidence,
                "reasoning": classification.reasoning,
                "zone_detected": signature_zone is not None,
                "zone_location": (signature_zone.x, signature_zone.y) if signature_zone else None,
                "zone_size": (signature_zone.width, signature_zone.height) if signature_zone else None,
                "text_above_zone": signature_zone.text_above if signature_zone else None,
                "visualization_path": visualization_path,
                "processing_status": "success",
                "processed_at": datetime.now().isoformat()
            }

            self.processed_count += 1

            if self.verbose:
                logger.info(f"✅ {contract_name}: {classification.classification.upper()} (confidence: {classification.confidence:.2f})")

            return result

        except Exception as e:
            logger.error(f"❌ Error processing {contract_name}: {e}")

            return {
                "contract_name": contract_name,
                "file_path": pdf_path,
                "classification": "error",
                "confidence": 0.0,
                "reasoning": f"Processing failed: {str(e)}",
                "zone_detected": False,
                "zone_location": None,
                "zone_size": None,
                "text_above_zone": None,
                "visualization_path": None,
                "processing_status": "error",
                "processed_at": datetime.now().isoformat()
            }

    def save_csv_report(self, results: List[Dict], csv_path: str) -> bool:
        """Save results to a CSV file"""
        try:
            with open(csv_path, 'w', newline='', encoding='utf-8') as csvfile:
                fieldnames = [
                    'contract_name',
                    'classification',
                    'confidence',
                    'reasoning',
                    'zone_detected',
                    'zone_location',
                    'zone_size',
                    'text_above_zone',
                    'processing_status',
                    'processed_at',
                    'file_path',
                    'visualization_path'
                ]

                writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
                writer.writeheader()

                for result in results:
                    # Convert tuples to strings for CSV compatibility
                    csv_row = result.copy()
                    if csv_row['zone_location']:
                        csv_row['zone_location'] = f"({csv_row['zone_location'][0]}, {csv_row['zone_location'][1]})"
                    if csv_row['zone_size']:
                        csv_row['zone_size'] = f"{csv_row['zone_size'][0]}x{csv_row['zone_size'][1]}"

                    writer.writerow(csv_row)

            logger.info(f"CSV report saved: {csv_path}")
            return True

        except Exception as e:
            logger.error(f"Error saving CSV report: {e}")
            return False

    def print_summary(self, results: List[Dict]) -> None:
        """Print processing summary"""
        successful = [r for r in results if r['processing_status'] == 'success']
        errors = [r for r in results if r['processing_status'] == 'error']

        # Count classifications
        classifications = {}
        total_confidence = 0
        for result in successful:
            cls = result['classification']
            if cls in classifications:
                classifications[cls] += 1
            else:
                classifications[cls] = 1
            total_confidence += result['confidence']

        print("\n" + "="*70)
        print("BATCH PROCESSING SUMMARY")
        print("="*70)
        print(f"Total Files: {len(results)}")
        print(f"Successfully Processed: {len(successful)}")
        print(f"Errors: {len(errors)}")

        if successful:
            print(f"Average Confidence: {total_confidence / len(successful):.2f}")
            print("\nClassification Distribution:")
            for cls, count in classifications.items():
                percentage = (count / len(successful)) * 100
                print(f"  {cls.upper()}: {count} files ({percentage:.1f}%)")

        if errors:
            print(f"\nFailed Files:")
            for error in errors:
                print(f"  ❌ {error['contract_name']}: {error['reasoning']}")

        print("="*70)

    def process_batch(self, pdf_folder: str, csv_output_folder: str,
                     visualization_folder: str = None) -> List[Dict]:
        """Process all PDFs in a folder"""

        # Validate API key presence before proceeding
        if not self.api_key:
            logger.error("❌ Error: Gemini API key is not provided.")
            return []

        # Get PDF files
        pdf_files = self.get_pdf_files(pdf_folder)

        if not pdf_files:
            logger.warning(f"No PDF files found in {pdf_folder}")
            return []

        self.total_count = len(pdf_files)
        self.processed_count = 0 # Reset count for new batch
        self.results = []

        print(f"\n🚀 Starting batch processing of {self.total_count} PDF files...")
        print(f"📁 Input folder: {pdf_folder}")
        if visualization_folder:
            print(f"🎨 Visualizations will be saved to: {visualization_folder}")
        print(f"📊 CSV report will be saved to: {csv_output_folder}")
        print("-" * 70)

        # Create output folders if they don't exist
        os.makedirs(csv_output_folder, exist_ok=True)
        if visualization_folder:
            os.makedirs(visualization_folder, exist_ok=True)

        # Process each PDF
        for i, pdf_file in enumerate(pdf_files, 1):
            print(f"[{i}/{self.total_count}] Processing: {Path(pdf_file).name}")

            result = self.process_single_contract(pdf_file, visualization_folder)
            self.results.append(result)

            # Show progress
            # print(f"    Result: {result.get('classification', 'N/A').upper()} (confidence: {result.get('confidence', 0.0):.2f}) - {progress:.1f}% complete")
            # Simplified progress output
            progress = (i / self.total_count) * 100
            print(f"    Status: {result.get('processing_status', 'N/A').upper()}, Classification: {result.get('classification', 'N/A').upper()} - {progress:.1f}% complete")


        # Generate timestamp for CSV filename
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        csv_filename = f"gemini_classification_report_{timestamp}.csv"
        csv_path = os.path.join(csv_output_folder, csv_filename)

        # Save CSV report
        if self.save_csv_report(self.results, csv_path):
            print(f"\n📊 CSV report saved: {csv_path}")

        # Print summary
        self.print_summary(self.results)

        return self.results


## 🧪 Step 5: Test Single Contract Processing


In [None]:
from google.colab import files
import os

# Upload PDF(s)
uploaded = files.upload()

# Save file(s) to local disk
pdf_files = []
for filename in uploaded.keys():
    if filename.lower().endswith('.pdf'):
        new_path = os.path.join("contracts", filename)
        os.makedirs("contracts", exist_ok=True)
        with open(new_path, "wb") as f:
            f.write(uploaded[filename])
        pdf_files.append(new_path)

if not pdf_files:
    print("❌ No valid PDF files uploaded.")
else:
    print("✅ Uploaded files:", pdf_files)

# Enhanced contract processing with diagnostic output
if 'GeminiZoneClassifier' in globals():
    classifier = GeminiZoneClassifier(api_key=GEMINI_API_KEY, verbose=True)

    if pdf_files:
        test_file = pdf_files[0]
        print(f"\n🧪 Processing: {test_file}")

        # Step-by-step processing
        original_image = classifier.extract_second_page(test_file)
        cropped_image, _ = classifier.get_bottom_two_thirds(original_image)
        text_detections = classifier.detect_text_with_positions(cropped_image)

        print("\n🔍 Attempting PRIMARY zone detection...")
        primary_zone = classifier.find_signature_zone(cropped_image, text_detections)
        if primary_zone:
            used_method = "Primary method (keyword-based)"
            signature_zone = primary_zone
        else:
            print("❌ Primary method failed. Trying Backup Method 1 (enhanced OCR)...")
            backup1 = classifier.backup_zone_detection_method1(cropped_image)
            if backup1:
                used_method = "Backup Method 1 (enhanced OCR + relaxed keywords)"
                signature_zone = backup1
            else:
                print("❌ Backup 1 failed. Trying Backup Method 2 (grey boundary)...")
                backup2 = classifier.backup_zone_detection_method2(cropped_image)
                if backup2:
                    used_method = "Backup Method 2 (grey boundary detection)"
                    signature_zone = backup2
                else:
                    used_method = "None (all detection methods failed)"
                    signature_zone = None

        if signature_zone:
            print(f"✅ Signature zone detected using: {used_method}")
            zone_image = cropped_image[signature_zone.y:signature_zone.y + signature_zone.height,
                                       signature_zone.x:signature_zone.x + signature_zone.width]
            classification = classifier.classify_with_gemini(zone_image)
        else:
            classification = GeminiClassificationResult(
                classification="not found",
                confidence=0.0,
                reasoning="Zone detection failed with all methods",
                gemini_response=""
            )

        # Show visualization
        classifier.visualize_results(original_image, cropped_image, text_detections, signature_zone, classification)

        # Print result summary
        print("\n📊 SUMMARY")
        print("=" * 60)
        print(f"Detection Method Used: {used_method}")
        print(f"Gemini Classification: {classification.classification.upper()}")
        print(f"Confidence: {classification.confidence:.2f}")
        print(f"Reasoning: {classification.reasoning}")
        if signature_zone:
            print(f"Zone Location: {signature_zone.x}, {signature_zone.y}")
            print(f"Zone Size: {signature_zone.width}x{signature_zone.height}")
            print(f"Text Above Zone: {signature_zone.text_above}")
        else:
            print("No zone detected.")
        print("=" * 60)

    else:
        print("❌ No PDF files found. Please upload a PDF.")
else:
    print("❌ GeminiZoneClassifier not loaded. Please run the code cell defining it.")


## 🧪  Run Classifier + Update Sheet

In [None]:
import os
import time
import tempfile
import requests
import json
import traceback
import re
import shutil
from typing import Dict, List, Optional, Tuple
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor, as_completed
from threading import Lock
import threading

import pandas as pd
import numpy as np
import cv2
import fitz  # PyMuPDF
import pytesseract
import google.generativeai as genai
from PIL import Image
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from matplotlib.patches import Rectangle
from dataclasses import dataclass
from pathlib import Path
import logging


CSV_FILE_PATH = "/content/drive/MyDrive/remaining.csv"
PROGRESS_FILE = "processing_progress.json"
BACKUP_CSV_PREFIX = "maids_backup"
BATCH_SAVE_INTERVAL = 100
VERBOSE_LOGGING = True
START_ROW = 0                       # Row to start processing (0=first data row)
MAX_ROWS_TO_PROCESS = None          # Maximum rows to process (None = all rows)
DELAY_BETWEEN_BATCHES = 0.1

MAID_ID_COLUMN = "Maid ID"
MAID_NAME_COLUMN = "Maid Name"
DOCUMENT_LINK_COLUMN = "Link to Document"
CLASSIFICATION_COLUMN = "final_classification"
CONFIDENCE_COLUMN = "final_confidence"
DETECTION_METHOD_COLUMN = "Detection Method"
DOWNLOAD_TIMEOUT = 30               # Timeout for downloading files
MAX_RETRIES = 2                     # Number of retries for failed downloads
VERBOSE_LOGGING = True              # Enable detailed logging

class EnhancedCSVProcessor:
    """Enhanced CSV-based processor for large-scale contract analysis"""

    def __init__(self, api_key: str, csv_path: str, max_workers: int = 40):
        self.api_key = api_key
        self.csv_path = csv_path
        self.max_workers = max_workers
        self.classifier = GeminiZoneClassifier(api_key=api_key, verbose=VERBOSE_LOGGING)
        self.output_folder = "/content/drive/MyDrive/Remaining All"

        # Thread-safe counters
        self.processed_count = 0
        self.success_count = 0
        self.error_count = 0
        self.no_link_count = 0
        self.download_fail_count = 0
        self.method_counts = {
            "Primary": 0,
            "Backup Method 1": 0,
            "Backup Method 2": 0,
            "None": 0
        }
        self.lock = Lock()

        # Load or create DataFrame
        self.df = self.load_csv()
        self.progress = self.load_progress()

    def load_csv(self) -> pd.DataFrame:
        """Load CSV file into DataFrame"""
        try:
            print(f"📊 Loading CSV: {self.csv_path}")
            df = pd.read_csv(self.csv_path)

            # Ensure required columns exist
            required_columns = [CLASSIFICATION_COLUMN, CONFIDENCE_COLUMN, DETECTION_METHOD_COLUMN]
            for col in required_columns:
                if col not in df.columns:
                    df[col] = None
                    print(f"   ➕ Added missing column: {col}")

            print(f"   ✅ Loaded {len(df)} rows with {len(df.columns)} columns")
            return df

        except Exception as e:
            print(f"❌ Error loading CSV: {e}")
            raise

    def load_progress(self) -> Dict:
        """Load processing progress from file"""
        try:
            if os.path.exists(PROGRESS_FILE):
                with open(PROGRESS_FILE, 'r') as f:
                    progress = json.load(f)
                print(f"📈 Loaded progress: {progress.get('processed_count', 0)} contracts processed")
                return progress
        except Exception as e:
            print(f"⚠️ Could not load progress: {e}")

        return {
            'processed_count': 0,
            'last_processed_index': -1,
            'start_time': datetime.now().isoformat(),
            'last_save_time': None
        }

    def save_progress(self):
        """Save current progress to file"""
        try:
            self.progress.update({
                'processed_count': self.processed_count,
                'success_count': self.success_count,
                'error_count': self.error_count,
                'no_link_count': self.no_link_count,
                'download_fail_count': self.download_fail_count,
                'method_counts': self.method_counts,
                'last_save_time': datetime.now().isoformat()
            })

            with open(PROGRESS_FILE, 'w') as f:
                json.dump(self.progress, f, indent=2)

        except Exception as e:
            print(f"⚠️ Could not save progress: {e}")

    def create_backup(self):
        """Create backup of current CSV"""
        try:
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            backup_path = f"{BACKUP_CSV_PREFIX}_{timestamp}.csv"
            shutil.copy2(self.csv_path, backup_path)
            print(f"💾 Backup created: {backup_path}")
            return backup_path
        except Exception as e:
            print(f"⚠️ Could not create backup: {e}")
            return None

    def save_dataframe(self, force=False):
        """Save DataFrame to CSV file"""
        try:
            if force or self.processed_count % BATCH_SAVE_INTERVAL == 0:
                print(f"💾 Saving progress... ({self.processed_count} processed)")

                # Create backup before saving
                self.create_backup()

                # Save updated CSV
                self.df.to_csv(self.csv_path, index=False)

                # Save progress
                self.save_progress()

                print(f"   ✅ CSV saved with {len(self.df)} rows")

        except Exception as e:
            print(f"❌ Error saving CSV: {e}")

    def download_contract(self, url: str, base_filename: str) -> Optional[Tuple[str, str]]:
        """Download contract file from URL"""
        try:
            thread_id = threading.current_thread().name[-3:]
            print(f"   📥 [{thread_id}] Downloading from: {url[:50]}...")

            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
                'Accept': '*/*',
                'Connection': 'keep-alive'
            }

            response = requests.get(url, headers=headers, timeout=DOWNLOAD_TIMEOUT, stream=True)

            if response.status_code != 200:
                print(f"   ❌ [{thread_id}] HTTP {response.status_code}: {response.reason}")
                return None

            # Create temporary file
            temp_dir = tempfile.gettempdir()
            temp_path = os.path.join(temp_dir, f"{base_filename}.tmp")

            # Download file
            total_size = 0
            with open(temp_path, 'wb') as f:
                for chunk in response.iter_content(chunk_size=8192):
                    if chunk:
                        f.write(chunk)
                        total_size += len(chunk)

            if total_size == 0:
                print(f"   ⚠️ [{thread_id}] Downloaded file is empty")
                os.remove(temp_path)
                return None

            # Detect file format
            file_format = self.detect_file_format(temp_path)
            if not file_format:
                content_type = response.headers.get('content-type', '').lower()
                if 'pdf' in content_type:
                    file_format = 'pdf'
                elif 'jpeg' in content_type or 'jpg' in content_type:
                    file_format = 'jpeg'
                elif 'png' in content_type:
                    file_format = 'png'
                else:
                    print(f"   ❓ [{thread_id}] Unknown file format")
                    os.remove(temp_path)
                    return None

            # Rename with proper extension
            final_path = f"{temp_path}.{file_format}"
            os.rename(temp_path, final_path)

            print(f"   ✅ [{thread_id}] Downloaded {file_format.upper()} ({total_size//1024} KB)")
            return final_path, file_format

        except Exception as e:
            print(f"   ❌ Download failed: {str(e)[:100]}")
            return None

    def detect_file_format(self, file_path: str) -> Optional[str]:
        """Detect file format from file header"""
        try:
            with open(file_path, 'rb') as f:
                header = f.read(20)

            if header.startswith(b'%PDF') or b'%PDF' in header[:10]:
                return 'pdf'
            elif header.startswith(b'\xff\xd8\xff'):
                return 'jpeg'
            elif header.startswith(b'\x89PNG\r\n\x1a\n'):
                return 'png'
            else:
                return None
        except:
            return None

    def process_contract_file_with_method_tracking(self, file_path: str, file_format: str, maid_name, maid_id) -> Dict:
        """Process contract file and track which detection method was used"""
        try:
            thread_id = threading.current_thread().name[-3:]
            print(f"   🤖 [{thread_id}] Processing {file_format.upper()} file...")

            if file_format == 'pdf':
                # Extract pages and detect text
                original_image = self.classifier.extract_second_page(file_path)
                cropped_image, _ = self.classifier.get_bottom_two_thirds(original_image)
                text_detections = self.classifier.detect_text_with_positions(cropped_image)

                # Try detection methods in order and track which one succeeds
                detection_method_used = "None"
                signature_zone = None

                # Primary method
                print(f"   🔍 [{thread_id}] Trying PRIMARY zone detection...")
                primary_zone = self.classifier.find_signature_zone(cropped_image, text_detections)
                if primary_zone:
                    detection_method_used = "Primary"
                    signature_zone = primary_zone
                    print(f"   ✅ [{thread_id}] Primary method succeeded")
                else:
                    print(f"   ❌ [{thread_id}] Primary method failed. Trying Backup Method 1...")
                    backup1 = self.classifier.backup_zone_detection_method1(cropped_image)
                    if backup1:
                        detection_method_used = "Backup Method 1"
                        signature_zone = backup1
                        print(f"   ✅ [{thread_id}] Backup Method 1 succeeded")
                    else:
                        print(f"   ❌ [{thread_id}] Backup 1 failed. Trying Backup Method 2...")
                        backup2 = self.classifier.backup_zone_detection_method2(cropped_image)
                        if backup2:
                            detection_method_used = "Backup Method 2"
                            signature_zone = backup2
                            print(f"   ✅ [{thread_id}] Backup Method 2 succeeded")
                        else:
                            detection_method_used = "None"
                            signature_zone = None
                            print(f"   ❌ [{thread_id}] All detection methods failed")

                # Update method counter
                with self.lock:
                    self.method_counts[detection_method_used] += 1

                # Classify with Gemini if zone was found
                if signature_zone:
                    zone_image = cropped_image[signature_zone.y:signature_zone.y + signature_zone.height,
                                               signature_zone.x:signature_zone.x + signature_zone.width]

                    safe_maid_name = re.sub(r'[^\w\s-]', '', maid_name).strip()
                    safe_maid_name = re.sub(r'\s+', '_', safe_maid_name)
                    filename = f"{maid_id}_{safe_maid_name}.jpg"
                    output_path = os.path.join(self.output_folder, filename)

                    cv2.imwrite(output_path, zone_image)
                    # classification = self.classifier.classify_with_gemini(zone_image)

                    # return {
                    #     "classification": classification.classification,
                    #     "confidence": classification.confidence,
                    #     "reasoning": f"Detection: {detection_method_used}. {classification.reasoning}",
                    #     "zone_detected": True,
                    #     "detection_method": detection_method_used,
                    #     "status": "success"
                    # }
                return {
                        "classification": "signature",
                        "confidence": 0.0,
                        "reasoning": f"saved_contract_pdf",
                        "zone_detected": True,
                        "detection_method": "None",
                        "status": "saved"
                    }

            elif file_format in ['jpeg', 'png']:
                # Process image files directly
                image = cv2.imread(file_path)
                safe_maid_name = re.sub(r'[^\w\s-]', '', maid_name).strip()
                safe_maid_name = re.sub(r'\s+', '_', safe_maid_name)
                filename = f"{maid_id}_{safe_maid_name}.jpg"
                output_path = os.path.join(self.output_folder, filename)

                cv2.imwrite(output_path, image)

                return {
                        "classification": "signature",
                        "confidence": 0.0,
                        "reasoning": "load and save image file",
                        "zone_detected": True,
                        "detection_method": "N/A",
                        "status": "image_file_saved"
                    }

            else:
                return {
                    "classification": "error",
                    "confidence": 0.0,
                    "reasoning": f"Unsupported file format: {file_format}",
                    "zone_detected": False,
                    "detection_method": "N/A",
                    "status": "unsupported_format"
                }

        except Exception as e:
            print(f"   ❌ Processing failed: {str(e)[:100]}")
            return {
                "classification": "error",
                "confidence": 0.0,
                "reasoning": f"Processing error: {str(e)}",
                "zone_detected": False,
                "detection_method": "Error",
                "status": "processing_failed"
            }

    def process_single_row(self, row_index: int) -> Dict:
        """Process a single row from the DataFrame"""
        start_time = time.time()
        thread_id = threading.current_thread().name[-3:]

        row = self.df.iloc[row_index]
        maid_id = str(row.get(MAID_ID_COLUMN, f'Row_{row_index}'))
        maid_name = str(row.get(MAID_NAME_COLUMN, 'Unknown'))
        document_link = str(row.get(DOCUMENT_LINK_COLUMN, ''))

        print(f"\n🔄 [{thread_id}] Processing: {maid_name} (Row {row_index + 1})")

        temp_file = None
        result = {
            "row_index": row_index,
            "maid_id": maid_id,
            "maid_name": maid_name,
            "classification": "error",
            "confidence": 0.0,
            "reasoning": "",
            "detection_method": "N/A",
            "processing_time": 0.0,
            "status": "unknown"
        }

        try:
            # Check if document link exists
            if not document_link or document_link.lower() in ['', 'none', 'null', 'nan']:
                print(f"   ⚠️ [{thread_id}] No document link found")
                result.update({
                    "classification": "no_link",
                    "reasoning": "No document link provided",
                    "detection_method": "N/A",
                    "status": "no_link"
                })

                with self.lock:
                    self.no_link_count += 1
                    self.processed_count += 1

                # Update DataFrame
                self.df.loc[row_index, CLASSIFICATION_COLUMN] = "no_link"
                self.df.loc[row_index, CONFIDENCE_COLUMN] = 0.0
                self.df.loc[row_index, DETECTION_METHOD_COLUMN] = "N/A"

                print(f"   📋 [{thread_id}] DataFrame updated: no_link")
                return result

            # Create filename
            safe_name = re.sub(r'[^\w\-_.]', '_', maid_name)
            base_filename = f"{safe_name}_{maid_id}_contract"

            # Download contract
            download_result = self.download_contract(document_link, base_filename)

            if not download_result:
                print(f"   ❌ [{thread_id}] Download failed")
                result.update({
                    "classification": "download_failed",
                    "reasoning": "Failed to download contract file",
                    "detection_method": "N/A",
                    "status": "download_failed"
                })

                with self.lock:
                    self.download_fail_count += 1
                    self.processed_count += 1

                # Update DataFrame
                self.df.loc[row_index, CLASSIFICATION_COLUMN] = "download_failed"
                self.df.loc[row_index, CONFIDENCE_COLUMN] = 0.0
                self.df.loc[row_index, DETECTION_METHOD_COLUMN] = "N/A"

                print(f"   📋 [{thread_id}] DataFrame updated: download_failed")
                return result

            temp_file, file_format = download_result

            # Process the contract with method tracking
            processing_result = self.process_contract_file_with_method_tracking(temp_file, file_format, maid_name, maid_id)

            result.update({
                "classification": processing_result["classification"],
                "confidence": processing_result["confidence"],
                "reasoning": processing_result["reasoning"],
                "detection_method": processing_result["detection_method"],
                "status": processing_result["status"],
                "processing_time": time.time() - start_time
            })

            # Update counters
            with self.lock:
                if processing_result["status"] == "success":
                    self.success_count += 1
                else:
                    self.error_count += 1
                self.processed_count += 1

            # Update DataFrame immediately
            self.df.loc[row_index, CLASSIFICATION_COLUMN] = processing_result["classification"]
            self.df.loc[row_index, CONFIDENCE_COLUMN] = processing_result["confidence"]
            self.df.loc[row_index, DETECTION_METHOD_COLUMN] = processing_result["detection_method"]

            print(f"   ✅ [{thread_id}] {processing_result['classification'].upper()} (conf: {processing_result['confidence']:.2f}) via {processing_result['detection_method']} - DataFrame updated")

            # Check if we should save
            if self.processed_count % BATCH_SAVE_INTERVAL == 0:
                self.save_dataframe()

            return result

        except Exception as e:
            print(f"   ❌ [{thread_id}] Error: {str(e)[:100]}")
            result.update({
                "classification": "error",
                "reasoning": f"Processing error: {str(e)}",
                "detection_method": "Error",
                "status": "error",
                "processing_time": time.time() - start_time
            })

            with self.lock:
                self.error_count += 1
                self.processed_count += 1

            # Update DataFrame even for errors
            self.df.loc[row_index, CLASSIFICATION_COLUMN] = "error"
            self.df.loc[row_index, CONFIDENCE_COLUMN] = 0.0
            self.df.loc[row_index, DETECTION_METHOD_COLUMN] = "Error"

            print(f"   📋 [{thread_id}] DataFrame updated: error")
            return result

        finally:
            # Clean up temporary file
            if temp_file and os.path.exists(temp_file):
                try:
                    os.remove(temp_file)
                except:
                    pass

    def get_unprocessed_rows(self) -> List[int]:
        """Get list of row indices that haven't been processed yet"""
        # Check which rows don't have classifications yet
        unprocessed_mask = (
            self.df[CLASSIFICATION_COLUMN].isna() |
            (self.df[CLASSIFICATION_COLUMN] == '') |
            (self.df[CLASSIFICATION_COLUMN] == 'None') |
            (self.df[CLASSIFICATION_COLUMN] == 'error') |
            (self.df[CLASSIFICATION_COLUMN] == 'download_failed') |
            (self.df[CLASSIFICATION_COLUMN] == 'no_link')
        )

        unprocessed_indices = self.df[unprocessed_mask].index.tolist()

        # Apply START_ROW and MAX_ROWS_TO_PROCESS filters
        filtered_indices = [i for i in unprocessed_indices if i >= START_ROW]

        if MAX_ROWS_TO_PROCESS:
            end_row = START_ROW + MAX_ROWS_TO_PROCESS
            filtered_indices = [i for i in filtered_indices if i < end_row]

        return filtered_indices

    def process_contracts(self) -> Dict:
        """Main function to process contracts in the CSV"""
        print(f"\n🚀 Starting enhanced CSV batch processing...")
        print(f"   📊 Workers: {self.max_workers}")
        print(f"   🎯 Start row: {START_ROW}")
        print(f"   📏 Max rows: {MAX_ROWS_TO_PROCESS if MAX_ROWS_TO_PROCESS else 'All'}")
        print(f"   💾 Batch save interval: {BATCH_SAVE_INTERVAL}")

        start_time = time.time()

        try:
            # Get unprocessed rows
            unprocessed_rows = self.get_unprocessed_rows()
            total_to_process = len(unprocessed_rows)

            if total_to_process == 0:
                print("✅ All contracts have already been processed!")
                return {"processed": 0, "message": "No unprocessed contracts found"}

            print(f"   📊 Found {total_to_process} unprocessed contracts")
            print(f"   📈 Previous progress: {self.progress.get('processed_count', 0)} contracts")

            # Create initial backup
            self.create_backup()

            print(f"\n🔄 Processing {total_to_process} contracts with {self.max_workers} workers...")

            # Process in parallel
            results = []
            with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
                # Submit all tasks
                future_to_row = {
                    executor.submit(self.process_single_row, row_index): row_index
                    for row_index in unprocessed_rows
                }

                # Collect results
                for future in as_completed(future_to_row):
                    row_index = future_to_row[future]
                    try:
                        result = future.result()
                        results.append(result)

                        # Progress update
                        completed = len(results)
                        progress = (completed / total_to_process) * 100
                        print(f"📊 Progress: {completed}/{total_to_process} ({progress:.1f}%) - ✅{self.success_count} ❌{self.error_count} ⚠️{self.no_link_count} 📥{self.download_fail_count}")

                        # Small delay to avoid overwhelming the API
                        if completed % self.max_workers == 0:
                            time.sleep(DELAY_BETWEEN_BATCHES)

                    except Exception as e:
                        print(f"❌ Task failed for row {row_index}: {e}")

            # Final save
            self.save_dataframe(force=True)

            # Calculate final statistics
            total_time = time.time() - start_time

            # Print comprehensive summary
            self.print_final_summary(total_time, results)

            return {
                "processed": self.processed_count,
                "success": self.success_count,
                "errors": self.error_count,
                "no_links": self.no_link_count,
                "download_fails": self.download_fail_count,
                "method_counts": self.method_counts,
                "total_time": total_time,
                "results": results
            }

        except Exception as e:
            print(f"❌ Fatal error: {e}")
            traceback.print_exc()

            # Emergency save
            self.save_dataframe(force=True)

            return {"error": str(e), "processed": self.processed_count}

    def print_final_summary(self, total_time: float, results: List[Dict]):
        """Print comprehensive final summary"""
        print(f"\n🎉 Enhanced CSV processing completed!")
        print(f"   ⏱️  Total time: {total_time:.1f} seconds ({total_time/60:.1f} minutes)")
        print(f"   📊 Processed: {self.processed_count} contracts")
        print(f"   ✅ Successful: {self.success_count}")
        print(f"   ❌ Errors: {self.error_count}")
        print(f"   ⚠️  No links: {self.no_link_count}")
        print(f"   📥 Download failures: {self.download_fail_count}")
        print(f"   ⚡ Average time per contract: {total_time/max(1, self.processed_count):.1f}s")

        # Detection method breakdown
        print(f"\n🔍 DETECTION METHOD BREAKDOWN:")
        total_detected = sum(self.method_counts.values())
        for method, count in self.method_counts.items():
            if total_detected > 0:
                percentage = (count / total_detected) * 100
                print(f"   {method}: {count} ({percentage:.1f}%)")

        if self.success_count > 0:
            success_results = [r for r in results if r["status"] == "success"]
            if success_results:
                avg_confidence = sum(r["confidence"] for r in success_results) / len(success_results)
                print(f"   🎯 Average confidence: {avg_confidence:.2f}")

                # Classification breakdown
                classifications = {}
                for r in success_results:
                    cls = r["classification"]
                    classifications[cls] = classifications.get(cls, 0) + 1

                print(f"   📊 Classification breakdown:")
                for cls, count in classifications.items():
                    percentage = (count / len(success_results)) * 100
                    print(f"      {cls.upper()}: {count} ({percentage:.1f}%)")

        print(f"\n💾 CSV file updated: {self.csv_path}")

def run_enhanced_batch_analysis():
    """Main function to run the enhanced batch analysis"""

    print("="*80)
    print("🚀 ENHANCED MOHRE CONTRACT BATCH ANALYZER")
    print("📊 Pandas-based | 💾 Batch Saving | 📈 Progress Tracking")
    print("="*80)

    # Validate configuration
    if not GEMINI_API_KEY or "your_api_key" in GEMINI_API_KEY.lower():
        print("❌ ERROR: Please set your Gemini API key")
        return

    if not os.path.exists(CSV_FILE_PATH):
        print(f"❌ ERROR: CSV file not found: {CSV_FILE_PATH}")
        return

    try:
        # Create processor
        processor = EnhancedCSVProcessor(
            api_key=GEMINI_API_KEY,
            csv_path=CSV_FILE_PATH,
            max_workers=20
        )

        # Process the contracts
        results = processor.process_contracts()

        if "error" in results:
            print(f"❌ Processing failed: {results['error']}")
        else:
            print(f"\n✨ Processing completed successfully!")
            print(f"📊 Total processed: {results['processed']}")
            print(f"✅ Successful classifications: {results['success']}")
            print(f"❌ Errors: {results['errors']}")
            print(f"⏱️  Total time: {results['total_time']:.1f} seconds")

            # Show detection method summary
            print(f"\n🔍 Detection Methods Used:")
            for method, count in results['method_counts'].items():
                print(f"   {method}: {count} files")

        return results

    except Exception as e:
        print(f"❌ Fatal error occurred: {e}")
        traceback.print_exc()
        return None


results = run_enhanced_batch_analysis()

🚀 ENHANCED MOHRE CONTRACT BATCH ANALYZER
📊 Pandas-based | 💾 Batch Saving | 📈 Progress Tracking
📊 Loading CSV: /content/drive/MyDrive/remaining.csv
   ➕ Added missing column: final_classification
   ➕ Added missing column: final_confidence
   ➕ Added missing column: Detection Method
   ✅ Loaded 384 rows with 16 columns

🚀 Starting enhanced CSV batch processing...
   📊 Workers: 20
   🎯 Start row: 0
   📏 Max rows: All
   💾 Batch save interval: 100
   📊 Found 384 unprocessed contracts
   📈 Previous progress: 0 contracts
💾 Backup created: maids_backup_20250808_064441.csv

🔄 Processing 384 contracts with 20 workers...

🔄 [0_0] Processing: Meskerem Desta Debesa (Row 1)
   📥 [0_0] Downloading from: https://erpbackendpro.maids.cc/public/download/c24...

🔄 [0_1] Processing: MARITES VISAYA PIMENTEL  (Row 2)
   📥 [0_1] Downloading from: https://erpbackendpro.maids.cc/public/download/207...

🔄 [0_2] Processing: Elyn Curiente Agrado  (Row 3)
   📥 [0_2] Downloading from: https://erpbackendpro.maids.c



   🔍 [0_7] Trying PRIMARY zone detection...
   ❌ [0_7] Primary method failed. Trying Backup Method 1...
   ✅ [0_1] SIGNATURE (conf: 0.00) via None - DataFrame updated

🔄 [0_1] Processing: Retchel Demesana Roma  (Row 21)
   📥 [0_1] Downloading from: https://erpbackendpro.maids.cc/public/download/f53...
📊 Progress: 1/384 (0.3%) - ✅0 ❌1 ⚠️0 📥0
   ✅ [0_4] SIGNATURE (conf: 0.00) via None - DataFrame updated

🔄 [0_4] Processing: Zarah Montilla Celajes (Row 22)
   📥 [0_4] Downloading from: https://erpbackendpro.maids.cc/public/download/9a4...
📊 Progress: 2/384 (0.5%) - ✅0 ❌2 ⚠️0 📥0
   ✅ [0_0] SIGNATURE (conf: 0.00) via None - DataFrame updated

🔄 [0_0] Processing: Riza Mae Delgado Fabrero (Row 23)
   📥 [0_0] Downloading from: https://erpbackendpro.maids.cc/public/download/bcb...
📊 Progress: 3/384 (0.8%) - ✅0 ❌3 ⚠️0 📥0
   ✅ [0_3] SIGNATURE (conf: 0.00) via None - DataFrame updated

🔄 [0_3] Processing: Rose Ann Gutierrez Gapasin (Row 24)
   📥 [0_3] Downloading from: https://erpbackendpro.maids.



   🔍 [0_8] Trying PRIMARY zone detection...
   ❌ [0_8] Primary method failed. Trying Backup Method 1...
   ✅ [0_1] Downloaded PDF (1860 KB)
   🤖 [0_1] Processing PDF file...
   🔍 [_16] Trying PRIMARY zone detection...




   🔍 [_14] Trying PRIMARY zone detection...   ✅ [0_4] Downloaded PDF (1873 KB)
   🤖 [0_4] Processing PDF file...
   🔍 [_12] Trying PRIMARY zone detection...
   🔍 [_10] Trying PRIMARY zone detection...

   🔍 [0_2] Trying PRIMARY zone detection...
   🔍 [_15] Trying PRIMARY zone detection...
   🔍 [_13] Trying PRIMARY zone detection...
   ✅ [0_0] Downloaded PDF (1821 KB)
   🤖 [0_0] Processing PDF file...
   🔍 [0_5] Trying PRIMARY zone detection...
   🔍 [_17] Trying PRIMARY zone detection...
   ❌ [_16] Primary method failed. Trying Backup Method 1...
   ❌ [_10] Primary method failed. Trying Backup Method 1...
   ❌ [_17] Primary method failed. Trying Backup Method 1...
   🔍 [_11] Trying PRIMARY zone detection...
   🔍 [0_6] Trying PRIMARY zone detection...




   🔍 [0_9] Trying PRIMARY zone detection...   🔍 [_18] Trying PRIMARY zone detection...
   ✅ [_12] Primary method succeeded
   🔍 [_19] Trying PRIMARY zone detection...
   ✅ [_12] SIGNATURE (conf: 0.00) via None - DataFrame updated

🔄 [_12] Processing: Grecia Ojerio Yambao  (Row 25)
   📥 [_12] Downloading from: https://erpbackendpro.maids.cc/public/download/264...
📊 Progress: 5/384 (1.3%) - ✅0 ❌5 ⚠️0 📥0
   ✅ [_15] Primary method succeeded
   ✅ [_14] Primary method succeeded

   ❌ [0_6] Primary method failed. Trying Backup Method 1...
   ✅ [_14] SIGNATURE (conf: 0.00) via None - DataFrame updated

🔄 [_14] Processing: Juvyna Ning Jugal  (Row 26)
   📥 [_14] Downloading from: https://erpbackendpro.maids.cc/public/download/3d8...
📊 Progress: 6/384 (1.6%) - ✅0 ❌6 ⚠️0 📥0
   ✅ [0_2] Primary method succeeded
   ✅ [_13] Primary method succeeded
   ✅ [0_5] Primary method succeeded
   ✅ [0_3] Downloaded PDF (1873 KB)
   🤖 [0_3] Processing PDF file...
   ✅ [0_5] SIGNATURE (conf: 0.00) via None - Data



   🔍 [0_4] Trying PRIMARY zone detection...
   ❌ [0_4] Primary method failed. Trying Backup Method 1...
   🔍 [0_1] Trying PRIMARY zone detection...
   ✅ [0_1] Primary method succeeded
   🔍 [0_3] Trying PRIMARY zone detection...
   ✅ [0_3] Primary method succeeded
   ✅ [0_0] SIGNATURE (conf: 0.00) via None - DataFrame updated

🔄 [0_0] Processing: Liamor Marcelino Corpuz  (Row 41)
   📥 [0_0] Downloading from: https://erpbackendpro.maids.cc/public/download/a41...
📊 Progress: 21/384 (5.5%) - ✅0 ❌21 ⚠️0 📥0
   ✅ [0_1] SIGNATURE (conf: 0.00) via None - DataFrame updated

🔄 [0_1] Processing: Melody Arellano Bongolan (Row 42)
   📥 [0_1] Downloading from: https://erpbackendpro.maids.cc/public/download/d31...
📊 Progress: 22/384 (5.7%) - ✅0 ❌22 ⚠️0 📥0
   ✅ [0_3] SIGNATURE (conf: 0.00) via None - DataFrame updated

🔄 [0_3] Processing: Maryjoy Tacata Mabascog  (Row 43)
   📥 [0_3] Downloading from: https://erpbackendpro.maids.cc/public/download/3e0...
📊 Progress: 23/384 (6.0%) - ✅0 ❌23 ⚠️0 📥0
   ✅ [0



   🔍 [0_2] Trying PRIMARY zone detection...
   ❌ [0_2] Primary method failed. Trying Backup Method 1...




   🔍 [_19] Trying PRIMARY zone detection...
   ❌ [_19] Primary method failed. Trying Backup Method 1...
   🔍 [_13] Trying PRIMARY zone detection...
   ✅ [_13] Primary method succeeded




   🔍 [0_5] Trying PRIMARY zone detection...
   ❌ [0_5] Primary method failed. Trying Backup Method 1...
   ✅ [_13] SIGNATURE (conf: 0.00) via None - DataFrame updated

🔄 [_13] Processing: Mitchell Achieng Oyamo  (Row 44)
   📥 [_13] Downloading from: https://erpbackendpro.maids.cc/public/download/863...
📊 Progress: 24/384 (6.2%) - ✅0 ❌24 ⚠️0 📥0
   ✅ [_13] Downloaded PDF (1841 KB)
   🤖 [_13] Processing PDF file...
   🔍 [_14] Trying PRIMARY zone detection...
   🔍 [_12] Trying PRIMARY zone detection...




   🔍 [0_9] Trying PRIMARY zone detection...
   ❌ [_12] Primary method failed. Trying Backup Method 1...




   🔍 [_10] Trying PRIMARY zone detection...
   ❌ [_10] Primary method failed. Trying Backup Method 1...
   ✅ [_14] Primary method succeeded
   ✅ [0_9] Primary method succeeded
   🔍 [_18] Trying PRIMARY zone detection...
   ✅ [_18] Primary method succeeded
   🔍 [_11] Trying PRIMARY zone detection...
   🔍 [_15] Trying PRIMARY zone detection...
   ✅ [_11] Primary method succeeded
   ✅ [_15] Primary method succeeded
   ✅ [_14] SIGNATURE (conf: 0.00) via None - DataFrame updated

🔄 [_14] Processing: Gilda De lima Ciron  (Row 45)
   📥 [_14] Downloading from: https://erpbackendpro.maids.cc/public/download/7f9...
📊 Progress: 25/384 (6.5%) - ✅0 ❌25 ⚠️0 📥0
   ✅ [0_9] SIGNATURE (conf: 0.00) via None - DataFrame updated

🔄 [0_9] Processing: Bernadette Pica Danao  (Row 46)
   📥 [0_9] Downloading from: https://erpbackendpro.maids.cc/public/download/ee8...
📊 Progress: 26/384 (6.8%) - ✅0 ❌26 ⚠️0 📥0
   ✅ [_18] SIGNATURE (conf: 0.00) via None - DataFrame updated

🔄 [_18] Processing: Esmayla Ramos Dorado



   🔍 [_17] Trying PRIMARY zone detection...
   ❌ [_17] Primary method failed. Trying Backup Method 1...
   🔍 [0_7] Trying PRIMARY zone detection...
   ✅ [0_7] Primary method succeeded
   ✅ [0_7] SIGNATURE (conf: 0.00) via None - DataFrame updated
📊 Progress: 32/384 (8.3%) - ✅0 ❌32 ⚠️0 📥0

🔄 [0_7] Processing: Marlyn Demasupil Gucio (Row 52)
   📥 [0_7] Downloading from: https://erpbackendpro.maids.cc/public/download/347...




   🔍 [0_8] Trying PRIMARY zone detection...
   ❌ [0_8] Primary method failed. Trying Backup Method 1...
   ✅ [0_7] Downloaded PDF (1819 KB)
   🤖 [0_7] Processing PDF file...




   🔍 [_16] Trying PRIMARY zone detection...
   ❌ [_16] Primary method failed. Trying Backup Method 1...
   ❌ [_19] Backup 1 failed. Trying Backup Method 2...
   ✅ [_19] Backup Method 2 succeeded
   ✅ [_19] SIGNATURE (conf: 0.00) via None - DataFrame updated

🔄 [_19] Processing: Rose Ann Alba Pelaez (Row 53)
   📥 [_19] Downloading from: https://erpbackendpro.maids.cc/public/download/2e7...
📊 Progress: 33/384 (8.6%) - ✅0 ❌33 ⚠️0 📥0
   ❌ [0_2] Backup 1 failed. Trying Backup Method 2...
   ✅ [0_2] Backup Method 2 succeeded
   ✅ [0_2] SIGNATURE (conf: 0.00) via None - DataFrame updated

🔄 [0_2] Processing: Maria Francia Esguerra Guevarra (Row 54)
   📥 [0_2] Downloading from: https://erpbackendpro.maids.cc/public/download/6fd...
📊 Progress: 34/384 (8.9%) - ✅0 ❌34 ⚠️0 📥0
   ✅ [_19] Downloaded PDF (1783 KB)
   🤖 [_19] Processing PDF file...
   ❌ [0_5] Backup 1 failed. Trying Backup Method 2...
   ✅ [0_2] Downloaded PDF (1888 KB)
   🤖 [0_2] Processing PDF file...
   ❌ [_12] Backup 1 failed. Try



   🔍 [0_3] Trying PRIMARY zone detection...
   ❌ [0_3] Primary method failed. Trying Backup Method 1...




   🔍 [0_0] Trying PRIMARY zone detection...
   ❌ [0_0] Primary method failed. Trying Backup Method 1...
   ✅ [0_8] Downloaded PDF (1749 KB)
   🤖 [0_8] Processing PDF file...
   🔍 [0_1] Trying PRIMARY zone detection...
   ✅ [0_1] Primary method succeeded
   ✅ [0_1] SIGNATURE (conf: 0.00) via None - DataFrame updated

🔄 [0_1] Processing: Analyn Palma Cabildo (Row 60)
   📥 [0_1] Downloading from: https://erpbackendpro.maids.cc/public/download/c50...
📊 Progress: 40/384 (10.4%) - ✅0 ❌40 ⚠️0 📥0
   ✅ [0_1] Downloaded PDF (1858 KB)
   🤖 [0_1] Processing PDF file...
   🔍 [_13] Trying PRIMARY zone detection...
   ✅ [_13] Primary method succeeded
   ✅ [_13] SIGNATURE (conf: 0.00) via None - DataFrame updated

🔄 [_13] Processing: Danica Camille Viclar De La Cruz  (Row 61)
   📥 [_13] Downloading from: https://erpbackendpro.maids.cc/public/download/6bb...
📊 Progress: 41/384 (10.7%) - ✅0 ❌41 ⚠️0 📥0
   ✅ [_13] Downloaded PDF (1877 KB)
   🤖 [_13] Processing PDF file...
   ❌ [_16] Backup 1 failed. Tryin



   🔍 [_14] Trying PRIMARY zone detection...
   ❌ [_14] Primary method failed. Trying Backup Method 1...
   🔍 [_11] Trying PRIMARY zone detection...
   ✅ [_11] Primary method succeeded
   ✅ [_11] SIGNATURE (conf: 0.00) via None - DataFrame updated

🔄 [_11] Processing: Gene Braga Cruz  (Row 63)
   📥 [_11] Downloading from: https://erpbackendpro.maids.cc/public/download/ffe...
📊 Progress: 43/384 (11.2%) - ✅0 ❌43 ⚠️0 📥0




   ✅ [_11] Downloaded PDF (1831 KB)
   🤖 [_11] Processing PDF file...
   🔍 [0_4] Trying PRIMARY zone detection...
   🔍 [0_9] Trying PRIMARY zone detection...
   🔍 [_18] Trying PRIMARY zone detection...
   ❌ [0_4] Primary method failed. Trying Backup Method 1...
   ❌ [0_9] Primary method failed. Trying Backup Method 1...
   ❌ [_18] Primary method failed. Trying Backup Method 1...




   🔍 [_15] Trying PRIMARY zone detection...
   ❌ [_15] Primary method failed. Trying Backup Method 1...
   🔍 [0_6] Trying PRIMARY zone detection...
   ✅ [0_6] Primary method succeeded
   ✅ [0_6] SIGNATURE (conf: 0.00) via None - DataFrame updated

🔄 [0_6] Processing: Jean Rhea Aragoncillo Ramirez (Row 64)
   📥 [0_6] Downloading from: https://erpbackendpro.maids.cc/public/download/6e3...
📊 Progress: 44/384 (11.5%) - ✅0 ❌44 ⚠️0 📥0
   ❌ [0_3] Backup 1 failed. Trying Backup Method 2...
   ✅ [0_3] Backup Method 2 succeeded
   ✅ [0_3] SIGNATURE (conf: 0.00) via None - DataFrame updated
📊 Progress: 45/384 (11.7%) - ✅0 ❌45 ⚠️0 📥0

🔄 [0_3] Processing: Melanie Omandac Regala (Row 65)
   📥 [0_3] Downloading from: https://erpbackendpro.maids.cc/public/download/452...
   ✅ [0_6] Downloaded PDF (1877 KB)
   🤖 [0_6] Processing PDF file...
   ✅ [0_3] Downloaded PDF (1835 KB)
   🤖 [0_3] Processing PDF file...
   🔍 [0_7] Trying PRIMARY zone detection...
   ✅ [0_7] Primary method succeeded
   ✅ [0_7] SIG



   🔍 [0_5] Trying PRIMARY zone detection...
   ❌ [0_5] Primary method failed. Trying Backup Method 1...
   ❌ [0_4] Backup 1 failed. Trying Backup Method 2...
   ❌ [_15] Backup 1 failed. Trying Backup Method 2...
   ✅ [0_4] Backup Method 2 succeeded
   🔍 [_19] Trying PRIMARY zone detection...
   ✅ [0_4] SIGNATURE (conf: 0.00) via None - DataFrame updated
📊 Progress: 48/384 (12.5%) - ✅0 ❌48 ⚠️0 📥0

🔄 [0_4] Processing: Mary Jane Labsan Gabad (Row 68)
   📥 [0_4] Downloading from: https://erpbackendpro.maids.cc/public/download/76e...
   ✅ [_15] Backup Method 2 succeeded
   ✅ [_15] SIGNATURE (conf: 0.00) via None - DataFrame updated

🔄 [_15] Processing: Irene Regina Nassuna  (Row 69)
   📥 [_15] Downloading from: https://erpbackendpro.maids.cc/public/download/259...
📊 Progress: 49/384 (12.8%) - ✅0 ❌49 ⚠️0 📥0
   ✅ [_19] Primary method succeeded
   ✅ [_19] SIGNATURE (conf: 0.00) via None - DataFrame updated

🔄 [_19] Processing: Jacinta Wanjugu Matu (Row 70)
   📥 [_19] Downloading from: https://



   ✅ [0_2] Primary method succeeded
   ✅ [_12] Primary method succeeded
   ✅ [_12] SIGNATURE (conf: 0.00) via None - DataFrame updated
📊 Progress: 51/384 (13.3%) - ✅0 ❌52 ⚠️0 📥0
   ✅ [0_2] SIGNATURE (conf: 0.00) via None - DataFrame updated

🔄 [_12] Processing: Maureen Casbadillo Tordecilla (Row 71)
   📥 [_12] Downloading from: https://erpbackendpro.maids.cc/public/download/a1e...

🔄 [0_2] Processing: Rosilyn Patulot Medina  (Row 72)
   📥 [0_2] Downloading from: https://erpbackendpro.maids.cc/public/download/ff4...
📊 Progress: 52/384 (13.5%) - ✅0 ❌52 ⚠️0 📥0
   🔍 [_10] Trying PRIMARY zone detection...
   ❌ [_10] Primary method failed. Trying Backup Method 1...
   ✅ [0_4] Downloaded PDF (1851 KB)
   🤖 [0_4] Processing PDF file...
   ✅ [_15] Downloaded PDF (1856 KB)
   🤖 [_15] Processing PDF file...
   ✅ [_19] Downloaded PDF (1853 KB)
   🤖 [_19] Processing PDF file...
   ❌ [_14] Backup 1 failed. Trying Backup Method 2...
   ✅ [_14] Backup Method 2 succeeded
   ✅ [_14] SIGNATURE (conf: 0.0



   ✅ [0_2] Downloaded PDF (1779 KB)
   🤖 [0_2] Processing PDF file...
   ❌ [_18] Backup 1 failed. Trying Backup Method 2...
   🔍 [_17] Trying PRIMARY zone detection...
   ❌ [_17] Primary method failed. Trying Backup Method 1...
   ✅ [_18] Backup Method 2 succeeded
   ✅ [0_9] Backup Method 2 succeeded
   ✅ [0_9] SIGNATURE (conf: 0.00) via None - DataFrame updated
   ✅ [_18] SIGNATURE (conf: 0.00) via None - DataFrame updated

🔄 [_18] Processing: Elsie Macababat Pelias (Row 75)
   📥 [_18] Downloading from: https://erpbackendpro.maids.cc/public/download/748...
📊 Progress: 54/384 (14.1%) - ✅0 ❌55 ⚠️0 📥0
📊 Progress: 55/384 (14.3%) - ✅0 ❌55 ⚠️0 📥0

🔄 [0_9] Processing: Norah Kemunto Ongeri  (Row 74)
   📥 [0_9] Downloading from: https://erpbackendpro.maids.cc/public/download/9bf...
   ✅ [_14] Downloaded PDF (1926 KB)
   🤖 [_14] Processing PDF file...
   ✅ [_18] Downloaded PDF (1855 KB)
   🤖 [_18] Processing PDF file...
   ✅ [0_9] Downloaded PDF (1864 KB)
   🤖 [0_9] Processing PDF file...
   🔍 



   ✅ [0_5] Downloaded PDF (1907 KB)
   🤖 [0_5] Processing PDF file...
   🔍 [_13] Trying PRIMARY zone detection...
   ❌ [_13] Primary method failed. Trying Backup Method 1...




   🔍 [_16] Trying PRIMARY zone detection...
   ❌ [_16] Primary method failed. Trying Backup Method 1...
   ❌ [_10] Backup 1 failed. Trying Backup Method 2...
   ✅ [_10] Backup Method 2 succeeded
   ✅ [_10] SIGNATURE (conf: 0.00) via None - DataFrame updated

🔄 [_10] Processing: Ma Thereza Espino (Row 79)
   📥 [_10] Downloading from: https://erpbackendpro.maids.cc/public/download/11e...
📊 Progress: 59/384 (15.4%) - ✅0 ❌59 ⚠️0 📥0
   ✅ [_10] Downloaded PDF (1831 KB)
   🤖 [_10] Processing PDF file...
   🔍 [_11] Trying PRIMARY zone detection...
   ✅ [_11] Primary method succeeded
   ✅ [_11] SIGNATURE (conf: 0.00) via None - DataFrame updated
📊 Progress: 60/384 (15.6%) - ✅0 ❌60 ⚠️0 📥0

🔄 [_11] Processing: Jennifer Valmoja Guzman  (Row 80)
   📥 [_11] Downloading from: https://erpbackendpro.maids.cc/public/download/40a...
   ✅ [_11] Downloaded PDF (1869 KB)
   🤖 [_11] Processing PDF file...
   ❌ [_17] Backup 1 failed. Trying Backup Method 2...
   ✅ [_17] Backup Method 2 succeeded
   ✅ [_17] SI



   🔍 [0_3] Trying PRIMARY zone detection...
   ❌ [0_3] Primary method failed. Trying Backup Method 1...
   🔍 [0_6] Trying PRIMARY zone detection...
   ✅ [0_6] Primary method succeeded
   ✅ [0_6] SIGNATURE (conf: 0.00) via None - DataFrame updated

🔄 [0_6] Processing: Aiza Pagatpatan Rosendo  (Row 82)
   📥 [0_6] Downloading from: https://erpbackendpro.maids.cc/public/download/ef3...
📊 Progress: 62/384 (16.1%) - ✅0 ❌62 ⚠️0 📥0
   ✅ [0_6] Downloaded PDF (1856 KB)
   🤖 [0_6] Processing PDF file...




   🔍 [0_0] Trying PRIMARY zone detection...
   ❌ [0_0] Primary method failed. Trying Backup Method 1...




   🔍 [0_7] Trying PRIMARY zone detection...
   ❌ [0_7] Primary method failed. Trying Backup Method 1...
   ❌ [_13] Backup 1 failed. Trying Backup Method 2...
   ❌ [_16] Backup 1 failed. Trying Backup Method 2...
   ✅ [_13] Backup Method 2 succeeded
   ✅ [_13] SIGNATURE (conf: 0.00) via None - DataFrame updated

🔄 [_13] Processing: Monaliza Villas Geroche (Row 83)
   📥 [_13] Downloading from: https://erpbackendpro.maids.cc/public/download/7ff...
📊 Progress: 63/384 (16.4%) - ✅0 ❌63 ⚠️0 📥0
   ✅ [_16] Backup Method 2 succeeded
   ✅ [_16] SIGNATURE (conf: 0.00) via None - DataFrame updated

🔄 [_16] Processing: Aida Paler Samijon  (Row 84)
   📥 [_16] Downloading from: https://erpbackendpro.maids.cc/public/download/99d...
📊 Progress: 64/384 (16.7%) - ✅0 ❌64 ⚠️0 📥0
   ✅ [_13] Downloaded PDF (1880 KB)
   🤖 [_13] Processing PDF file...
   ✅ [_16] Downloaded PDF (1843 KB)
   🤖 [_16] Processing PDF file...
   🔍 [0_4] Trying PRIMARY zone detection...
   🔍 [_15] Trying PRIMARY zone detection...
   ✅



   ✅ [_17] Primary method succeeded
   ✅ [_17] SIGNATURE (conf: 0.00) via None - DataFrame updated
   🔍 [0_6] Trying PRIMARY zone detection...
   ❌ [0_6] Primary method failed. Trying Backup Method 1...

🔄 [_17] Processing: Analyn Divinigracia Cruz  (Row 101)
   📥 [_17] Downloading from: https://erpbackendpro.maids.cc/public/download/a60...
📊 Progress: 81/384 (21.1%) - ✅0 ❌81 ⚠️0 📥0
   ✅ [_17] Downloaded PDF (1813 KB)
   🤖 [_17] Processing PDF file...




   🔍 [_16] Trying PRIMARY zone detection...
   ❌ [_16] Primary method failed. Trying Backup Method 1...




   🔍 [_13] Trying PRIMARY zone detection...
   ❌ [_13] Primary method failed. Trying Backup Method 1...
   🔍 [0_4] Trying PRIMARY zone detection...
   ✅ [0_4] Primary method succeeded
   ✅ [0_4] SIGNATURE (conf: 0.00) via None - DataFrame updated

🔄 [0_4] Processing: Elsa Apiasa Macaraeg (Row 102)
   📥 [0_4] Downloading from: https://erpbackendpro.maids.cc/public/download/52c...
📊 Progress: 82/384 (21.4%) - ✅0 ❌82 ⚠️0 📥0
   🔍 [_15] Trying PRIMARY zone detection...
   ✅ [0_4] Downloaded PDF (1866 KB)
   🤖 [0_4] Processing PDF file...
   ✅ [_15] Primary method succeeded
   ✅ [_15] SIGNATURE (conf: 0.00) via None - DataFrame updated
📊 Progress: 83/384 (21.6%) - ✅0 ❌83 ⚠️0 📥0

🔄 [_15] Processing: Shiela Serafin Bornales (Row 103)
   📥 [_15] Downloading from: https://erpbackendpro.maids.cc/public/download/5f2...
   ✅ [_15] Downloaded PDF (1856 KB)
   🤖 [_15] Processing PDF file...
   ❌ [0_6] Backup 1 failed. Trying Backup Method 2...
   ✅ [0_6] Backup Method 2 succeeded
   ✅ [0_6] SIGNATURE



   🔍 [_19] Trying PRIMARY zone detection...
   ❌ [_19] Primary method failed. Trying Backup Method 1...
   🔍 [0_3] Trying PRIMARY zone detection...
   ✅ [0_3] Primary method succeeded
   ✅ [0_3] SIGNATURE (conf: 0.00) via None - DataFrame updated

🔄 [0_3] Processing: Emma Delos Santos Bello  (Row 105)
   📥 [0_3] Downloading from: https://erpbackendpro.maids.cc/public/download/480...
📊 Progress: 85/384 (22.1%) - ✅0 ❌85 ⚠️0 📥0
   ✅ [0_3] Downloaded PDF (1870 KB)
   🤖 [0_3] Processing PDF file...




   🔍 [_12] Trying PRIMARY zone detection...
   ❌ [_12] Primary method failed. Trying Backup Method 1...




   🔍 [0_2] Trying PRIMARY zone detection...
   ❌ [0_2] Primary method failed. Trying Backup Method 1...




   🔍 [_14] Trying PRIMARY zone detection...
   ❌ [_14] Primary method failed. Trying Backup Method 1...
   ❌ [_16] Backup 1 failed. Trying Backup Method 2...
   ✅ [_16] Backup Method 2 succeeded
   ✅ [_16] SIGNATURE (conf: 0.00) via None - DataFrame updated

🔄 [_16] Processing: Janet Layug Magno (Row 106)
   📥 [_16] Downloading from: https://erpbackendpro.maids.cc/public/download/0e3...
📊 Progress: 86/384 (22.4%) - ✅0 ❌86 ⚠️0 📥0
   ❌ [_13] Backup 1 failed. Trying Backup Method 2...
   ✅ [_13] Backup Method 2 succeeded
   ✅ [_13] SIGNATURE (conf: 0.00) via None - DataFrame updated
📊 Progress: 87/384 (22.7%) - ✅0 ❌87 ⚠️0 📥0

🔄 [_13] Processing: Suzette Dela Pena Sibuco (Row 107)
   📥 [_13] Downloading from: https://erpbackendpro.maids.cc/public/download/451...
   ✅ [_16] Downloaded PDF (1777 KB)
   🤖 [_16] Processing PDF file...
   ✅ [_13] Downloaded PDF (1890 KB)
   🤖 [_13] Processing PDF file...
   ❌ [_19] Backup 1 failed. Trying Backup Method 2...
   ✅ [_19] Backup Method 2 succeeded




   ❌ [_12] Backup 1 failed. Trying Backup Method 2...

   🔍 [_11] Trying PRIMARY zone detection...
   ❌ [_17] Primary method failed. Trying Backup Method 1...
   ✅ [0_9] Downloaded PDF (1848 KB)
   🤖 [0_9] Processing PDF file...
   ✅ [0_0] Primary method succeeded
   ✅ [0_0] SIGNATURE (conf: 0.00) via None - DataFrame updated

🔄 [0_0] Processing: Maggie Melgar Dumato (Row 115)
   📥 [0_0] Downloading from: https://erpbackendpro.maids.cc/public/download/9ef...
📊 Progress: 95/384 (24.7%) - ✅0 ❌95 ⚠️0 📥0
   ✅ [_10] Primary method succeeded
   ✅ [_10] SIGNATURE (conf: 0.00) via None - DataFrame updated

🔄 [_10] Processing: Marilou Torres De Queros (Row 116)
   📥 [_10] Downloading from: https://erpbackendpro.maids.cc/public/download/110...
📊 Progress: 96/384 (25.0%) - ✅0 ❌96 ⚠️0 📥0
   ✅ [_11] Primary method succeeded
   ✅ [_11] SIGNATURE (conf: 0.00) via None - DataFrame updated

🔄 [_11] Processing: Vergilyn Roxas Alaurin (Row 117)
   📥 [_11] Downloading from: https://erpbackendpro.maids.cc/



   🔍 [_16] Trying PRIMARY zone detection...
   ❌ [_16] Primary method failed. Trying Backup Method 1...




   🔍 [_13] Trying PRIMARY zone detection...
   ❌ [_13] Primary method failed. Trying Backup Method 1...
   🔍 [_19] Trying PRIMARY zone detection...
   ✅ [_19] Primary method succeeded
   ✅ [_19] SIGNATURE (conf: 0.00) via None - DataFrame updated

🔄 [_19] Processing: Doris Kimayong Dawin (Row 126)
   📥 [_19] Downloading from: https://erpbackendpro.maids.cc/public/download/2ba...
📊 Progress: 106/384 (27.6%) - ✅0 ❌106 ⚠️0 📥0
   ✅ [_19] Downloaded PDF (1802 KB)
   🤖 [_19] Processing PDF file...
   🔍 [0_1] Trying PRIMARY zone detection...
   ✅ [0_1] Primary method succeeded
   ✅ [0_1] SIGNATURE (conf: 0.00) via None - DataFrame updated
📊 Progress: 107/384 (27.9%) - ✅0 ❌107 ⚠️0 📥0

🔄 [0_1] Processing: Eva Caballero Diaz (Row 127)
   📥 [0_1] Downloading from: https://erpbackendpro.maids.cc/public/download/bcb...
   ✅ [0_1] Downloaded PDF (1755 KB)
   🤖 [0_1] Processing PDF file...




   🔍 [0_8] Trying PRIMARY zone detection...
   ❌ [0_8] Primary method failed. Trying Backup Method 1...
   🔍 [_18] Trying PRIMARY zone detection...
   ✅ [_18] Primary method succeeded
   ✅ [_18] SIGNATURE (conf: 0.00) via None - DataFrame updated

🔄 [_18] Processing: Rowena Labitoria Manuel  (Row 128)
   📥 [_18] Downloading from: https://erpbackendpro.maids.cc/public/download/33e...
📊 Progress: 108/384 (28.1%) - ✅0 ❌108 ⚠️0 📥0
   ✅ [_18] Downloaded PDF (1855 KB)
   🤖 [_18] Processing PDF file...
   🔍 [0_7] Trying PRIMARY zone detection...
   🔍 [0_9] Trying PRIMARY zone detection...
   ✅ [0_9] Primary method succeeded
   ✅ [0_9] SIGNATURE (conf: 0.00) via None - DataFrame updated
   ✅ [0_7] Primary method succeeded

🔄 [0_9] Processing: Almira Lacre Vistal (Row 129)
   📥 [0_9] Downloading from: https://erpbackendpro.maids.cc/public/download/844...
📊 Progress: 109/384 (28.4%) - ✅0 ❌109 ⚠️0 📥0
   ✅ [0_7] SIGNATURE (conf: 0.00) via None - DataFrame updated

🔄 [0_7] Processing: Marites Japso



   🔍 [0_5] Trying PRIMARY zone detection...
   ❌ [0_5] Primary method failed. Trying Backup Method 1...
   ✅ [0_9] Downloaded PDF (1841 KB)
   🤖 [0_9] Processing PDF file...
   ❌ [_16] Backup 1 failed. Trying Backup Method 2...
   ✅ [0_7] Downloaded PDF (1874 KB)
   🤖 [0_7] Processing PDF file...
   ❌ [_13] Backup 1 failed. Trying Backup Method 2...
   ✅ [_16] Backup Method 2 succeeded
   ✅ [_16] SIGNATURE (conf: 0.00) via None - DataFrame updated

🔄 [_16] Processing: Leonila Dagos Tapia (Row 131)
   📥 [_16] Downloading from: https://erpbackendpro.maids.cc/public/download/bd0...
📊 Progress: 111/384 (28.9%) - ✅0 ❌111 ⚠️0 📥0
   ✅ [_13] Backup Method 2 succeeded
   ✅ [_13] SIGNATURE (conf: 0.00) via None - DataFrame updated

🔄 [_13] Processing: Janeth Matamog Puno (Row 132)
   📥 [_13] Downloading from: https://erpbackendpro.maids.cc/public/download/d45...
📊 Progress: 112/384 (29.2%) - ✅0 ❌112 ⚠️0 📥0




   🔍 [_10] Trying PRIMARY zone detection...
   ❌ [_10] Primary method failed. Trying Backup Method 1...
   ✅ [_16] Downloaded PDF (1863 KB)
   🤖 [_16] Processing PDF file...
   🔍 [_12] Trying PRIMARY zone detection...   🔍 [_11] Trying PRIMARY zone detection...
   ✅ [_13] Downloaded PDF (1858 KB)
   🤖 [_13] Processing PDF file...





   🔍 [_15] Trying PRIMARY zone detection...
   🔍 [0_0] Trying PRIMARY zone detection...
   ✅ [_12] Primary method succeeded
   🔍 [0_2] Trying PRIMARY zone detection...
   ❌ [0_2] Primary method failed. Trying Backup Method 1...
   ✅ [_12] SIGNATURE (conf: 0.00) via None - DataFrame updated

🔄 [_12] Processing: Jene Navidad Lambrento (Row 133)
   📥 [_12] Downloading from: https://erpbackendpro.maids.cc/public/download/c0a...
📊 Progress: 113/384 (29.4%) - ✅0 ❌113 ⚠️0 📥0
   ✅ [_11] Primary method succeeded
   ✅ [_11] SIGNATURE (conf: 0.00) via None - DataFrame updated

🔄 [_11] Processing: Louwanie Indoyon Budiongan (Row 134)
   📥 [_11] Downloading from: https://erpbackendpro.maids.cc/public/download/f77...
📊 Progress: 114/384 (29.7%) - ✅0 ❌114 ⚠️0 📥0
   ✅ [_15] Primary method succeeded
   ✅ [0_0] Primary method succeeded
   ✅ [_15] SIGNATURE (conf: 0.00) via None - DataFrame updated
📊 Progress: 115/384 (29.9%) - ✅0 ❌115 ⚠️0 📥0

🔄 [_15] Processing: Geraldine Sumagaysay Predonio (Row 135)
 



   🔍 [_14] Trying PRIMARY zone detection...
   ❌ [_14] Primary method failed. Trying Backup Method 1...
   ✅ [_12] Downloaded PDF (1790 KB)
   🤖 [_12] Processing PDF file...
   ✅ [_11] Downloaded PDF (1809 KB)
   🤖 [_11] Processing PDF file...
   ✅ [_15] Downloaded PDF (1905 KB)
   🤖 [_15] Processing PDF file...




   🔍 [0_4] Trying PRIMARY zone detection...
   ❌ [0_4] Primary method failed. Trying Backup Method 1...
   ✅ [0_0] Downloaded PDF (1838 KB)
   🤖 [0_0] Processing PDF file...
   ❌ [0_8] Backup 1 failed. Trying Backup Method 2...
   ✅ [0_8] Backup Method 2 succeeded
   ✅ [0_8] SIGNATURE (conf: 0.00) via None - DataFrame updated

🔄 [0_8] Processing: Suraya Pandulo Bedsok (Row 137)
   📥 [0_8] Downloading from: https://erpbackendpro.maids.cc/public/download/f91...
📊 Progress: 117/384 (30.5%) - ✅0 ❌117 ⚠️0 📥0
   ✅ [0_8] Downloaded PDF (1810 KB)
   🤖 [0_8] Processing PDF file...
   🔍 [_17] Trying PRIMARY zone detection...
   ✅ [_17] Primary method succeeded
   ✅ [_17] SIGNATURE (conf: 0.00) via None - DataFrame updated

🔄 [_17] Processing: Edna Baladhay (Row 138)
   📥 [_17] Downloading from: https://erpbackendpro.maids.cc/public/download/044...
📊 Progress: 118/384 (30.7%) - ✅0 ❌118 ⚠️0 📥0
   🔍 [0_3] Trying PRIMARY zone detection...
   ❌ [0_5] Backup 1 failed. Trying Backup Method 2...
   ✅ [_



   🔍 [_12] Trying PRIMARY zone detection...
   ❌ [_12] Primary method failed. Trying Backup Method 1...
   ✅ [_13] Downloaded PDF (1812 KB)
   🤖 [_13] Processing PDF file...




   🔍 [_16] Trying PRIMARY zone detection...   🔍 [_11] Trying PRIMARY zone detection...

   ❌ [_11] Primary method failed. Trying Backup Method 1...
   ✅ [_16] Primary method succeeded
   ✅ [_16] SIGNATURE (conf: 0.00) via None - DataFrame updated

🔄 [_16] Processing: Maria Lenie Oberes Dizon (Row 152)
   📥 [_16] Downloading from: https://erpbackendpro.maids.cc/public/download/bff...
📊 Progress: 132/384 (34.4%) - ✅0 ❌132 ⚠️0 📥0
   ✅ [_16] Downloaded PDF (1855 KB)
   🤖 [_16] Processing PDF file...
   🔍 [0_8] Trying PRIMARY zone detection...
   ✅ [0_8] Primary method succeeded
   ✅ [0_8] SIGNATURE (conf: 0.00) via None - DataFrame updated
   🔍 [0_0] Trying PRIMARY zone detection...

🔄 [0_8] Processing: Jevilma Cabrera Jabonete  (Row 153)
   📥 [0_8] Downloading from: https://erpbackendpro.maids.cc/public/download/de5...
📊 Progress: 133/384 (34.6%) - ✅0 ❌133 ⚠️0 📥0
   ✅ [0_0] Primary method succeeded
   ✅ [0_0] SIGNATURE (conf: 0.00) via None - DataFrame updated

🔄 [0_0] Processing: Harleen



   🔍 [0_6] Trying PRIMARY zone detection...
   ❌ [0_6] Primary method failed. Trying Backup Method 1...
   🔍 [0_5] Trying PRIMARY zone detection...
   ✅ [0_5] Primary method succeeded
   ✅ [0_5] SIGNATURE (conf: 0.00) via None - DataFrame updated

🔄 [0_5] Processing: Jovie Aporbo Balbarino (Row 156)
   📥 [0_5] Downloading from: https://erpbackendpro.maids.cc/public/download/ca3...
📊 Progress: 136/384 (35.4%) - ✅0 ❌136 ⚠️0 📥0




   🔍 [0_3] Trying PRIMARY zone detection...
   ❌ [0_3] Primary method failed. Trying Backup Method 1...
   🔍 [_17] Trying PRIMARY zone detection...
   ✅ [_17] Primary method succeeded
   ✅ [_17] SIGNATURE (conf: 0.00) via None - DataFrame updated

🔄 [_17] Processing: Diana Mmela Limo  (Row 157)
   📥 [_17] Downloading from: https://erpbackendpro.maids.cc/public/download/01b...
📊 Progress: 137/384 (35.7%) - ✅0 ❌137 ⚠️0 📥0
   ✅ [0_5] Downloaded PDF (1809 KB)
   🤖 [0_5] Processing PDF file...
   ✅ [_17] Downloaded PDF (1836 KB)
   🤖 [_17] Processing PDF file...
   🔍 [_10] Trying PRIMARY zone detection...
   ❌ [_12] Backup 1 failed. Trying Backup Method 2...
   ✅ [_10] Primary method succeeded
   ✅ [_10] SIGNATURE (conf: 0.00) via None - DataFrame updated

🔄 [_10] Processing: Melyn Conzon Umpacan (Row 158)
   📥 [_10] Downloading from: https://erpbackendpro.maids.cc/public/download/75d...
📊 Progress: 138/384 (35.9%) - ✅0 ❌138 ⚠️0 📥0
   ✅ [_12] Backup Method 2 succeeded
   ✅ [_12] SIGNATURE (



   🔍 [0_2] Trying PRIMARY zone detection...
   🔍 [_19] Trying PRIMARY zone detection...
   ❌ [0_2] Primary method failed. Trying Backup Method 1...
   ❌ [_19] Primary method failed. Trying Backup Method 1...
   ✅ [_11] Backup Method 2 succeeded
   ✅ [_11] SIGNATURE (conf: 0.00) via None - DataFrame updated

🔄 [_11] Processing: Maria Fe Alegasin Doromal (Row 160)
   📥 [_11] Downloading from: https://erpbackendpro.maids.cc/public/download/70f...
📊 Progress: 140/384 (36.5%) - ✅0 ❌140 ⚠️0 📥0
   ✅ [_12] Downloaded PDF (1786 KB)
   🤖 [_12] Processing PDF file...
   🔍 [_14] Trying PRIMARY zone detection...
   ✅ [_14] Primary method succeeded
   ✅ [_14] SIGNATURE (conf: 0.00) via None - DataFrame updated
📊 Progress: 141/384 (36.7%) - ✅0 ❌141 ⚠️0 📥0

🔄 [_14] Processing: Elma Aplasca Sagon (Row 161)
   📥 [_14] Downloading from: https://erpbackendpro.maids.cc/public/download/846...




   ✅ [_11] Downloaded PDF (1798 KB)
   🤖 [_11] Processing PDF file...
   🔍 [0_4] Trying PRIMARY zone detection...
   ❌ [0_4] Primary method failed. Trying Backup Method 1...
   ✅ [_14] Downloaded PDF (1832 KB)
   🤖 [_14] Processing PDF file...
   ❌ [0_3] Backup 1 failed. Trying Backup Method 2...
   🔍 [0_1] Trying PRIMARY zone detection...
   ✅ [0_3] Backup Method 2 succeeded
   ✅ [0_3] SIGNATURE (conf: 0.00) via None - DataFrame updated
   ✅ [0_1] Primary method succeeded
   ✅ [0_1] SIGNATURE (conf: 0.00) via None - DataFrame updated

🔄 [0_3] Processing: Brendah Banderi Khatsenzi (Row 162)
   📥 [0_3] Downloading from: https://erpbackendpro.maids.cc/public/download/d29...

🔄 [0_1] Processing: Jaqualyn Lerion Garcia (Row 163)
   📥 [0_1] Downloading from: https://erpbackendpro.maids.cc/public/download/7e6...
📊 Progress: 142/384 (37.0%) - ✅0 ❌143 ⚠️0 📥0
📊 Progress: 143/384 (37.2%) - ✅0 ❌143 ⚠️0 📥0
   ✅ [0_3] Downloaded PDF (1871 KB)
   🤖 [0_3] Processing PDF file...
   ✅ [0_1] Downloaded 



   🔍 [_18] Trying PRIMARY zone detection...   ❌ [0_6] Backup 1 failed. Trying Backup Method 2...

   ❌ [_18] Primary method failed. Trying Backup Method 1...
   ✅ [0_6] Backup Method 2 succeeded
   ✅ [0_6] SIGNATURE (conf: 0.00) via None - DataFrame updated
📊 Progress: 144/384 (37.5%) - ✅0 ❌144 ⚠️0 📥0

🔄 [0_6] Processing: Leonita Frias Magtoto  (Row 164)
   📥 [0_6] Downloading from: https://erpbackendpro.maids.cc/public/download/10c...
   ✅ [0_6] Downloaded PDF (1775 KB)
   🤖 [0_6] Processing PDF file...
   🔍 [0_9] Trying PRIMARY zone detection...
   🔍 [0_7] Trying PRIMARY zone detection...
   ✅ [0_9] Primary method succeeded
   ✅ [0_7] Primary method succeeded
   ✅ [0_9] SIGNATURE (conf: 0.00) via None - DataFrame updated

🔄 [0_9] Processing: Maria Coro Rivera (Row 165)
   📥 [0_9] Downloading from: https://erpbackendpro.maids.cc/public/download/116...
📊 Progress: 145/384 (37.8%) - ✅0 ❌145 ⚠️0 📥0
   ✅ [0_7] SIGNATURE (conf: 0.00) via None - DataFrame updated

🔄 [0_7] Processing: Nelly 



   ✅ [0_2] Downloaded PDF (1822 KB)
   🤖 [0_2] Processing PDF file...
   🔍 [0_3] Trying PRIMARY zone detection...
   ❌ [0_3] Primary method failed. Trying Backup Method 1...
   🔍 [0_1] Trying PRIMARY zone detection...
   ✅ [0_1] Primary method succeeded
   ✅ [0_1] SIGNATURE (conf: 0.00) via None - DataFrame updated
📊 Progress: 164/384 (42.7%) - ✅0 ❌164 ⚠️0 📥0

🔄 [0_1] Processing: EVER JOHN MALISH ANDREW  (Row 184)
   📥 [0_1] Downloading from: https://erpbackendpro.maids.cc/public/download/2c8...
   ✅ [0_1] Downloaded PDF (1887 KB)
   🤖 [0_1] Processing PDF file...
   🔍 [0_6] Trying PRIMARY zone detection...
   ✅ [0_6] Primary method succeeded
   ✅ [0_6] SIGNATURE (conf: 0.00) via None - DataFrame updated
📊 Progress: 165/384 (43.0%) - ✅0 ❌165 ⚠️0 📥0

🔄 [0_6] Processing: JESSYLYN LORETO SAN PEDRO (Row 185)
   📥 [0_6] Downloading from: https://erpbackendpro.maids.cc/public/download/2ae...
   ✅ [0_6] Downloaded PDF (1831 KB)
   🤖 [0_6] Processing PDF file...
   🔍 [0_9] Trying PRIMARY zone 



   ✅ [0_2] SIGNATURE (conf: 0.00) via None - DataFrame updated
   🔍 [0_7] Trying PRIMARY zone detection...
   🔍 [0_8] Trying PRIMARY zone detection...

🔄 [0_2] Processing: EVA CABALTEA ENCINARES (Row 194)
   📥 [0_2] Downloading from: https://erpbackendpro.maids.cc/public/download/d51...
   ❌ [0_8] Primary method failed. Trying Backup Method 1...
📊 Progress: 174/384 (45.3%) - ✅0 ❌174 ⚠️0 📥0
   ✅ [0_7] Primary method succeeded
   ✅ [0_7] SIGNATURE (conf: 0.00) via None - DataFrame updated
📊 Progress: 175/384 (45.6%) - ✅0 ❌175 ⚠️0 📥0

🔄 [0_7] Processing: MARILOU MABILEN DABLIO  (Row 195)
   📥 [0_7] Downloading from: https://erpbackendpro.maids.cc/public/download/6cd...
   ✅ [0_2] Downloaded PDF (1808 KB)
   🤖 [0_2] Processing PDF file...
   🔍 [_18] Trying PRIMARY zone detection...
   🔍 [_15] Trying PRIMARY zone detection...
   🔍 [0_0] Trying PRIMARY zone detection...




   ❌ [_18] Primary method failed. Trying Backup Method 1...
   ✅ [0_7] Downloaded PDF (1763 KB)
   🤖 [0_7] Processing PDF file...
   🔍 [0_6] Trying PRIMARY zone detection...




   🔍 [0_5] Trying PRIMARY zone detection...
   ❌ [0_5] Primary method failed. Trying Backup Method 1...
   ✅ [_15] Primary method succeeded
   ✅ [0_0] Primary method succeeded
   ✅ [0_0] SIGNATURE (conf: 0.00) via None - DataFrame updated

🔄 [0_0] Processing: IRENE MUKAMI NDEGWA  (Row 196)
   📥 [0_0] Downloading from: https://erpbackendpro.maids.cc/public/download/1d1...
📊 Progress: 176/384 (45.8%) - ✅0 ❌176 ⚠️0 📥0
   ✅ [_15] SIGNATURE (conf: 0.00) via None - DataFrame updated

🔄 [_15] Processing: ELMA DELA CRUZ SIN (Row 197)
   📥 [_15] Downloading from: https://erpbackendpro.maids.cc/public/download/0eb...
📊 Progress: 177/384 (46.1%) - ✅0 ❌177 ⚠️0 📥0
   ✅ [0_6] Primary method succeeded
   ✅ [0_6] SIGNATURE (conf: 0.00) via None - DataFrame updated
📊 Progress: 178/384 (46.4%) - ✅0 ❌178 ⚠️0 📥0

🔄 [0_6] Processing: MAGDALENA ENRIQUEZ QUESADA (Row 198)
   📥 [0_6] Downloading from: https://erpbackendpro.maids.cc/public/download/c3c...
   🔍 [0_1] Trying PRIMARY zone detection...
   ✅ [0_1] 



   🔍 [0_3] Trying PRIMARY zone detection...
   🔍 [_13] Trying PRIMARY zone detection...
   ❌ [0_3] Primary method failed. Trying Backup Method 1...
   ✅ [_13] Primary method succeeded
   ✅ [_13] SIGNATURE (conf: 0.00) via None - DataFrame updated

🔄 [_13] Processing: GRACE SAN BUENAVENTURA MAGDAMIT (Row 249)
   📥 [_13] Downloading from: https://erpbackendpro.maids.cc/public/download/faa...
📊 Progress: 229/384 (59.6%) - ✅0 ❌229 ⚠️0 📥0
   🔍 [_11] Trying PRIMARY zone detection...
   ✅ [_11] Primary method succeeded
   ✅ [_11] SIGNATURE (conf: 0.00) via None - DataFrame updated
📊 Progress: 230/384 (59.9%) - ✅0 ❌230 ⚠️0 📥0

🔄 [_11] Processing: DEVE JENE CONTANG GARCIA (Row 250)
   📥 [_11] Downloading from: https://erpbackendpro.maids.cc/public/download/226...
   ✅ [_13] Downloaded PDF (1811 KB)
   🤖 [_13] Processing PDF file...
   🔍 [_16] Trying PRIMARY zone detection...
   🔍 [_18] Trying PRIMARY zone detection...
   ✅ [_18] Primary method succeeded
   ✅ [_18] SIGNATURE (conf: 0.00) via Non



   🔍 [0_1] Trying PRIMARY zone detection...
   ❌ [0_1] Primary method failed. Trying Backup Method 1...
   ✅ [0_2] Primary method succeeded
   ✅ [0_2] SIGNATURE (conf: 0.00) via None - DataFrame updated

🔄 [0_2] Processing: Zaheda Abdul Rahuf Shaikh Rahmatullah (Row 353)
   📥 [0_2] Downloading from: https://erpbackendpro.maids.cc/public/download/62d...
📊 Progress: 333/384 (86.7%) - ✅0 ❌333 ⚠️0 📥0
   ✅ [_16] Primary method succeeded
   ✅ [_16] SIGNATURE (conf: 0.00) via None - DataFrame updated

🔄 [_16] Processing: Madanag Enora Panganta  (Row 354)
   📥 [_16] Downloading from: https://erpbackendpro.maids.cc/public/download/efd...
📊 Progress: 334/384 (87.0%) - ✅0 ❌334 ⚠️0 📥0
   🔍 [0_3] Trying PRIMARY zone detection...
   ✅ [_16] Downloaded PDF (1760 KB)
   🤖 [_16] Processing PDF file...
   🔍 [0_6] Trying PRIMARY zone detection...
   ✅ [0_2] Downloaded PDF (1783 KB)
   🤖 [0_2] Processing PDF file...
   ✅ [0_3] Primary method succeeded
   ✅ [0_3] SIGNATURE (conf: 0.00) via None - DataFrame



   ✅ [_10] Downloaded PDF (1763 KB)
   🤖 [_10] Processing PDF file...
   ❌ [_14] Primary method failed. Trying Backup Method 1...
   ✅ [_11] Downloaded PDF (1795 KB)
   🤖 [_11] Processing PDF file...
   🔍 [0_5] Trying PRIMARY zone detection...   ✅ [_19] Downloaded PDF (1780 KB)
   🤖 [_19] Processing PDF file...

   ✅ [0_1] Downloaded PDF (2297 KB)
   🤖 [0_1] Processing PDF file...
   ✅ [0_5] Primary method succeeded
   ✅ [0_5] SIGNATURE (conf: 0.00) via None - DataFrame updated

🔄 [0_5] Processing: Veronica Wangari Machua  (Row 367)
   📥 [0_5] Downloading from: https://erpbackendpro.maids.cc/public/download/fec...
📊 Progress: 347/384 (90.4%) - ✅0 ❌347 ⚠️0 📥0
   ✅ [0_5] Downloaded PDF (2305 KB)
   🤖 [0_5] Processing PDF file...
   🔍 [_13] Trying PRIMARY zone detection...
   ✅ [_13] Primary method succeeded
   ✅ [_13] SIGNATURE (conf: 0.00) via None - DataFrame updated

🔄 [_13] Processing: Mailyn Ambaras Eton (Row 368)📊 Progress: 348/384 (90.6%) - ✅0 ❌348 ⚠️0 📥0

   📥 [_13] Downloading f



   ✅ [_16] Primary method succeeded
   ✅ [_16] SIGNATURE (conf: 0.00) via None - DataFrame updated

🔄 [_16] Processing: Minda Abalos Juliano (Row 372)
   📥 [_16] Downloading from: https://erpbackendpro.maids.cc/public/download/0d5...
📊 Progress: 352/384 (91.7%) - ✅0 ❌352 ⚠️0 📥0
   🔍 [0_3] Trying PRIMARY zone detection...
   ❌ [0_3] Primary method failed. Trying Backup Method 1...
   ✅ [_14] Backup Method 2 succeeded
   ✅ [_14] SIGNATURE (conf: 0.00) via None - DataFrame updated

🔄 [_14] Processing: Jonalyn Pagatpat Rosas (Row 373)
   📥 [_14] Downloading from: https://erpbackendpro.maids.cc/public/download/de3...
📊 Progress: 353/384 (91.9%) - ✅0 ❌353 ⚠️0 📥0
   🔍 [0_2] Trying PRIMARY zone detection...
   ✅ [0_2] Primary method succeeded
   ✅ [0_2] SIGNATURE (conf: 0.00) via None - DataFrame updated

🔄 [0_2] Processing: Sharina Perez Fabila (Row 374)
   📥 [0_2] Downloading from: https://erpbackendpro.maids.cc/public/download/555...
📊 Progress: 354/384 (92.2%) - ✅0 ❌354 ⚠️0 📥0
   🔍 [0_6] T



   🔍 [_17] Trying PRIMARY zone detection...
   🔍 [0_4] Trying PRIMARY zone detection...
   ❌ [0_4] Primary method failed. Trying Backup Method 1...
   ✅ [0_6] Downloaded PDF (2276 KB)
   🤖 [0_6] Processing PDF file...
   ✅ [_17] Primary method succeeded
   ✅ [_17] SIGNATURE (conf: 0.00) via None - DataFrame updated

🔄 [_17] Processing: Irene Mulowooza (Row 378)
   📥 [_17] Downloading from: https://erpbackendpro.maids.cc/public/download/8d5...
📊 Progress: 358/384 (93.2%) - ✅0 ❌358 ⚠️0 📥0
   ✅ [0_0] Downloaded PDF (1759 KB)
   🤖 [0_0] Processing PDF file...
   ✅ [_15] Downloaded PDF (1739 KB)
   🤖 [_15] Processing PDF file...
   ❌ [0_3] Backup 1 failed. Trying Backup Method 2...
   ✅ [0_3] Backup Method 2 succeeded
   ✅ [0_3] SIGNATURE (conf: 0.00) via None - DataFrame updated

🔄 [0_3] Processing: Shaquira Muhenje (Row 379)
   📥 [0_3] Downloading from: https://erpbackendpro.maids.cc/public/download/6a3...
📊 Progress: 359/384 (93.5%) - ✅0 ❌359 ⚠️0 📥0
   ✅ [_17] Downloaded PDF (1727 KB)
  



   🔍 [_17] Trying PRIMARY zone detection...
   ❌ [_17] Primary method failed. Trying Backup Method 1...
   🔍 [_16] Trying PRIMARY zone detection...




   🔍 [0_2] Trying PRIMARY zone detection...
   ❌ [0_2] Primary method failed. Trying Backup Method 1...
   ✅ [_16] Primary method succeeded
   ✅ [_16] SIGNATURE (conf: 0.00) via None - DataFrame updated
📊 Progress: 372/384 (96.9%) - ✅0 ❌372 ⚠️0 📥0
   🔍 [_14] Trying PRIMARY zone detection...
   ✅ [_14] Primary method succeeded
   ✅ [_14] SIGNATURE (conf: 0.00) via None - DataFrame updated
📊 Progress: 373/384 (97.1%) - ✅0 ❌373 ⚠️0 📥0
   🔍 [0_0] Trying PRIMARY zone detection...
   ✅ [0_0] Primary method succeeded
   ✅ [0_0] SIGNATURE (conf: 0.00) via None - DataFrame updated
📊 Progress: 374/384 (97.4%) - ✅0 ❌374 ⚠️0 📥0




   🔍 [_15] Trying PRIMARY zone detection...
   ❌ [_15] Primary method failed. Trying Backup Method 1...
   🔍 [0_6] Trying PRIMARY zone detection...
   ✅ [0_6] Primary method succeeded
   ✅ [0_6] SIGNATURE (conf: 0.00) via None - DataFrame updated
📊 Progress: 375/384 (97.7%) - ✅0 ❌375 ⚠️0 📥0
   ❌ [_17] Backup 1 failed. Trying Backup Method 2...
   ❌ [0_2] Backup 1 failed. Trying Backup Method 2...




   ✅ [_17] Backup Method 2 succeeded
   ✅ [_17] SIGNATURE (conf: 0.00) via None - DataFrame updated
   ✅ [0_2] Backup Method 2 succeeded
📊 Progress: 376/384 (97.9%) - ✅0 ❌376 ⚠️0 📥0
   ✅ [0_2] SIGNATURE (conf: 0.00) via None - DataFrame updated
📊 Progress: 377/384 (98.2%) - ✅0 ❌377 ⚠️0 📥0
   🔍 [0_1] Trying PRIMARY zone detection...
   ❌ [0_1] Primary method failed. Trying Backup Method 1...
   🔍 [0_3] Trying PRIMARY zone detection...
   ✅ [0_3] Primary method succeeded
   ✅ [0_3] SIGNATURE (conf: 0.00) via None - DataFrame updated
📊 Progress: 378/384 (98.4%) - ✅0 ❌378 ⚠️0 📥0
   🔍 [_12] Trying PRIMARY zone detection...
   ✅ [_12] Primary method succeeded
   ✅ [_12] SIGNATURE (conf: 0.00) via None - DataFrame updated
📊 Progress: 379/384 (98.7%) - ✅0 ❌379 ⚠️0 📥0
   ❌ [_15] Backup 1 failed. Trying Backup Method 2...
   ✅ [_15] Backup Method 2 succeeded
   ✅ [_15] SIGNATURE (conf: 0.00) via None - DataFrame updated
📊 Progress: 380/384 (99.0%) - ✅0 ❌380 ⚠️0 📥0
   ❌ [0_1] Backup 1 failed. Try

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#!/usr/bin/env python3
"""
Enhanced Google Sheets Batch Contract Processor
Now tracks which signature zone detection method was used
"""

import os
import time
import tempfile
import requests
import json
import traceback
import re
from typing import Dict, List, Optional, Tuple
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor, as_completed
from threading import Lock
import threading

# Google Sheets dependencies
%pip install --upgrade gspread google-auth google-auth-oauthlib pandas --quiet

from google.colab import auth
import gspread
from google.auth import default
import pandas as pd

# =============================================================================
# CONFIGURATION SECTION - MODIFY THESE VALUES
# =============================================================================

# Your Google Sheet URL
GOOGLE_SHEET_URL = "https://docs.google.com/spreadsheets/d/1E2Cvtv4ZbIBjY4oyorrJZVwgSkpAeeFk2x_X4B9EEXY/edit?gid=1652789691#gid=1652789691"

# Processing Configuration
MAX_CONCURRENT_WORKERS = 40        # Number of parallel workers (⚠️ HIGH CONCURRENCY MODE)
SHEET_TAB_NAME = "Initial"           # Name of the sheet tab to process
START_ROW = 2                       # Row to start processing (1=header, 2=first data row)
MAX_ROWS_TO_PROCESS = None          # Maximum rows to process (None = all rows)
DELAY_BETWEEN_BATCHES = 0.5         # Delay between batches in seconds (reduced for high concurrency)

# Column names in your sheet (modify if different)
MAID_ID_COLUMN = "Maid ID"
MAID_NAME_COLUMN = "Maid Name"
DOCUMENT_LINK_COLUMN = "Link to Document"
CLASSIFICATION_COLUMN = "Classification"
CONFIDENCE_COLUMN = "Confidence"
DETECTION_METHOD_COLUMN = "Detection Method"  # NEW: Track detection method used

# Processing Settings
DOWNLOAD_TIMEOUT = 30               # Timeout for downloading files
MAX_RETRIES = 2                     # Number of retries for failed downloads
VERBOSE_LOGGING = True              # Enable detailed logging

print("🔧 Configuration loaded:")
print(f"   📊 Sheet: {GOOGLE_SHEET_URL.split('/d/')[1].split('/')[0][:20]}...")
print(f"   👥 Workers: {MAX_CONCURRENT_WORKERS}")
print(f"   📋 Tab: {SHEET_TAB_NAME}")
print(f"   🎯 Start row: {START_ROW}")
print(f"   📏 Max rows: {MAX_ROWS_TO_PROCESS if MAX_ROWS_TO_PROCESS else 'All'}")

# =============================================================================
# ENHANCED BATCH PROCESSOR CLASS WITH DETECTION METHOD TRACKING
# =============================================================================

class GoogleSheetsBatchProcessor:
    """Enhanced batch processor for Google Sheets contract analysis with detection method tracking"""

    def __init__(self, api_key: str, max_workers: int = 3):
        self.api_key = api_key
        self.max_workers = max_workers
        self.classifier = GeminiZoneClassifier(api_key=api_key, verbose=VERBOSE_LOGGING)

        # Thread-safe counters
        self.processed_count = 0
        self.success_count = 0
        self.error_count = 0
        self.no_link_count = 0
        self.download_fail_count = 0
        # NEW: Track detection method statistics
        self.method_counts = {
            "Primary": 0,
            "Backup Method 1": 0,
            "Backup Method 2": 0,
            "None": 0
        }
        self.lock = Lock()

        # Setup Google Sheets
        self.setup_google_sheets()

    def setup_google_sheets(self):
        """Setup Google Sheets authentication"""
        try:
            print("🔐 Authenticating with Google Sheets...")
            auth.authenticate_user()
            creds, _ = default()
            self.gc = gspread.authorize(creds)
            print("✅ Google Sheets authentication successful")
        except Exception as e:
            print(f"❌ Google Sheets authentication failed: {e}")
            raise

    def extract_sheet_id(self, url: str) -> str:
        """Extract sheet ID from Google Sheets URL"""
        if "/d/" in url:
            return url.split("/d/")[1].split("/")[0]
        else:
            raise ValueError("Invalid Google Sheets URL format")

    def download_contract(self, url: str, base_filename: str) -> Optional[Tuple[str, str]]:
        """Download contract file from URL"""
        try:
            thread_id = threading.current_thread().name[-3:]
            print(f"   📥 [{thread_id}] Downloading from: {url[:50]}...")

            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
                'Accept': '*/*',
                'Connection': 'keep-alive'
            }

            response = requests.get(url, headers=headers, timeout=DOWNLOAD_TIMEOUT, stream=True)

            if response.status_code != 200:
                print(f"   ❌ [{thread_id}] HTTP {response.status_code}: {response.reason}")
                return None

            # Create temporary file
            temp_dir = tempfile.gettempdir()
            temp_path = os.path.join(temp_dir, f"{base_filename}.tmp")

            # Download file
            total_size = 0
            with open(temp_path, 'wb') as f:
                for chunk in response.iter_content(chunk_size=8192):
                    if chunk:
                        f.write(chunk)
                        total_size += len(chunk)

            if total_size == 0:
                print(f"   ⚠️ [{thread_id}] Downloaded file is empty")
                os.remove(temp_path)
                return None

            # Detect file format
            file_format = self.detect_file_format(temp_path)
            if not file_format:
                content_type = response.headers.get('content-type', '').lower()
                if 'pdf' in content_type:
                    file_format = 'pdf'
                elif 'jpeg' in content_type or 'jpg' in content_type:
                    file_format = 'jpeg'
                elif 'png' in content_type:
                    file_format = 'png'
                else:
                    print(f"   ❓ [{thread_id}] Unknown file format")
                    os.remove(temp_path)
                    return None

            # Rename with proper extension
            final_path = f"{temp_path}.{file_format}"
            os.rename(temp_path, final_path)

            print(f"   ✅ [{thread_id}] Downloaded {file_format.upper()} ({total_size//1024} KB)")
            return final_path, file_format

        except Exception as e:
            print(f"   ❌ Download failed: {str(e)[:100]}")
            return None

    def detect_file_format(self, file_path: str) -> Optional[str]:
        """Detect file format from file header"""
        try:
            with open(file_path, 'rb') as f:
                header = f.read(20)

            if header.startswith(b'%PDF') or b'%PDF' in header[:10]:
                return 'pdf'
            elif header.startswith(b'\xff\xd8\xff'):
                return 'jpeg'
            elif header.startswith(b'\x89PNG\r\n\x1a\n'):
                return 'png'
            else:
                return None
        except:
            return None

    def process_contract_file_with_method_tracking(self, file_path: str, file_format: str, maid_name, maid_id) -> Dict:
        """Process contract file and track which detection method was used"""
        try:
            thread_id = threading.current_thread().name[-3:]
            print(f"   🤖 [{thread_id}] Processing {file_format.upper()} file...")

            if file_format == 'pdf':
                # Extract pages and detect text (same as original)
                original_image = self.classifier.extract_second_page(file_path)
                cropped_image, _ = self.classifier.get_bottom_two_thirds(original_image)
                text_detections = self.classifier.detect_text_with_positions(cropped_image)

                # NEW: Try detection methods in order and track which one succeeds
                detection_method_used = "None"
                signature_zone = None

                # Primary method
                print(f"   🔍 [{thread_id}] Trying PRIMARY zone detection...")
                primary_zone = self.classifier.find_signature_zone(cropped_image, text_detections)
                if primary_zone:
                    detection_method_used = "Primary"
                    signature_zone = primary_zone
                    print(f"   ✅ [{thread_id}] Primary method succeeded")
                else:
                    print(f"   ❌ [{thread_id}] Primary method failed. Trying Backup Method 1...")
                    backup1 = self.classifier.backup_zone_detection_method1(cropped_image)
                    if backup1:
                        detection_method_used = "Backup Method 1"
                        signature_zone = backup1
                        print(f"   ✅ [{thread_id}] Backup Method 1 succeeded")
                    else:
                        print(f"   ❌ [{thread_id}] Backup 1 failed. Trying Backup Method 2...")
                        backup2 = self.classifier.backup_zone_detection_method2(cropped_image)
                        if backup2:
                            detection_method_used = "Backup Method 2"
                            signature_zone = backup2
                            print(f"   ✅ [{thread_id}] Backup Method 2 succeeded")
                        else:
                            detection_method_used = "None"
                            signature_zone = None
                            print(f"   ❌ [{thread_id}] All detection methods failed")

                # Update method counter
                with self.lock:
                    self.method_counts[detection_method_used] += 1

                # Classify with Gemini if zone was found
                if signature_zone:
                    zone_image = cropped_image[signature_zone.y:signature_zone.y + signature_zone.height,
                                               signature_zone.x:signature_zone.x + signature_zone.width]
                    classification = self.classifier.classify_with_gemini(zone_image)

                    return {
                        "classification": classification.classification,
                        "confidence": classification.confidence,
                        "reasoning": f"Detection: {detection_method_used}. {classification.reasoning}",
                        "zone_detected": True,
                        "detection_method": detection_method_used,
                        "status": "success"
                    }
                else:
                    return {
                        "classification": "not found",
                        "confidence": 0.0,
                        "reasoning": f"Zone detection failed with all methods: Primary, Backup 1, Backup 2",
                        "zone_detected": False,
                        "detection_method": "None",
                        "status": "zone_not_found"
                    }

            elif file_format in ['jpeg', 'png']:
                # Process image files directly (no zone detection needed)
                import cv2
                image = cv2.imread(file_path)
                if image is None:
                    return {
                        "classification": "error",
                        "confidence": 0.0,
                        "reasoning": "Could not load image file",
                        "zone_detected": False,
                        "detection_method": "N/A",
                        "status": "image_load_failed"
                    }

                # Classify the entire image with Gemini
                classification = self.classifier.classify_with_gemini(image)

                return {
                    "classification": classification.classification,
                    "confidence": classification.confidence,
                    "reasoning": f"Image file processed: {classification.reasoning}",
                    "zone_detected": True,
                    "detection_method": "N/A (Full Image)",
                    "status": "success"
                }

            else:
                return {
                    "classification": "error",
                    "confidence": 0.0,
                    "reasoning": f"Unsupported file format: {file_format}",
                    "zone_detected": False,
                    "detection_method": "N/A",
                    "status": "unsupported_format"
                }

        except Exception as e:
            print(f"   ❌ Processing failed: {str(e)[:100]}")
            return {
                "classification": "error",
                "confidence": 0.0,
                "reasoning": f"Processing error: {str(e)}",
                "zone_detected": False,
                "detection_method": "Error",
                "status": "processing_failed"
            }

    def process_single_row(self, row_data: Dict, worksheet, classification_col: int, confidence_col: int = None, method_col: int = None) -> Dict:
        """Process a single row from the Google Sheet"""
        start_time = time.time()
        thread_id = threading.current_thread().name[-3:]

        row_number = row_data['row_number']
        maid_id = str(row_data.get(MAID_ID_COLUMN, f'Row_{row_number}'))
        maid_name = str(row_data.get(MAID_NAME_COLUMN, 'Unknown'))
        document_link = str(row_data.get(DOCUMENT_LINK_COLUMN, ''))

        print(f"\n🔄 [{thread_id}] Processing: {maid_name} (Row {row_number})")

        temp_file = None
        result = {
            "row_number": row_number,
            "maid_id": maid_id,
            "maid_name": maid_name,
            "classification": "error",
            "confidence": 0.0,
            "reasoning": "",
            "detection_method": "N/A",
            "processing_time": 0.0,
            "status": "unknown"
        }

        try:
            # Check if document link exists
            if not document_link or document_link.lower() in ['', 'none', 'null']:
                print(f"   ⚠️ [{thread_id}] No document link found")
                result.update({
                    "classification": "no_link",
                    "reasoning": "No document link provided",
                    "detection_method": "N/A",
                    "status": "no_link"
                })

                with self.lock:
                    self.no_link_count += 1
                    self.processed_count += 1

                # Update sheet
                self.update_sheet_cell(worksheet, row_number + 1, classification_col, "no_link")
                self.update_sheet_cell(worksheet, row_number + 1, confidence_col, "0.00")
                self.update_sheet_cell(worksheet, row_number + 1, method_col, "N/A")
                print(f"   📋 [{thread_id}] Sheet updated: no_link")
                return result

            # Create filename
            safe_name = re.sub(r'[^\w\-_.]', '_', maid_name)
            base_filename = f"{safe_name}_{maid_id}_contract"

            # Download contract
            download_result = self.download_contract(document_link, base_filename)

            if not download_result:
                print(f"   ❌ [{thread_id}] Download failed")
                result.update({
                    "classification": "download_failed",
                    "reasoning": "Failed to download contract file",
                    "detection_method": "N/A",
                    "status": "download_failed"
                })

                with self.lock:
                    self.download_fail_count += 1
                    self.processed_count += 1

                # Update sheet
                self.update_sheet_cell(worksheet, row_number + 1, classification_col, "download_failed")
                self.update_sheet_cell(worksheet, row_number + 1, confidence_col, "0.00")
                self.update_sheet_cell(worksheet, row_number + 1, method_col, "N/A")
                print(f"   📋 [{thread_id}] Sheet updated: download_failed")
                return result

            temp_file, file_format = download_result

            # Process the contract with method tracking
            processing_result = self.process_contract_file_with_method_tracking(temp_file, file_format)

            result.update({
                "classification": processing_result["classification"],
                "confidence": processing_result["confidence"],
                "reasoning": processing_result["reasoning"],
                "detection_method": processing_result["detection_method"],
                "status": processing_result["status"],
                "processing_time": time.time() - start_time
            })

            # Update counters
            with self.lock:
                if processing_result["status"] == "success":
                    self.success_count += 1
                else:
                    self.error_count += 1
                self.processed_count += 1

            # Update sheet immediately with classification, confidence, and detection method
            classification_value = processing_result["classification"]
            confidence_value = processing_result["confidence"]
            method_value = processing_result["detection_method"]

            # Update all three columns
            self.update_sheet_cell(worksheet, row_number + 1, classification_col, classification_value)
            self.update_sheet_cell(worksheet, row_number + 1, confidence_col, f"{confidence_value:.2f}")
            self.update_sheet_cell(worksheet, row_number + 1, method_col, method_value)

            print(f"   ✅ [{thread_id}] {classification_value.upper()} (conf: {confidence_value:.2f}) via {method_value} - Sheet updated")

            return result

        except Exception as e:
            print(f"   ❌ [{thread_id}] Error: {str(e)[:100]}")
            result.update({
                "classification": "error",
                "reasoning": f"Processing error: {str(e)}",
                "detection_method": "Error",
                "status": "error",
                "processing_time": time.time() - start_time
            })

            with self.lock:
                self.error_count += 1
                self.processed_count += 1

            # Update sheet even for errors
            self.update_sheet_cell(worksheet, row_number + 1, classification_col, "error")
            self.update_sheet_cell(worksheet, row_number + 1, confidence_col, "0.00")
            self.update_sheet_cell(worksheet, row_number + 1, method_col, "Error")
            print(f"   📋 [{thread_id}] Sheet updated: error")
            return result

        finally:
            # Clean up temporary file
            if temp_file and os.path.exists(temp_file):
                try:
                    os.remove(temp_file)
                except:
                    pass

    def update_sheet_cell(self, worksheet, row: int, col: int, value: str):
        """Update a single cell in the Google Sheet"""
        try:
            worksheet.update_cell(row, col, value)
            return True
        except Exception as e:
            print(f"      ⚠️ Sheet update failed: {str(e)[:50]}")
            return False

    def process_google_sheet(self, sheet_url: str, sheet_name: str = "Sheet1",
                           start_row: int = 2, max_rows: int = None) -> Dict:
        """Main function to process the entire Google Sheet"""

        print(f"\n🚀 Starting batch processing...")
        print(f"   📊 Workers: {self.max_workers}")
        print(f"   📋 Sheet: {sheet_name}")
        print(f"   🎯 Start row: {start_row}")
        print(f"   📏 Max rows: {max_rows if max_rows else 'All'}")

        start_time = time.time()

        try:
            # Open the spreadsheet
            sheet_id = self.extract_sheet_id(sheet_url)
            spreadsheet = self.gc.open_by_key(sheet_id)
            worksheet = spreadsheet.worksheet(sheet_name)

            # Get all data
            all_data = worksheet.get_all_records()
            if not all_data:
                return {"error": "No data found in sheet", "processed": 0}

            # Determine processing range
            total_rows = len(all_data)
            if max_rows:
                end_row = min(start_row - 1 + max_rows, total_rows)
            else:
                end_row = total_rows

            rows_to_process = all_data[start_row-1:end_row]
            total_to_process = len(rows_to_process)

            print(f"   📊 Found {total_rows} total rows, processing {total_to_process} rows")

            # Setup Classification, Confidence, and Detection Method columns
            headers = worksheet.row_values(1)
            classification_col = None
            confidence_col = None
            method_col = None

            # Check if columns exist
            for i, header in enumerate(headers, 1):
                if header.lower().strip() == 'classification':
                    classification_col = i
                    print(f"   📋 Found existing Classification column at position {classification_col}")
                if header.lower().strip() == 'confidence':
                    confidence_col = i
                    print(f"   📋 Found existing Confidence column at position {confidence_col}")
                if header.lower().strip() == 'detection method':
                    method_col = i
                    print(f"   📋 Found existing Detection Method column at position {method_col}")

            # Add missing columns
            if not classification_col:
                classification_col = len(headers) + 1
                worksheet.update_cell(1, classification_col, CLASSIFICATION_COLUMN)
                print(f"   📋 Added Classification column at position {classification_col}")

            if not confidence_col:
                confidence_col = classification_col + 1
                worksheet.update_cell(1, confidence_col, CONFIDENCE_COLUMN)
                print(f"   📋 Added Confidence column at position {confidence_col}")

            if not method_col:
                method_col = confidence_col + 1
                worksheet.update_cell(1, method_col, DETECTION_METHOD_COLUMN)
                print(f"   📋 Added Detection Method column at position {method_col}")

            # Prepare data for processing
            processing_data = []
            for i, row in enumerate(rows_to_process):
                row_data = row.copy()
                row_data['row_number'] = start_row + i
                processing_data.append(row_data)

            print(f"\n🔄 Processing {len(processing_data)} contracts with {self.max_workers} workers...")

            # Process in parallel
            results = []
            with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
                # Submit all tasks
                future_to_row = {
                    executor.submit(self.process_single_row, row_data, worksheet, classification_col, confidence_col, method_col): row_data
                    for row_data in processing_data
                }

                # Collect results
                for future in as_completed(future_to_row):
                    row_data = future_to_row[future]
                    try:
                        result = future.result()
                        results.append(result)

                        # Progress update
                        completed = len(results)
                        progress = (completed / total_to_process) * 100
                        print(f"📊 Progress: {completed}/{total_to_process} ({progress:.1f}%) - ✅{self.success_count} ❌{self.error_count} ⚠️{self.no_link_count} 📥{self.download_fail_count}")

                        # Small delay to avoid overwhelming the API
                        if completed % self.max_workers == 0:
                            time.sleep(DELAY_BETWEEN_BATCHES)

                    except Exception as e:
                        print(f"❌ Task failed for row {row_data.get('row_number', 'unknown')}: {e}")

            # Calculate final statistics
            total_time = time.time() - start_time

            # Print summary with detection method breakdown
            print(f"\n🎉 Batch processing completed!")
            print(f"   ⏱️  Total time: {total_time:.1f} seconds ({total_time/60:.1f} minutes)")
            print(f"   📊 Processed: {self.processed_count} contracts")
            print(f"   ✅ Successful: {self.success_count}")
            print(f"   ❌ Errors: {self.error_count}")
            print(f"   ⚠️  No links: {self.no_link_count}")
            print(f"   📥 Download failures: {self.download_fail_count}")
            print(f"   ⚡ Average time per contract: {total_time/max(1, self.processed_count):.1f}s")

            # NEW: Detection method breakdown
            print(f"\n🔍 DETECTION METHOD BREAKDOWN:")
            total_detected = sum(self.method_counts.values())
            for method, count in self.method_counts.items():
                if total_detected > 0:
                    percentage = (count / total_detected) * 100
                    print(f"   {method}: {count} ({percentage:.1f}%)")

            if self.success_count > 0:
                success_results = [r for r in results if r["status"] == "success"]
                avg_confidence = sum(r["confidence"] for r in success_results) / len(success_results)
                print(f"   🎯 Average confidence: {avg_confidence:.2f}")

                # Classification breakdown
                classifications = {}
                for r in success_results:
                    cls = r["classification"]
                    classifications[cls] = classifications.get(cls, 0) + 1

                print(f"   📊 Classification breakdown:")
                for cls, count in classifications.items():
                    percentage = (count / len(success_results)) * 100
                    print(f"      {cls.upper()}: {count} ({percentage:.1f}%)")

            return {
                "processed": self.processed_count,
                "success": self.success_count,
                "errors": self.error_count,
                "no_links": self.no_link_count,
                "download_fails": self.download_fail_count,
                "method_counts": self.method_counts,
                "total_time": total_time,
                "results": results
            }

        except Exception as e:
            print(f"❌ Fatal error: {e}")
            traceback.print_exc()
            return {"error": str(e), "processed": 0}

# =============================================================================
# MAIN EXECUTION
# =============================================================================

def run_batch_analysis():
    """Main function to run the batch analysis"""

    print("="*80)
    print("🚀 MOHRE CONTRACT BATCH ANALYZER - Enhanced with Detection Method Tracking")
    print("="*80)

    # Validate configuration
    if not GEMINI_API_KEY or "your_api_key" in GEMINI_API_KEY.lower():
        print("❌ ERROR: Please set your Gemini API key at the top of this notebook")
        return

    if not GOOGLE_SHEET_URL:
        print("❌ ERROR: Please set your Google Sheet URL")
        return

    try:
        # Create processor
        processor = GoogleSheetsBatchProcessor(
            api_key=GEMINI_API_KEY,
            max_workers=MAX_CONCURRENT_WORKERS
        )

        # Process the sheet
        results = processor.process_google_sheet(
            sheet_url=GOOGLE_SHEET_URL,
            sheet_name=SHEET_TAB_NAME,
            start_row=START_ROW,
            max_rows=MAX_ROWS_TO_PROCESS
        )

        if "error" in results:
            print(f"❌ Processing failed: {results['error']}")
        else:
            print(f"\n✨ Processing completed successfully!")
            print(f"📊 Total processed: {results['processed']}")
            print(f"✅ Successful classifications: {results['success']}")
            print(f"❌ Errors: {results['errors']}")
            print(f"⏱️  Total time: {results['total_time']:.1f} seconds")

            # Show detection method summary
            print(f"\n🔍 Detection Methods Used:")
            for method, count in results['method_counts'].items():
                print(f"   {method}: {count} files")

        return results

    except Exception as e:
        print(f"❌ Fatal error occurred: {e}")
        traceback.print_exc()
        return None

# Run the batch analysis
print("🎯 Configuration complete. Ready to start batch processing!")
print("📋 New feature: Detection method will be tracked and saved to 'Detection Method' column")
print("\nTo start processing, run:")
print("results = run_batch_analysis()")

In [None]:
# =============================================================================
# 🚀 RUN BATCH ANALYSIS
# =============================================================================

# Execute the batch processing
print("🎬 Starting batch analysis...")
print("This will process all contracts in your Google Sheet")
print("Each contract will be downloaded, analyzed with Gemini AI, and results updated in real-time")
print("\n" + "="*60)

# Run the analysis
results = run_batch_analysis()

if results and "error" not in results:
    print("\n" + "="*60)
    print("🎉 BATCH ANALYSIS COMPLETED SUCCESSFULLY!")
    print("="*60)
    print(f"📊 Summary:")
    print(f"   Total processed: {results['processed']}")
    print(f"   ✅ Successful: {results['success']}")
    print(f"   ❌ Errors: {results['errors']}")
    print(f"   ⚠️  No links: {results['no_links']}")
    print(f"   📥 Download fails: {results['download_fails']}")
    print(f"   ⏱️  Total time: {results['total_time']:.1f} seconds")
    print("\n✨ Check your Google Sheet - the 'Classification' column has been updated!")
    print("💡 You can re-run this cell to process additional contracts")
else:
    print("\n❌ Batch analysis failed. Please check the error messages above.")
