In [None]:
# Cell 1: Import Libraries and Setup
# Import necessary libraries
import os
import re
import cv2
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image
import easyocr
from paddleocr import PaddleOCR
import zipcodes
import platform
from datetime import datetime
import warnings

warnings.filterwarnings("ignore")
plt.style.use("ggplot")

# Set up OCR engines
os.environ["CUDA_VISIBLE_DEVICES"] = ""  # Force PaddleOCR to use CPU on macOS

paddle_ocr = PaddleOCR(
    use_angle_cls=True, lang="en", show_log=False
)  # CPU-only for PaddleOCR

# Create output directory
os.makedirs("reports", exist_ok=True)


In [None]:
# Cell 2: Environment Setup with Unified Schema & Visualization Support

# Setup OCR engine and file discovery
test_folder = os.path.join(os.getcwd(), "test")
ocr = PaddleOCR(use_angle_cls=True, lang="en", show_log=False)

image_extensions = [".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".tif"]
file_list = [
    f
    for f in os.listdir(test_folder)
    if any(f.lower().endswith(ext) for ext in image_extensions)
]
print(f"Found {len(file_list)} image files in test folder")

EXPECTED_COLUMNS = [
    "serial_number",
    "event_type",
    "event_date",
    "associated_name",
    "associated_address",
    "filename",
    "file_creation_date",
    "file_modification_date",
    "file_location",
]
_reader_instance = None


def ensure_schema(df: pd.DataFrame) -> pd.DataFrame:
    for col in EXPECTED_COLUMNS:
        if col not in df.columns:
            df[col] = None
    return df[EXPECTED_COLUMNS].copy()


if "df" not in globals():
    df = pd.DataFrame(columns=EXPECTED_COLUMNS)
else:
    df = ensure_schema(df)

print("Environment setup complete with target schema:")
print(EXPECTED_COLUMNS)

# Visualization helper with semantic highlighting
CATEGORY_COLORS = {
    "serial_number": (255, 0, 0),      # Red - Serial Number
    "event_type": (255, 140, 0),       # Orange - Event Type
    "event_date": (255, 215, 0),       # Gold/Yellow - Event Date
    "associated_name": (30, 144, 255),  # Blue - Associated Name
    "associated_address": (148, 0, 211), # Purple - Associated Address
    "other": (128, 128, 128),          # Gray - Other
}


def classify_token(text_lower):
    # Lightweight classification for event_type cues / names / address parts
    event_keywords = [
        "inventory",
        "inspection",
        "theft",
        "loss",
        "stolen",
        "missing",
        "burglary",
        "incident",
        "transfer",
        "disposal",
        "larceny",
    ]

    date_keywords = [
        "date",
        "jan", "feb", "mar", "apr", "may", "jun",
        "jul", "aug", "sep", "oct", "nov", "dec"
    ]
    
    date_patterns = [
        r'\d{1,2}/\d{1,2}/\d{2,4}',  # MM/DD/YYYY or DD/MM/YYYY
        r'\d{1,2}-\d{1,2}-\d{2,4}',  # MM-DD-YYYY or DD-MM-YYYY
    ]

    address_keywords = [
        "street",
        "st",
        "ave",
        "avenue",
        "road",
        "rd",
        "dr",
        "drive",
        "lane",
        "ln",
        "blvd",
        "boulevard",
        "suite",
        "ste",
        "apt",
        "unit",
        "city",
        "state",
        "zip"
    ]
    
    # Check for event type
    if any(k in text_lower for k in event_keywords):
        return "event_type"
    
    # Check for date patterns
    if any(k in text_lower for k in date_keywords) or any(re.search(pattern, text_lower) for pattern in date_patterns):
        return "event_date"
    
    # Check for address components
    if any(k in text_lower for k in address_keywords):
        return "associated_address"
    
    # Default category
    return "other"


def visualize_ocr(
    img_bgr,
    ocr_results,
    serial_candidate=None,
    metadata_info=None,
    show=True,
    title=None,
):
    draw = img_bgr.copy()
    serial_norm = str(serial_candidate) if serial_candidate else None
    
    # Extract metadata for each component
    component_tokens = {
        "associated_name": set(),
        "associated_address": set(),
        "event_type": set(),
        "event_date": set()
    }
    
    if metadata_info:
        # Process associated_name
        if metadata_info.get("associated_name"):
            for part in str(metadata_info["associated_name"]).split():
                component_tokens["associated_name"].add(part.lower())
        
        # Process associated_address
        if metadata_info.get("associated_address"):
            for part in re.split(r"[\s,]", str(metadata_info["associated_address"])):
                if part:
                    component_tokens["associated_address"].add(part.lower())
        
        # Process event_type
        if metadata_info.get("event_type"):
            for part in str(metadata_info["event_type"]).split():
                component_tokens["event_type"].add(part.lower())
        
        # Process event_date
        if metadata_info.get("event_date"):
            for part in str(metadata_info["event_date"]).split():
                component_tokens["event_date"].add(part.lower())
    
    # Process each OCR result
    for bbox, text, conf in ocr_results:
        t_norm = text.strip()
        t_lower = t_norm.lower()
        
        # Determine category for color coding
        if serial_norm and t_norm == serial_norm:
            color_key = "serial_number"
        elif any(t_lower in tokens for component, tokens in component_tokens.items() if component == "associated_name"):
            color_key = "associated_name"
        elif any(t_lower in tokens for component, tokens in component_tokens.items() if component == "associated_address"):
            color_key = "associated_address"
        elif any(t_lower in tokens for component, tokens in component_tokens.items() if component == "event_type"):
            color_key = "event_type"
        elif any(t_lower in tokens for component, tokens in component_tokens.items() if component == "event_date"):
            color_key = "event_date"
        else:
            color_key = classify_token(t_lower)
        
        # Get color for the category
        color = CATEGORY_COLORS.get(color_key, CATEGORY_COLORS["other"])
        
        # Draw bounding box
        pts = np.array(bbox, np.int32).reshape((-1, 1, 2))
        cv2.polylines(
            draw,
            [pts],
            isClosed=True,
            color=color,
            thickness=2 if color_key != "serial_number" else 3,  # Fixed key name
        )
        
        # Add text label
        x, y = pts[0][0]
        label = f"{t_norm} ({conf:.2f})"
        
        # Handle long labels gracefully
        if len(label) > 38:
            label = label[:35] + "..."
        
        cv2.putText(
            draw,
            label,
            (x, y - 6),
            cv2.FONT_HERSHEY_SIMPLEX,
            0.55,
            color,
            2,
            cv2.LINE_AA,
        )
    
    # Display visualization if requested
    if show:
        img_rgb = cv2.cvtColor(draw, cv2.COLOR_BGR2RGB)
        plt.figure(figsize=(13, 9))
        plt.imshow(img_rgb)
        plt.axis("off")
        plt.title(
            title
            or "OCR Visualization (serial=red, event_type=orange, event_date=yellow, name=blue, address=purple, other=gray)"
        )
        plt.show()
    
    return draw

In [None]:
# Cell 3: Metadata Extraction
import os
import platform
from datetime import datetime


def get_file_metadata(file_path):
    """Get comprehensive file metadata - platform agnostic, honest timestamps"""
    try:
        file_stats = os.stat(file_path)
        system = platform.system().lower()

        # Platform-specific creation time handling - report what the OS actually provides
        if system == "windows":
            # Windows: st_ctime is actual creation time
            creation_time = file_stats.st_ctime
            creation_source = "st_ctime (Windows creation time)"
            creation_reliable = True
        elif hasattr(file_stats, "st_birthtime"):
            # macOS/BSD: st_birthtime is true creation time
            creation_time = file_stats.st_birthtime
            creation_source = "st_birthtime (macOS/BSD creation time)"
            creation_reliable = True
        else:
            # Linux/Other: No reliable creation time available
            # Report st_ctime but mark as unreliable
            creation_time = file_stats.st_ctime
            creation_source = "st_ctime (Linux - metadata change time, NOT creation)"
            creation_reliable = False

        # Modification time is consistent across platforms
        modification_time = file_stats.st_mtime

        metadata = {
            "filename": os.path.basename(file_path),
            "file_creation_date": datetime.fromtimestamp(creation_time).strftime(
                "%Y-%m-%d %H:%M:%S"
            ),
            "file_modification_date": datetime.fromtimestamp(
                modification_time
            ).strftime("%Y-%m-%d %H:%M:%S"),
            "file_location": os.path.abspath(file_path),
            "platform": system,
            "creation_source": creation_source,
            "creation_reliable": creation_reliable,  # Indicates if creation time is trustworthy
        }

        return metadata

    except Exception as e:
        print(f"Error getting metadata for {file_path}: {e}")
        return {
            "filename": os.path.basename(file_path) if file_path else "Unknown",
            "file_creation_date": "Unknown",
            "file_modification_date": "Unknown",
            "file_location": (
                os.path.abspath(file_path)
                if file_path and os.path.exists(file_path)
                else str(file_path)
            ),
            "platform": platform.system().lower(),
            "creation_source": "Error occurred",
            "creation_reliable": False,
        }


# Optional: Diagnostic function to show what timestamps are actually available
def diagnose_file_timestamps(file_path):
    """Show all available timestamps for debugging"""
    if not os.path.exists(file_path):
        print(f"File not found: {file_path}")
        return

    try:
        file_stats = os.stat(file_path)
        system = platform.system().lower()

        print(f"\nFile: {os.path.basename(file_path)}")
        print(f"Platform: {system}")
        print("\nAvailable timestamps:")

        if hasattr(file_stats, "st_birthtime"):
            print(
                f"  st_birthtime: {datetime.fromtimestamp(file_stats.st_birthtime).strftime('%Y-%m-%d %H:%M:%S')} (creation time)"
            )
        else:
            print("  st_birthtime: Not available")

        print(
            f"  st_ctime:     {datetime.fromtimestamp(file_stats.st_ctime).strftime('%Y-%m-%d %H:%M:%S')} ({'creation' if system == 'windows' else 'metadata change'} time)"
        )
        print(
            f"  st_mtime:     {datetime.fromtimestamp(file_stats.st_mtime).strftime('%Y-%m-%d %H:%M:%S')} (modification time)"
        )
        print(
            f"  st_atime:     {datetime.fromtimestamp(file_stats.st_atime).strftime('%Y-%m-%d %H:%M:%S')} (access time)"
        )

    except Exception as e:
        print(f"Error reading timestamps: {e}")


In [None]:
# CELL 4: Function Definitions for Serial Number Extraction
"""
Function definitions for the Advanced Serial Number Detection System.
This cell contains all helper functions used by the main processing pipeline for serial number extraction.
"""

import os
import cv2
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
import platform
from datetime import datetime


def get_file_metadata(file_path):
    """Extract file creation and modification timestamps."""
    try:
        stat = os.stat(file_path)
        modification_time = datetime.fromtimestamp(stat.st_mtime).strftime(
            "%m/%d/%y %H:%M"
        )
        if platform.system() == "Windows":
            creation_time = datetime.fromtimestamp(stat.st_ctime).strftime(
                "%m/%d/%y %H:%M"
            )
        else:
            try:
                creation_time = datetime.fromtimestamp(stat.st_birthtime).strftime(
                    "%m/%d/%y %H:%M"
                )
            except AttributeError:
                creation_time = modification_time
        return {
            "file_creation_date": creation_time,
            "file_modification_date": modification_time,
        }
    except Exception as e:
        return {"file_creation_date": "Unknown", "file_modification_date": "Unknown"}


def combine_spaced_alphanumeric(text):
    """Combine spaced single characters into continuous alphanumeric strings."""
    parts = [part for part in text.split() if part]
    if len(parts) >= 3:
        single_chars = [
            part
            for part in parts
            if len(part) == 1 and (part.isalpha() or part.isdigit())
        ]
        if len(single_chars) >= 2:
            combined = "".join(parts)
            if 5 <= len(combined) <= 20 and combined.isalnum():
                return combined
    return text


def simple_clustering(text_positions, eps=150, min_samples=2):
    """Perform distance-based clustering of text positions without sklearn dependency."""
    if len(text_positions) < min_samples:
        return [-1] * len(text_positions)

    labels = [-1] * len(text_positions)
    cluster_id = 0

    for i, (x1, y1, _, _, _) in enumerate(text_positions):
        if labels[i] != -1:
            continue

        cluster = [i]
        labels[i] = cluster_id

        for j, (x2, y2, _, _, _) in enumerate(text_positions):
            if i == j or labels[j] != -1:
                continue

            if ((x1 - x2) ** 2 + (y1 - y2) ** 2) ** 0.5 <= eps:
                cluster.append(j)
                labels[j] = cluster_id

        if len(cluster) >= min_samples:
            cluster_id += 1
        else:
            for idx in cluster:
                labels[idx] = -1

    return labels


def detect_image_regions(processed_results, image_dimensions):
    """Identify and classify different regions within an image based on text clustering."""
    if len(processed_results) < 3:
        return [{"type": "unknown", "texts": processed_results, "region_id": 0}]

    text_positions = [
        (
            (bbox[0][0] + bbox[2][0]) / 2,
            (bbox[0][1] + bbox[2][1]) / 2,
            text,
            bbox,
            confidence,
        )
        for bbox, text, confidence in processed_results
    ]

    labels = simple_clustering(text_positions, eps=150, min_samples=2)

    regions = {}
    for i, label in enumerate(labels):
        regions.setdefault(label, []).append(processed_results[i])

    classified_regions = []
    for region_id, region_texts in regions.items():
        region_type = (
            "isolated" if region_id == -1 else classify_region_type(region_texts)
        )
        classified_regions.append(
            {
                "type": region_type,
                "texts": region_texts,
                "region_id": region_id if region_id != -1 else len(classified_regions),
            }
        )

    return classified_regions


def classify_region_type(region_texts):
    """Classify a region as document, firearm, or mixed based on content analysis."""
    all_text = " ".join([text for _, text, _ in region_texts]).upper()
    text_count = len(region_texts)

    if region_texts:
        xs = [bbox[0][0] for bbox, _, _ in region_texts] + [
            bbox[2][0] for bbox, _, _ in region_texts
        ]
        ys = [bbox[0][1] for bbox, _, _ in region_texts] + [
            bbox[2][1] for bbox, _, _ in region_texts
        ]
        area = (max(xs) - min(xs)) * (max(ys) - min(ys))
        text_area = sum(
            (bbox[2][0] - bbox[0][0]) * (bbox[2][1] - bbox[0][1])
            for bbox, _, _ in region_texts
        )
        density = text_area / area if area > 0 else 0
    else:
        density = 0

    doc_keywords = ["LICENSE", "NAME", "ADDRESS", "DATE", "ISSUED", "EXPIRES"]
    firearm_keywords = ["GLOCK", "SMITH", "COLT", "CAL", "MM", "MODEL", "MADE IN"]

    doc_score = (
        (2 if density > 0.1 else 0)
        + (2 if text_count > 8 else 0)
        + sum(2 for kw in doc_keywords if kw in all_text)
    )
    firearm_score = (
        (2 if density < 0.05 else 0)
        + (2 if text_count < 6 else 0)
        + sum(2 for kw in firearm_keywords if kw in all_text)
    )

    if doc_score > firearm_score + 2:
        return "document"
    elif firearm_score > doc_score + 2:
        return "firearm"
    else:
        return "mixed"


def analyze_region_context(region_texts, region_type):
    """Perform detailed context analysis for a classified region."""
    all_texts = [text for _, text, _ in region_texts]

    if region_type == "document":
        word_freq = {}
        for text in all_texts:
            for word in re.findall(r"[A-Z]{3,}", text.upper()):
                word_freq[word] = word_freq.get(word, 0) + 1

        frequent_words = {word for word, count in word_freq.items() if count > 1}
        document_labels = {
            "LICENSE",
            "NAME",
            "ADDRESS",
            "DATE",
            "DOB",
            "EXPIRES",
            "ISSUED",
            "POBOX",
            "PO",
            "BOX",
        }

        for text in all_texts:
            text_upper = text.upper()
            frequent_words.update(
                label for label in document_labels if label in text_upper
            )

        y_groups = {}
        for bbox, text, _ in region_texts:
            y_bucket = int((bbox[0][1] + bbox[2][1]) / 100) * 50
            y_groups.setdefault(y_bucket, []).append(text)

        label_zones = {
            y
            for y, texts in y_groups.items()
            if len(texts) > 2 and sum(len(t) for t in texts) / len(texts) < 15
        }

        return {
            "type": "document",
            "frequent_words": frequent_words,
            "label_zones": label_zones,
        }

    elif region_type == "firearm":
        all_text = " ".join(all_texts).upper()
        manufacturers = ["GLOCK", "SMITH", "COLT", "RUGER", "SIG"]
        manufacturer = next((mfg for mfg in manufacturers if mfg in all_text), None)

        manufacturing_marks = set()
        for text in all_texts:
            text_upper = text.upper()
            if (
                re.search(r"\d+MM|CAL", text_upper)
                or any(
                    country in text_upper for country in ["USA", "AUSTRIA", "GERMANY"]
                )
                or (
                    manufacturer
                    and text_upper.startswith(manufacturer[:4])
                    and len(text) <= 8
                )
            ):
                manufacturing_marks.add(text)

        return {
            "type": "firearm",
            "manufacturer": manufacturer,
            "manufacturing_marks": manufacturing_marks,
        }

    elif region_type == "isolated":
        if len(region_texts) == 1:
            text = region_texts[0][1]
            if len(text) >= 5 and text.isalnum():
                return {
                    "type": "firearm",
                    "manufacturer": None,
                    "manufacturing_marks": set(),
                }
        return {
            "type": "mixed",
            "frequent_words": set(),
            "label_zones": set(),
            "manufacturing_marks": set(),
        }

    elif region_type == "unknown":
        if (
            len(region_texts) == 1
            and len(region_texts[0][1]) >= 5
            and region_texts[0][1].replace(" ", "").isalnum()
        ):
            return {
                "type": "firearm",
                "manufacturer": None,
                "manufacturing_marks": set(),
            }
        return {
            "type": "mixed",
            "frequent_words": set(),
            "label_zones": set(),
            "manufacturing_marks": set(),
        }

    else:  # mixed
        return {
            "type": "mixed",
            "frequent_words": set(),
            "label_zones": set(),
            "manufacturing_marks": set(),
        }


def is_valid_serial_number(candidate):
    """Validate serial number format and detect potential OCR errors."""
    clean = candidate.replace("-", "")

    if (
        not (5 <= len(clean) <= 12)
        or candidate.count("-") > 1
        or not re.match(r"^[A-Z0-9-]+$", candidate)
    ):
        return False

    has_letters = bool(re.search(r"[A-Z]", clean))
    has_digits = bool(re.search(r"[0-9]", clean))

    # Flag digits-only sequences as potential OCR errors
    if not has_letters and has_digits:
        return "potential_ocr_error"

    return has_letters and has_digits


def calculate_context_score(candidate, source_text, bbox, context):
    """Calculate context-aware confidence score for serial number candidates."""
    score = 0.5
    candidate_upper = candidate.upper()
    is_standalone = source_text.strip() == candidate

    if context["type"] == "document":
        # Extreme Document Label Penalties
        extreme_penalties = [
            ("POBOX", -1.5),
            ("PO", -1.5),
            (r"DOB\d+", -1.5),
            (r"LICENSE\d+", -1.4),
            (r"DATE\d+", -1.3),
            (r"EXPIRES\d+", -1.3),
            (r"ISSUED\d+", -1.2),
        ]

        for pattern, penalty in extreme_penalties:
            if (
                pattern.startswith("r") and re.search(pattern[1:], candidate_upper)
            ) or pattern in candidate_upper:
                score += penalty
                break
        else:
            # Strong Document Penalties
            if any(
                word in context["frequent_words"]
                for word in re.findall(r"[A-Z]+", candidate_upper)
            ):
                score -= 0.8
            elif any(
                addr in candidate_upper for addr in ["ST", "AVE", "BLVD", "RD", "DR"]
            ):
                score -= 0.8
            elif any(name in candidate_upper for name in ["MR", "MS", "DR"]):
                score -= 0.8

        y_bucket = int((bbox[0][1] + bbox[2][1]) / 100) * 50
        if y_bucket in context["label_zones"]:
            score -= 0.6

        if is_standalone:
            score += 0.2

    elif context["type"] in ["firearm", "isolated"]:
        # Manufacturing penalties
        if re.search(r"\d+MM|CAL", candidate_upper):
            score -= 0.5
        elif candidate_upper in ["USA", "AUSTRIA", "GERMANY", "ITALY"]:
            score -= 0.4
        elif candidate in context.get("manufacturing_marks", set()):
            score -= 0.3

        if is_standalone:
            score += 0.8

    else:  # mixed
        if any(
            label in candidate_upper
            for label in ["POBOX", "DOB", "DATE", "EXP", "ISSUED", "LIC"]
        ):
            score += -1.0 if not is_standalone else -0.5

        if is_standalone:
            score += 0.4

    # Common bonuses
    if 6 <= len(candidate.replace("-", "")) <= 10:
        score += 0.2

    letters = sum(1 for c in candidate if c.isalpha())
    digits = sum(1 for c in candidate if c.isdigit())
    if letters + digits > 0 and 0.3 <= letters / (letters + digits) <= 0.7:
        score += 0.2

    return max(0, min(2, score))


def get_method_description(method):
    """Get human-readable description of pattern matching method."""
    descriptions = {
        "Pattern_0": "Mixed Letters+Digits 5-12 chars (Highest Priority)",
        "Pattern_1": "Letters followed by Digits (High Priority)",
        "Pattern_2": "Digits followed by Letters (Medium-High Priority)",
        "Pattern_3": "Mixed alphanumeric with hyphen (Medium Priority)",
        "Pattern_4": "Digits-only (potential OCR error) (Low Priority)",
    }
    base_desc = descriptions.get(method.replace("Combined_", ""), method)
    return (
        f"Combined Text: {base_desc}" if method.startswith("Combined_") else base_desc
    )


def find_serial_number(
    processed_results, all_text_combined, image_dimensions, debug=True
):
    """Main serial number detection function with multi-region analysis."""
    regions = detect_image_regions(processed_results, image_dimensions)

    if debug:
        if len(regions) > 1:
            print(
                f"  Image classified as: multi-region ({len(regions)} regions detected)"
            )
            for i, region in enumerate(regions):
                print(
                    f"    Region {i+1}: {region['type']} ({len(region['texts'])} texts)"
                )
        else:
            print(f"  Image classified as: {regions[0]['type']}")

    patterns = [
        r"\b[A-Z0-9]{5,12}\b",
        r"\b[A-Z]{1,6}[0-9]{1,11}\b",
        r"\b[0-9]{1,11}[A-Z]{1,6}\b",
        r"\b[A-Z0-9]{2,6}-[A-Z0-9]{2,6}\b",
        r"\b[0-9]{5,12}\b",
    ]

    candidates = []

    # Process each region
    for region in regions:
        context = analyze_region_context(region["texts"], region["type"])

        for pattern_idx, pattern in enumerate(patterns):
            for bbox, text, confidence in region["texts"]:
                for match in re.findall(pattern, text):
                    validation = is_valid_serial_number(match)

                    if validation == True or validation == "potential_ocr_error":
                        context_score = calculate_context_score(
                            match, text, bbox, context
                        )
                        region_bonus = (
                            0.3 if context["type"] in ["firearm", "isolated"] else 0.0
                        )

                        score = (
                            confidence * 0.2
                            + context_score * 0.7
                            + (len(patterns) - pattern_idx) * 0.02
                            + region_bonus
                        )

                        # Determine status and serial number to report
                        if validation == "potential_ocr_error":
                            score *= 0.5
                            status = "Potential OCR Error"
                            reported_serial = f"{match}_potential_error"  # Add suffix for potential errors
                        else:
                            status = "Valid"
                            reported_serial = match

                        candidates.append(
                            (
                                reported_serial,
                                score,
                                f"Pattern_{pattern_idx}",
                                f"From OCR text: '{text}'",
                                confidence,
                                pattern_idx + 1,
                                text,
                                status,
                                context_score,
                                region["region_id"],
                            )
                        )

    # Process combined text
    if all_text_combined:
        for pattern_idx, pattern in enumerate(patterns):
            for match in re.findall(pattern, all_text_combined):
                validation = is_valid_serial_number(match)
                if validation == True:
                    score = 0.3 * 0.2 + 0.3 * 0.7 + (len(patterns) - pattern_idx) * 0.01
                    candidates.append(
                        (
                            match,
                            score,
                            f"Combined_Pattern_{pattern_idx}",
                            "From combined text",
                            0.3,
                            pattern_idx + 1,
                            "Combined OCR text",
                            "Valid",
                            0.3,
                            -1,
                        )
                    )

    # Remove duplicates, keeping highest score
    unique_candidates = {}
    for candidate in candidates:
        serial = candidate[0]
        if (
            serial not in unique_candidates
            or candidate[1] > unique_candidates[serial][1]
        ):
            unique_candidates[serial] = candidate

    final_candidates = sorted(
        unique_candidates.values(), key=lambda x: x[1], reverse=True
    )

    # Display results
    if debug and final_candidates:
        print(f"Serial number candidates (ALL {len(final_candidates)}):")
        for i, (
            serial,
            score,
            method,
            source,
            conf,
            priority,
            source_text,
            status,
            context_score,
            region_id,
        ) in enumerate(final_candidates):
            method_desc = get_method_description(method)
            region_info = (
                f" [Region {region_id+1}]" if region_id >= 0 else " [Combined]"
            )
            print(
                f"  {i+1}. '{serial}' (score: {score:.3f}, method: {method}, priority: {priority}, conf: {conf:.3f}, status: {status}){region_info}"
            )
            print(f"      Method: {method_desc}")
            print(f"      Source text: '{source_text}'")

    return final_candidates[0] if final_candidates else None, final_candidates


In [None]:
# CELL 5: Serial Number Extraction - Main Execution
"""
Main execution pipeline for serial number detection.
This cell processes images using the functions defined in the previous cell.
"""

import os
import cv2
import numpy as np
from paddleocr import PaddleOCR
import matplotlib.pyplot as plt

# Setup OCR engine and file discovery
test_folder = os.path.join(os.getcwd(), "test")
ocr = PaddleOCR(use_angle_cls=True, lang="en", show_log=False)

image_extensions = [".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".tif"]
file_list = [
    f
    for f in os.listdir(test_folder)
    if any(f.lower().endswith(ext) for ext in image_extensions)
]
print(f"Found {len(file_list)} image files in test folder")

# Process each file
csv_results = []

for filename in file_list:
    file_path = os.path.join(test_folder, filename)

    print(f"\nProcessing: {filename}")

    # Extract file metadata
    file_metadata = get_file_metadata(file_path)

    # Load and validate image
    img = cv2.imread(file_path)
    if img is None:
        print(f"Could not read image: {filename}")
        continue

    print(f"  Dimensions: {img.shape[1]}x{img.shape[0]}")

    # Perform OCR
    ocr_results = ocr.ocr(img, cls=True)
    if not ocr_results or not ocr_results[0]:
        print(f"  No text detected")
        continue

    # Process OCR results
    processed_results = []
    all_text = []

    for line in ocr_results[0]:
        bbox, (text, confidence) = line
        text = combine_spaced_alphanumeric(text.strip())
        processed_results.append((bbox, text, confidence))
        all_text.append(text)

    all_text_combined = " ".join(all_text)
    print(f"  OCR extracted {len(all_text_combined)} characters")

    # Find serial numbers using multi-region analysis
    selected_candidate, all_candidates = find_serial_number(
        processed_results, all_text_combined, (img.shape[1], img.shape[0]), debug=True
    )

    candidate_serials = (
        [candidate[0] for candidate in all_candidates] if all_candidates else []
    )

    # Display image with colored bounding boxes
    plt.figure(figsize=(15, 10))
    img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    plt.imshow(img_rgb)

    for bbox, text, confidence in processed_results:
        # Determine box color based on serial number detection
        if (
            selected_candidate
            and selected_candidate[0].replace("_potential_error", "") in text
        ):
            color, linewidth = "red", 3  # Best candidate
        elif any(
            candidate.replace("_potential_error", "") in text
            for candidate in candidate_serials
        ):
            color, linewidth = "blue", 2  # Other candidates
        else:
            color, linewidth = "gray", 1  # Regular text

        # Draw bounding box
        points = np.array(bbox, dtype=np.int32)
        plt.plot(
            [points[0][0], points[1][0], points[2][0], points[3][0], points[0][0]],
            [points[0][1], points[1][1], points[2][1], points[3][1], points[0][1]],
            color=color,
            linewidth=linewidth,
        )

        # Add text label
        plt.text(
            points[0][0],
            points[0][1] - 5,
            f"{text} ({confidence:.2f})",
            fontsize=8,
            color=color,
            weight="bold",
        )

    plt.title(f"Multi-Region Serial Number Detection: {filename}")
    plt.axis("off")
    plt.tight_layout()
    plt.show()

    # Output results and store for CSV
    if selected_candidate:
        (
            serial_number,
            _,
            method,
            _,
            _,
            priority,
            source_text,
            status,
            context_score,
            region_id,
        ) = selected_candidate
        method_desc = get_method_description(method)

        print(f"Found 1 serial numbers")
        print(f"Serial numbers: {serial_number}")
        print(f"Method: {method} (Priority: {priority}) - {method_desc}")
        print(f"Status: {status}")
        print(f"Context Score: {context_score:.3f}")
        if region_id >= 0:
            print(f"Region: {region_id+1}")
        print(f"Extracted from: '{source_text}'")

        # Store result for CSV output
        csv_results.append(
            [
                serial_number,
                "",
                "",
                "",
                "",
                filename,
                file_metadata.get("file_creation_date", "Unknown"),
                file_metadata.get("file_modification_date", "Unknown"),
            ]
        )
    else:
        print(f"Found 0 serial numbers")

# Display final summary
print(f"\n📋 Processed {len(csv_results)} files with serial numbers")

# Display results in CSV format
print("\nEXTRACTION RESULTS (CSV FORMAT)")
print(
    "serial_number,event_type,event_date,associated_name,associated_address,source_file,file_created,file_modified"
)
for row in csv_results:
    print(
        ",".join([f'"{str(item)}"' if "," in str(item) else str(item) for item in row])
    )


In [None]:
# CELL 6.3: Event Type Extraction Implementation with Checkbox Detection

"""
Implementation of the event type extraction function using TF-IDF and cosine similarity.
Includes checkbox detection to identify selected event types.
"""

import re
import numpy as np
import cv2
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def detect_checkboxes(image, processed_results, debug=False):
    """
    Detect checkboxes in the image and identify which ones are checked.
    
    Args:
        image: The input image
        processed_results: List of (bbox, text, confidence) tuples from OCR
        debug: Whether to print debug information
        
    Returns:
        Dictionary mapping text to checkbox status (True if checked, False otherwise)
    """
    # Convert to grayscale
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    
    # Threshold the image
    _, thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
    
    # Find contours
    contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    
    # Filter contours to find potential checkboxes
    checkboxes = []
    for contour in contours:
        x, y, w, h = cv2.boundingRect(contour)
        
        # Filter by aspect ratio and size - more strict filtering
        aspect_ratio = float(w) / h
        if 0.8 <= aspect_ratio <= 1.2 and 15 <= w <= 25 and 15 <= h <= 25:
            # Check if it's a checkbox by looking at the shape
            perimeter = cv2.arcLength(contour, True)
            approx = cv2.approxPolyDP(contour, 0.04 * perimeter, True)
            
            # Checkboxes typically have 4 corners
            if len(approx) == 4:
                checkboxes.append((x, y, w, h))
    
    if debug:
        print(f"Found {len(checkboxes)} potential checkboxes")
    
    # Define event type options we're specifically looking for
    event_types = ["Burglary", "Robbery", "Larceny", "Missing Inventory"]
    
    # For each checkbox, check if it's filled
    checkbox_status = {}
    for x, y, w, h in checkboxes:
        # Extract the checkbox region
        checkbox_roi = thresh[y:y+h, x:x+w]
        
        # Calculate the percentage of filled pixels
        filled_ratio = np.sum(checkbox_roi > 0) / (w * h)
        
        # Consider it checked if more than 20% is filled
        is_checked = filled_ratio > 0.2
        
        # Only look for text near the checkbox that matches event types
        if is_checked:
            min_distance = float('inf')
            nearest_text = None
            
            for bbox, text, _ in processed_results:
                # Only consider text that matches our event types
                if text in event_types:
                    # Calculate center of text bbox
                    text_center_x = sum([point[0] for point in bbox]) / len(bbox)
                    text_center_y = sum([point[1] for point in bbox]) / len(bbox)
                    
                    # Calculate center of checkbox
                    checkbox_center_x = x + w/2
                    checkbox_center_y = y + h/2
                    
                    # Calculate distance
                    distance = ((text_center_x - checkbox_center_x) ** 2 + 
                                (text_center_y - checkbox_center_y) ** 2) ** 0.5
                    
                    # Check if this text is closer than previous nearest
                    if distance < min_distance and distance < 50:  # Within 50 pixels
                        min_distance = distance
                        nearest_text = text
            
            if nearest_text:
                checkbox_status[nearest_text] = is_checked
                if debug:
                    print(f"Checkbox near '{nearest_text}' is checked (Distance: {min_distance:.1f})")
    
    return checkbox_status

def calculate_position_score(bbox):
    """Calculate position score based on location in document."""
    # Get y-coordinate (vertical position)
    y_position = bbox[0][1]
    
    # Normalize to 0-1 range (assuming document height of 3000 pixels)
    normalized_position = min(1.0, max(0.0, y_position / 3000))
    
    # Higher score for elements closer to the top
    return 1.0 - normalized_position

def calculate_contextual_relevance(text, bbox, processed_results):
    """Calculate contextual relevance score based on position and surrounding text."""
    # Position-based score (prefer text near the top of the document)
    position_score = calculate_position_score(bbox)
    
    # Look for event type indicators nearby
    event_indicators = ["event", "incident", "type", "description", "category"]
    proximity_score = 0
    
    for other_bbox, other_text, _ in processed_results:
        if bbox != other_bbox:
            other_text_lower = other_text.lower()
            if any(indicator in other_text_lower for indicator in event_indicators):
                # Calculate distance between bboxes
                center1_x = sum([point[0] for point in bbox]) / len(bbox)
                center1_y = sum([point[1] for point in bbox]) / len(bbox)
                center2_x = sum([point[0] for point in other_bbox]) / len(other_bbox)
                center2_y = sum([point[1] for point in other_bbox]) / len(other_bbox)
                distance = ((center1_x - center2_x) ** 2 + (center1_y - center2_y) ** 2) ** 0.5
                
                if distance < 100:
                    proximity_score = 1.0
                elif distance < 200:
                    proximity_score = 0.8
                elif distance < 300:
                    proximity_score = 0.6
                elif distance < 400:
                    proximity_score = 0.4
                else:
                    proximity_score = max(proximity_score, 0.2)
    
    # Combine scores
    return 0.4 * position_score + 0.6 * proximity_score

def find_event_type(processed_results, all_text_combined, image=None, serial_number=None, debug=True):
    """
    Extract event type using TF-IDF and cosine similarity with checkbox detection.
    
    Args:
        processed_results: List of (bbox, text, confidence) tuples from OCR
        all_text_combined: Combined text from all OCR results
        image: Original image for checkbox detection
        serial_number: Optional serial number for context
        debug: Whether to print debug information
        
    Returns:
        Extracted event type string, or empty string if not found
    """
    # Minimum semantic score threshold
    SEMANTIC_THRESHOLD = 0.3
    
    # Check if there's enough text to extract an event type
    if len(all_text_combined) < 20 or len(processed_results) < 3:
        if debug:
            print("Not enough text to extract event type")
        return "", []
    
    # Define event type categories with example phrases
    event_categories = {
        "theft": ["theft", "stolen", "burglary", "robbery", "shoplifting", "larceny"],
        "loss": ["loss", "lost", "misplaced", "missing", "inventory loss", "missing inventory"],
        "damage": ["damage", "damaged", "destruction", "vandalism", "defaced"],
        "other": ["other", "unknown", "miscellaneous", "unspecified"]
    }
    
    # Detect checkboxes if image is provided
    checkbox_status = {}
    if image is not None:
        checkbox_status = detect_checkboxes(image, processed_results, debug)
        
        # Check if any checkboxes are detected as checked
        checked_items = [text for text, is_checked in checkbox_status.items() if is_checked]
        if checked_items and debug:
            print(f"Detected checked items: {checked_items}")
            
        # If we have checked items that match common event types, prioritize them
        common_event_types = ["Burglary", "Robbery", "Larceny", "Missing Inventory"]
        for item in checked_items:
            if item in common_event_types:
                if debug:
                    print(f"Selected event type from checkbox: {item}")
                return item, []
    
    # Flatten categories for vectorization
    all_examples = []
    category_indices = {}
    current_idx = 0
    
    for category, examples in event_categories.items():
        category_indices[category] = (current_idx, current_idx + len(examples))
        all_examples.extend(examples)
        current_idx += len(examples)
    
    # Extract text from processed results
    texts = [text for _, text, _ in processed_results]
    
    # Skip if no text found
    if not texts:
        if debug:
            print("No text found in processed results")
        return "", []
    
    # Create TF-IDF vectorizer
    vectorizer = TfidfVectorizer(stop_words='english')
    
    # Combine examples and texts for vectorization
    all_texts = all_examples + texts
    
    try:
        # Create TF-IDF matrix
        tfidf_matrix = vectorizer.fit_transform(all_texts)
        
        # Split matrix into examples and texts
        examples_matrix = tfidf_matrix[:len(all_examples)]
        texts_matrix = tfidf_matrix[len(all_examples):]
    except Exception as e:
        if debug:
            print(f"Error in TF-IDF vectorization: {e}")
        return "", []
    
    candidates = []
    
    # Process each text element
    for i, (bbox, text, confidence) in enumerate(processed_results):
        # Skip if text is too short
        if len(text) < 3:
            continue
        
        try:
            # Get text vector
            text_vector = texts_matrix[i]
            
            # Compare with category examples
            scores = {}
            for category, (start_idx, end_idx) in category_indices.items():
                # Calculate cosine similarity with each example in the category
                category_vectors = examples_matrix[start_idx:end_idx]
                similarities = cosine_similarity(text_vector, category_vectors)[0]
                scores[category] = np.max(similarities) if len(similarities) > 0 else 0
            
            # Find best matching category
            best_category = max(scores.items(), key=lambda x: x[1]) if scores else ("unknown", 0)
            
            # Calculate contextual relevance
            context_score = calculate_contextual_relevance(text, bbox, processed_results)
            
            # Calculate checkbox bonus
            checkbox_bonus = 2.0 if checkbox_status.get(text, False) else 1.0
            
            # Final score combines semantic similarity, contextual relevance, and checkbox status
            total_score = (best_category[1] * 0.5 + context_score * 0.5) * confidence * checkbox_bonus
            
            candidates.append({
                'text': text,
                'score': total_score,
                'category': best_category[0],
                'semantic_score': best_category[1],
                'context_score': context_score,
                'bbox': bbox,
                'confidence': confidence,
                'is_checked': checkbox_status.get(text, False)
            })
        except Exception as e:
            if debug:
                print(f"Error processing text '{text}': {e}")
            continue
    
    # Filter and rank candidates
    if not candidates:
        if debug:
            print("No event type candidates found")
        return "", []
        
    # Filter by semantic score threshold
    valid_candidates = [c for c in candidates if c['semantic_score'] >= SEMANTIC_THRESHOLD]
    
    # Prioritize checked checkboxes
    checked_candidates = [c for c in valid_candidates if c['is_checked']]
    if checked_candidates:
        valid_candidates = checked_candidates
    
    # Sort by score
    valid_candidates.sort(key=lambda x: x['score'], reverse=True)
    
    # Debug output
    if debug and valid_candidates:
        print(f"\nEvent type candidates ({len(valid_candidates)}):")
        for i, candidate in enumerate(valid_candidates[:5]):  # Show top 5
            checked = ", Checked: Yes" if candidate['is_checked'] else ""
            print(f"  {i+1}. '{candidate['text']}' - Score: {candidate['score']:.3f} "
                  f"(Category: {candidate['category']}, Semantic: {candidate['semantic_score']:.3f}, "
                  f"Context: {candidate['context_score']:.3f}{checked})")
    elif debug:
        print("No valid event type candidates found")
    
    # Return best candidate or empty string if none found
    return valid_candidates[0]['text'] if valid_candidates else "", valid_candidates


# Test the event type extraction on ALL images
print("\n===== TESTING EVENT TYPE EXTRACTION ON ALL IMAGES =====")

# Initialize results storage
event_type_results = []

if file_list:
    for test_file in file_list:
        print(f"\n{'-'*60}")
        print(f"Processing: {test_file}")
        test_path = os.path.join(test_folder, test_file)
        
        # Load and process image
        image = cv2.imread(test_path)
        if image is None:
            print(f"Could not read image: {test_file}")
            continue
            
        print(f"  Dimensions: {image.shape[1]}x{image.shape[0]}")
        
        # Perform OCR
        ocr_results = ocr.ocr(image, cls=True)
        if not ocr_results or not ocr_results[0]:
            print(f"  No text detected")
            continue
            
        # Process OCR results
        processed_results = []
        all_text = []
        
        for line in ocr_results[0]:
            bbox, (text, confidence) = line
            text = combine_spaced_alphanumeric(text.strip())
            processed_results.append((bbox, text, confidence))
            all_text.append(text)
            
        all_text_combined = " ".join(all_text)
        print(f"  OCR extracted {len(all_text_combined)} characters")
        
        # Get image dimensions for serial number extraction
        image_dimensions = (image.shape[1], image.shape[0])  # width, height
        
        # Extract serial number first to use as context
        serial_number, serial_candidates = find_serial_number(processed_results, all_text_combined, image_dimensions, debug=False)
        if serial_number:
            print(f"Serial Number: '{serial_number}'")
        
        # Extract event type
        event_type, all_candidates = find_event_type(processed_results, all_text_combined, image, serial_number, debug=True)
        print(f"\nExtracted Event Type: '{event_type}'")
        
        # Store result
        event_type_results.append({
            'filename': test_file,
            'event_type': event_type
        })
        
        # Visualize the result with colored bounding boxes
        plt.figure(figsize=(15, 10))
        img_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        plt.imshow(img_rgb)
        
        # Get all candidate texts for visualization
        candidate_texts = [candidate['text'] for candidate in all_candidates if candidate['bbox'] is not None]
        
        for bbox, text, confidence in processed_results:
            # Determine box color based on detection
            if event_type and text.strip() == event_type:
                color, linewidth = "red", 3  # Event type
            elif any(text.strip() == candidate_text for candidate_text in candidate_texts):
                color, linewidth = "blue", 2  # Other candidates
            else:
                color, linewidth = "gray", 1  # Regular text
                
            # Draw bounding box
            points = np.array(bbox, dtype=np.int32)
            plt.plot(
                [points[0][0], points[1][0], points[2][0], points[3][0], points[0][0]],
                [points[0][1], points[1][1], points[2][1], points[3][1], points[0][1]],
                color=color,
                linewidth=linewidth,
            )
            
            # Add text label
            plt.text(
                points[0][0],
                points[0][1] - 5,
                f"{text} ({confidence:.2f})",
                fontsize=8,
                color=color,
                weight="bold",
            )
            
        plt.title(f"Event Type Detection: {test_file}")
        plt.axis("off")
        plt.tight_layout()
        plt.show()
    
    # Display summary of results
    print(f"\n{'-'*60}")
    print(f"EVENT TYPE EXTRACTION SUMMARY")
    print(f"{'-'*60}")
    print(f"Processed {len(event_type_results)} images")
    print(f"Found event types in {sum(1 for r in event_type_results if r['event_type'])} images")
    
    # Display table of results
    if event_type_results:
        results_df = pd.DataFrame(event_type_results)
        print("\nExtracted Event Types:")
        print(results_df.to_string(index=False))
else:
    print("No test images available for event type extraction testing")

In [None]:
# CELL 6.4: Event Date Extraction Implementation

"""
Implementation of the event date extraction function.
Uses proximity to event_type and date pattern recognition.
"""

import re
import dateutil.parser
from datetime import datetime

def is_valid_date_format(text):
    """Check if text matches common date formats."""
    # Common date patterns
    date_patterns = [
        r'\d{1,2}[-/\.]\d{1,2}[-/\.]\d{2,4}',  # MM/DD/YYYY, DD/MM/YYYY, etc.
        r'\d{1,2}[-/\.]\d{2,4}',  # MM/YY, MM/YYYY
        r'\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\.?\s+\d{1,2},?\s+\d{2,4}\b',  # January 1, 2020
        r'\b\d{1,2}\s+(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\.?\s+\d{2,4}\b',  # 1 January 2020
        r'\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},?\s+\d{2,4}\b',
        r'\b\d{1,2}\s+(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{2,4}\b',
    ]
    
    for pattern in date_patterns:
        if re.search(pattern, text, re.IGNORECASE):
            return True
    
    return False

def extract_date_from_text(text):
    """Extract date from text using dateutil parser."""
    try:
        # Try to parse the date
        date = dateutil.parser.parse(text, fuzzy=True)
        
        # Check if year is reasonable (not future date)
        current_year = datetime.now().year
        if date.year > current_year + 1:
            # Adjust 2-digit years that were interpreted as future
            if date.year - 2000 > current_year:
                date = date.replace(year=date.year - 100)
        
        # Format date consistently
        return date.strftime("%m/%d/%Y")
    except:
        return None

def find_date_labels(processed_results):
    """Find text elements that are likely date labels."""
    date_labels = []
    date_keywords = ["date", "occurred", "reported", "discovered", "incident", "event"]
    
    for bbox, text, confidence in processed_results:
        text_lower = text.lower()
        if any(keyword in text_lower for keyword in date_keywords):
            if "date" in text_lower or ":" in text:
                date_labels.append((bbox, text, confidence))
    
    return date_labels

def calculate_distance(bbox1, bbox2):
    """Calculate distance between two bounding boxes."""
    center1_x = sum([point[0] for point in bbox1]) / len(bbox1)
    center1_y = sum([point[1] for point in bbox1]) / len(bbox1)
    center2_x = sum([point[0] for point in bbox2]) / len(bbox2)
    center2_y = sum([point[1] for point in bbox2]) / len(bbox2)
    return ((center1_x - center2_x) ** 2 + (center1_y - center2_y) ** 2) ** 0.5

def find_event_date(processed_results, event_type_bbox=None, debug=True):
    """
    Extract event date using proximity to event_type and date pattern recognition.
    
    Args:
        processed_results: List of (bbox, text, confidence) tuples from OCR
        event_type_bbox: Bounding box of the detected event type
        debug: Whether to print debug information
        
    Returns:
        Extracted date string in MM/DD/YYYY format, or empty string if not found
    """
    # Skip if no event_type was found
    if event_type_bbox is None:
        if debug:
            print("No event_type found, skipping event_date extraction")
        return "", []
    
    # Find date labels
    date_labels = find_date_labels(processed_results)
    
    # Find text elements that match date patterns
    date_candidates = []
    
    # Distance threshold for severe penalty
    DISTANCE_THRESHOLD = 2000
    
    # First pass: find explicit dates
    for bbox, text, confidence in processed_results:
        # Skip very short text
        if len(text) < 4:
            continue
            
        # Check if text matches date pattern
        if is_valid_date_format(text):
            # Extract date
            date_str = extract_date_from_text(text)
            if date_str:
                # Calculate distance to event_type
                distance_to_event = calculate_distance(bbox, event_type_bbox)
                
                # Calculate distance to nearest date label
                min_label_distance = float('inf')
                nearest_label = None
                for label_bbox, label_text, _ in date_labels:
                    dist = calculate_distance(bbox, label_bbox)
                    if dist < min_label_distance:
                        min_label_distance = dist
                        nearest_label = label_text
                
                # Calculate score based on distances with threshold penalty
                if distance_to_event < DISTANCE_THRESHOLD:
                    # Normal scoring for dates within threshold
                    event_proximity_score = max(0, 1 - (distance_to_event / DISTANCE_THRESHOLD))
                else:
                    # Severe penalty for dates beyond threshold
                    event_proximity_score = max(0, 0.1 - (distance_to_event - DISTANCE_THRESHOLD) / 1000)
                
                # Label proximity is secondary
                label_proximity_score = max(0, 1 - (min_label_distance / 500)) if nearest_label else 0
                
                # Combine scores with much higher weight (0.9) to event proximity
                total_score = (event_proximity_score * 0.9 + label_proximity_score * 0.1) * confidence
                
                date_candidates.append({
                    'date': date_str,
                    'text': text,
                    'score': total_score,
                    'bbox': bbox,
                    'confidence': confidence,
                    'distance_to_event': distance_to_event,
                    'nearest_label': nearest_label,
                    'label_distance': min_label_distance
                })
    
    # Second pass: look for date fields near date labels
    for label_bbox, label_text, _ in date_labels:
        for bbox, text, confidence in processed_results:
            # Skip if already processed as a date
            if any(c['text'] == text for c in date_candidates):
                continue
                
            # Check proximity to label
            distance = calculate_distance(bbox, label_bbox)
            if distance < 200:  # Close to label
                # Try to extract date
                date_str = extract_date_from_text(text)
                if date_str:
                    # Calculate distance to event_type
                    distance_to_event = calculate_distance(bbox, event_type_bbox)
                    
                    # Calculate score based on distances with threshold penalty
                    if distance_to_event < DISTANCE_THRESHOLD:
                        # Normal scoring for dates within threshold
                        event_proximity_score = max(0, 1 - (distance_to_event / DISTANCE_THRESHOLD))
                    else:
                        # Severe penalty for dates beyond threshold
                        event_proximity_score = max(0, 0.1 - (distance_to_event - DISTANCE_THRESHOLD) / 1000)
                    
                    label_proximity_score = max(0, 1 - (distance / 500))
                    
                    # Combine scores with much higher weight (0.9) to event proximity
                    total_score = (event_proximity_score * 0.9 + label_proximity_score * 0.1) * confidence
                    
                    date_candidates.append({
                        'date': date_str,
                        'text': text,
                        'score': total_score,
                        'bbox': bbox,
                        'confidence': confidence,
                        'distance_to_event': distance_to_event,
                        'nearest_label': label_text,
                        'label_distance': distance
                    })
    
    # Sort candidates by score
    date_candidates.sort(key=lambda x: x['score'], reverse=True)
    
    # Debug output
    if debug and date_candidates:
        print(f"\nEvent date candidates ({len(date_candidates)}):")
        for i, candidate in enumerate(date_candidates[:5]):  # Show top 5
            print(f"  {i+1}. '{candidate['date']}' (from '{candidate['text']}') - Score: {candidate['score']:.3f}, "
                  f"Distance to event: {candidate['distance_to_event']:.1f}, "
                  f"Label: '{candidate['nearest_label']}' ({candidate['label_distance']:.1f})")
    
    # Return best candidate
    return date_candidates[0]['date'] if date_candidates else "", date_candidates


# Test the event date extraction on ALL images
print("\n===== TESTING EVENT DATE EXTRACTION ON ALL IMAGES =====")

# Initialize results storage
event_date_results = []

if file_list:
    for test_file in file_list:
        print(f"\n{'-'*60}")
        print(f"Processing: {test_file}")
        test_path = os.path.join(test_folder, test_file)
        
        # Load and process image
        image = cv2.imread(test_path)
        if image is None:
            print(f"Could not read image: {test_file}")
            continue
            
        print(f"  Dimensions: {image.shape[1]}x{image.shape[0]}")
        
        # Perform OCR
        ocr_results = ocr.ocr(image, cls=True)
        if not ocr_results or not ocr_results[0]:
            print(f"  No text detected")
            continue
            
        # Process OCR results
        processed_results = []
        all_text = []
        
        for line in ocr_results[0]:
            bbox, (text, confidence) = line
            text = combine_spaced_alphanumeric(text.strip())
            processed_results.append((bbox, text, confidence))
            all_text.append(text)
            
        all_text_combined = " ".join(all_text)
        print(f"  OCR extracted {len(all_text_combined)} characters")
        
        # First extract event type
        event_type, event_type_candidates = find_event_type(processed_results, all_text_combined, debug=False)
        
        # Get event type bbox if found
        event_type_bbox = None
        if event_type:
            print(f"Event Type: '{event_type}'")
            # Find the bbox for the event type
            for bbox, text, _ in processed_results:
                if text.strip() == event_type:
                    event_type_bbox = bbox
                    break
        else:
            print("No event type found")
        
        # Extract event date only if event type was found
        if event_type_bbox:
            event_date, date_candidates = find_event_date(processed_results, event_type_bbox, debug=True)
            print(f"\nExtracted Event Date: '{event_date}'")
        else:
            event_date = ""
            date_candidates = []
            print("\nSkipping event date extraction (no event type found)")
        
        # Store result
        event_date_results.append({
            'filename': test_file,
            'event_type': event_type,
            'event_date': event_date
        })
        
        # Visualize the result with colored bounding boxes
        plt.figure(figsize=(15, 10))
        img_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        plt.imshow(img_rgb)
        
        # Get all candidate texts for visualization
        date_texts = [candidate['text'] for candidate in date_candidates if candidate['bbox'] is not None]
        
        for bbox, text, confidence in processed_results:
            # Determine box color based on detection
            if event_type and text.strip() == event_type:
                color, linewidth = "red", 3  # Event type
            elif event_date and text.strip() == event_date:
                color, linewidth = "green", 3  # Event date
            elif any(text.strip() == date_text for date_text in date_texts):
                color, linewidth = "blue", 2  # Other date candidates
            else:
                color, linewidth = "gray", 1  # Regular text
                
            # Draw bounding box
            points = np.array(bbox, dtype=np.int32)
            plt.plot(
                [points[0][0], points[1][0], points[2][0], points[3][0], points[0][0]],
                [points[0][1], points[1][1], points[2][1], points[3][1], points[0][1]],
                color=color,
                linewidth=linewidth,
            )
            
            # Add text label
            plt.text(
                points[0][0],
                points[0][1] - 5,
                f"{text} ({confidence:.2f})",
                fontsize=8,
                color=color,
                weight="bold",
            )
            
        plt.title(f"Event Date Detection: {test_file}")
        plt.axis("off")
        plt.tight_layout()
        plt.show()
    
    # Display summary of results
    print(f"\n{'-'*60}")
    print(f"EVENT DATE EXTRACTION SUMMARY")
    print(f"{'-'*60}")
    print(f"Processed {len(event_date_results)} images")
    print(f"Found event dates in {sum(1 for r in event_date_results if r['event_date'])} images")
    
    # Display table of results
    if event_date_results:
        results_df = pd.DataFrame(event_date_results)
        print("\nExtracted Event Dates:")
        print(results_df.to_string(index=False))
else:
    print("No test images available for event date extraction testing")

In [None]:
# CELL 6.5: Associated Name Extraction Implementation

"""
Implementation of the associated name extraction function.
Identifies person names associated with the event (reporter or owner).
"""

import re
import numpy as np

def is_person_name(text):
    """Check if text is likely a person's name."""
    # Common name patterns
    name_patterns = [
        r'^[A-Z][a-z]+\s+[A-Z][a-z]+$',  # First Last
        r'^[A-Z][a-z]+\s+[A-Z]\.\s*[A-Z][a-z]+$',  # First M. Last
        r'^[A-Z][a-z]+\s+[A-Z][a-z]+\s+[A-Z][a-z]+$',  # First Middle Last
        r'^[A-Z][a-z]+,\s*[A-Z][a-z]+$',  # Last, First
        r'^[A-Z][a-z]+,\s*[A-Z][a-z]+\s+[A-Z][a-z]+$',  # Last, First Middle
        r'^[A-Z][a-z]+,\s*[A-Z][a-z]+\s+[A-Z]\.$',  # Last, First M.
        r'^[A-Z]+\s+[A-Z]+$',  # FIRST LAST (all caps)
        r'^[A-Z]+,\s*[A-Z]+$',  # LAST, FIRST (all caps)
        r'^[A-Z]+,\s*[A-Z]+\s+[A-Z]+$',  # LAST, FIRST MIDDLE (all caps)
        r'^[A-Z]+,\s*[A-Z]+\s+[A-Z]\.?$',  # LAST, FIRST M. (all caps)
    ]
    
    for pattern in name_patterns:
        if re.match(pattern, text):
            return True
    
    return False

def contains_name_indicator(text):
    """Check if text contains indicators of a name field."""
    name_indicators = ["name", "reported by", "owner", "licensee", "person", "signature", "signed"]
    text_lower = text.lower()
    
    for indicator in name_indicators:
        if indicator in text_lower:
            return True
    
    return False

def find_name_labels(processed_results):
    """Find text elements that are likely name labels."""
    name_labels = []
    name_indicators = ["full name", "person making report", "reported by", "owner", 
                      "licensee", "person", "signature of", "signed by"]
    
    for bbox, text, confidence in processed_results:
        text_lower = text.lower()
        
        # Check if text contains name indicators
        if any(indicator in text_lower for indicator in name_indicators):
            name_labels.append((bbox, text, confidence))
    
    return name_labels

def calculate_distance(bbox1, bbox2):
    """Calculate distance between two bounding boxes."""
    center1_x = sum([point[0] for point in bbox1]) / len(bbox1)
    center1_y = sum([point[1] for point in bbox1]) / len(bbox1)
    center2_x = sum([point[0] for point in bbox2]) / len(bbox2)
    center2_y = sum([point[1] for point in bbox2]) / len(bbox2)
    return ((center1_x - center2_x) ** 2 + (center1_y - center2_y) ** 2) ** 0.5

def is_likely_not_a_name(text):
    """Check if text is likely not a person name."""
    # Common non-name patterns
    non_name_indicators = [
        "missing", "inventory", "theft", "loss", "report", "form", "license", 
        "serial", "number", "date", "address", "phone", "fax", "email",
        "llc", "inc", "corporation", "company", "department", "bureau",
        "zip", "code", "city", "state", "street", "tower", "bridge", "services",
        "security", "universal", "police", "notification", "signature", "felony"
    ]
    
    text_lower = text.lower()
    
    # Check for non-name indicators
    for indicator in non_name_indicators:
        if indicator in text_lower:
            return True
    
    # Check for numeric content
    if re.search(r'\d', text):
        return True
    
    # Check for very short text
    if len(text.split()) < 2:
        return True
    
    # Check for very long text
    if len(text.split()) > 5:
        return True
    
    return False

def is_form_field_label(text):
    """Check if text is likely a form field label."""
    text_lower = text.lower()
    
    # Common form field label indicators
    form_field_indicators = ["name:", "address:", "code:", "number:", "telephone", 
                           "signature", "street", "city", "state", "zip", "date"]
    
    # Check for form field label indicators
    for indicator in form_field_indicators:
        if indicator in text_lower:
            return True
    
    return False

def find_associated_name(processed_results, event_type_bbox=None, debug=True):
    """
    Extract associated name (person reporting the event or owner).
    
    Args:
        processed_results: List of (bbox, text, confidence) tuples from OCR
        event_type_bbox: Bounding box of the detected event type
        debug: Whether to print debug information
        
    Returns:
        Extracted name string, or empty string if not found
    """
    # Find name labels
    name_labels = find_name_labels(processed_results)
    
    # Find text elements that are likely names
    name_candidates = []
    person_name_candidates = []
    
    # First pass: find explicit names
    for bbox, text, confidence in processed_results:
        # Skip very short text
        if len(text) < 4:
            continue
            
        # Skip if likely not a name
        if is_likely_not_a_name(text):
            continue
            
        # Skip if it's a form field label
        if is_form_field_label(text):
            continue
            
        # Check if text matches name pattern
        is_name = is_person_name(text)
        
        # Calculate distance to nearest name label
        min_label_distance = float('inf')
        nearest_label = None
        for label_bbox, label_text, _ in name_labels:
            dist = calculate_distance(bbox, label_bbox)
            if dist < min_label_distance:
                min_label_distance = dist
                nearest_label = label_text
        
        # Calculate distance to event type if available
        if event_type_bbox:
            distance_to_event = calculate_distance(bbox, event_type_bbox)
        else:
            distance_to_event = float('inf')
        
        # Calculate score based on distances
        label_proximity_score = max(0, 1 - (min_label_distance / 500)) if nearest_label else 0
        
        # Event type proximity is less important for names
        if event_type_bbox:
            event_proximity_score = max(0, 1 - (distance_to_event / 2000))
        else:
            event_proximity_score = 0
        
        # Apply person name bonus - this is the key change
        person_name_bonus = 2.0 if is_name else 0.3
        
        # Combine scores with person name bonus
        total_score = (label_proximity_score * 0.8 + event_proximity_score * 0.2) * confidence * person_name_bonus
        
        candidate = {
            'name': text,
            'score': total_score,
            'bbox': bbox,
            'confidence': confidence,
            'nearest_label': nearest_label,
            'label_distance': min_label_distance,
            'distance_to_event': distance_to_event if event_type_bbox else None,
            'is_person_name': is_name
        }
        
        name_candidates.append(candidate)
        
        # Keep track of person name candidates separately
        if is_name:
            person_name_candidates.append(candidate)
    
    # Second pass: look for names near name labels
    for label_bbox, label_text, _ in name_labels:
        for bbox, text, confidence in processed_results:
            # Skip if already processed as a name
            if any(c['name'] == text for c in name_candidates):
                continue
                
            # Skip if likely not a name
            if is_likely_not_a_name(text):
                continue
                
            # Skip if it's a form field label
            if is_form_field_label(text):
                continue
                
            # Check proximity to label
            distance = calculate_distance(bbox, label_bbox)
            if distance < 200:  # Close to label
                # Check if text matches name pattern
                is_name = is_person_name(text)
                
                # Calculate distance to event type if available
                if event_type_bbox:
                    distance_to_event = calculate_distance(bbox, event_type_bbox)
                else:
                    distance_to_event = float('inf')
                
                # Calculate score based on distances
                label_proximity_score = max(0, 1 - (distance / 500))
                
                # Event type proximity is less important for names
                if event_type_bbox:
                    event_proximity_score = max(0, 1 - (distance_to_event / 2000))
                else:
                    event_proximity_score = 0
                
                # Apply person name bonus - this is the key change
                person_name_bonus = 2.0 if is_name else 0.3
                
                # Combine scores with person name bonus
                total_score = (label_proximity_score * 0.8 + event_proximity_score * 0.2) * confidence * person_name_bonus
                
                candidate = {
                    'name': text,
                    'score': total_score,
                    'bbox': bbox,
                    'confidence': confidence,
                    'nearest_label': label_text,
                    'label_distance': distance,
                    'distance_to_event': distance_to_event if event_type_bbox else None,
                    'is_person_name': is_name
                }
                
                name_candidates.append(candidate)
                
                # Keep track of person name candidates separately
                if is_name:
                    person_name_candidates.append(candidate)
    
    # Sort candidates by score
    name_candidates.sort(key=lambda x: x['score'], reverse=True)
    
    # Sort person name candidates by score
    person_name_candidates.sort(key=lambda x: x['score'], reverse=True)
    
    # Debug output
    if debug and name_candidates:
        print(f"\nAssociated name candidates ({len(name_candidates)}):")
        for i, candidate in enumerate(name_candidates[:5]):  # Show top 5
            event_dist = f", Distance to event: {candidate['distance_to_event']:.1f}" if candidate['distance_to_event'] else ""
            person_name = ", Is person name: Yes" if candidate['is_person_name'] else ", Is person name: No"
            print(f"  {i+1}. '{candidate['name']}' - Score: {candidate['score']:.3f}{event_dist}, "
                  f"Label: '{candidate['nearest_label']}' ({candidate['label_distance']:.1f}){person_name}")
    elif debug:
        print("No associated name candidates found")
    
    # IMPORTANT: Prioritize person name candidates if available
    if person_name_candidates:
        return person_name_candidates[0]['name'], name_candidates
    elif name_candidates:
        return name_candidates[0]['name'], name_candidates
    else:
        return "", name_candidates


# Test the associated name extraction on ALL images
print("\n===== TESTING ASSOCIATED NAME EXTRACTION ON ALL IMAGES =====")

# Initialize results storage
associated_name_results = []

if file_list:
    for test_file in file_list:
        print(f"\n{'-'*60}")
        print(f"Processing: {test_file}")
        test_path = os.path.join(test_folder, test_file)
        
        # Load and process image
        image = cv2.imread(test_path)
        if image is None:
            print(f"Could not read image: {test_file}")
            continue
            
        print(f"  Dimensions: {image.shape[1]}x{image.shape[0]}")
        
        # Perform OCR
        ocr_results = ocr.ocr(image, cls=True)
        if not ocr_results or not ocr_results[0]:
            print(f"  No text detected")
            continue
            
        # Process OCR results
        processed_results = []
        all_text = []
        
        for line in ocr_results[0]:
            bbox, (text, confidence) = line
            text = combine_spaced_alphanumeric(text.strip())
            processed_results.append((bbox, text, confidence))
            all_text.append(text)
            
        all_text_combined = " ".join(all_text)
        print(f"  OCR extracted {len(all_text_combined)} characters")
        
        # First extract event type to use as reference
        event_type, event_type_candidates = find_event_type(processed_results, all_text_combined, debug=False)
        
        # Get event type bbox if found
        event_type_bbox = None
        if event_type:
            print(f"Event Type: '{event_type}'")
            # Find the bbox for the event type
            for bbox, text, _ in processed_results:
                if text.strip() == event_type:
                    event_type_bbox = bbox
                    break
        else:
            print("No event type found")
        
        # Extract associated name
        associated_name, name_candidates = find_associated_name(processed_results, event_type_bbox, debug=True)
        print(f"\nExtracted Associated Name: '{associated_name}'")
        
        # Store result
        associated_name_results.append({
            'filename': test_file,
            'event_type': event_type,
            'associated_name': associated_name
        })
        
        # Visualize the result with colored bounding boxes
        plt.figure(figsize=(15, 10))
        img_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        plt.imshow(img_rgb)
        
        # Get all candidate texts for visualization
        name_texts = [candidate['name'] for candidate in name_candidates if candidate['bbox'] is not None]
        
        for bbox, text, confidence in processed_results:
            # Determine box color based on detection
            if associated_name and text.strip() == associated_name:
                color, linewidth = "red", 3  # Associated name (selected candidate)
            elif any(text.strip() == name_text for name_text in name_texts):
                color, linewidth = "blue", 2  # Other name candidates
            else:
                color, linewidth = "gray", 1  # Regular text
                
            # Draw bounding box
            points = np.array(bbox, dtype=np.int32)
            plt.plot(
                [points[0][0], points[1][0], points[2][0], points[3][0], points[0][0]],
                [points[0][1], points[1][1], points[2][1], points[3][1], points[0][1]],
                color=color,
                linewidth=linewidth,
            )
            
            # Add text label
            plt.text(
                points[0][0],
                points[0][1] - 5,
                f"{text} ({confidence:.2f})",
                fontsize=8,
                color=color,
                weight="bold",
            )
            
        plt.title(f"Associated Name Detection: {test_file}")
        plt.axis("off")
        plt.tight_layout()
        plt.show()
    
    # Display summary of results
    print(f"\n{'-'*60}")
    print(f"ASSOCIATED NAME EXTRACTION SUMMARY")
    print(f"{'-'*60}")
    print(f"Processed {len(associated_name_results)} images")
    print(f"Found associated names in {sum(1 for r in associated_name_results if r['associated_name'])} images")
    
    # Display table of results
    if associated_name_results:
        results_df = pd.DataFrame(associated_name_results)
        print("\nExtracted Associated Names:")
        print(results_df.to_string(index=False))
else:
    print("No test images available for associated name extraction testing")

In [None]:
# CELL 6.6: Associated Address Extraction Implementation

"""
Implementation of the associated address extraction function.
Identifies the full address (street, city, state, zip) associated with the person.
"""

import re
import numpy as np
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load spaCy NER model
try:
    nlp = spacy.load("en_core_web_sm")
except:
    # If model not installed, download it
    import sys
    !{sys.executable} -m spacy download en_core_web_sm
    nlp = spacy.load("en_core_web_sm")

# [KEEP ALL ORIGINAL ASSOCIATED NAME FUNCTIONS FROM CELL 6.5 EXACTLY AS PROVIDED]
# is_person_name, contains_name_indicator, find_name_labels, calculate_distance,
# is_likely_not_a_name, is_form_field_label, find_associated_name

# Address extraction functions
def is_street_address(text):
    """Check if text is likely a street address."""
    # Common street address patterns
    street_patterns = [
        r'\d+\s+[A-Za-z]+\s+[A-Za-z]+',  # 123 Main Street
        r'\d+\s+[A-Za-z]+\s+[A-Za-z]+\s+[A-Za-z]+',  # 123 East Main Street
        r'\d+\s+[A-Za-z]+',  # 123 Main
        r'P\.?O\.?\s+Box\s+\d+',  # P.O. Box 123
    ]
    
    # Check for common street suffixes
    street_suffixes = ['street', 'st', 'avenue', 'ave', 'road', 'rd', 'boulevard', 'blvd', 
                      'drive', 'dr', 'lane', 'ln', 'place', 'pl', 'court', 'ct', 'way']
    
    # Check for street patterns
    for pattern in street_patterns:
        if re.search(pattern, text, re.IGNORECASE):
            return True
    
    # Check for street suffixes
    for suffix in street_suffixes:
        if re.search(r'\b' + suffix + r'\b', text, re.IGNORECASE):
            return True
    
    return False

def is_state(text):
    """Check if text is a US state or abbreviation."""
    # List of US state abbreviations
    state_abbrs = ['AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'FL', 'GA', 
                  'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', 
                  'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 
                  'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 
                  'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY']
    
    # Check if text is a state abbreviation
    if text.upper() in state_abbrs:
        return True
        
    # List of US state names
    state_names = ['alabama', 'alaska', 'arizona', 'arkansas', 'california', 
                  'colorado', 'connecticut', 'delaware', 'florida', 'georgia', 
                  'hawaii', 'idaho', 'illinois', 'indiana', 'iowa', 'kansas', 
                  'kentucky', 'louisiana', 'maine', 'maryland', 'massachusetts', 
                  'michigan', 'minnesota', 'mississippi', 'missouri', 'montana', 
                  'nebraska', 'nevada', 'new hampshire', 'new jersey', 'new mexico', 
                  'new york', 'north carolina', 'north dakota', 'ohio', 'oklahoma', 
                  'oregon', 'pennsylvania', 'rhode island', 'south carolina', 
                  'south dakota', 'tennessee', 'texas', 'utah', 'vermont', 
                  'virginia', 'washington', 'west virginia', 'wisconsin', 'wyoming']
    
    # Check if text is a state name
    if text.lower() in state_names:
        return True
        
    return False

def is_zipcode(text):
    """Check if text is a US ZIP code."""
    # 5-digit ZIP code
    if re.match(r'^\d{5}$', text):
        return True
        
    # 9-digit ZIP+4 code
    if re.match(r'^\d{5}-\d{4}$', text):
        return True
        
    return False

def find_address_blocks(processed_results):
    """Group text elements into potential address blocks based on spatial proximity."""
    # If too few elements, return a single block
    if len(processed_results) < 5:
        return [processed_results]
    
    # Initialize address blocks
    address_blocks = []
    
    # Sort by vertical position (y-coordinate)
    sorted_results = sorted(processed_results, 
                           key=lambda x: sum([point[1] for point in x[0]]) / len(x[0]))
    
    current_block = []
    prev_y = None
    
    # Group by vertical proximity
    for bbox, text, confidence in sorted_results:
        # Calculate center y-coordinate
        center_y = sum([point[1] for point in bbox]) / len(bbox)
        
        # Start a new block if vertical gap is large
        if prev_y is not None and abs(center_y - prev_y) > 100:
            if current_block:
                address_blocks.append(current_block)
                current_block = []
        
        current_block.append((bbox, text, confidence))
        prev_y = center_y
    
    # Add the last block
    if current_block:
        address_blocks.append(current_block)
    
    return address_blocks

def find_city_contextual(processed_results, state_candidates, zip_candidates, associated_name_bbox=None, debug=False):
    """
    Extract city name using contextual clues from form structure and spatial relationships.
    
    Args:
        processed_results: List of (bbox, text, confidence) tuples from OCR
        state_candidates: List of state candidates
        zip_candidates: List of ZIP code candidates
        associated_name_bbox: Bounding box of the associated name (optional)
        debug: Whether to print debug information
        
    Returns:
        List of city candidates with scores
    """
    city_candidates = []
    
    # Find city labels
    city_labels = []
    for bbox, text, confidence in processed_results:
        text_lower = text.lower()
        if text_lower == 'city' or text_lower == 'city:':
            city_labels.append((bbox, text, confidence))
            if debug:
                print(f"Found city label: '{text}'")
    
    # Group text elements into potential address blocks
    address_blocks = find_address_blocks(processed_results)
    
    # Process each text element
    for bbox, text, confidence in processed_results:
        # Skip very short text or text with numbers
        if len(text) < 3 or re.search(r'\d', text):
            continue
            
        # Skip form field labels and text with colons
        if text.lower() in ['city', 'city:', 'state', 'state:', 'zip', 'zip code', 'zip:'] or ':' in text:
            continue
        
        # Initialize score and reasons
        score = 0.0
        reasons = []
        
        # 1. Check proximity to city labels
        for label_bbox, label_text, _ in city_labels:
            distance = calculate_distance(bbox, label_bbox)
            if distance < 200:
                # Text near a city label is likely a city
                label_score = 5.0 * confidence
                score += label_score
                reasons.append(f"Near city label '{label_text}' (distance: {distance:.1f}): +{label_score:.2f} points")
                break  # Only count once
        
        # 2. Check for city-state-zip pattern within the same address block
        if state_candidates or zip_candidates:
            # Find which address block this text belongs to
            text_block = None
            for block in address_blocks:
                if any(t == text for _, t, _ in block):
                    text_block = block
                    break
            
            if text_block:
                # Check if any state candidate is in the same block
                block_texts = [t for _, t, _ in text_block]
                state_in_block = False
                zip_in_block = False
                
                for state_candidate in state_candidates:
                    if state_candidate['text'] in block_texts:
                        state_in_block = True
                        
                        # Find positions to check if city is before state
                        state_bbox = state_candidate['bbox']
                        state_x = sum([point[0] for point in state_bbox]) / len(state_bbox)
                        state_y = sum([point[1] for point in state_bbox]) / len(state_bbox)
                        text_x = sum([point[0] for point in bbox]) / len(bbox)
                        text_y = sum([point[1] for point in bbox]) / len(bbox)
                        
                        # City typically appears before state on same line
                        if abs(text_y - state_y) < 30 and text_x < state_x:
                            pattern_score = 6.0 * confidence
                            score += pattern_score
                            reasons.append(f"Appears before state '{state_candidate['text']}' on same line: +{pattern_score:.2f} points")
                        
                        # City typically appears before state in address block
                        elif text_y < state_y:
                            pattern_score = 3.0 * confidence
                            score += pattern_score
                            reasons.append(f"Appears before state '{state_candidate['text']}' in address block: +{pattern_score:.2f} points")
                        
                        break  # Only count once
                
                for zip_candidate in zip_candidates:
                    if zip_candidate['text'] in block_texts:
                        zip_in_block = True
                        break
                
                if state_in_block and zip_in_block:
                    # Strong indicator: city, state, and ZIP in same block
                    block_score = 4.0 * confidence
                    score += block_score
                    reasons.append(f"In same address block as state and ZIP: +{block_score:.2f} points")
                elif state_in_block:
                    # Medium indicator: city and state in same block
                    block_score = 2.0 * confidence
                    score += block_score
                    reasons.append(f"In same address block as state: +{block_score:.2f} points")
        
        # 3. Check proximity to associated name (if provided)
        if associated_name_bbox:
            distance_to_name = calculate_distance(bbox, associated_name_bbox)
            # Addresses are typically near the associated name
            if distance_to_name < 500:
                name_proximity_score = max(0, 1 - (distance_to_name / 500)) * 2.0 * confidence
                score += name_proximity_score
                reasons.append(f"Near associated name (distance: {distance_to_name:.1f}): +{name_proximity_score:.2f} points")
        
        # 4. Check capitalization pattern (proper nouns)
        if text[0].isupper() and not text.isupper() and len(text.split()) <= 2:
            cap_score = 1.0 * confidence
            score += cap_score
            reasons.append(f"Has city-like capitalization pattern: +{cap_score:.2f} points")
        
        # Add to candidates if score is positive
        if score > 0:
            city_candidates.append({
                'text': text,
                'score': score,
                'bbox': bbox,
                'confidence': confidence,
                'reasons': reasons
            })
    
    # Sort candidates by score
    city_candidates.sort(key=lambda x: x['score'], reverse=True)
    
    if debug and city_candidates:
        print(f"\nContextual city candidates ({len(city_candidates)}):")
        for i, candidate in enumerate(city_candidates[:3]):  # Show top 3
            print(f"  {i+1}. '{candidate['text']}' - Score: {candidate['score']:.3f}")
            for reason in candidate['reasons']:
                print(f"     - {reason}")
    
    return city_candidates

def find_associated_address(processed_results, associated_name_bbox, debug=True):
    """
    Extract the full address associated with a person.
    
    Args:
        processed_results: List of (bbox, text, confidence) tuples from OCR
        associated_name_bbox: Bounding box of the associated name
        debug: Whether to print debug information
        
    Returns:
        Tuple of (full_address, address_components)
    """
    # Handle case where no associated name is found
    if not associated_name_bbox:
        if debug:
            print("No associated name bounding box provided")
        return "", {}
    
    # Initialize address components
    address_components = {
        'street': None,
        'city': None,
        'state': None,
        'zip': None
    }
    
    # First identify state and ZIP candidates
    state_candidates = []
    zip_candidates = []
    
    for bbox, text, confidence in processed_results:
        # Skip very short text
        if len(text) < 2:
            continue
            
        # Check for state
        if is_state(text):
            state_candidates.append({
                'text': text,
                'bbox': bbox,
                'confidence': confidence
            })
            
        # Check for ZIP code
        if is_zipcode(text):
            zip_candidates.append({
                'text': text,
                'bbox': bbox,
                'confidence': confidence
            })
    
    # Sort state and ZIP candidates by confidence
    if state_candidates:
        state_candidates.sort(key=lambda x: x['confidence'], reverse=True)
    
    if zip_candidates:
        zip_candidates.sort(key=lambda x: x['confidence'], reverse=True)
    
    # Initialize candidates for street
    street_candidates = []
    
    # First pass: identify potential street address
    for bbox, text, confidence in processed_results:
        # Skip very short text
        if len(text) < 2:
            continue
        
        # Check for street address
        if is_street_address(text):
            street_candidates.append({
                'text': text,
                'bbox': bbox,
                'confidence': confidence
            })
    
    # Use contextual approach for city detection
    city_candidates = find_city_contextual(processed_results, state_candidates, zip_candidates, associated_name_bbox, debug=debug)
    
    # Sort candidates by score/confidence
    if street_candidates:
        street_candidates.sort(key=lambda x: x['confidence'], reverse=True)
        address_components['street'] = street_candidates[0]['text']
        
    if city_candidates:
        address_components['city'] = city_candidates[0]['text']
        
    if state_candidates:
        address_components['state'] = state_candidates[0]['text']
        
    if zip_candidates:
        address_components['zip'] = zip_candidates[0]['text']
    
    # Debug output
    if debug:
        print("\nAddress component candidates:")
        if street_candidates:
            print(f"  Street candidates ({len(street_candidates)}):")
            for i, candidate in enumerate(street_candidates[:3]):  # Show top 3
                print(f"    {i+1}. '{candidate['text']}' - Confidence: {candidate['confidence']:.3f}")
        else:
            print("  No street candidates found")
            
        if city_candidates:
            print(f"  City candidates ({len(city_candidates)}):")
            for i, candidate in enumerate(city_candidates[:3]):  # Show top 3
                print(f"    {i+1}. '{candidate['text']}' - Score: {candidate['score']:.3f}")
        else:
            print("  No city candidates found")
            
        if state_candidates:
            print(f"  State candidates ({len(state_candidates)}):")
            for i, candidate in enumerate(state_candidates[:3]):  # Show top 3
                print(f"    {i+1}. '{candidate['text']}' - Confidence: {candidate['confidence']:.3f}")
        else:
            print("  No state candidates found")
            
        if zip_candidates:
            print(f"  ZIP candidates ({len(zip_candidates)}):")
            for i, candidate in enumerate(zip_candidates[:3]):  # Show top 3
                print(f"    {i+1}. '{candidate['text']}' - Confidence: {candidate['confidence']:.3f}")
        else:
            print("  No ZIP candidates found")
    
    # Format full address
    full_address = ""
    if address_components['street']:
        full_address += address_components['street']
    
    if address_components['city']:
        if full_address:
            full_address += ", "
        full_address += address_components['city']
    
    if address_components['state']:
        if full_address:
            full_address += ", "
        full_address += address_components['state']
    
    if address_components['zip']:
        if full_address and address_components['state']:
            full_address += " "
        elif full_address:
            full_address += ", "
        full_address += address_components['zip']
    
    if debug:
        print(f"\nExtracted Address: '{full_address}'")
    
    return full_address, address_components


# Test the associated address extraction on ALL images
print("\n===== TESTING ASSOCIATED ADDRESS EXTRACTION ON ALL IMAGES =====")

# Initialize results storage
address_results = []

if file_list:
    for test_file in file_list:
        print(f"\n{'-'*60}")
        print(f"Processing: {test_file}")
        test_path = os.path.join(test_folder, test_file)
        
        # Load and process image
        image = cv2.imread(test_path)
        if image is None:
            print(f"Could not read image: {test_file}")
            continue
            
        print(f"  Dimensions: {image.shape[1]}x{image.shape[0]}")
        
        # Perform OCR
        ocr_results = ocr.ocr(image, cls=True)
        if not ocr_results or not ocr_results[0]:
            print(f"  No text detected")
            continue
            
        # Process OCR results
        processed_results = []
        all_text = []
        
        for line in ocr_results[0]:
            bbox, (text, confidence) = line
            text = combine_spaced_alphanumeric(text.strip())
            processed_results.append((bbox, text, confidence))
            all_text.append(text)
            
        all_text_combined = " ".join(all_text)
        print(f"  OCR extracted {len(all_text_combined)} characters")
        
        # First extract event type to use as reference for associated_name
        event_type, event_type_candidates = find_event_type(processed_results, all_text_combined, debug=False)
        
        # Get event type bbox if found
        event_type_bbox = None
        if event_type:
            # Find the bbox for the event type
            for bbox, text, _ in processed_results:
                if text.strip() == event_type:
                    event_type_bbox = bbox
                    break
        
        # Extract associated name using the exact same function from the first cell
        associated_name, name_candidates = find_associated_name(processed_results, event_type_bbox, debug=False)
        
        # Get associated name bbox if found
        associated_name_bbox = None
        if associated_name:
            print(f"Associated Name: '{associated_name}'")
            # Find the bbox for the associated name
            for bbox, text, _ in processed_results:
                if text.strip() == associated_name:
                    associated_name_bbox = bbox
                    break
        else:
            print("No associated name found")
        
        # Extract associated address only if we have an associated name
        if associated_name_bbox:
            associated_address, address_components = find_associated_address(processed_results, associated_name_bbox, debug=True)
            print(f"\nExtracted Associated Address: '{associated_address}'")
        else:
            associated_address = ""
            address_components = {}
            print("No associated address extracted (no associated name found)")
        
        # Store result
        address_results.append({
            'filename': test_file,
            'associated_name': associated_name if associated_name else "",
            'associated_address': associated_address
        })
        
        # Visualize the result with colored bounding boxes
        plt.figure(figsize=(15, 10))
        img_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        plt.imshow(img_rgb)
        
        # Draw bounding boxes for address components
        for bbox, text, confidence in processed_results:
            # Determine box color based on detection
            color, linewidth = "gray", 1  # Default
            
            if associated_name and text.strip() == associated_name:
                color, linewidth = "red", 3  # Associated name
            elif address_components.get('street') and text.strip() == address_components['street']:
                color, linewidth = "blue", 2  # Street
            elif address_components.get('city') and text.strip() == address_components['city']:
                color, linewidth = "green", 2  # City
            elif address_components.get('state') and text.strip() == address_components['state']:
                color, linewidth = "purple", 2  # State
            elif address_components.get('zip') and text.strip() == address_components['zip']:
                color, linewidth = "orange", 2  # ZIP
                
            # Draw bounding box
            points = np.array(bbox, dtype=np.int32)
            plt.plot(
                [points[0][0], points[1][0], points[2][0], points[3][0], points[0][0]],
                [points[0][1], points[1][1], points[2][1], points[3][1], points[0][1]],
                color=color,
                linewidth=linewidth,
            )
            
            # Add text label
            plt.text(
                points[0][0],
                points[0][1] - 5,
                f"{text} ({confidence:.2f})",
                fontsize=8,
                color=color,
                weight="bold",
            )
            
        plt.title(f"Associated Address Detection: {test_file}")
        plt.axis("off")
        plt.tight_layout()
        plt.show()
    
    # Display summary of results
    print(f"\n{'-'*60}")
    print(f"ASSOCIATED ADDRESS EXTRACTION SUMMARY")
    print(f"{'-'*60}")
    print(f"Processed {len(address_results)} images")
    print(f"Found addresses in {sum(1 for r in address_results if r['associated_address'])} images")
    
    # Display table of results
    if address_results:
        results_df = pd.DataFrame(address_results)
        print("\nExtracted Associated Addresses:")
        print(results_df.to_string(index=False))
else:
    print("No test images available for associated address extraction testing")

In [None]:
# CELL 6.6: Associated Address Extraction Implementation

"""
Implementation of the associated address extraction function.
Identifies the full address (street, city, state, zip) associated with the person.
"""

import re
import numpy as np
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load spaCy NER model
try:
    nlp = spacy.load("en_core_web_sm")
except:
    # If model not installed, download it
    import sys
    !{sys.executable} -m spacy download en_core_web_sm
    nlp = spacy.load("en_core_web_sm")

# [KEEP ALL ORIGINAL ASSOCIATED NAME FUNCTIONS FROM CELL 6.5 EXACTLY AS PROVIDED]
# is_person_name, contains_name_indicator, find_name_labels, calculate_distance,
# is_likely_not_a_name, is_form_field_label, find_associated_name

# Address extraction functions
def is_street_address(text):
    """Check if text is likely a street address."""
    # Common street address patterns
    street_patterns = [
        r'\d+\s+[A-Za-z]+\s+[A-Za-z]+',  # 123 Main Street
        r'\d+\s+[A-Za-z]+\s+[A-Za-z]+\s+[A-Za-z]+',  # 123 East Main Street
        r'\d+\s+[A-Za-z]+',  # 123 Main
        r'P\.?O\.?\s+Box\s+\d+',  # P.O. Box 123
    ]
    
    # Check for common street suffixes
    street_suffixes = ['street', 'st', 'avenue', 'ave', 'road', 'rd', 'boulevard', 'blvd', 
                      'drive', 'dr', 'lane', 'ln', 'place', 'pl', 'court', 'ct', 'way']
    
    # Check for street patterns
    for pattern in street_patterns:
        if re.search(pattern, text, re.IGNORECASE):
            return True
    
    # Check for street suffixes
    for suffix in street_suffixes:
        if re.search(r'\b' + suffix + r'\b', text, re.IGNORECASE):
            return True
    
    return False

def is_state(text):
    """Check if text is a US state or abbreviation."""
    # List of US state abbreviations
    state_abbrs = ['AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'FL', 'GA', 
                  'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', 
                  'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 
                  'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 
                  'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY']
    
    # Check if text is a state abbreviation
    if text.upper() in state_abbrs:
        return True
        
    # List of US state names
    state_names = ['alabama', 'alaska', 'arizona', 'arkansas', 'california', 
                  'colorado', 'connecticut', 'delaware', 'florida', 'georgia', 
                  'hawaii', 'idaho', 'illinois', 'indiana', 'iowa', 'kansas', 
                  'kentucky', 'louisiana', 'maine', 'maryland', 'massachusetts', 
                  'michigan', 'minnesota', 'mississippi', 'missouri', 'montana', 
                  'nebraska', 'nevada', 'new hampshire', 'new jersey', 'new mexico', 
                  'new york', 'north carolina', 'north dakota', 'ohio', 'oklahoma', 
                  'oregon', 'pennsylvania', 'rhode island', 'south carolina', 
                  'south dakota', 'tennessee', 'texas', 'utah', 'vermont', 
                  'virginia', 'washington', 'west virginia', 'wisconsin', 'wyoming']
    
    # Check if text is a state name
    if text.lower() in state_names:
        return True
        
    return False

def is_zipcode(text):
    """Check if text is a US ZIP code."""
    # 5-digit ZIP code
    if re.match(r'^\d{5}$', text):
        return True
        
    # 9-digit ZIP+4 code
    if re.match(r'^\d{5}-\d{4}$', text):
        return True
        
    return False

def find_address_blocks(processed_results):
    """Group text elements into potential address blocks based on spatial proximity."""
    # If too few elements, return a single block
    if len(processed_results) < 5:
        return [processed_results]
    
    # Initialize address blocks
    address_blocks = []
    
    # Sort by vertical position (y-coordinate)
    sorted_results = sorted(processed_results, 
                           key=lambda x: sum([point[1] for point in x[0]]) / len(x[0]))
    
    current_block = []
    prev_y = None
    
    # Group by vertical proximity
    for bbox, text, confidence in sorted_results:
        # Calculate center y-coordinate
        center_y = sum([point[1] for point in bbox]) / len(bbox)
        
        # Start a new block if vertical gap is large
        if prev_y is not None and abs(center_y - prev_y) > 100:
            if current_block:
                address_blocks.append(current_block)
                current_block = []
        
        current_block.append((bbox, text, confidence))
        prev_y = center_y
    
    # Add the last block
    if current_block:
        address_blocks.append(current_block)
    
    return address_blocks

def identify_city_with_ner(text, debug=False):
    """
    Use NER to identify if text is likely a city.
    
    Args:
        text: Text to analyze
        debug: Whether to print debug information
        
    Returns:
        Tuple of (is_city, confidence_score)
    """
    # Skip very short text
    if len(text) < 3:
        return False, 0.0
    
    # Skip text with numbers
    if re.search(r'\d', text):
        return False, 0.0
    
    # Skip form field labels
    if text.lower() in ['city', 'city:', 'state', 'state:', 'zip', 'zip code', 'zip:']:
        return False, 0.0
    
    # Skip common form headers and organization names
    non_city_indicators = [
        'department', 'bureau', 'federal', 'firearms', 'license', 'report', 'form',
        'section', 'universal', 'security', 'services', 'corporation', 'corporate',
        'trade', 'full', 'name', 'person', 'making', 'street', 'address', 'theft',
        'loss', 'inventory', 'notification', 'alcohol', 'tobacco', 'explosives',
        'signature', 'justice', 'licensee', 'date', 'time', 'code', 'telephone',
        'bridge', 'tower', 'building', 'plaza', 'center', 'complex', 'director',
        'category', 'hair', 'eyes', 'sex', 'height', 'weight', 'expires', 'print'
    ]
    
    # Check for non-city indicators
    text_lower = text.lower()
    for indicator in non_city_indicators:
        if indicator in text_lower:
            return False, 0.0
    
    # Check for labels/headers with colons
    if ':' in text:
        return False, 0.0
    
    # Use spaCy NER to check if it's recognized as a location
    doc = nlp(text)
    
    # Initialize confidence score
    confidence = 0.0
    
    # Check for GPE (Geo-Political Entity) or LOC (Location) entities
    for ent in doc.ents:
        if ent.label_ == "GPE":
            confidence = 0.9  # High confidence for GPE (cities, states, countries)
            if debug:
                print(f"  '{text}' recognized as GPE by NER: +0.9 confidence")
            break
        elif ent.label_ == "LOC":
            confidence = 0.7  # Medium confidence for LOC (non-GPE locations)
            if debug:
                print(f"  '{text}' recognized as LOC by NER: +0.7 confidence")
            break
    
    # IMPORTANT: Only return True if NER actually identified it as a location
    # Do NOT provide fallback confidence for non-location entities
    if confidence > 0.0:
        return True, confidence
    else:
        if debug:
            print(f"  '{text}' not recognized as a location by NER")
        return False, 0.0

def find_city_with_contextual_validation(processed_results, state_candidates, zip_candidates, debug=False):
    """
    Extract city name using NER and contextual validation.
    
    Args:
        processed_results: List of (bbox, text, confidence) tuples from OCR
        state_candidates: List of state candidates
        zip_candidates: List of ZIP code candidates
        debug: Whether to print debug information
        
    Returns:
        List of city candidates with scores
    """
    city_candidates = []
    
    # Find city labels
    city_labels = []
    for bbox, text, confidence in processed_results:
        text_lower = text.lower()
        if text_lower == 'city' or text_lower == 'city:':
            city_labels.append((bbox, text, confidence))
            if debug:
                print(f"Found city label: '{text}'")
    
    # Group text elements into potential address blocks
    address_blocks = find_address_blocks(processed_results)
    
    # Process each text element
    for bbox, text, confidence in processed_results:
        # Use NER to check if it's a city
        is_city, ner_confidence = identify_city_with_ner(text, debug)
        
        if not is_city:
            continue
        
        # Initialize score with NER confidence
        score = ner_confidence * 5.0  # Scale up NER confidence
        reasons = [f"NER confidence: {ner_confidence:.2f} x 5.0 = {ner_confidence * 5.0:.2f} points"]
        
        # Check proximity to city labels
        for label_bbox, label_text, _ in city_labels:
            distance = calculate_distance(bbox, label_bbox)
            if distance < 200:
                # Text near a city label is likely a city
                label_score = 3.0 * confidence
                score += label_score
                reasons.append(f"Near city label '{label_text}': +{label_score:.2f} points")
                break  # Only count once
        
        # Check for city-state-zip pattern within the same address block
        if state_candidates and zip_candidates:
            # Find which address block this text belongs to
            text_block = None
            for block in address_blocks:
                if any(t == text for _, t, _ in block):
                    text_block = block
                    break
            
            if text_block:
                # Check if any state candidate is in the same block
                block_texts = [t for _, t, _ in text_block]
                state_in_block = False
                zip_in_block = False
                
                for state_candidate in state_candidates:
                    if state_candidate['text'] in block_texts:
                        state_in_block = True
                        
                        # Find positions to check if city is before state
                        state_bbox = state_candidate['bbox']
                        state_x = sum([point[0] for point in state_bbox]) / len(state_bbox)
                        state_y = sum([point[1] for point in state_bbox]) / len(state_bbox)
                        text_x = sum([point[0] for point in bbox]) / len(bbox)
                        text_y = sum([point[1] for point in bbox]) / len(bbox)
                        
                        # City typically appears before state on same line
                        if abs(text_y - state_y) < 30 and text_x < state_x:
                            pattern_score = 4.0 * confidence
                            score += pattern_score
                            reasons.append(f"Appears before state '{state_candidate['text']}' on same line: +{pattern_score:.2f} points")
                        
                        # City typically appears before state in address block
                        elif text_y < state_y:
                            pattern_score = 2.0 * confidence
                            score += pattern_score
                            reasons.append(f"Appears before state '{state_candidate['text']}' in address block: +{pattern_score:.2f} points")
                        
                        break  # Only count once
                
                for zip_candidate in zip_candidates:
                    if zip_candidate['text'] in block_texts:
                        zip_in_block = True
                        break
                
                if state_in_block and zip_in_block:
                    # Strong indicator: city, state, and ZIP in same block
                    block_score = 3.0 * confidence
                    score += block_score
                    reasons.append(f"In same address block as state and ZIP: +{block_score:.2f} points")
                elif state_in_block:
                    # Medium indicator: city and state in same block
                    block_score = 1.5 * confidence
                    score += block_score
                    reasons.append(f"In same address block as state: +{block_score:.2f} points")
        
        # Add to candidates if score is positive
        if score > 0:
            city_candidates.append({
                'text': text,
                'score': score,
                'bbox': bbox,
                'confidence': confidence,
                'ner_confidence': ner_confidence,
                'reasons': reasons
            })
    
    # Sort candidates by score
    city_candidates.sort(key=lambda x: x['score'], reverse=True)
    
    if debug and city_candidates:
        print(f"\nNER-based city candidates ({len(city_candidates)}):")
        for i, candidate in enumerate(city_candidates[:3]):  # Show top 3
            print(f"  {i+1}. '{candidate['text']}' - Score: {candidate['score']:.3f}, NER confidence: {candidate['ner_confidence']:.2f}")
            for reason in candidate['reasons']:
                print(f"     - {reason}")
    
    return city_candidates

def find_associated_address(processed_results, associated_name_bbox, debug=True):
    """
    Extract the full address associated with a person.
    
    Args:
        processed_results: List of (bbox, text, confidence) tuples from OCR
        associated_name_bbox: Bounding box of the associated name
        debug: Whether to print debug information
        
    Returns:
        Tuple of (full_address, address_components)
    """
    # Handle case where no associated name is found
    if not associated_name_bbox:
        if debug:
            print("No associated name bounding box provided")
        return "", {}
    
    # Initialize address components
    address_components = {
        'street': None,
        'city': None,
        'state': None,
        'zip': None
    }
    
    # First identify state and ZIP candidates
    state_candidates = []
    zip_candidates = []
    
    for bbox, text, confidence in processed_results:
        # Skip very short text
        if len(text) < 2:
            continue
            
        # Check for state
        if is_state(text):
            state_candidates.append({
                'text': text,
                'bbox': bbox,
                'confidence': confidence
            })
            
        # Check for ZIP code
        if is_zipcode(text):
            zip_candidates.append({
                'text': text,
                'bbox': bbox,
                'confidence': confidence
            })
    
    # Sort state and ZIP candidates by confidence
    if state_candidates:
        state_candidates.sort(key=lambda x: x['confidence'], reverse=True)
    
    if zip_candidates:
        zip_candidates.sort(key=lambda x: x['confidence'], reverse=True)
    
    # Initialize candidates for street
    street_candidates = []
    
    # First pass: identify potential street address
    for bbox, text, confidence in processed_results:
        # Skip very short text
        if len(text) < 2:
            continue
        
        # Check for street address
        if is_street_address(text):
            street_candidates.append({
                'text': text,
                'bbox': bbox,
                'confidence': confidence
            })
    
    # Use NER with contextual validation for city detection
    city_candidates = find_city_with_contextual_validation(processed_results, state_candidates, zip_candidates, debug=debug)
    
    # Sort candidates by score/confidence
    if street_candidates:
        street_candidates.sort(key=lambda x: x['confidence'], reverse=True)
        address_components['street'] = street_candidates[0]['text']
        
    if city_candidates:
        address_components['city'] = city_candidates[0]['text']
        
    if state_candidates:
        address_components['state'] = state_candidates[0]['text']
        
    if zip_candidates:
        address_components['zip'] = zip_candidates[0]['text']
    
    # Debug output
    if debug:
        print("\nAddress component candidates:")
        if street_candidates:
            print(f"  Street candidates ({len(street_candidates)}):")
            for i, candidate in enumerate(street_candidates[:3]):  # Show top 3
                print(f"    {i+1}. '{candidate['text']}' - Confidence: {candidate['confidence']:.3f}")
        else:
            print("  No street candidates found")
            
        if city_candidates:
            print(f"  City candidates ({len(city_candidates)}):")
            for i, candidate in enumerate(city_candidates[:3]):  # Show top 3
                print(f"    {i+1}. '{candidate['text']}' - Score: {candidate['score']:.3f}")
        else:
            print("  No city candidates found")
            
        if state_candidates:
            print(f"  State candidates ({len(state_candidates)}):")
            for i, candidate in enumerate(state_candidates[:3]):  # Show top 3
                print(f"    {i+1}. '{candidate['text']}' - Confidence: {candidate['confidence']:.3f}")
        else:
            print("  No state candidates found")
            
        if zip_candidates:
            print(f"  ZIP candidates ({len(zip_candidates)}):")
            for i, candidate in enumerate(zip_candidates[:3]):  # Show top 3
                print(f"    {i+1}. '{candidate['text']}' - Confidence: {candidate['confidence']:.3f}")
        else:
            print("  No ZIP candidates found")
    
    # Format full address
    full_address = ""
    if address_components['street']:
        full_address += address_components['street']
    
    if address_components['city']:
        if full_address:
            full_address += ", "
        full_address += address_components['city']
    
    if address_components['state']:
        if full_address:
            full_address += ", "
        full_address += address_components['state']
    
    if address_components['zip']:
        if full_address and address_components['state']:
            full_address += " "
        elif full_address:
            full_address += ", "
        full_address += address_components['zip']
    
    if debug:
        print(f"\nExtracted Address: '{full_address}'")
    
    return full_address, address_components

# Test the associated address extraction on ALL images
print("\n===== TESTING ASSOCIATED ADDRESS EXTRACTION ON ALL IMAGES =====")

# Initialize results storage
address_results = []

if file_list:
    for test_file in file_list:
        print(f"\n{'-'*60}")
        print(f"Processing: {test_file}")
        test_path = os.path.join(test_folder, test_file)
        
        # Load and process image
        image = cv2.imread(test_path)
        if image is None:
            print(f"Could not read image: {test_file}")
            continue
            
        print(f"  Dimensions: {image.shape[1]}x{image.shape[0]}")
        
        # Perform OCR
        ocr_results = ocr.ocr(image, cls=True)
        if not ocr_results or not ocr_results[0]:
            print(f"  No text detected")
            continue
            
        # Process OCR results
        processed_results = []
        all_text = []
        
        for line in ocr_results[0]:
            bbox, (text, confidence) = line
            text = combine_spaced_alphanumeric(text.strip())
            processed_results.append((bbox, text, confidence))
            all_text.append(text)
            
        all_text_combined = " ".join(all_text)
        print(f"  OCR extracted {len(all_text_combined)} characters")
        
        # First extract event type to use as reference for associated_name
        event_type, event_type_candidates = find_event_type(processed_results, all_text_combined, debug=False)
        
        # Get event type bbox if found
        event_type_bbox = None
        if event_type:
            # Find the bbox for the event type
            for bbox, text, _ in processed_results:
                if text.strip() == event_type:
                    event_type_bbox = bbox
                    break
        
        # Extract associated name using the exact same function from the first cell
        associated_name, name_candidates = find_associated_name(processed_results, event_type_bbox, debug=False)
        
        # Get associated name bbox if found
        associated_name_bbox = None
        if associated_name:
            print(f"Associated Name: '{associated_name}'")
            # Find the bbox for the associated name
            for bbox, text, _ in processed_results:
                if text.strip() == associated_name:
                    associated_name_bbox = bbox
                    break
        else:
            print("No associated name found")
        
        # Extract associated address only if we have an associated name
        if associated_name_bbox:
            associated_address, address_components = find_associated_address(processed_results, associated_name_bbox, debug=True)
            print(f"\nExtracted Associated Address: '{associated_address}'")
        else:
            associated_address = ""
            address_components = {}
            print("No associated address extracted (no associated name found)")
        
        # Store result
        address_results.append({
            'filename': test_file,
            'associated_name': associated_name if associated_name else "",
            'associated_address': associated_address
        })
        
        # Visualize the result with colored bounding boxes
        plt.figure(figsize=(15, 10))
        img_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        plt.imshow(img_rgb)
        
        # Draw bounding boxes for address components
        for bbox, text, confidence in processed_results:
            # Determine box color based on detection
            color, linewidth = "gray", 1  # Default
            
            if associated_name and text.strip() == associated_name:
                color, linewidth = "red", 3  # Associated name
            elif address_components.get('street') and text.strip() == address_components['street']:
                color, linewidth = "blue", 2  # Street
            elif address_components.get('city') and text.strip() == address_components['city']:
                color, linewidth = "green", 2  # City
            elif address_components.get('state') and text.strip() == address_components['state']:
                color, linewidth = "purple", 2  # State
            elif address_components.get('zip') and text.strip() == address_components['zip']:
                color, linewidth = "orange", 2  # ZIP
                
            # Draw bounding box
            points = np.array(bbox, dtype=np.int32)
            plt.plot(
                [points[0][0], points[1][0], points[2][0], points[3][0], points[0][0]],
                [points[0][1], points[1][1], points[2][1], points[3][1], points[0][1]],
                color=color,
                linewidth=linewidth,
            )
            
            # Add text label
            plt.text(
                points[0][0],
                points[0][1] - 5,
                f"{text} ({confidence:.2f})",
                fontsize=8,
                color=color,
                weight="bold",
            )
            
        plt.title(f"Associated Address Detection: {test_file}")
        plt.axis("off")
        plt.tight_layout()
        plt.show()
    
    # Display summary of results
    print(f"\n{'-'*60}")
    print(f"ASSOCIATED ADDRESS EXTRACTION SUMMARY")
    print(f"{'-'*60}")
    print(f"Processed {len(address_results)} images")
    print(f"Found addresses in {sum(1 for r in address_results if r['associated_address'])} images")
    
    # Display table of results
    if address_results:
        results_df = pd.DataFrame(address_results)
        print("\nExtracted Associated Addresses:")
        print(results_df.to_string(index=False))
else:
    print("No test images available for associated address extraction testing")

In [None]:
# CELL 6.7: Main Processing Pipeline

"""
Main processing pipeline that uses the specialized extraction functions
to process each image and extract all fields.
"""

import os
import cv2
import numpy as np
from paddleocr import PaddleOCR
import matplotlib.pyplot as plt
import pandas as pd
from datetime import datetime

# Initialize OCR engine if not already done
try:
    ocr
except NameError:
    ocr = PaddleOCR(use_angle_cls=True, lang="en", show_log=False)

# Initialize results list for CSV output
csv_results = []

# Process each image file in the test folder
for filename in file_list:
    print(f"\n{'='*60}")
    print(f"Processing: {filename}")
    print(f"{'='*60}")
    
    # Construct full file path
    file_path = os.path.join(test_folder, filename)
    
    try:
        # Load image using OpenCV
        image = cv2.imread(file_path)
        if image is None:
            print(f"Error: Could not load image {filename}")
            continue
            
        print(f"Image dimensions: {image.shape}")
        
        # Get file metadata for record keeping
        metadata = get_file_metadata(file_path)
        
        # Execute OCR on the image
        ocr_results = ocr.ocr(image, cls=True)
        
        if not ocr_results or not ocr_results[0]:
            print(f"No text detected in {filename}")
            continue
        
        # Extract and process OCR results
        results = ocr_results[0]
        processed_results = []
        all_text_parts = []
        
        print(f"OCR detected {len(results)} text elements")
        
        # Process each detected text element
        for bbox, (text, confidence) in results:
            if confidence > 0.5:  # Filter low-confidence detections
                # Clean and process the text
                cleaned_text = text.strip()
                
                # Apply spaced character combination for potential serial numbers
                combined_text = combine_spaced_alphanumeric(cleaned_text)
                
                processed_results.append((bbox, combined_text, confidence))
                all_text_parts.append(combined_text)
                
                print(f"  Text: '{cleaned_text}' -> '{combined_text}' (Confidence: {confidence:.2f})")
        
        # Combine all detected text for comprehensive analysis
        all_text_combined = " ".join(all_text_parts)
        print(f"\nCombined text: {all_text_combined}")
        
        # ====================================================================
        # FIELD EXTRACTION
        # ====================================================================
        
        print("\nExtracting structured data...")
        
        # Extract serial number using robust method from Cell 5
        selected_candidate, all_candidates = find_serial_number(
            processed_results, all_text_combined, (image.shape[1], image.shape[0]), debug=True
        )
        
        # Extract serial number
        serial_number = selected_candidate["text"] if selected_candidate else ""
        print(f"Serial Number: '{serial_number}'")
        
        # Extract event type
        event_type = find_event_type(processed_results, all_text_combined, debug=True)
        print(f"Event Type: '{event_type}'")
        
        # Extract associated name
        associated_name = find_associated_name(processed_results, all_text_combined, debug=True)
        print(f"Associated Name: '{associated_name}'")
        
        # Extract event date
        event_date = find_event_date(processed_results, all_text_combined, debug=True)
        print(f"Event Date: '{event_date}'")
        
        # Extract associated address
        associated_address = find_associated_address(processed_results, all_text_combined, debug=True)
        print(f"Associated Address: '{associated_address}'")
        
        # ====================================================================
        # VISUALIZATION
        # ====================================================================
        
        print("\nGenerating visualization...")
        
        # Create a copy of the image for visualization
        vis_image = image.copy()
        
        # Draw bounding boxes with color coding
        for bbox, text, confidence in processed_results:
            # Determine color based on content classification
            t_lower = text.lower()
            
            # Check for specific field matches first
            if text == serial_number:
                color_key = "serial_number"
            elif text == associated_name:
                color_key = "associated_name"
            elif text == associated_address:
                color_key = "associated_address"
            elif text == event_date:
                color_key = "event_date"
            elif text == event_type:
                color_key = "event_type"
            else:
                # Use classification function for general categorization
                color_key = classify_token(t_lower)
            
            # Get color for the category
            color = CATEGORY_COLORS.get(color_key, CATEGORY_COLORS["other"])
            
            # Draw bounding box
            pts = np.array(bbox, np.int32).reshape((-1, 1, 2))
            cv2.polylines(vis_image, [pts], True, color, 2)
            
            # Add text label
            cv2.putText(vis_image, f"{text[:20]}...", 
                       (int(bbox[0][0]), int(bbox[0][1]) - 10),
                       cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 1)
        
        # Display the visualization
        plt.figure(figsize=(15, 10))
        plt.imshow(cv2.cvtColor(vis_image, cv2.COLOR_BGR2RGB))
        plt.title(f"OCR Results for {filename}\n"
                 f"Serial: {serial_number} | Event: {event_type} | "
                 f"Name: {associated_name}")
        plt.axis('off')
        
        # Add legend
        legend_elements = []
        for category, color in CATEGORY_COLORS.items():
            legend_elements.append(plt.Rectangle((0,0),1,1, facecolor=[c/255 for c in color], 
                                               edgecolor='black', label=category.title()))
        plt.legend(handles=legend_elements, loc='upper right', bbox_to_anchor=(1, 1))
        
        plt.tight_layout()
        plt.show()
        
        # ====================================================================
        # RECORD RESULTS
        # ====================================================================
        
        # Prepare CSV record
        csv_record = {
            'filename': filename,
            'file_creation_date': metadata['file_creation_date'],
            'file_modification_date': metadata['file_modification_date'],
            'file_location': file_path,
            'serial_number': serial_number,
            'event_type': event_type,
            'associated_name': associated_name,
            'event_date': event_date,
            'associated_address': associated_address,
            'processing_timestamp': datetime.now().strftime("%m/%d/%y %H:%M")
        }
        
        csv_results.append(csv_record)
        
        print(f"\nProcessing completed for {filename}")
        print(f"Results recorded: {len(csv_results)} files processed so far")
        
    except Exception as e:
        print(f"Error processing {filename}: {str(e)}")
        import traceback
        traceback.print_exc()
        continue

# ============================================================================
# FINAL OUTPUT GENERATION
# ============================================================================

print(f"\n{'='*60}")
print(f"PROCESSING COMPLETE")
print(f"{'='*60}")
print(f"Successfully processed {len(csv_results)} files")

if csv_results:
    # Create DataFrame from results
    results_df = pd.DataFrame(csv_results)
    
    # Display summary
    print(f"\nExtraction Summary:")
    print(f"- Files with serial numbers: {sum(1 for r in csv_results if r['serial_number'])}")
    print(f"- Files with event types: {sum(1 for r in csv_results if r['event_type'])}")
    print(f"- Files with associated names: {sum(1 for r in csv_results if r['associated_name'])}")
    print(f"- Files with dates: {sum(1 for r in csv_results if r['event_date'])}")
    print(f"- Files with addresses: {sum(1 for r in csv_results if r['associated_address'])}")
    
    # Save to CSV
    output_file = "extracted_data_detailed.csv"
    results_df.to_csv(output_file, index=False)
    print(f"\nResults saved to: {output_file}")
    
    # Display first few results
    print(f"\nSample Results:")
    print(results_df.head().to_string(index=False))
else:
    print("No files were successfully processed.")

print(f"\nProcessing pipeline completed at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")