In [None]:
# ============================================
# IMPORTS AND DEPENDENCIES
# ============================================

import os
import json
import pandas as pd
from pathlib import Path
from dotenv import load_dotenv
from typing import Optional, Dict, Any

# File reading libraries
import pdfplumber
from PIL import Image
import pytesseract
import xml.etree.ElementTree as ET

# CrewAI imports
from crewai import Agent, Task, Crew, Process, LLM
from crewai.tools import tool

print("‚úÖ All dependencies imported successfully")

In [None]:
# ============================================
# ENVIRONMENT SETUP
# ============================================

load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

if not OPENAI_API_KEY:
    print("‚ö†Ô∏è  Warning: OPENAI_API_KEY not found in environment")
    print("   Please add it to your .env file")
else:
    print("‚úÖ OpenAI API key loaded")

# Initialize LLM
llm = LLM(
    model="gpt-4o",
    api_key=OPENAI_API_KEY,
    temperature=0.1  # Low temperature for consistent extraction
)

print("‚úÖ LLM initialized: gpt-4o")

In [None]:
# ============================================
# CANONICAL INVOICE SCHEMA DEFINITION
# ============================================
CANONICAL_SCHEMA_DOC = """
You MUST output JSON ARRAY matching this structure:
 
{
  "invoice_header": {
    "invoice_number": string or null,
    "vendor_name": string or null,
    "invoice_date": string or null,
    "billing_start_date": string or null,
    "billing_end_date": string or null,
    "currency": string or null,
    "gross_revenue": number or null,
    "discount_amount": number or null,
    "discount_percent": number or null,
    "tax": number or null
  },
  "line_items": [
    {
      "line_id": 1,
      "campaign_name": "Example Campaign",
      "campaign_id": "12345",
      "insertion_order_ID": "IO-900",
      "start_date": "2025-10-14",
      "end_date": "2025-10-30",
      "duration_days": 17,
      "booked_impressions": 400000,
      "billed_impressions": 350000,
      "views": 5000,
      "gross_revenue": 3500,
      "net_revenue": 3000,
      "discount_amount": 500,
      "discount_percent": 5,
      "profit": 1000,
      "rate_type": "CPM",
      "rate": 5.0
    }
  ]
}
 
RULES:
- If a value is not present in the invoice, use null.
- MANDATORY: Always extract start_date, end_date from the Dates column and calculate duration_days
- For duration_days: If Dates column shows "2025-10-14 to 2025-10-30", then start_date="2025-10-14", end_date="2025-10-30", duration_days=17 (count includes both dates: Oct 14,15,16...30 = 17 days)
- If only one type of revenue is present, store it in gross_revenue and leave net_revenue null (or vice versa if clearly net).
- Discounts can be explicit (discount column) or implicit (difference between gross and net) ‚Äî explain in notes if inferred.
- Profit = revenue - cost, if not directly provided.
- Be conservative: do NOT invent numbers if they are not in the invoice.
- Campaign ID can be the segment ID
- Insertion order id can not be same as Campaign ID. Insertion order id can be a short form
- IMPORTANT: For duration_days, parse the Dates column (format: "YYYY-MM-DD to YYYY-MM-DD"), extract start and end dates, then calculate the number of days INCLUDING both start and end dates (end_date - start_date + 1)
"""
print("‚úÖ Canonical schema defined")

In [None]:
# ============================================
# FILE READING FUNCTIONS
# ============================================

def read_pdf_content(pdf_path: str, max_pages: int = 5) -> str:
    """Extract text from PDF files with OCR fallback for image-based PDFs."""
    try:
        pages_text = []
        with pdfplumber.open(pdf_path) as pdf:
            for i, page in enumerate(pdf.pages):
                if i >= max_pages:
                    break
                text = page.extract_text() or ""
                
                # If no text extracted, try OCR on the page image
                if not text.strip():
                    try:
                        # Convert page to image and use OCR
                        page_image = page.to_image(resolution=300).original
                        text = pytesseract.image_to_string(page_image)
                    except pytesseract.TesseractNotFoundError:
                        text = """[ERROR: Tesseract OCR not installed]
                        
To install Tesseract:
‚Ä¢ macOS: brew install tesseract
‚Ä¢ Ubuntu/Debian: sudo apt-get install tesseract-ocr
‚Ä¢ Windows: Download from https://github.com/UB-Mannheim/tesseract/wiki"""
                    except Exception as ocr_error:
                        text = f"[OCR failed for page {i+1}: {str(ocr_error)}]"
                
                if text:
                    pages_text.append(f"--- Page {i+1} ---\n{text}")
        
        return "\n\n".join(pages_text) if pages_text else "No text extracted from PDF"
    except Exception as e:
        return f"Error reading PDF: {str(e)}"


def read_image_content(image_path: str) -> str:
    """Extract text from images using OCR."""
    try:
        image = Image.open(image_path)
        text = pytesseract.image_to_string(image)
        return text.strip() if text.strip() else "No text extracted from image"
    except pytesseract.TesseractNotFoundError:
        return """ERROR: Tesseract OCR is not installed.
        
To install Tesseract:
‚Ä¢ macOS: brew install tesseract
‚Ä¢ Ubuntu/Debian: sudo apt-get install tesseract-ocr
‚Ä¢ Windows: Download from https://github.com/UB-Mannheim/tesseract/wiki

After installation, restart your kernel."""
    except Exception as e:
        return f"Error reading image: {str(e)}"


def read_excel_content(excel_path: str, sheet_name=None, max_rows=50) -> Dict[str, Any]:
    """Read Excel file and return structured preview."""
    try:
        excel_file = pd.ExcelFile(excel_path)
        
        # Determine which sheet to read
        if sheet_name is not None:
            sheets = [sheet_name]
        else:
            # Read first sheet only for production
            sheets = [excel_file.sheet_names[0]]
        
        result = {
            "file_name": Path(excel_path).name,
            "total_sheets": len(excel_file.sheet_names),
            "sheet_names": excel_file.sheet_names,
            "data": {}
        }
        
        for sheet in sheets:
            df = pd.read_excel(excel_path, sheet_name=sheet)
            
            # Clean column names
            df.columns = [
                str(c).strip().lower().replace(" ", "_").replace("-", "_")
                for c in df.columns
            ]
            
            # Limit rows
            preview_df = df.head(max_rows)
            
            result["data"][sheet] = {
                "total_rows": len(df),
                "columns": list(df.columns),
                "preview": preview_df.to_dict(orient="records"),
                "preview_text": preview_df.to_string(index=False, max_colwidth=30)
            }
        
        return result
        
    except Exception as e:
        return {"error": str(e)}


def read_csv_content(csv_path: str, max_rows=50) -> Dict[str, Any]:
    """Read CSV file and return structured preview."""
    try:
        df = pd.read_csv(csv_path)
        
        # Clean column names
        df.columns = [
            str(c).strip().lower().replace(" ", "_").replace("-", "_")
            for c in df.columns
        ]
        
        preview_df = df.head(max_rows)
        
        return {
            "file_name": Path(csv_path).name,
            "total_rows": len(df),
            "columns": list(df.columns),
            "preview": preview_df.to_dict(orient="records"),
            "preview_text": preview_df.to_string(index=False, max_colwidth=30)
        }
        
    except Exception as e:
        return {"error": str(e)}


def read_text_content(text_path: str) -> str:
    """Read plain text files."""
    try:
        with open(text_path, 'r', encoding='utf-8') as f:
            return f.read()
    except Exception as e:
        return f"Error reading text file: {str(e)}"


print("‚úÖ File reading functions created (with OCR support for images and image-based PDFs)")

In [None]:
# ============================================
# INVOICE CONTEXT BUILDER
# ============================================

def build_invoice_context(file_path: str, max_rows: int = 50) -> str:
    """
    Build formatted context from any invoice file type.
    
    Args:
        file_path: Path to invoice file
        max_rows: Maximum rows for tabular data
    
    Returns:
        Formatted string with invoice content
    """
    file_path_obj = Path(file_path)
    
    if not file_path_obj.exists():
        return f"ERROR: File not found: {file_path}"
    
    suffix = file_path_obj.suffix.lower()
    
    output = []
    output.append(f"FILE: {file_path_obj.name}")
    output.append("=" * 70)
    
    # PDF files
    if suffix == '.pdf':
        output.append("TYPE: PDF Invoice")
        output.append("\nCONTENT:")
        content = read_pdf_content(str(file_path))
        output.append(content)
    
    # Excel files
    elif suffix in ['.xlsx', '.xls']:
        output.append("TYPE: Excel Spreadsheet")
        data = read_excel_content(str(file_path), max_rows=max_rows)
        
        if "error" in data:
            output.append(f"\nERROR: {data['error']}")
        else:
            output.append(f"\nSheets: {', '.join(data['sheet_names'])}")
            for sheet_name, sheet_data in data['data'].items():
                output.append(f"\n--- Sheet: {sheet_name} ---")
                output.append(f"Total Rows: {sheet_data['total_rows']}")
                output.append(f"Columns: {', '.join(sheet_data['columns'])}")
                output.append(f"\nData Preview (first {max_rows} rows):")
                output.append(sheet_data['preview_text'])
    
    # CSV files
    elif suffix == '.csv':
        output.append("TYPE: CSV File")
        data = read_csv_content(str(file_path), max_rows=max_rows)
        
        if "error" in data:
            output.append(f"\nERROR: {data['error']}")
        else:
            output.append(f"\nTotal Rows: {data['total_rows']}")
            output.append(f"Columns: {', '.join(data['columns'])}")
            output.append(f"\nData Preview (first {max_rows} rows):")
            output.append(data['preview_text'])
    
    # Image files
    elif suffix in ['.png', '.jpg', '.jpeg', '.bmp', '.tiff']:
        output.append("TYPE: Image File (OCR Extraction)")
        output.append("\nCONTENT:")
        content = read_image_content(str(file_path))
        output.append(content)
    
    # Text files
    elif suffix == '.txt':
        output.append("TYPE: Text Invoice")
        output.append("\nCONTENT:")
        content = read_text_content(str(file_path))
        output.append(content)
    
    else:
        output.append(f"ERROR: Unsupported file type: {suffix}")
    
    return "\n".join(output)


print("‚úÖ Invoice context builder created (supports PDF, Excel, CSV, Images, Text)")

In [None]:
# ============================================
# CREWAI AGENT DEFINITION
# ============================================

invoice_extraction_agent = Agent(
    role='Media Invoice Data Extraction Specialist',
    goal='Extract structured financial and delivery data from media invoices into canonical JSON format, including accurate campaign duration calculations',
    backstory="""You are an expert in media billing and invoice processing. 
    You understand advertising metrics (impressions, views, clicks), financial terms 
    (revenue, costs, discounts, profit), and how to extract data accurately from 
    various invoice formats including OCR-extracted text from images and scanned PDFs.
    
    You are skilled at handling noisy or imperfectly formatted text from OCR, identifying
    patterns, and extracting meaningful data even when formatting is inconsistent.
    You always follow the canonical schema strictly and never invent data - you use null 
    for missing values. When dealing with OCR text, you intelligently parse tables and 
    structured data even when spacing or alignment is imperfect.
    
    You are meticulous about extracting date ranges from the Dates column and calculating
    campaign duration in days. You parse date ranges like "2025-10-14 to 2025-10-30" and
    calculate duration_days as the number of days from start to end, inclusive (e.g., 
    Oct 14 to Oct 30 = 17 days, calculated as: (30-14)+1 = 17).""",
    llm=llm,
    tools=[],  # No tools needed - direct file reading
    verbose=True,

    allow_delegation=Falseprint(f"   Role: {invoice_extraction_agent.role}")

)print("‚úÖ Invoice extraction agent created (optimized for OCR text handling)")


In [None]:
# ============================================
# TASK CREATION FUNCTION
# ============================================

def create_extraction_task(file_path: str, max_rows: int = 50) -> Task:
    """
    Create extraction task with invoice context and schema.
    
    Args:
        file_path: Path to invoice file
        max_rows: Maximum rows for tabular data
    
    Returns:
        Task configured for invoice extraction
    """
    # Build context from file
    context_str = build_invoice_context(file_path, max_rows=max_rows)
    
    description = f"""
Extract structured invoice data from the provided file and map it to the canonical schema.

**CANONICAL SCHEMA:**
{CANONICAL_SCHEMA_DOC}

**INVOICE DATA:**
{context_str}

**INSTRUCTIONS:**
1. Identify invoice header information (vendor, dates, totals, currency)
2. Extract all line items with sequential line_id starting from 1
3. Map financial metrics (revenue, costs, discounts, profit)
4. Map delivery metrics (impressions, views, clicks)
5. **CRITICAL: Extract date ranges from the Dates column and calculate duration_days:**
   - Parse the Dates field (format: "YYYY-MM-DD to YYYY-MM-DD")
   - Extract start_date and end_date separately
   - Calculate duration_days = (end_date - start_date) + 1 (inclusive count)
   - Example: "2025-10-14 to 2025-10-30" ‚Üí start_date: "2025-10-14", end_date: "2025-10-30", duration_days: 17
6. Calculate implicit discounts if gross and net revenue differ
7. Use null for missing values - DO NOT INVENT DATA
8. For OCR-extracted text: Look for patterns and table structures even if spacing/formatting is imperfect
9. Handle OCR artifacts gracefully (e.g., misread characters, spacing issues)
10. Add clarifications to 'notes' field if needed or if OCR quality affected extraction
11. Return ONLY valid JSON - no markdown, no explanations

**OUTPUT REQUIREMENT:**
Return a single valid JSON object following the canonical schema exactly.
""".strip()
    
    task = Task(
        description=description,
        agent=invoice_extraction_agent,
        expected_output="Valid JSON object with invoice_header, line_items array, and notes field"
    )

    print("‚úÖ Task creation function defined (with OCR-specific instructions)")

    return task


In [None]:
# ============================================
# MAIN EXTRACTION FUNCTION
# ============================================

def extract_invoice_data(file_path: str, max_rows: int = 50) -> Dict[str, Any]:
    """
    Extract structured invoice data from any supported file format.
    
    Args:
        file_path: Path to invoice file (PDF, Excel, CSV, or text)
        max_rows: Maximum rows to process from tabular files
    
    Returns:
        Dictionary with extracted invoice data in canonical format
    """
    print(f"\n{'='*70}")
    print(f"üìÑ EXTRACTING INVOICE DATA")
    print(f"{'='*70}")
    print(f"File: {Path(file_path).name}")
    print(f"{'='*70}\n")
    
    # Create task
    task = create_extraction_task(file_path, max_rows=max_rows)
    
    # Create crew with single agent
    crew = Crew(
        agents=[invoice_extraction_agent],
        tasks=[task],
        process=Process.sequential,
        verbose=True
    )
    
    # Execute extraction
    result = crew.kickoff()
    result_str = str(result).strip()
    
    # Parse JSON from result
    try:
        parsed = json.loads(result_str)
    except json.JSONDecodeError:
        # Try to extract JSON from response
        start = result_str.find("{")
        end = result_str.rfind("}")
        
        if start != -1 and end != -1 and start < end:
            json_str = result_str[start : end + 1]
            try:
                parsed = json.loads(json_str)
            except json.JSONDecodeError as e:
                return {
                    "error": "Failed to parse JSON response",
                    "details": str(e),
                    "raw_response": result_str[:500]
                }
        else:
            return {
                "error": "No JSON object found in response",
                "raw_response": result_str[:500]
            }
    
    print(f"\n{'='*70}")
    print(f"‚úÖ EXTRACTION COMPLETE")
    print(f"{'='*70}\n")
    
    # Post-process: Calculate duration_days if missing
    parsed = calculate_missing_durations(parsed)
    
    return parsed


def calculate_missing_durations(data: Dict[str, Any]) -> Dict[str, Any]:
    """
    Post-process extracted data to calculate duration_days if missing.
    Ensures all line items have duration_days calculated from start_date and end_date.
    """
    from datetime import datetime
    
    if "line_items" not in data or not isinstance(data["line_items"], list):
        return data
    
    for item in data["line_items"]:
        # Only calculate if duration_days is missing but we have dates
        if item.get("duration_days") is None:
            start_date = item.get("start_date")
            end_date = item.get("end_date")
            
            if start_date and end_date:
                try:
                    # Parse dates
                    start = datetime.strptime(str(start_date).strip(), '%Y-%m-%d')
                    end = datetime.strptime(str(end_date).strip(), '%Y-%m-%d')
                    
                    # Calculate duration (inclusive)
                    duration = (end - start).days + 1
                    item["duration_days"] = duration if duration > 0 else None
                    
                    print(f"   ‚úì Calculated duration for {item.get('campaign_name', 'Unknown')}: {duration} days")
                except (ValueError, AttributeError) as e:
                    print(f"   ‚ö† Could not calculate duration for {item.get('campaign_name', 'Unknown')}: {e}")
    
    return data


print("‚úÖ Main extraction function created")
print("   Usage: result = extract_invoice_data('invoice.xlsx')")

In [None]:
# ============================================
# VALIDATION FUNCTIONS
# ============================================

def validate_extracted_data(data: Dict[str, Any]) -> Dict[str, Any]:
    """
    Validate extracted data against canonical schema.
    
    Returns:
        Validation report with errors and warnings
    """
    validation = {
        "valid": True,
        "errors": [],
        "warnings": []
    }
    
    # Check top-level structure
    if "invoice_header" not in data:
        validation["valid"] = False
        validation["errors"].append("Missing 'invoice_header' field")
    
    if "line_items" not in data:
        validation["valid"] = False
        validation["errors"].append("Missing 'line_items' field")
    elif not isinstance(data["line_items"], list):
        validation["valid"] = False
        validation["errors"].append("'line_items' must be an array")
    
    # Check header has minimal info
    if "invoice_header" in data:
        header = data["invoice_header"]
        if not header.get("invoice_number") and not header.get("vendor_name"):
            validation["warnings"].append("Missing both invoice_number and vendor_name")
        if not header.get("currency"):
            validation["warnings"].append("Currency not specified")
    
    # Check line items have IDs
    if "line_items" in data and isinstance(data["line_items"], list):
        for idx, item in enumerate(data["line_items"]):
            if "line_id" not in item:
                validation["warnings"].append(f"Line item at index {idx} missing 'line_id'")
    
    return validation


print("‚úÖ Validation function created")

In [None]:
# ============================================
# EXPORT FUNCTIONS
# ============================================

def save_to_json(data: Dict[str, Any], output_path: str = None) -> str:
    """
    Save extracted data to JSON file.
    
    Args:
        data: Extracted invoice data
        output_path: Custom output path (optional)
    
    Returns:
        Path to saved file
    """
    if output_path is None:
        timestamp = pd.Timestamp.now().strftime("%Y%m%d_%H%M%S")
        output_path = f"data/invoice_extract_{timestamp}.json"
    
    # Ensure directory exists
    Path(output_path).parent.mkdir(parents=True, exist_ok=True)
    
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=2, ensure_ascii=False)
    
    return output_path


def convert_to_dataframe(data: Dict[str, Any]) -> pd.DataFrame:
    """
    Convert line items to pandas DataFrame.
    
    Args:
        data: Extracted invoice data
    
    Returns:
        DataFrame with line items and header info
    """
    if "line_items" not in data or not data["line_items"]:
        return pd.DataFrame()
    
    df = pd.DataFrame(data["line_items"])
    
    # Add header fields to each row
    if "invoice_header" in data:
        header = data["invoice_header"]
        for key in ["invoice_number", "vendor_name", "invoice_date", "currency"]:
            if key in header:
                df[key] = header[key]
    
    return df


print("‚úÖ Export functions created")
print("   ‚Ä¢ save_to_json() - Save to JSON file")
print("   ‚Ä¢ convert_to_dataframe() - Convert to DataFrame")

# Phase 2: Invoice Reconciliation & Discrepancy Detection

This section implements Phase 2 of the invoice processing system - comparing extracted data with internal mapping files to detect discrepancies.

## Features:
- Load and normalize mapping JSON files
- Exact match discrepancy detection
- Fuzzy logic comparison with configurable thresholds
- Severity classification (CRITICAL, HIGH, MEDIUM, LOW)
- Comprehensive reporting with CSV export

In [None]:
# ============================================
# PHASE 2: MAPPING DATA LOADER
# ============================================

from difflib import SequenceMatcher

def load_mapping_files(mapping_folder: str = 'mapping') -> list:
    """
    Load all JSON mapping files from the mapping folder.
    
    Args:
        mapping_folder: Path to folder containing mapping JSON files
    
    Returns:
        List of dictionaries containing mapping data
    """
    mapping_path = Path(mapping_folder)
    
    if not mapping_path.exists():
        print(f"‚ö†Ô∏è  Warning: Mapping folder not found: {mapping_folder}")
        return []
    
    mapping_files = list(mapping_path.glob('*.json'))
    
    if not mapping_files:
        print(f"‚ö†Ô∏è  Warning: No JSON files found in {mapping_folder}")
        return []
    
    mappings = []
    for file_path in mapping_files:
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
                data['_source_file'] = file_path.name
                mappings.append(data)
                print(f"‚úÖ Loaded: {file_path.name}")
        except Exception as e:
            print(f"‚ùå Error loading {file_path.name}: {str(e)}")
    
    print(f"\nüìÇ Total mapping files loaded: {len(mappings)}")
    return mappings


def normalize_mapping_data(mapping_data: dict) -> dict:
    """
    Normalize mapping data to match canonical schema structure.
    
    Args:
        mapping_data: Raw mapping data from JSON file
    
    Returns:
        Normalized data matching canonical schema
    """
    header = mapping_data.get('Header', {})
    
    normalized = {
        'invoice_header': {
            'invoice_number': (
                header.get('Invoice ID') or 
                header.get('Bill Number') or 
                header.get('Reference No.')
            ),
            'vendor_name': mapping_data.get('Vendor'),
            'invoice_date': (
                header.get('Invoice Date') or 
                header.get('Date Issued') or 
                header.get('Date')
            ),
            'currency': (
                header.get('Currency Type') or 
                header.get('Currency')
            ),
            'total_amount': (
                header.get('Total Due') or 
                header.get('Grand Total') or 
                header.get('Amount')
            ),
        },
        'line_items': [],
        '_source_file': mapping_data.get('_source_file', 'unknown'),
        '_invoice_index': mapping_data.get('InvoiceIndex')
    }
    
    line_items = mapping_data.get('LineItems', [])
    for idx, item in enumerate(line_items, 1):
        dates_str = item.get('Dates')
        normalized_item = {
            'line_id': idx,
            'campaign_name': item.get('Campaign'),
            'insertion_order_id': item.get('IO'),
            'ad_unit': item.get('Ad Unit'),
            'format': item.get('Format'),
            'booked_impressions': parse_number(item.get('Booked')),
            'billed_impressions': parse_number(item.get('Billed')),
            'clicks': parse_number(item.get('Clicks')),
            'rate': item.get('Rate'),
            'discount': item.get('Discount'),
            'net_cost': parse_currency(item.get('Net Cost')),
            'geo': item.get('Geo'),
            'dates': dates_str,
            'duration_days': parse_duration(dates_str),
            'creative': item.get('Creative'),
            'tracking': item.get('Tracking'),
            'notes': item.get('Notes'),
        }
        normalized['line_items'].append(normalized_item)
    
    return normalized


def parse_number(value) -> Optional[float]:
    """Parse string number with commas to float."""
    if value is None:
        return None
    try:
        if isinstance(value, str):
            return float(value.replace(',', ''))
        return float(value)
    except (ValueError, AttributeError):
        return None


def parse_currency(value) -> Optional[float]:
    """Parse currency string to float."""
    if value is None:
        return None
    try:
        if isinstance(value, str):
            clean_value = value.replace('$', '').replace(',', '').strip()
            return float(clean_value)
        return float(value)
    except (ValueError, AttributeError):
        return None



    """Parse duration from date range string (e.g., '2025-10-01 to 2025-10-15')."""print("‚úÖ Mapping data loader functions created")

print("‚úÖ Mapping data loader functions created")
    if not dates_str:

        return None

    try:        return None

        from datetime import datetime    except (ValueError, AttributeError):

        # Split on ' to ' and handle various formats        return duration if duration > 0 else None

        parts = dates_str.replace(' - ', ' to ').split(' to ')        duration = (end_date - start_date).days + 1

        if len(parts) != 2:        # Calculate duration in days (inclusive)

            return None        

                end_date = datetime.strptime(parts[1].strip(), '%Y-%m-%d')

        # Try to parse dates        start_date = datetime.strptime(parts[0].strip(), '%Y-%m-%d')

In [None]:
# ============================================
# FUZZY MATCHING & COMPARISON FUNCTIONS
# ============================================

def fuzzy_string_match(str1: str, str2: str) -> float:
    """Calculate similarity ratio between two strings (0.0 to 1.0)."""
    if str1 is None or str2 is None:
        return 0.0
    s1 = str(str1).strip().lower()
    s2 = str(str2).strip().lower()
    return SequenceMatcher(None, s1, s2).ratio()


def fuzzy_number_match(num1: float, num2: float, tolerance_percent: float = 5.0) -> dict:
    """Check if two numbers are within tolerance."""
    if num1 is None or num2 is None:
        return {
            'is_match': False,
            'difference': None,
            'difference_percent': None,
            'within_tolerance': False
        }
    
    difference = abs(num1 - num2)
    base_value = max(abs(num1), abs(num2))
    
    if base_value == 0:
        difference_percent = 0.0 if difference == 0 else 100.0
    else:
        difference_percent = (difference / base_value) * 100
    
    within_tolerance = difference_percent <= tolerance_percent
    
    return {
        'is_match': within_tolerance,
        'difference': difference,
        'difference_percent': round(difference_percent, 2),
        'within_tolerance': within_tolerance
    }


def get_discrepancy_severity(percent_diff: float) -> str:
    """Determine severity based on percentage difference."""
    if percent_diff < 1:
        return 'LOW'
    elif percent_diff < 5:
        return 'MEDIUM'
    elif percent_diff < 10:
        return 'HIGH'
    else:
        return 'CRITICAL'


def compare_line_items_fuzzy(extracted: dict, mapping: dict,
                             string_threshold: float = 0.8,
                             number_tolerance: float = 5.0) -> dict:
    """Compare line items using fuzzy matching logic."""
    scores = []
    matched_fields = []
    discrepancies = []
    
    # Compare campaign name (high weight)
    campaign_similarity = fuzzy_string_match(
        extracted.get('campaign_name'),
        mapping.get('campaign_name')
    )
    if campaign_similarity >= string_threshold:
        matched_fields.append('campaign_name')
        scores.append(('campaign_name', campaign_similarity, 3.0))
    
    # Compare insertion order ID (high weight)
    io_similarity = fuzzy_string_match(
        extracted.get('insertion_order_id'),
        mapping.get('insertion_order_id')
    )
    if io_similarity >= 0.9:
        matched_fields.append('insertion_order_id')
        scores.append(('insertion_order_id', io_similarity, 3.0))
    
    # Compare numerical fields
    numerical_fields = {
        'booked_impressions': 2.0,
        'billed_impressions': 2.5,
        'clicks': 1.5,
        'net_cost': 2.5,
        'gross_revenue': 2.0,
        'net_revenue': 2.0,
        'duration_days': 2.0
    }
    
    for field, weight in numerical_fields.items():
        ext_value = extracted.get(field)
        map_value = mapping.get(field)
        
        if ext_value is not None and map_value is not None:
            match_result = fuzzy_number_match(ext_value, map_value, number_tolerance)
            
            if match_result['within_tolerance']:
                matched_fields.append(field)
                score = 1.0 - (match_result['difference_percent'] / 100)
                scores.append((field, score, weight))
            else:
                discrepancies.append({
                    'field': field,
                    'extracted_value': ext_value,
                    'mapping_value': map_value,
                    'difference': match_result['difference'],
                    'difference_percent': match_result['difference_percent'],
                    'severity': get_discrepancy_severity(match_result['difference_percent'])
                })
    
    # Compare text fields
    text_fields = {'ad_unit': 1.0, 'format': 1.0, 'geo': 1.0}
    
    for field, weight in text_fields.items():
        similarity = fuzzy_string_match(extracted.get(field), mapping.get(field))
        if similarity >= string_threshold:
            matched_fields.append(field)
            scores.append((field, similarity, weight))
        elif similarity > 0.5:
            discrepancies.append({
                'field': field,
                'extracted_value': extracted.get(field),
                'mapping_value': mapping.get(field),
                'similarity': round(similarity, 2),
                'severity': 'LOW'
            })
    
    # Calculate weighted overall score
    if scores:
        total_weighted_score = sum(score * weight for _, score, weight in scores)
        total_weight = sum(weight for _, _, weight in scores)
        overall_score = total_weighted_score / total_weight
    else:
        overall_score = 0.0
    
    return {
        'overall_score': round(overall_score, 3),
        'matched_fields': matched_fields,
        'discrepancies': discrepancies,
        'field_scores': [(field, round(score, 2)) for field, score, _ in scores]
    }


def find_fuzzy_matches(extracted_data: dict, mapping_data: list, 
                       string_threshold: float = 0.8,
                       number_tolerance: float = 5.0) -> dict:
    """Find matches using fuzzy logic for more flexible comparison."""
    results = {
        'fuzzy_matches': [],
        'potential_discrepancies': [],
        'no_match_found': []
    }
    
    extracted_items = extracted_data.get('line_items', [])
    
    for ext_item in extracted_items:
        best_match = None
        best_score = 0
        
        for mapping in mapping_data:
            for map_item in mapping.get('line_items', []):
                match_result = compare_line_items_fuzzy(
                    ext_item, 
                    map_item,
                    string_threshold,
                    number_tolerance
                )
                
                if match_result['overall_score'] > best_score:
                    best_score = match_result['overall_score']
                    best_match = {
                        'mapping_file': mapping['_source_file'],
                        'extracted_line': ext_item.get('line_id'),
                        'mapping_line': map_item.get('line_id'),
                        'campaign': ext_item.get('campaign_name'),
                        'overall_score': best_score,
                        'match_details': match_result
                    }
        
        if best_match:
            if best_score >= 0.7:
                results['fuzzy_matches'].append(best_match)
                
                discrepancies = best_match['match_details'].get('discrepancies', [])
                if discrepancies:
                    results['potential_discrepancies'].append({
                        **best_match,
                        'discrepancies': discrepancies
                    })
            else:
                results['no_match_found'].append({
                    'extracted_line': ext_item.get('line_id'),
                    'campaign': ext_item.get('campaign_name'),
                    'io': ext_item.get('insertion_order_id'),
                    'best_score': best_score,
                    'reason': 'No strong match found in mapping files'
                })
    
    return results


print("‚úÖ Fuzzy matching and comparison functions created")

In [None]:
# ============================================
# REPORTING & EXPORT FUNCTIONS
# ============================================

def generate_discrepancy_report(fuzzy_matches: dict) -> pd.DataFrame:
    """Generate a detailed discrepancy report as a DataFrame."""
    report_data = []
    
    # Process fuzzy match discrepancies
    for disc in fuzzy_matches.get('potential_discrepancies', []):
        for field_disc in disc.get('discrepancies', []):
            field_name = field_disc.get('field')
            
            # Use more descriptive field name for duration
            if field_name == 'duration_days':
                display_field = 'Incorrect Duration Days'
            else:
                display_field = field_name
            
            report_data.append({
                'Source': 'Fuzzy Match',
                'Mapping File': disc.get('mapping_file'),
                'Campaign': disc.get('campaign'),
                'Line ID': disc.get('extracted_line'),
                'Field': display_field,
                'Extracted Value': field_disc.get('extracted_value'),
                'Planned Value': field_disc.get('mapping_value'),
                'Difference': field_disc.get('difference', 'N/A'),
                'Difference %': field_disc.get('difference_percent', 'N/A'),
                'Severity': field_disc.get('severity', 'UNKNOWN')
            })
    
    df = pd.DataFrame(report_data)
    
    if not df.empty:
        severity_order = {'CRITICAL': 0, 'HIGH': 1, 'MEDIUM': 2, 'LOW': 3, 'UNKNOWN': 4}
        df['_severity_rank'] = df['Severity'].map(severity_order)
        df = df.sort_values('_severity_rank').drop('_severity_rank', axis=1)
    
    return df


def save_discrepancy_report(df: pd.DataFrame, output_path: str = None) -> str:
    """Save discrepancy report to CSV file."""
    if output_path is None:
        timestamp = pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')
        output_path = f'output/discrepancy_report_{timestamp}.csv'
    
    Path(output_path).parent.mkdir(parents=True, exist_ok=True)
    df.to_csv(output_path, index=False)
    return output_path


def print_discrepancy_summary(fuzzy_matches: dict):
    """Print a formatted summary of discrepancy analysis."""
    print("\n" + "="*80)
    print("üìä DISCREPANCY ANALYSIS SUMMARY")
    print("="*80)
    
    print("\nüîç FUZZY MATCH ANALYSIS:")
    print(f"   ‚úÖ Fuzzy Matches: {len(fuzzy_matches.get('fuzzy_matches', []))}")
    print(f"   ‚ö†Ô∏è  Potential Discrepancies: {len(fuzzy_matches.get('potential_discrepancies', []))}")
    print(f"   ‚ùå No Match Found: {len(fuzzy_matches.get('no_match_found', []))}")
    
    if fuzzy_matches.get('no_match_found'):
        print("\n   Unmatched Items:")
        for item in fuzzy_matches['no_match_found'][:5]:
            print(f"   ‚Ä¢ Line {item.get('extracted_line')}: {item.get('campaign')} (IO: {item.get('io')})")
    
    print("\n" + "="*80)


print("‚úÖ Reporting and export functions created")

In [None]:
# ============================================
# COMPLETE RECONCILIATION WORKFLOW
# ============================================

def run_invoice_reconciliation(invoice_file_path: str, 
                               mapping_folder: str = 'mapping',
                               string_threshold: float = 0.8,
                               number_tolerance: float = 5.0,
                               save_report: bool = True) -> dict:
    """
    Complete end-to-end invoice reconciliation workflow.
    
    Phase 1: Extract data from invoice
    Phase 2: Compare with mapping files and detect discrepancies
    
    Args:
        invoice_file_path: Path to invoice file to process
        mapping_folder: Folder containing mapping JSON files
        string_threshold: Minimum similarity for fuzzy string matching
        number_tolerance: Acceptable percentage difference for numbers
        save_report: Whether to save the report to file
    
    Returns:
        Dictionary with complete results including discrepancy report
    """
    print("\n" + "="*80)
    print("üöÄ STARTING INVOICE RECONCILIATION WORKFLOW")
    print("="*80)
    
    # ========== PHASE 1: EXTRACT INVOICE DATA ==========
    print("\nüì• PHASE 1: EXTRACTING INVOICE DATA")
    print("-" * 80)
    
    extracted_data = extract_invoice_data(invoice_file_path)
    
    if "error" in extracted_data:
        print(f"\n‚ùå ERROR: Failed to extract invoice data")
        print(f"   {extracted_data['error']}")
        return {"error": extracted_data, "status": "failed"}
    
    print(f"\n‚úÖ Extraction complete!")
    print(f"   Invoice: {extracted_data.get('invoice_header', {}).get('invoice_number', 'N/A')}")
    print(f"   Vendor: {extracted_data.get('invoice_header', {}).get('vendor_name', 'N/A')}")
    print(f"   Line Items: {len(extracted_data.get('line_items', []))}")
    
    # ========== PHASE 2: LOAD MAPPING DATA ==========
    print("\nüìÇ PHASE 2: LOADING MAPPING DATA")
    print("-" * 80)
    
    mapping_data_raw = load_mapping_files(mapping_folder)
    
    if not mapping_data_raw:
        print(f"\n‚ö†Ô∏è  WARNING: No mapping files found. Skipping reconciliation.")
        return {
            "status": "success",
            "extracted_data": extracted_data,
            "mapping_data": [],
            "warning": "No mapping files available",
            "discrepancy_report": []
        }
    
    mapping_data = [normalize_mapping_data(m) for m in mapping_data_raw]
    
    # ========== PHASE 3: FUZZY MATCH ANALYSIS ==========
    print("\nüîç PHASE 3: FUZZY MATCH ANALYSIS")
    print("-" * 80)
    
    fuzzy_matches = find_fuzzy_matches(
        extracted_data, 
        mapping_data,
        string_threshold,
        number_tolerance
    )
    print(f"   Fuzzy Matches: {len(fuzzy_matches['fuzzy_matches'])}")
    print(f"   Potential Discrepancies: {len(fuzzy_matches['potential_discrepancies'])}")
    print(f"   Unmatched Items: {len(fuzzy_matches['no_match_found'])}")
    
    # ========== PHASE 4: GENERATE REPORTS ==========
    print("\nüìä PHASE 4: GENERATING REPORTS")
    print("-" * 80)
    
    print_discrepancy_summary(fuzzy_matches)
    
    discrepancy_df = generate_discrepancy_report(fuzzy_matches)
    
    report_path = None
    if save_report and not discrepancy_df.empty:
        report_path = save_discrepancy_report(discrepancy_df)
        print(f"\nüíæ Discrepancy report saved to: {report_path}")
    
    # ========== RETURN COMPLETE RESULTS ==========
    results = {
        "status": "success",
        "extracted_data": extracted_data,
        "mapping_files_count": len(mapping_data),
        "fuzzy_matches": fuzzy_matches,
        "discrepancy_report": discrepancy_df.to_dict('records') if not discrepancy_df.empty else [],
        "discrepancy_report_df": discrepancy_df,
        "report_path": report_path,
        "summary": {
            "total_line_items": len(extracted_data.get('line_items', [])),
            "fuzzy_matches": len(fuzzy_matches['fuzzy_matches']),
            "discrepancies": len(fuzzy_matches['potential_discrepancies']),
            "unmatched": len(fuzzy_matches['no_match_found'])
        }
    }
    
    print("\n" + "="*80)
    print("‚úÖ RECONCILIATION WORKFLOW COMPLETE")
    print("="*80)
    
    return results


print("‚úÖ Complete reconciliation workflow function created")