# FinScribe ETL Pipeline - Full Demo

This notebook demonstrates the complete ETL pipeline for financial document processing:

1. **Extract**: Ingest documents from various sources
2. **Transform**: OCR, semantic parsing, normalization
3. **Load**: Store structured data in PostgreSQL + MinIO
4. **Validate**: Business rule validation

## Setup


In [None]:
# Install dependencies
!pip install -q opencv-python pillow psycopg2-binary dateparser requests boto3


In [None]:
# Clone repo (if not already cloned)
import os
if not os.path.exists('finscribe-smart-scan'):
    !git clone https://github.com/lucylow/finscribe-smart-scan.git
    %cd finscribe-smart-scan
else:
    %cd finscribe-smart-scan


## Import ETL Pipeline Modules


In [None]:
import sys
sys.path.insert(0, '.')

from data_pipeline.ingestion import ingest_from_local
from data_pipeline.preprocess import preprocess
from data_pipeline.ocr_client import run_ocr
from data_pipeline.semantic_parser import parse
from data_pipeline.normalizer import normalize_invoice_data
from data_pipeline.validator import validate

print("ETL modules imported successfully")


## Run ETL Pipeline


In [None]:
# Process a single invoice
invoice_path = "examples/sample_invoice_1.png"

if not os.path.exists(invoice_path):
    print(f"Invoice not found: {invoice_path}")
    print("Please add a sample invoice to the examples/ directory")
else:
    print(f"Processing: {invoice_path}")
    
    # Extract
    print("\n[1/5] Ingesting...")
    src = ingest_from_local(invoice_path)
    
    print("\n[2/5] Preprocessing...")
    clean = preprocess(src)
    
    print("\n[3/5] Running OCR...")
    ocr_result = run_ocr(clean)
    
    print("\n[4/5] Parsing semantic structure...")
    parsed = parse(ocr_result)
    normalized = normalize_invoice_data(parsed)
    
    print("\n[5/5] Validating...")
    validation = validate(normalized)
    
    print("\nâœ… ETL Pipeline Complete!")
    print(f"Validation: {'PASSED' if validation['ok'] else 'FAILED'}")
    if not validation['ok']:
        print(f"Errors: {validation['errors']}")


## Visualize Results


In [None]:
import json
from pprint import pprint

print("=" * 60)
print("RAW OCR TEXT (first 500 chars):")
print("=" * 60)
if 'ocr_result' in locals():
    print(ocr_result.get("text", "")[:500])

print("\n" + "=" * 60)
print("STRUCTURED JSON:")
print("=" * 60)
if 'normalized' in locals():
    print(json.dumps(normalized, indent=2))
