# 🇺🇦 Ukrainian OCR Pipeline - Google Colab Demo

This notebook demonstrates the **Ukrainian OCR Pipeline Package** in Google Colab.

## Quick Start:
1. **Upload your document** using the file uploader
2. **Run all cells** for complete processing  
3. **Download results** at the end

**Two-Stage Processing**: Segmentation → Recognition & Enhancement

## 🔧 Setup & Installation

In [None]:
# Clone repository and install package
print("📥 Cloning Ukrainian OCR repository...")
!git clone https://github.com/mary-lev/ukrainian-ocr-package.git /content/ukrainian_ocr_package

print("⚙️ Installing Ukrainian OCR package...")
!cd /content/ukrainian_ocr_package && pip install -e .

print("✅ Installation complete!")

## 📦 Import Libraries

In [None]:
import os
import sys
import time
from pathlib import Path

# Add package to path
sys.path.insert(0, '/content/ukrainian_ocr_package')

# Core libraries
import torch
import numpy as np
import cv2
from PIL import Image
import matplotlib.pyplot as plt

# Ukrainian OCR Package
from ukrainian_ocr import UkrainianOCRPipeline
from ukrainian_ocr.core.config import OCRPipelineConfig

# Colab utilities
from google.colab import files

print("✅ All libraries loaded successfully")

In [None]:
# Create default configuration
config = OCRPipelineConfig()
config.update_for_colab()

print(f"✅ Configuration ready")
print(f"🎯 Device: {config.device}")
print(f"📦 Batch size: {config.batch_size}")

## 📤 Upload Document

In [None]:
print("📤 Upload your Ukrainian document (JPG, PNG, TIFF):")
uploaded = files.upload()

if uploaded:
    test_image_path = list(uploaded.keys())[0]
    print(f"✅ Document uploaded: {test_image_path}")
    
    # Display document preview
    with Image.open(test_image_path) as img:
        print(f"📐 Dimensions: {img.size[0]} x {img.size[1]} pixels")
        plt.figure(figsize=(10, 6))
        plt.imshow(img)
        plt.title(f"Document: {test_image_path}")
        plt.axis('off')
        plt.show()
else:
    print("❌ No file uploaded")
    test_image_path = None

## 🏁 Stage 1: Document Segmentation

In [None]:
if not test_image_path:
    print("❌ Please upload a document first")
else:
    print("🚀 Starting segmentation...")
    
    # Initialize pipeline
    pipeline = UkrainianOCRPipeline(config=config)
    output_dir = Path("/content/ukrainian_ocr_output")
    output_dir.mkdir(exist_ok=True)
    
    # Load and segment image
    start_time = time.time()
    pipeline._init_components()
    image = cv2.imread(test_image_path)
    lines = pipeline.segmenter.segment_image(image)
    seg_time = time.time() - start_time
    
    print(f"✅ Segmentation complete: {seg_time:.2f}s")
    print(f"📊 Detected {len(lines)} text lines")
    
    # Create basic ALTO XML
    basic_alto_xml = pipeline._create_alto_xml(Path(test_image_path), image, lines)
    basic_alto_path = output_dir / f"{Path(test_image_path).stem}_basic_alto.xml"
    with open(basic_alto_path, 'w', encoding='utf-8') as f:
        f.write(basic_alto_xml)
    
    print(f"✅ Basic ALTO created: {basic_alto_path}")
    
    # Create visualization
    vis_image = image.copy()
    colors = [(0, 255, 0), (255, 0, 0), (0, 0, 255), (255, 255, 0)]
    
    for idx, line in enumerate(lines[:100]):  # Show first 100 lines
        color = colors[idx % len(colors)]
        polygon = line.get('polygon', [])
        if polygon and len(polygon) >= 3:
            pts = np.array(polygon, np.int32)
            cv2.polylines(vis_image, [pts], True, color, 2)
    
    plt.figure(figsize=(12, 8))
    plt.imshow(cv2.cvtColor(vis_image, cv2.COLOR_BGR2RGB))
    plt.title(f"Segmentation: {len(lines)} lines detected")
    plt.axis('off')
    plt.show()
    
    # Store for next stage
    stage1_results = {
        'image': image,
        'lines': lines,
        'segmentation_time': seg_time,
        'output_dir': output_dir
    }

## 🤖 Stage 2: Text Recognition & Enhancement

In [None]:
if 'stage1_results' not in locals():
    print("❌ Please run Stage 1 first")
else:
    print("🚀 Starting text recognition & enhancement...")
    
    image = stage1_results['image']
    lines = stage1_results['lines']
    output_dir = stage1_results['output_dir']
    
    # Text Recognition
    print(f"🤖 Processing {len(lines)} lines...")
    start_time = time.time()
    lines_with_text = pipeline.ocr_processor.process_lines(image, lines)
    ocr_time = time.time() - start_time
    
    recognized_lines = [l for l in lines_with_text if l.get('text', '').strip()]
    print(f"✅ OCR complete: {ocr_time:.2f}s")
    print(f"📊 Text in {len(recognized_lines)}/{len(lines)} lines")
    
    # Show sample text
    print("\n📝 Sample recognized text:")
    for i, line in enumerate(recognized_lines[:5]):
        text = line.get('text', '')
        print(f"  {i+1}. '{text}'")
    
    # Named Entity Recognition
    print("\n🏷️ Named Entity Recognition...")
    start_time = time.time()
    ner_results = pipeline.ner_extractor.extract_entities_from_lines(lines_with_text)
    ner_time = time.time() - start_time
    
    all_entities = ner_results.get('all_entities', [])
    print(f"✅ NER complete: {ner_time:.2f}s")
    print(f"🏷️ Found {len(all_entities)} entities")
    
    if all_entities:
        print("\nSample entities:")
        for entity in all_entities[:5]:
            print(f"  '{entity.get('text', '')}' -> {entity.get('label', '')}")
    
    # Surname Matching
    print("\n👥 Surname Matching...")
    start_time = time.time()
    surname_matches = pipeline.surname_matcher.find_in_lines(lines_with_text)
    surname_time = time.time() - start_time
    
    print(f"✅ Surname matching: {surname_time:.2f}s")
    print(f"👥 Found {len(surname_matches)} matches")
    
    if surname_matches:
        print("\nSample matches:")
        for match in surname_matches[:5]:
            print(f"  '{match.found_text}' -> '{match.matched_surname}' ({match.confidence:.2f})")
        
        # Export matches
        matches_file = output_dir / f"{Path(test_image_path).stem}_surnames.json"
        pipeline.surname_matcher.export_matches(surname_matches, str(matches_file))
    
    # Create Complete ALTO
    print("\n✨ Creating enhanced ALTO...")
    complete_alto_xml = pipeline._create_alto_xml(Path(test_image_path), image, lines_with_text)
    complete_alto_path = output_dir / f"{Path(test_image_path).stem}_complete_alto.xml"
    with open(complete_alto_path, 'w', encoding='utf-8') as f:
        f.write(complete_alto_xml)
    
    # Enhanced ALTO with NER (if entities found)
    if all_entities:
        entities_by_line_id = {}
        for idx, line in enumerate(lines_with_text):
            line_id = f"line_{idx}"
            line_text = line.get('text', '')
            line_entities = [e for e in all_entities if e.get('text', '') in line_text]
            if line_entities:
                entities_by_line_id[line_id] = {'entities': line_entities}
        
        if entities_by_line_id:
            enhanced_alto_path = output_dir / f"{Path(test_image_path).stem}_enhanced_alto.xml"
            pipeline.alto_enhancer.enhance_alto_with_ner(
                str(complete_alto_path), entities_by_line_id, str(enhanced_alto_path)
            )
    
    # Summary
    total_time = stage1_results['segmentation_time'] + ocr_time + ner_time + surname_time
    print(f"\n📊 PROCESSING COMPLETE")
    print(f"⏱️ Total time: {total_time:.2f}s")
    print(f"🔍 Lines detected: {len(lines)}")
    print(f"📝 Lines with text: {len(recognized_lines)}")
    print(f"🏷️ Entities found: {len(all_entities)}")
    print(f"👥 Surname matches: {len(surname_matches)}")
    
    # Store for download
    processing_results = {
        'output_dir': output_dir,
        'total_time': total_time
    }

## 📥 Download Results

In [None]:
if 'processing_results' in locals():
    print("📥 Downloading your OCR results...")
    
    output_dir = processing_results['output_dir']
    
    # Create a zip file with all results
    import zipfile
    zip_path = "/content/ukrainian_ocr_results.zip"
    
    with zipfile.ZipFile(zip_path, 'w') as zipf:
        for file_path in output_dir.glob("*"):
            zipf.write(file_path, file_path.name)
    
    print(f"📦 Created results archive")
    
    # List generated files
    print("\n📁 Generated files:")
    for file_path in output_dir.glob("*"):
        size_kb = file_path.stat().st_size / 1024
        print(f"  📄 {file_path.name} ({size_kb:.1f} KB)")
    
    # Download the zip file
    files.download(zip_path)
    print("\n✅ Download started!")
    
else:
    print("❌ No processing results found. Please run the processing cells first.")

## 📋 Results Summary

### Generated Files:
- **Basic ALTO XML**: Segmentation with coordinates
- **Complete ALTO XML**: Full transcription with confidence scores  
- **Enhanced ALTO XML**: With NER semantic annotations (if entities found)
- **Surname Matches JSON**: Genealogical findings with fuzzy matching
- **Segmentation PNG**: Visual representation of detected text lines

### Usage:
- Import ALTO files into eScriptorium or XML editors
- Analyze surname matches for genealogical research
- Process additional documents by re-running the notebook

---
**🇺🇦 Ukrainian OCR Pipeline** - [GitHub Repository](https://github.com/mary-lev/ukrainian-ocr-package)