# üì∑ OCR Script - Membaca Text dari Gambar
### Menggunakan Tesseract OCR (Tanpa AI/LLM)
### Support: Bahasa Indonesia & English

---

**Cara Pakai:**
1. Jalankan Cell 1-3 untuk setup
2. Jalankan Cell 4 untuk upload dan proses gambar
3. Hasil akan ditampilkan dan bisa di-download

---

## üì¶ Cell 1: Instalasi Dependencies
Jalankan cell ini **sekali** di awal

In [None]:
print("=" * 50)
print("üì¶ INSTALASI DEPENDENCIES")
print("=" * 50)

# Install Tesseract OCR dan bahasa Indonesia + English
!apt-get update -qq
!apt-get install -y tesseract-ocr tesseract-ocr-ind tesseract-ocr-eng -qq

# Install Python packages
!pip install pytesseract pillow natsort -q

print("\n‚úÖ Instalasi selesai!")
print("‚úÖ Tesseract OCR + Bahasa Indonesia & English terinstall")
print("‚úÖ Natural sorting library terinstall")

## üîß Cell 2: Import dan Verifikasi

In [None]:
import pytesseract
from PIL import Image
from google.colab import files
from datetime import datetime
from natsort import natsorted
import os
import io
import re

print("=" * 50)
print("üîß VERIFIKASI INSTALASI")
print("=" * 50)

tesseract_version = pytesseract.get_tesseract_version()
print(f"‚úÖ Tesseract versi: {tesseract_version}")

available_langs = pytesseract.get_languages()
print(f"‚úÖ Bahasa tersedia: {', '.join(available_langs)}")

if 'ind' in available_langs:
    print("‚úÖ Bahasa Indonesia: Tersedia")
if 'eng' in available_langs:
    print("‚úÖ Bahasa English: Tersedia")

print("‚úÖ Natural sorting: Tersedia")

## ‚öôÔ∏è Cell 3: Fungsi OCR

In [None]:
def get_languages():
    """Tentukan bahasa yang akan digunakan"""
    available = pytesseract.get_languages()
    if 'ind' in available and 'eng' in available:
        return "ind+eng"
    elif 'ind' in available:
        return "ind"
    else:
        return "eng"


def extract_text(image, languages="ind+eng"):
    """
    Ekstrak text dari gambar menggunakan Tesseract OCR
    """
    custom_config = r'--oem 3 --psm 6'
    text = pytesseract.image_to_string(image, lang=languages, config=custom_config)
    return text.strip()


def process_single_image(image, filename):
    """
    Proses satu gambar dan return hasilnya
    """
    languages = get_languages()
    extracted_text = extract_text(image, languages)
    return extracted_text


def process_and_save(image, filename="uploaded_image", save_to_drive=False):
    """
    Proses gambar dan simpan hasil ke file txt (untuk single image)
    """
    print("\n" + "=" * 50)
    print("‚è≥ MEMPROSES GAMBAR...")
    print("=" * 50)

    languages = get_languages()

    print(f"üåê Bahasa: {languages}")
    print(f"üì∑ Gambar: {filename}")
    print(f"üìê Ukuran: {image.size[0]} x {image.size[1]} pixels")

    # Ekstrak text
    extracted_text = extract_text(image, languages)

    # Buat nama file output
    base_name = os.path.splitext(filename)[0]
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    output_filename = f"ocr_result_{base_name}_{timestamp}.txt"

    # Tentukan path output
    if save_to_drive:
        output_path = f"/content/drive/MyDrive/{output_filename}"
    else:
        output_path = f"/content/{output_filename}"

    # Simpan ke file
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(f"{'=' * 50}\n\n")
        f.write(extracted_text)

    print(f"\n‚úÖ SELESAI!")
    print(f"üìÑ Output disimpan: {output_path}")

    return extracted_text, output_path


def display_result(text):
    """Tampilkan hasil OCR"""
    print("\n" + "=" * 50)
    print("üìù HASIL OCR")
    print("=" * 50)

    if text:
        print(text)
    else:
        print("(Tidak ada text yang terdeteksi)")

    print("=" * 50)

    if text:
        words = len(text.split())
        chars = len(text)
        lines = len(text.split('\n'))
        print(f"\nüìä Statistik:")
        print(f"   ‚Ä¢ Karakter: {chars}")
        print(f"   ‚Ä¢ Kata: {words}")
        print(f"   ‚Ä¢ Baris: {lines}")


print("‚úÖ Fungsi OCR siap digunakan!")

---
## üöÄ Cell 4: Upload & Proses Gambar (Single)
Jalankan cell ini untuk **upload satu gambar dan ekstrak text**

In [None]:
print("=" * 50)
print("üì§ UPLOAD GAMBAR")
print("=" * 50)
print("Pilih file gambar (PNG, JPG, JPEG, BMP, TIFF, WEBP)")
print()

# Upload file
uploaded = files.upload()

if uploaded:
    for filename, content in uploaded.items():
        print(f"\nüì∑ File diterima: {filename}")

        # Buka gambar
        image = Image.open(io.BytesIO(content))

        # Proses OCR
        text, output_path = process_and_save(image, filename)

        # Tampilkan hasil
        display_result(text)

        # Download hasil
        print("\n" + "=" * 50)
        print("üì• DOWNLOAD HASIL")
        print("=" * 50)
        files.download(output_path)
else:
    print("‚ö†Ô∏è Tidak ada file yang diupload")

---
# üìö BATCH PROCESSING
## Proses Banyak Gambar ‚Üí Satu File Output

**Fitur:**
- ‚úÖ Natural sorting otomatis (11, 11b, 12a, 12b)
- ‚úÖ Semua hasil digabung ke satu file .txt
- ‚úÖ Separator antar halaman yang jelas

## ‚öôÔ∏è Cell 5: Fungsi Batch Processing

In [None]:
def natural_sort_key(filename):
    """
    Generate sorting key untuk natural sorting
    Contoh: '11', '11b', '12a', '12b' akan diurutkan dengan benar
    """
    # Pisahkan nama file menjadi bagian angka dan huruf
    parts = re.split(r'(\d+)', filename.lower())
    # Convert angka ke integer untuk sorting yang benar
    return [int(part) if part.isdigit() else part for part in parts]


def sort_files_naturally(file_dict):
    """
    Urutkan dictionary files secara natural
    Input: {'12b.png': content, '11.png': content, '11b.png': content, '12a.png': content}
    Output: [('11.png', content), ('11b.png', content), ('12a.png', content), ('12b.png', content)]
    """
    # Gunakan natsort untuk pengurutan natural
    sorted_filenames = natsorted(file_dict.keys())
    return [(fname, file_dict[fname]) for fname in sorted_filenames]


def process_batch_to_single_file(uploaded_files, output_name="combined_ocr_result",
                                  save_to_drive=False, separator_style="default"):
    """
    Proses banyak gambar dan gabungkan ke satu file output

    Args:
        uploaded_files: Dictionary dari files.upload()
        output_name: Nama file output (tanpa .txt)
        save_to_drive: Simpan ke Google Drive
        separator_style: 'default', 'minimal', 'detailed'

    Returns:
        Tuple (combined_text, output_path)
    """
    print("\n" + "=" * 60)
    print("üìö BATCH PROCESSING - MULAI")
    print("=" * 60)

    # Sort files naturally
    print("\nüîÑ Mengurutkan file secara natural...")
    sorted_files = sort_files_naturally(uploaded_files)

    print(f"\nüìã Urutan file setelah di-sort:")
    for i, (fname, _) in enumerate(sorted_files, 1):
        print(f"   {i}. {fname}")

    # Process each file
    languages = get_languages()
    print(f"\nüåê Bahasa: {languages}")
    print(f"\n‚è≥ Memproses {len(sorted_files)} gambar...\n")

    all_results = []

    for i, (filename, content) in enumerate(sorted_files, 1):
        print(f"   [{i}/{len(sorted_files)}] Memproses: {filename}", end="")

        try:
            # Buka dan proses gambar
            image = Image.open(io.BytesIO(content))
            text = process_single_image(image, filename)

            all_results.append({
                'filename': filename,
                'text': text,
                'size': image.size,
                'success': True
            })
            print(" ‚úÖ")

        except Exception as e:
            all_results.append({
                'filename': filename,
                'text': f"[ERROR: {str(e)}]",
                'size': (0, 0),
                'success': False
            })
            print(f" ‚ùå Error: {e}")

    # Combine results into single text
    print("\nüìù Menggabungkan hasil...")

    combined_text = generate_combined_output(all_results, separator_style)

    # Create output filename
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    output_filename = f"{output_name}_{timestamp}.txt"

    if save_to_drive:
        output_path = f"/content/drive/MyDrive/{output_filename}"
    else:
        output_path = f"/content/{output_filename}"

    # Save to file
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(combined_text)

    # Summary
    success_count = sum(1 for r in all_results if r['success'])

    print("\n" + "=" * 60)
    print("‚úÖ BATCH PROCESSING - SELESAI")
    print("=" * 60)
    print(f"\nüìä Ringkasan:")
    print(f"   ‚Ä¢ Total file: {len(all_results)}")
    print(f"   ‚Ä¢ Berhasil: {success_count}")
    print(f"   ‚Ä¢ Gagal: {len(all_results) - success_count}")
    print(f"\nüìÑ Output disimpan: {output_path}")

    return combined_text, output_path, all_results


def generate_combined_output(results, separator_style="default"):
    """
    Generate combined text dari semua hasil OCR
    """
    lines = []

    # Header
    lines.append("OCR Combined Result - Google Colab")
    lines.append("=" * 60)
    lines.append(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    lines.append(f"Total Files: {len(results)}")
    lines.append(f"Languages: {get_languages()}")
    lines.append("")
    lines.append("Files (in order):")
    for i, r in enumerate(results, 1):
        status = "‚úì" if r['success'] else "‚úó"
        lines.append(f"  {i}. [{status}] {r['filename']}")
    lines.append("=" * 60)
    lines.append("")
    lines.append("")

    # Content dari setiap file
    for i, result in enumerate(results, 1):
        if separator_style == "minimal":
            # Separator minimal
            lines.append(f"--- [{i}] {result['filename']} ---")
            lines.append("")
        elif separator_style == "detailed":
            # Separator detailed
            lines.append("#" * 60)
            lines.append(f"# FILE {i}: {result['filename']}")
            lines.append("#" * 60)
            lines.append("")
        else:
            # Default separator
            lines.append("=" * 60)
            lines.append(f"üìÑ [{i}/{len(results)}] {result['filename']}")
            lines.append("=" * 60)
            lines.append("")

        # Content
        lines.append(result['text'])
        lines.append("")
        lines.append("")

    # Footer
    lines.append("=" * 60)
    lines.append("END OF DOCUMENT")
    lines.append("=" * 60)

    return "\n".join(lines)


print("‚úÖ Fungsi Batch Processing siap digunakan!")

---
## üöÄ Cell 6: Upload & Proses Banyak Gambar
Jalankan cell ini untuk **batch processing**

**Fitur:**
- File akan diurutkan otomatis secara natural (11 ‚Üí 11b ‚Üí 12a ‚Üí 12b)
- Semua hasil digabung menjadi **satu file .txt**

In [None]:
print("=" * 60)
print("üì§ UPLOAD BANYAK GAMBAR (BATCH PROCESSING)")
print("=" * 60)
print("\nüìå Pilih beberapa file gambar sekaligus")
print("üìå File akan diurutkan otomatis secara natural")
print("üìå Contoh urutan: 11, 11b, 12a, 12b\n")

# Upload files
uploaded = files.upload()

if uploaded:
    print(f"\nüì∑ {len(uploaded)} file diterima")

    # Proses batch
    combined_text, output_path, results = process_batch_to_single_file(
        uploaded,
        output_name="combined_ocr_result",
        save_to_drive=False,
        separator_style="default"  # Pilihan: 'default', 'minimal', 'detailed'
    )

    # Preview hasil
    print("\n" + "=" * 60)
    print("üëÄ PREVIEW HASIL (500 karakter pertama)")
    print("=" * 60)
    preview = combined_text[:500] + "..." if len(combined_text) > 500 else combined_text
    print(preview)

    # Download
    print("\n" + "=" * 60)
    print("üì• DOWNLOAD HASIL")
    print("=" * 60)
    files.download(output_path)
else:
    print("‚ö†Ô∏è Tidak ada file yang diupload")

---
# üìÇ OPSI TAMBAHAN

## üíæ Opsi A: Batch Processing + Simpan ke Google Drive

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

print("\n‚úÖ Google Drive terhubung!")

In [None]:
print("=" * 60)
print("üì§ BATCH PROCESSING ‚Üí GOOGLE DRIVE")
print("=" * 60)

uploaded = files.upload()

if uploaded:
    combined_text, output_path, results = process_batch_to_single_file(
        uploaded,
        output_name="combined_ocr_result",
        save_to_drive=True,  # Simpan ke Google Drive
        separator_style="default"
    )

    print(f"\n‚úÖ Hasil disimpan di Google Drive: {output_path}")

## üé® Opsi B: Pilih Style Separator

Ada 3 pilihan style separator:
- `default` - Separator standar dengan emoji
- `minimal` - Separator simpel
- `detailed` - Separator dengan info lengkap

In [None]:
# ‚¨áÔ∏è PILIH STYLE SEPARATOR
SEPARATOR_STYLE = "minimal"  # Pilihan: 'default', 'minimal', 'detailed'

print("=" * 60)
print(f"üì§ BATCH PROCESSING (Style: {SEPARATOR_STYLE})")
print("=" * 60)

uploaded = files.upload()

if uploaded:
    combined_text, output_path, results = process_batch_to_single_file(
        uploaded,
        output_name="combined_ocr_result",
        save_to_drive=False,
        separator_style=SEPARATOR_STYLE
    )

    files.download(output_path)

## üìÅ Opsi C: Batch Processing dari Google Drive Folder

In [None]:
# Mount Google Drive dulu
from google.colab import drive
drive.mount('/content/drive')

import glob

# ‚¨áÔ∏è GANTI PATH FOLDER DI BAWAH INI
folder_path = "/content/drive/MyDrive/folder_gambar/"  # <-- Ganti path ini

# Ekstensi gambar yang didukung
image_extensions = ['*.png', '*.jpg', '*.jpeg', '*.bmp', '*.tiff', '*.webp']

# Cari semua file gambar
image_files = []
for ext in image_extensions:
    image_files.extend(glob.glob(os.path.join(folder_path, ext)))

print(f"üìÅ Folder: {folder_path}")
print(f"üì∑ Ditemukan {len(image_files)} file gambar")

if image_files:
    # Baca file dan buat dictionary seperti hasil upload
    file_dict = {}
    for fpath in image_files:
        fname = os.path.basename(fpath)
        with open(fpath, 'rb') as f:
            file_dict[fname] = f.read()

    # Proses batch
    combined_text, output_path, results = process_batch_to_single_file(
        file_dict,
        output_name="combined_ocr_from_drive",
        save_to_drive=True,
        separator_style="default"
    )

    print(f"\n‚úÖ Hasil disimpan: {output_path}")
else:
    print("‚ö†Ô∏è Tidak ada file gambar ditemukan di folder tersebut")

## üîß Opsi D: Custom Output Filename

In [None]:
# ‚¨áÔ∏è CUSTOM NAMA FILE OUTPUT
CUSTOM_OUTPUT_NAME = "hasil_scan_dokumen"  # <-- Ganti nama ini

print("=" * 60)
print(f"üì§ BATCH PROCESSING")
print(f"üìÑ Output: {CUSTOM_OUTPUT_NAME}_[timestamp].txt")
print("=" * 60)

uploaded = files.upload()

if uploaded:
    combined_text, output_path, results = process_batch_to_single_file(
        uploaded,
        output_name=CUSTOM_OUTPUT_NAME,
        save_to_drive=False,
        separator_style="default"
    )

    files.download(output_path)

## üìä Opsi E: Lihat Preview Tiap File Sebelum Download

In [None]:
print("=" * 60)
print("üì§ BATCH PROCESSING + PREVIEW")
print("=" * 60)

uploaded = files.upload()

if uploaded:
    combined_text, output_path, results = process_batch_to_single_file(
        uploaded,
        output_name="combined_ocr_result",
        save_to_drive=False,
        separator_style="default"
    )

    # Preview tiap file
    print("\n" + "=" * 60)
    print("üëÄ PREVIEW TIAP FILE (100 karakter pertama)")
    print("=" * 60)

    for i, result in enumerate(results, 1):
        print(f"\nüìÑ [{i}] {result['filename']}:")
        preview = result['text'][:100] + "..." if len(result['text']) > 100 else result['text']
        print(f"   {preview if preview else '(kosong)'}")

    # Download
    print("\n" + "=" * 60)
    print("üì• DOWNLOAD HASIL")
    print("=" * 60)
    files.download(output_path)