# üöÄ Bangla/English OCR & IE Pipeline (Stable & Interactive)
This notebook implements a complete Document AI pipeline with robust error handling and multi-threaded API support for Google Colab.

## 1. Install Dependencies

In [None]:
!pip install paddlepaddle-gpu
!pip install paddleocr opencv-python-headless fastapi uvicorn pyngrok pydantic python-multipart requests
print('‚úÖ Dependencies installed.')

## 2. Setup Ollama (Local LLM)
Run this to pull the extraction model.

In [None]:
!curl -fsSL https://ollama.com/install.sh | sh

import subprocess
import time

# Start Ollama server in background
subprocess.Popen(['ollama', 'serve'])
time.sleep(5)

# Pull model
!ollama pull llama3.2:1b
print('‚úÖ Ollama is ready.')

## 3. Core Logic (OCR & IE)

In [None]:
import cv2
import numpy as np
from paddleocr import PaddleOCR
import unicodedata
import re
import requests
import json

# --- üß† Singleton OCR Initialization ---
# This prevents Colab crashes when running the cell multiple times
if 'ocr' not in globals():
    print("Initializing PaddleOCR (this may take a minute)...")
    try:
        ocr = PaddleOCR(use_angle_cls=True, lang='bn', use_gpu=True)
        print("‚úÖ PaddleOCR initialized.")
    except Exception as e:
        print(f"‚ùå OCR Init Error: {e}")
else:
    print("‚ÑπÔ∏è PaddleOCR already initialized.")

def preprocess_image(image_path):
    image = cv2.imread(image_path)
    if image is None: raise ValueError("Could not read image.")
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
    return thresh

def run_ocr(image_path):
    result = ocr.ocr(image_path, cls=True)
    text_lines = [line[1][0] for line in result[0]] if (result and result[0]) else []
    full_text = "\n".join(text_lines)
    return unicodedata.normalize("NFKC", full_text)

def extract_info(text):
    data = {}
    # Regex Patterns
    nid_match = re.search(r'(\d{10}|\d{13}|\d{17})', text)
    if nid_match: data['nid_number'] = nid_match.group(1)
    
    date_match = re.search(r'(\d{2}[-/\.]\d{2}[-/\.]\d{4})', text)
    if date_match: data['date_of_birth'] = date_match.group(1)
    
    # Ollama Extraction
    prompt = f"Extract person name, father name, mother name, and address from this text. Return valid JSON only.\n\nText: {text}"
    try:
        payload = {"model": "llama3.2:1b", "prompt": prompt, "stream": False, "format": "json"}
        resp = requests.post("http://localhost:11434/api/generate", json=payload, timeout=30)
        data.update(json.loads(resp.json().get("response", "{}")))
    except Exception as e:
        data['llm_error'] = str(e)
        
    return data

## 4. FastAPI Deployment (Interactive)
This cell starts the API in a background thread so you can continue using the notebook.

In [None]:
from fastapi import FastAPI, UploadFile, File, HTTPException
from fastapi.responses import JSONResponse
from pyngrok import ngrok
import nest_asyncio
import uvicorn
import shutil
import os
import traceback
import threading

app = FastAPI()

@app.post("/extract")
async def api_extract(file: UploadFile = File(...)):
    temp_file = f"temp_{file.filename}"
    try:
        print(f"üì• Received: {file.filename}")
        with open(temp_file, "wb") as buffer:
            shutil.copyfileobj(file.file, buffer)
        
        clean = preprocess_image(temp_file)
        cv2.imwrite("clean.jpg", clean)
        
        print("üîç OCR Running...")
        text = run_ocr("clean.jpg")
        
        print("üß¨ IE Running...")
        structured = extract_info(text)
        
        return {"status": "success", "structured_data": structured, "raw_text": text}
        
    except Exception as e:
        error_trace = traceback.format_exc()
        print(f"‚ùå ERROR: {error_trace}")
        return JSONResponse(
            status_code=500, 
            content={"status": "error", "message": str(e), "traceback": error_trace}
        )
    finally:
        if os.path.exists(temp_file): os.remove(temp_file)

# üîë ngrok Authtoken
!ngrok authtoken 38CmbCTAS2yWNSsMba5alNmsRly_qNNmEdeqcf7xWBeyoN7A

def start_api():
    nest_asyncio.apply()
    uvicorn.run(app, host="0.0.0.0", port=8000)

try:
    # Kill existing tunnels to avoid port conflicts
    tunnels = ngrok.get_tunnels()
    for t in tunnels: ngrok.disconnect(t.public_url)
    
    public_url = ngrok.connect(8000).public_url
    print(f"\nüöÄ API Live at: {public_url}/docs")
    
    # Start server in background thread
    threading.Thread(target=start_api, daemon=True).start()
    print("‚úÖ FastAPI is running in the background.")
except Exception as e:
    print(f"‚ùå Startup failed: {e}")