# JurisCape Node A: The Vision Agent (Qwen2.5-VL + PaddleOCR)
**Role**: Analyze Images/Videos. Use OCR tool if timestamps are detected.
**Model**: `Qwen/Qwen2.5-VL-7B-Instruct` (Quantized).
**Tool**: `PaddleOCR` (Multi-language Support).

In [None]:
# 1. Install Dependencies
!pip install fastapi uvicorn pyngrok python-multipart nest_asyncio requests
!pip install git+https://github.com/huggingface/transformers accelerate bitsandbytes
!pip install paddlepaddle-gpu paddleocr opencv-python-headless
!apt-get install ffmpeg libsm6 libxext6  -y

In [None]:
# 2. Load PaddleOCR
from paddleocr import PaddleOCR
print("Loading PaddleOCR...")
# Lang='en' supports english. Paddle is auto-detect mostly. 
# For specific indian languages, we might need specific lang codes if auto fails.
ocr_engine = PaddleOCR(use_angle_cls=True, lang='en') 
print("PaddleOCR Ready!")

In [None]:
# 3. Load Qwen2.5-VL (Vision Language Model)
import torch
from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor

model_id = "Qwen/Qwen2.5-VL-7B-Instruct"
print(f"Loading {model_id}...")

model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    attn_implementation="flash_attention_2",
    device_map="auto"
)
processor = AutoProcessor.from_pretrained(model_id)
print("Qwen2.5-VL Loaded!")

In [None]:
# 4. Define Helper Functions (Video/Image Processing)
import cv2
import numpy as np
import re

def extract_frame_at_timestamp(video_path, timestamp_str):
    """Extracts a frame at HH:MM:SS or MM:SS"""
    # Convert timestamp to seconds
    parts = list(map(int, timestamp_str.split(':')))
    seconds = 0
    if len(parts) == 3:
        seconds = parts[0]*3600 + parts[1]*60 + parts[2]
    elif len(parts) == 2:
        seconds = parts[0]*60 + parts[1]
    
    cap = cv2.VideoCapture(video_path)
    cap.set(cv2.CAP_PROP_POS_MSEC, (seconds * 1000))
    ret, frame = cap.read()
    cap.release()
    if ret:
        return frame
    return None

def run_ocr_on_frame(frame):
    result = ocr_engine.ocr(frame, cls=True)
    # Flatten result
    text = "\n".join([line[1][0] for line in result[0]])
    return text

def run_qwen(content_inputs, prompt_text):
    # Prepare inputs for Qwen
    messages = [
        {
            "role": "user",
            "content": content_inputs + [{"type": "text", "text": prompt_text}]
        }
    ]
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(text=[text], images=..., videos=..., padding=True, return_tensors="pt") 
    # Note: Actual image/video binding logic needs detailed implementation matching HuggingFace docs for Qwen2-VL
    # Simplified for snippet length. Assuming global processor handles the inputs correctly 
    # or we pre-process images/videos into the format Qwen expects.
    
    inputs = inputs.to("cuda")
    generated_ids = model.generate(**inputs, max_new_tokens=128)
    return processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

# Note: The above run_qwen is pseudo-code-ish for the binding part due to complexity. 
# Real implementation requires 'vision_infos' handling.

In [None]:
# 5. The AGENTIC LOOP
async def process_media_agentic(file_path):
    # PASS 1: Initial Scan
    print("Running Pass 1 (Qwen Visual Scan)...")
    # In real code: Load video/image into Qwen input format
    initial_response = "[MOCK] I see a car with license plate number visible at <<<12:05>>>."
    
    # Check for Tool Trigger
    timestamp_match = re.search(r'<<<(\d{1,2}:\d{2}(?::\d{2})?)>>>', initial_response)
    
    if timestamp_match:
        timestamp = timestamp_match.group(1)
        print(f"Tool Triggered: OCR at {timestamp}")
        
        frame = extract_frame_at_timestamp(file_path, timestamp)
        if frame is not None:
            ocr_text = run_ocr_on_frame(frame)
            print(f"OCR Result: {ocr_text}")
            
            # PASS 2: Re-run with Context
            print("Running Pass 2 (Qwen + Context)...")
            final_response = f"{initial_response}\n[OCR DATA at {timestamp}]: {ocr_text}"
            return final_response
            
    return initial_response

In [None]:
# 6. Start Server
from pyngrok import ngrok
import uvicorn
import os
from fastapi import FastAPI, Request
from pydantic import BaseModel

NGROK_TOKEN = "YOUR_NGROK_TOKEN_HERE"
ngrok.set_auth_token(NGROK_TOKEN)
SWARM_SECRET = "change-me-in-prod-secure-swarm-key"

app = FastAPI()

class VisualRequest(BaseModel):
    file_url: str

@app.post("/analyze_vision")
async def analyze_vision(payload: VisualRequest):
    # Mock download logic for demo
    local_path = "test_video.mp4"
    return {"description": await process_media_agentic(local_path)}

# CLEANUP & RUN
ngrok.kill()
os.system("pkill ngrok")
tunnel = ngrok.connect(8000)
print(f"\n=== PUBLIC URL: {tunnel.public_url} ===\n")
config = uvicorn.Config(app, port=8000)
await uvicorn.Server(config).serve()