# Dolphin Gradio App
Interactive web interface for Dolphin document parsing

In [None]:
#!pip3.10 install hf_xet

In [None]:
import gradio as gr
import os
from PIL import Image
import json
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
import warnings
warnings.filterwarnings("ignore")

In [None]:
# Initialize Qwen2.5-VL model
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Initialize variables
model = None
processor = None

try:
    # Load Qwen2.5-VL-3B-Instruct model with simpler configuration
    model = Qwen2VLForConditionalGeneration.from_pretrained(
        "Qwen/Qwen2.5-VL-3B-Instruct", 
        torch_dtype="auto",
        device_map="auto",
        trust_remote_code=True
    )
    processor = AutoProcessor.from_pretrained(
        "Qwen/Qwen2.5-VL-3B-Instruct",
        trust_remote_code=True
    )
    
    print(f"Gradio version: {gr.__version__}")
    print("✅ Qwen2.5-VL-3B-Instruct model loaded successfully!")
    
except Exception as e:
    print(f"❌ Error loading model: {e}")
    print("Error details:", str(e))
    # Set to None explicitly if loading fails
    model = None
    processor = None

In [None]:
def process_document(image):
    """
    Process document image with Qwen2.5-VL
    """
    if image is None:
        return "Please upload an image"
    
    # Check if model and processor are loaded
    if model is None or processor is None:
        return "❌ Model or processor not loaded. Please check the model initialization above."
    
    try:
        # Simple approach without qwen_vl_utils
        # Convert PIL image to format expected by processor
        messages = [
            {
                "role": "user",
                "content": [
                    {"type": "image", "image": image},
                    {"type": "text", "text": "Analiza esta imagen de documento. Describe qué ves, extrae todo el texto que puedas leer, identifica tablas, fórmulas, diagramas o cualquier elemento estructural. Proporciona una descripción detallada del contenido."}
                ]
            }
        ]
        
        # Apply chat template
        text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        
        # Process inputs
        inputs = processor(text=text, images=[image], return_tensors="pt")
        inputs = inputs.to(device)
        
        # Generate response
        with torch.no_grad():
            generated_ids = model.generate(
                **inputs,
                max_new_tokens=512,
                do_sample=False,
                temperature=0.7,
                pad_token_id=processor.tokenizer.eos_token_id
            )
        
        # Decode response
        generated_ids_trimmed = [
            out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
        ]
        
        output_text = processor.batch_decode(
            generated_ids_trimmed, 
            skip_special_tokens=True, 
            clean_up_tokenization_spaces=False
        )
        
        return output_text[0] if output_text else "No response generated"
        
    except Exception as e:
        return f"Error processing image: {str(e)}"

In [None]:
# Create Gradio interface
demo = gr.Interface(
    fn=process_document,
    inputs=gr.Image(type="pil", label="Upload Document Image"),
    outputs=gr.Textbox(label="Qwen2.5-VL Analysis Results", lines=10),
    title="Qwen2.5-VL-3B-Instruct Document Analyzer",
    description="Upload a document image to analyze with Qwen2.5-VL-3B-Instruct. The model will extract text, identify structures, and provide detailed analysis.",
    examples=[
        ["demo/page_imgs/page_1.jpeg"] if os.path.exists("demo/page_imgs/page_1.jpeg") else None
    ]
)

if __name__ == "__main__":
    demo.launch(share=False)