In [None]:
!git clone https://github.com/microsoft/OmniParser.git
%cd OmniParser

In [None]:
%%capture
%pip install -r requirements.txt

In [None]:
%%capture
!mkdir -p weights/icon_detect weights/icon_caption

!huggingface-cli download microsoft/OmniParser-v2.0 icon_detect/train_args.yaml --local-dir weights/icon_detect --local-dir-use-symlinks False
!huggingface-cli download microsoft/OmniParser-v2.0 icon_detect/model.pt --local-dir weights/icon_detect --local-dir-use-symlinks False
!huggingface-cli download microsoft/OmniParser-v2.0 icon_detect/model.yaml --local-dir weights/icon_detect --local-dir-use-symlinks False

!huggingface-cli download microsoft/OmniParser-v2.0 icon_caption/config.json --local-dir weights/icon_caption --local-dir-use-symlinks False
!huggingface-cli download microsoft/OmniParser-v2.0 icon_caption/generation_config.json --local-dir weights/icon_caption --local-dir-use-symlinks False
!huggingface-cli download microsoft/OmniParser-v2.0 icon_caption/model.safetensors --local-dir weights/icon_caption --local-dir-use-symlinks False

!mv weights/icon_caption weights/icon_caption_florence

In [None]:
%%capture
!pip install -U numpy

import os
os.kill(os.getpid(), 9)

In [None]:
import sys
sys.path.append('/content/OmniParser')

In [None]:
from typing_extensions import TypedDict
from typing import List

class BBox(TypedDict):
    x: float
    y: float
    interactivity: bool
    content: str

class ImageProcessingResponse(TypedDict):
    img: str
    bboxes: List[BBox]

In [None]:
%%capture
%pip install fastapi uvicorn nest-asyncio pyngrok python-multipart pillow

In [None]:
import nest_asyncio
import uvicorn
from fastapi import FastAPI, UploadFile, File
from pydantic import BaseModel
from typing import List
from PIL import Image
import io
from pyngrok import ngrok
from OmniParser.util.utils import get_som_labeled_img, check_ocr_box, get_caption_model_processor, get_yolo_model
import torch
from ultralytics import YOLO
from rich import print

device = 'cuda'
som_model = get_yolo_model(model_path='/content/OmniParser/weights/icon_detect/icon_detect/model.pt')
som_model.to(device)
print('model to {}'.format(device))
caption_model_processor = get_caption_model_processor(model_name="florence2", model_name_or_path='/content/OmniParser/weights/icon_caption_florence/icon_caption/')


In [None]:
nest_asyncio.apply()
app = FastAPI()

@app.post("/process_image", response_model=ImageProcessingResponse)
async def process_image(file: UploadFile = File(...)):
    contents = await file.read()
    image = Image.open(io.BytesIO(contents)).convert("RGB")
    temp_path = "/content/temp_image.jpg"
    image.save(temp_path)

    ocr_bbox_rslt, _ = check_ocr_box(
        temp_path,
        display_img=False,
        output_bb_format='xyxy',
        goal_filtering=None,
        easyocr_args={'paragraph': False, 'text_threshold': 0.9}
    )
    text, ocr_bbox = ocr_bbox_rslt

    dino_labeled_img, _, parsed_content_list = get_som_labeled_img(
        temp_path,
        som_model,
        BOX_TRESHOLD=0.03,
        output_coord_in_ratio=False,
        ocr_bbox=ocr_bbox,
        draw_bbox_config={
            'text_scale': 0.5,
            'text_thickness': 1,
            'text_padding': 1,
            'thickness': 1
        },
        caption_model_processor=caption_model_processor,
        ocr_text=text,
        iou_threshold=0.1
    )

    icons_data = [
        BBox(
            x=(item['bbox'][0] + item['bbox'][2]) / 2,
            y=(item['bbox'][1] + item['bbox'][3]) / 2,
            interactivity=item['interactivity'],
            content=item['content'].strip(),
        )
        for item in parsed_content_list
    ]
    return {
        "img": dino_labeled_img,
        "bboxes": icons_data
    }


In [None]:
!ngrok config add-authtoken 2vcsXmkdaOyWc8cnxRS9Hs5c5aA_7jJPAhNdNzFaGthVdu5op

In [None]:
from pyngrok import ngrok
import time

ngrok.kill()
time.sleep(2)

public_url = ngrok.connect(8000)
print(" URL:", public_url.public_url)

uvicorn.run(app, host="0.0.0.0", port=8000)