In [None]:
pip install accelerate>=0.26.0

Note: you may need to restart the kernel to use updated packages.


In [None]:
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
import gradio as gr
from playwright.sync_api import sync_playwright
from bs4 import BeautifulSoup
import requests
import logging
import os  # Added for environment variable
from accelerate import init_empty_weights

# Set a writable Hugging Face cache directory
os.environ["TRANSFORMERS_CACHE"] = "./hf_cache"
os.environ["HF_HOME"] = "./hf_cache"

# Disable Hugging Face telemetry warnings
os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"

# Logger setup
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

# 1️⃣ Load UI-TARS-7B-DPO Model (CPU-Compatible for Colab)
model_name = "bytedance-research/UI-TARS-7B-DPO"
device = "cpu"  # Force CPU usage in Colab

# Ensure Accelerate is used for efficient CPU loading
model = Qwen2VLForConditionalGeneration.from_pretrained(
    model_name,
    torch_dtype=torch.float32,  # Use FP32 for CPU compatibility
    low_cpu_mem_usage=True,  # Optimize memory usage
    device_map="auto"  # Automatically assign device
)

# Load processor
processor = AutoProcessor.from_pretrained(model_name)
logger.info("UI-TARS-7B-DPO Model Loaded Successfully on CPU!")

def generate_response(prompt):
    inputs = processor(prompt, return_tensors="pt").to(device)
    output = model.generate(**inputs, max_length=512)
    return processor.decode(output[0], skip_special_tokens=True)

# 2️⃣ Web Automation Setup
def browse_website(url):
    with sync_playwright() as p:
        browser = p.chromium.launch(headless=True)
        page = browser.new_page()
        page.goto(url)
        screenshot_path = "screenshot.png"
        page.screenshot(path=screenshot_path)
        browser.close()
    return screenshot_path

# 3️⃣ Extract Webpage Text
def extract_text_from_url(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    text = ' '.join([p.text for p in soup.find_all('p')])
    return text

def answer_question(url, question):
    page_text = extract_text_from_url(url)
    prompt = f"Based on the following webpage content:\n\n{page_text}\n\nAnswer this question: {question}"
    response = generate_response(prompt)
    return response

# 4️⃣ UI with Gradio
def gradio_interface(url, question):
    screenshot_path = browse_website(url)
    response = answer_question(url, question)
    return screenshot_path, response

iface = gr.Interface(
    fn=gradio_interface,
    inputs=[gr.Textbox(label="Enter Website URL"), gr.Textbox(label="Ask a Question")],
    outputs=[gr.Image(label="Captured UI"), gr.Textbox(label="Model Answer")],
    title="UI TARS Web Agent",
    description="Enter a website URL and a question to analyze the webpage and get answers."
)

iface.launch()


The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

INFO:httpx:HTTP Request: GET https://api.gradio.app/gradio-messaging/en "HTTP/1.1 200 OK"


model.safetensors.index.json:   0%|          | 0.00/56.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]