In [None]:
pip install playwright

Collecting playwright
  Downloading playwright-1.51.0-py3-none-manylinux1_x86_64.whl.metadata (3.5 kB)
Collecting pyee<13,>=12 (from playwright)
  Downloading pyee-12.1.1-py3-none-any.whl.metadata (2.9 kB)
Downloading playwright-1.51.0-py3-none-manylinux1_x86_64.whl (45.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.2/45.2 MB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyee-12.1.1-py3-none-any.whl (15 kB)
Installing collected packages: pyee, playwright
Successfully installed playwright-1.51.0 pyee-12.1.1


In [None]:
!playwright install

Downloading Chromium 134.0.6998.35 (playwright build v1161)[2m from https://cdn.playwright.dev/dbazure/download/playwright/builds/chromium/1161/chromium-linux.zip[22m
[1G164.9 MiB [] 0% 125.5s[0K[1G164.9 MiB [] 0% 23.8s[0K[1G164.9 MiB [] 0% 8.1s[0K[1G164.9 MiB [] 1% 4.0s[0K[1G164.9 MiB [] 2% 2.9s[0K[1G164.9 MiB [] 3% 2.4s[0K[1G164.9 MiB [] 4% 2.1s[0K[1G164.9 MiB [] 5% 1.9s[0K[1G164.9 MiB [] 6% 2.0s[0K[1G164.9 MiB [] 7% 2.0s[0K[1G164.9 MiB [] 8% 1.8s[0K[1G164.9 MiB [] 9% 1.8s[0K[1G164.9 MiB [] 10% 1.7s[0K[1G164.9 MiB [] 11% 1.6s[0K[1G164.9 MiB [] 13% 1.5s[0K[1G164.9 MiB [] 14% 1.5s[0K[1G164.9 MiB [] 15% 1.4s[0K[1G164.9 MiB [] 16% 1.4s[0K[1G164.9 MiB [] 18% 1.3s[0K[1G164.9 MiB [] 20% 1.2s[0K[1G164.9 MiB [] 21% 1.2s[0K[1G164.9 MiB [] 23% 1.1s[0K[1G164.9 MiB [] 24% 1.1s[0K[1G164.9 MiB [] 26% 1.1s[0K[1G164.9 MiB [] 27% 1.0s[0K[1G164.9 MiB [] 27% 1.1s[0K[1G164.9 MiB [] 28% 1.1s[0K[1G164.9 MiB [] 30% 1.0s[0K[1G164.9 MiB [] 32% 1.0s[0

In [None]:
import torch
import json
import asyncio
import requests
import logging
import imageio
import numpy as np
import cv2
from transformers import AutoModelForVision2Seq, Blip2Processor
from playwright.async_api import async_playwright

# Configure Logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

# Check if CUDA is available
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load AI Model
model_name = "convergence-ai/proxy-lite-3b"
processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b", trust_remote_code=True)
model = AutoModelForVision2Seq.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto", trust_remote_code=True)

# Define AI Agent
def query_proxy_lite(prompt: str):
    inputs = processor(text=prompt, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model.generate(**inputs)
    response_text = processor.batch_decode(outputs, skip_special_tokens=True)[0]
    return response_text

# Capture Frames for GIF
async def capture_frames(frames, page):
    screenshot = await page.screenshot()
    image = np.array(bytearray(screenshot), dtype=np.uint8)
    image = cv2.imdecode(image, cv2.IMREAD_COLOR)
    frames.append(image)

# Generate GIF
def generate_gif(frames, output_path="output.gif"):
    imageio.mimsave(output_path, frames, fps=2)
    logging.info(f"GIF saved as {output_path}")

# Real-time Element Tracking
async def track_elements(page):
    await page.evaluate("""
        let observer = new MutationObserver(mutations => {
            mutations.forEach(mutation => {
                if (mutation.type === 'childList') {
                    document.querySelectorAll('a, button, input, select').forEach(el => {
                        el.style.border = '2px solid red';
                    });
                    console.log("Updated elements detected:", mutation);
                }
            });
        });
        observer.observe(document.body, { childList: true, subtree: true });
    """)
    logging.info("Tracking elements in real-time.")

# Browser Automation with Playwright
async def browse_web(task: str):
    """
    Launches a browser session, searches for the task-relevant website, interacts with it, records a GIF, and tracks elements in real-time.
    """
    frames = []
    async with async_playwright() as p:
        import os
        headless_mode = True if os.environ.get("COLAB_GPU") else False
        browser = await p.chromium.launch(headless=headless_mode)
        page = await browser.new_page()
        await page.goto("https://google.com")
        await track_elements(page)
        await capture_frames(frames, page)
        await page.fill("input[name='q']", task)
        await page.keyboard.press("Enter")
        await page.wait_for_load_state("domcontentloaded")
        await capture_frames(frames, page)

        links = await page.evaluate("""
            () => Array.from(document.querySelectorAll('a')).map(a => a.href)
        """)

        if links:
            target_link = links[0]  # Taking the first relevant link for now
            await page.goto(target_link)
            await asyncio.sleep(3)
            await capture_frames(frames, page)
            await page.screenshot(path="screenshot.png", full_page=True)
            logging.info(f"Navigated to {target_link} and captured a screenshot.")
            extracted_data = await extract_page_data(page)
        else:
            logging.warning("No relevant links found.")
            extracted_data = ""

        await browser.close()
    generate_gif(frames)
    return extracted_data

# Extract Text Data from Page
async def extract_page_data(page):
    extracted_text = await page.evaluate("""
        () => document.body.innerText
    """)
    logging.info(f"Extracted Page Data:\n{extracted_text[:500]}...")  # Log first 500 characters
    return extracted_text

# Format Results in JSON
def format_results(task, ai_response, extracted_data):
    results = {
        "task": task,
        "ai_response": ai_response,
        "extracted_data": extracted_data[:500],  # Limiting displayed text for clarity
        "screenshot": "screenshot.png",
        "gif": "output.gif"
    }
    with open("results.json", "w") as f:
        json.dump(results, f, indent=4)
    logging.info("Results saved in results.json")
    return results

# Execute the Script
async def run_script():
    user_prompt = input("Enter your task: ")
    logging.info("Fetching AI response...")
    ai_response = query_proxy_lite(user_prompt)
    logging.info(f"AI Response: {ai_response}")

    logging.info("Launching browser automation...")
    extracted_data = await browse_web(user_prompt)

    results = format_results(user_prompt, ai_response, extracted_data)
    logging.info("Process Completed. Screenshot, GIF, and structured results saved.")
    print(json.dumps(results, indent=4))

# Run the entire flow
await run_script()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Enter your task: Please find the best mortgage rate available to me using bankrate.com. My zip code is 90210 and the purchase price is $400,000 with a down payment of $85,000. My credit score is 800.


Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


TimeoutError: Page.fill: Timeout 30000ms exceeded.
Call log:
  - waiting for locator("input[name='q']")
