In [None]:
!pip install --upgrade transformers

Collecting transformers
  Downloading transformers-4.49.0-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Downloading transformers-4.49.0-py3-none-any.whl (10.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m33.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.48.3
    Uninstalling transformers-4.48.3:
      Successfully uninstalled transformers-4.48.3
Successfully installed transformers-4.49.0


In [None]:
!pip install gradio playwright pytesseract

Collecting gradio
  Downloading gradio-5.21.0-py3-none-any.whl.metadata (16 kB)
Collecting playwright
  Downloading playwright-1.50.0-py3-none-manylinux1_x86_64.whl.metadata (3.5 kB)
Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.11-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.7.2 (from gradio)
  Downloading gradio_client-1.7.2-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting markupsafe~=2.0 (from gradio)
  Downloading MarkupSafe-2.1.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting pydub (from gradio)
  Downloading pydu

In [None]:
!playwright install

Downloading Chromium 133.0.6943.16 (playwright build v1155)[2m from https://cdn.playwright.dev/dbazure/download/playwright/builds/chromium/1155/chromium-linux.zip[22m
[1G163.5 MiB [] 0% 10.5s[0K[1G163.5 MiB [] 0% 17.8s[0K[1G163.5 MiB [] 0% 33.6s[0K[1G163.5 MiB [] 0% 20.8s[0K[1G163.5 MiB [] 0% 19.0s[0K[1G163.5 MiB [] 0% 13.5s[0K[1G163.5 MiB [] 1% 11.5s[0K[1G163.5 MiB [] 1% 10.5s[0K[1G163.5 MiB [] 1% 9.7s[0K[1G163.5 MiB [] 1% 9.2s[0K[1G163.5 MiB [] 2% 8.7s[0K[1G163.5 MiB [] 2% 8.2s[0K[1G163.5 MiB [] 2% 7.6s[0K[1G163.5 MiB [] 3% 6.9s[0K[1G163.5 MiB [] 4% 6.1s[0K[1G163.5 MiB [] 4% 5.5s[0K[1G163.5 MiB [] 5% 5.4s[0K[1G163.5 MiB [] 5% 5.1s[0K[1G163.5 MiB [] 6% 4.9s[0K[1G163.5 MiB [] 6% 4.8s[0K[1G163.5 MiB [] 6% 5.0s[0K[1G163.5 MiB [] 7% 4.9s[0K[1G163.5 MiB [] 7% 5.0s[0K[1G163.5 MiB [] 7% 4.7s[0K[1G163.5 MiB [] 8% 4.5s[0K[1G163.5 MiB [] 9% 4.2s[0K[1G163.5 MiB [] 10% 4.1s[0K[1G163.5 MiB [] 11% 4.0s[0K[1G163.5 MiB [] 12% 3.6s[0K[1G163.

In [None]:
import torch
import time
import requests
from bs4 import BeautifulSoup
from transformers import AutoModelForVision2Seq, Blip2Processor
import gradio as gr
import json
import asyncio
from playwright.async_api import async_playwright
from collections import Counter

# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"

# Define model name
model_name = "convergence-ai/proxy-lite-3b"

# Load the processor & model
processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b", trust_remote_code=True)
model = AutoModelForVision2Seq.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto", trust_remote_code=True)
model.eval()

def log_to_file(message):
    """Logs messages for debugging."""
    with open("debug_log.txt", "a") as f:
        f.write(message + "\n")

class GeneralizedScraper:
    def __init__(self, url: str, query: str):
        self.url = url
        self.query = query.lower()  # Convert query to lowercase for matching
        self.browser = None
        self.page = None
        self.web_content = None

    async def load_webpage(self):
        """Loads the website using Playwright."""
        playwright = await async_playwright().start()
        self.browser = await playwright.chromium.launch(headless=True)
        self.page = await self.browser.new_page()
        await self.page.goto(self.url)

    async def extract_relevant_text(self):
        """Extracts key content dynamically while ignoring ads, navigation, and footers."""
        try:
            await self.page.wait_for_selector("body", timeout=5000)
            page_content = await self.page.content()
            soup = BeautifulSoup(page_content, "html.parser")

            # Remove unnecessary tags
            for tag in soup(["script", "style", "header", "footer", "aside", "nav"]):
                tag.decompose()

            extracted_text = []
            keyword_counts = Counter()

            # Find all sections and rank them based on query relevance
            for tag in soup.find_all(["h1", "h2", "h3", "p", "li", "div"]):
                text = tag.get_text(separator=" ", strip=True)
                if text and len(text) > 30:  # Avoid very short fragments
                    score = sum(1 for word in text.lower().split() if word in self.query)
                    keyword_counts[text] = score

            # Sort sections by relevance
            ranked_text = sorted(keyword_counts.items(), key=lambda x: x[1], reverse=True)

            # Select the top relevant sections
            for text, score in ranked_text[:10]:  # Limit to top 10 sections
                extracted_text.append(text)

            # Combine extracted text
            self.web_content = "\n".join(extracted_text)[:2000]  # Limit to 2000 chars
            return self.web_content if self.web_content else "No relevant data found."

        except Exception as e:
            return f"Error extracting relevant text: {str(e)}"

    async def run(self):
        await self.load_webpage()
        extracted_info = await self.extract_relevant_text()
        return extracted_info

def generate_response(extracted_info: str, user_query: str) -> str:
    formatted_prompt = f"""
    You are an AI assistant. Below is the extracted text from a website:

    {extracted_info}

    Based on the user's query: "{user_query}", provide a structured answer that includes:
    - The most **relevant details** found on the website.
    - A **summary of key insights**.
    - If applicable, extract **specific numbers, percentages, or important figures**.

    If no relevant data was found, return: "No useful information available from this site."
    """

    inputs = processor(text=[formatted_prompt], return_tensors="pt").to(device)

    with torch.no_grad():
        generated_ids = model.generate(
            **inputs,
            max_new_tokens=250,
            num_beams=5,
            repetition_penalty=2.0,
            length_penalty=1.0,
            early_stopping=True
        )

    return processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

def gradio_ui(url: str, query: str):
    start_time = time.time()
    scraper = GeneralizedScraper(url, query)
    extracted_info = asyncio.run(scraper.run())
    log_to_file(f"Extracted Content: {extracted_info[:500]}")
    response = generate_response(extracted_info, query)
    total_time = time.time() - start_time
    log_to_file(f"Total Time Taken: {total_time:.2f} sec")
    return response

iface = gr.Interface(
    fn=gradio_ui,
    inputs=[
        gr.Textbox(label="Enter Website URL"),
        gr.Textbox(label="Enter Your Query"),
    ],
    outputs=gr.Textbox(label="Generated Response"),
    title="Generalized Web Scraper",
    description="Enter a website URL and a query, and the AI will scrape the content and generate an answer."
)

iface.launch(share=True, debug=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/432 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/882 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.56M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/548 [00:00<?, ?B/s]

processor_config.json:   0%|          | 0.00/68.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.27k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/65.4k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.51G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/126 [00:00<?, ?B/s]

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://531f87efddc9a6ba4c.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://531f87efddc9a6ba4c.gradio.live


