In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from firecrawl import Firecrawl


firecrawl_api_key = "fc-6a9dd63b67a64375889bb608bee9664a"

In [3]:
feeds_df = pd.DataFrame(
    [["Ambience Vimeo", "https://vimeo.com/ambiencehealthcare", 25]],
    columns=["name", "url", "competitor_id"]
)

feeds_df

Unnamed: 0,name,url
0,Ambience Vimeo,https://vimeo.com/ambiencehealthcare


In [10]:
# --- Config ---
USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120 Safari/537.36"
TIMEOUT = 20

# Initialize Firecrawl
firecrawl = Firecrawl(api_key=firecrawl_api_key)

def scrape_with_bs4(url: str, timeout: int = TIMEOUT) -> dict:
    headers = {"User-Agent": USER_AGENT}
    resp = requests.get(url, headers=headers, timeout=timeout, allow_redirects=True)
    resp.raise_for_status()

    html = resp.text or ""
    soup = BeautifulSoup(html, "html.parser")

    # Remove junk
    for tag in soup(["script", "style", "noscript", "svg"]):
        tag.decompose()

    text = soup.get_text(separator="\n")
    # Normalize whitespace a bit
    lines = [ln.strip() for ln in text.splitlines()]
    text_clean = "\n".join([ln for ln in lines if ln])

    return {"html": html, "text": text_clean}

def scrape_with_firecrawl(url: str) -> dict:
    doc = firecrawl.scrape(url, formats=["markdown", "html"])

    # Firecrawl responses can be dict-like; be defensive:
    markdown = None
    html = None

    if isinstance(doc, dict):
        # common patterns:
        markdown = doc.get("markdown") or doc.get("data", {}).get("markdown")
        html = doc.get("html") or doc.get("data", {}).get("html")
    else:
        # fallback: just store string repr
        markdown = str(doc)

    return {"markdown": markdown, "html": html, "raw": doc}


def scrape_url(url: str, min_text_chars: int = 200) -> dict:
    result = {
        "url": url,
        "method": None,
        "html": None,
        "text": None,
        "markdown": None,
        "error": None,
    }

    # 1) Try BS4
    try:
        bs = scrape_with_bs4(url)
        result["html"] = bs.get("html")
        result["text"] = bs.get("text")
        result["method"] = "bs4"

        # If we got very little text, treat as failure and fallback
        if not result["text"] or len(result["text"]) < min_text_chars:
            raise ValueError(f"BS4 extracted too little text ({len(result['text'] or '')} chars)")

        return result

    except Exception as e_bs4:
        # 2) Fallback Firecrawl
        try:
            fc = scrape_with_firecrawl(url)
            result["markdown"] = fc.get("markdown")
            # prefer Firecrawl html if BS4 html was empty
            result["html"] = result["html"] or fc.get("html")
            result["method"] = "firecrawl"
            return result

        except Exception as e_fc:
            result["method"] = "failed"
            result["error"] = f"bs4_error={repr(e_bs4)} | firecrawl_error={repr(e_fc)}"
            return result

# --- Loop through your feeds_df ---
rows = []
for _, row in feeds_df.iterrows():
    url = row.get("url")
    name = row.get("name")

    if not isinstance(url, str) or not url.strip():
        rows.append({"name": name, "url": url, "method": "skipped", "error": "Missing URL"})
        continue

    out = scrape_url(url.strip())
    out["name"] = name
    rows.append(out)

scraped_df = pd.DataFrame(rows)
scraped_df

Unnamed: 0,url,method,html,text,markdown,error,name
0,https://vimeo.com/ambiencehealthcare,firecrawl,"<!DOCTYPE html><html lang=""en""><head><meta cha...",Ambience Healthcare,"markdown=""![Ambience Healthcare](https://i.vim...",,Ambience Vimeo


In [13]:
# Copy the HTML from the first row of scraped_df to your clipboard

html_str = scraped_df.loc[0, "markdown"]

# If you're in a Jupyter notebook, this works reliably:
import pyperclip
pyperclip.copy(html_str)

print(f"✅ Copied {len(html_str):,} characters of HTML to clipboard.")


✅ Copied 27,175 characters of HTML to clipboard.
