In [None]:
"""
ChainAbuse BTC Scam‑Report Scraper – Jupyter Edition
===================================================

Fetches paginated scam reports for the **Bitcoin** chain from
<https://www.chainabuse.com> and stores each card (category, description,
author, BTC address, associated domain) into *scam_reports.csv*.

Key features
------------
* **Playwright + headless Chromium** – bypasses Cloudflare + dynamic JS.
* **Cookie injection** – copy/paste your own `cf_clearance` (and friends)
  into the `COOKIE_STR` constant or export `CHAINABUSE_COOKIES`.
* **Resumable** – progress is persisted in `last_page.txt`; restart the
  notebook to pick up where it left off.
* **Tunable politeness** – configurable delays, extra wait every N pages.
* **Structured logging** – progress and errors via the `logging` module.
* **Jupyter‑friendly** – `# %%` cell delimiters; run top‑to‑bottom.

> ⚠️ **Legal notice:** Scraping may violate ChainAbuse’s ToS.  Use at your own
> risk and respect robots.txt / API if available.
"""

In [None]:
from __future__ import annotations

import asyncio
import csv
import json
import logging
import os
import re
import sys
import time
from pathlib import Path
from typing import Dict, List, Tuple

import nest_asyncio
from bs4 import BeautifulSoup
from playwright.async_api import async_playwright
from tqdm.auto import tqdm

nest_asyncio.apply()  # allow nested event loops inside notebooks

# -----------------------------------------------------------------------------
# Configuration – edit as needed
# -----------------------------------------------------------------------------
BASE_URL = "https://www.chainabuse.com/chain/BTC?page={page}"
START_PAGE = 1          # inclusive
END_PAGE   = 100        # inclusive; set to 17672 for full scrape

CSV_PATH = Path("scam_reports.csv")
STATE_PATH = Path("last_page.txt")

# Wait settings ---------------------------------------------------------------
WAIT_BETWEEN_PAGES = 5       # seconds
EXTRA_WAIT_EVERY_N = 5       # add EXTRA_WAIT seconds every N pages
EXTRA_WAIT          = 10

# Cookies ---------------------------------------------------------------------
COOKIE_STR = os.getenv("CHAINABUSE_COOKIES", "")  # paste your cf_clearance … here
COOKIE_DOMAIN = ".chainabuse.com"

# Playwright viewport & UA -----------------------------------------------------
VIEWPORT = {"width": 1280, "height": 800}
USER_AGENT = (
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
    "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36"
)

# Logging ---------------------------------------------------------------------
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[logging.StreamHandler(sys.stdout)],
)
log = logging.getLogger("chainabuse-scraper")

In [None]:
# -----------------------------------------------------------------------------
# Helpers
# -----------------------------------------------------------------------------

def parse_cookie_string(cookie_str: str, domain: str) -> List[Dict[str, str]]:
    """Convert a semicolon‑separated cookie string to Playwright's format."""
    cookies = []
    for part in cookie_str.split(";"):
        part = part.strip()
        if "=" in part:
            name, value = part.split("=", 1)
            cookies.append({"name": name, "value": value, "domain": domain, "path": "/"})
    return cookies


def extract_cards(html: str) -> List[Tuple[str, str, str, str, str]]:
    """Return a list of card tuples from the HTML fragment."""
    soup = BeautifulSoup(html, "html.parser")
    cards = soup.find_all("div", class_="create-ScamReportCard")
    results = []
    for card in cards:
        category = card.find("p", class_="create-ScamReportCard__category-label")
        description = card.find("p", class_="create-ScamReportCard__preview-description")
        author = card.find("a", class_="create-ScamReportCard__author-link")
        btc_addr = card.find("div", class_="create-ResponsiveAddress__text")
        domain = card.find("p", class_="create-ReportedSection__domain")
        results.append(
            (
                category.text.strip() if category else "N/A",
                description.text.strip() if description else "N/A",
                author.text.strip() if author else "N/A",
                btc_addr.text.strip() if btc_addr else "N/A",
                domain.text.strip() if domain else "N/A",
            )
        )
    return results

In [None]:
# -----------------------------------------------------------------------------
# Async scraping routine
# -----------------------------------------------------------------------------

async def scrape_pages(start_page: int, end_page: int) -> None:
    cookies = parse_cookie_string(COOKIE_STR, COOKIE_DOMAIN) if COOKIE_STR else []

    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        context = await browser.new_context(viewport=VIEWPORT, user_agent=USER_AGENT)
        if cookies:
            await context.add_cookies(cookies)
            log.info("Injected %s cookies", len(cookies))

        page = await context.new_page()

        # Prepare CSV ----------------------------------------------------------
        file_exists = CSV_PATH.exists()
        csv_file = open(CSV_PATH, "a", newline="", encoding="utf-8")
        writer = csv.writer(csv_file)
        if not file_exists:
            writer.writerow(["page", "category", "description", "author", "btc_address", "domain"])

        # Determine resume point ----------------------------------------------
        if STATE_PATH.exists():
            try:
                start_page = max(start_page, int(STATE_PATH.read_text().strip()) + 1)
                log.info("Resuming from page %s", start_page)
            except ValueError:
                pass

        # Iterate pages --------------------------------------------------------
        for pg in tqdm(range(start_page, end_page + 1), desc="Scraping pages"):
            url = BASE_URL.format(page=pg)
            try:
                await page.goto(url)
                await page.wait_for_load_state("networkidle")
                cards = extract_cards(await page.content())
                writer.writerows([(pg, *card) for card in cards])
                csv_file.flush()
                log.info("Page %s – %s cards", pg, len(cards))

                # Persist state
                STATE_PATH.write_text(str(pg))

                # Politeness delay
                delay = WAIT_BETWEEN_PAGES
                if pg % EXTRA_WAIT_EVERY_N == 0:
                    delay += EXTRA_WAIT
                await asyncio.sleep(delay)
            except Exception as exc:
                log.error("Error on page %s: %s", pg, exc)
                await asyncio.sleep(30)

        # Cleanup --------------------------------------------------------------
        csv_file.close()
        await page.close()
        await browser.close()
        log.info("Scraping finished – data written to %s", CSV_PATH)

In [None]:
# -----------------------------------------------------------------------------
# Entry point – run this cell
# -----------------------------------------------------------------------------

async def main():
    await scrape_pages(START_PAGE, END_PAGE)

# For Jupyter users: run `await main()` directly in the cell below
# For script execution: uncomment the following lines:
# if __name__ == "__main__":
#     asyncio.run(main())