In [None]:
"""
ChainAbuse BTC Scam‑Report Scraper – Jupyter Edition
===================================================

Fetches paginated scam reports for the **Bitcoin** chain from
<https://www.chainabuse.com> and stores each card (category, description,
author, BTC address, associated domain) into *scam_reports.csv*.

Key features
------------
* **Playwright + headless Chromium** – bypasses Cloudflare + dynamic JS.
* **Cookie injection** – copy/paste your own `cf_clearance` (and friends)
  into the `COOKIE_STR` constant or export `CHAINABUSE_COOKIES`.
* **Resumable** – progress is persisted in `last_page.txt`; restart the
  notebook to pick up where it left off.
* **Tunable politeness** – configurable delays, extra wait every N pages.
* **Structured logging** – progress and errors via the `logging` module.
* **Jupyter‑friendly** – `# %%` cell delimiters; run top‑to‑bottom.

> ⚠️ **Legal notice:** Scraping may violate ChainAbuse’s ToS. Use at your own
> risk and respect robots.txt / API if available.
"""

In [None]:
# -----------------------------------------------------------------------------
# Imports & constants
# -----------------------------------------------------------------------------

import pandas as pd
import re
from pathlib import Path

In [None]:

# -----------------------------------------------------------------------------
# 1. Load scraped data
# -----------------------------------------------------------------------------

CSV_PATH = Path("scam_reports.csv")
if not CSV_PATH.exists():
    raise FileNotFoundError(
        "scam_reports.csv not found. Run the scraper first or place the file in the working directory."
    )

# Read using UTF-8 for maximum compatibility
# If your file contains a different encoding, adjust the `encoding` argument.
df = pd.read_csv(CSV_PATH, encoding="utf-8")
print(f"Loaded {len(df):,} rows from {CSV_PATH}")

display(df.head())  # Jupyter-friendly preview

In [None]:
# -----------------------------------------------------------------------------
# 2. Bitcoin address validator
# -----------------------------------------------------------------------------

BITCOIN_LEGACY_REGEX = re.compile(r"^[13][a-km-zA-HJ-NP-Z1-9]{25,34}$")
BITCOIN_BECH32_REGEX = re.compile(r"^(bc1)[0-9a-z]{25,39}$")

def is_valid_bitcoin_address(address: str) -> bool:
    """Return **True** if *address* matches a plausible Bitcoin address pattern."""
    if not isinstance(address, str):
        return False
    return bool(
        BITCOIN_LEGACY_REGEX.match(address) or BITCOIN_BECH32_REGEX.match(address)
    )

# Vectorised validation
ADDRESS_COL = "Indirizzo Bitcoin"  # change here if your column has a different name
df["valid"] = df[ADDRESS_COL].apply(is_valid_bitcoin_address)

print(
    f"Found {df['valid'].sum():,} valid BTC addresses out of {len(df):,} total rows."
)

In [None]:
# -----------------------------------------------------------------------------
# 3. Clean & de-duplicate
# -----------------------------------------------------------------------------

filtered_df = (
    df[df["valid"]]  # keep valid addresses only
    .drop(columns="valid")  # no need to persist helper column
    .drop_duplicates(subset=[ADDRESS_COL], keep="first")  # unique addresses
    .reset_index(drop=True)
)

print(f"Rows after filtering: {len(filtered_df):,}")
display(filtered_df.head())

In [None]:
# -----------------------------------------------------------------------------
# 4. Persist cleaned dataset
# -----------------------------------------------------------------------------

OUTPUT_PATH = Path("scam_reports_filtered.csv")
filtered_df.to_csv(OUTPUT_PATH, index=False)
print(f"✅ Saved cleaned dataset to {OUTPUT_PATH.resolve()}")