<h3> imports <h3>

In [77]:
import os, yaml

import time, threading, re, random
from collections import defaultdict
import urllib.parse as urlparse
import requests
from urllib.robotparser import RobotFileParser
import tldextract

from bs4 import BeautifulSoup
from datetime import datetime
import pandas as pd
import re



<h1> 1.	Crawler / Scraper <h1>

Defining basic settings

In [78]:
os.makedirs("configs", exist_ok=True)
os.makedirs("storage/raw", exist_ok=True)
os.makedirs("storage/clean", exist_ok=True)
os.makedirs("logs", exist_ok=True)

CONFIG = {
    "user_agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/140.0.0.0 Safari/537.36",
    "default_timeout_sec": 15,
    "per_host_min_delay_sec": 2,
    "max_retries": 3,
    "sources": {
        "reddit": {
            "type": "reddit_html",
            "subreddits": ["worldnews", "news"],
            "limit_per_sub": 20
        },
        "news_sites": [
            {
                "name": "reuters_world",
                "start_urls": ["https://www.reuters.com/world/"],
                "article_selector_hint": "a"
            }
        ]
    }
}

with open("configs/crawl_config.yaml", "w", encoding="utf-8") as f:
    yaml.safe_dump(CONFIG, f, allow_unicode=True)
print("configs/crawl_config.yaml saved")


configs/crawl_config.yaml saved


Review the robots.txt and control the rate

In [79]:
_session = requests.Session()
# _session.headers.update({"Accept-Language": "en;q=0.9"})
_session.headers.update({
    "Accept-Language": "en;q=0.9",
    "User-Agent": CONFIG["user_agent"],   # اضافه شود
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Connection": "keep-alive"
})

_robot_cache = {}
_host_next_time = defaultdict(float)
_lock = threading.Lock()

def get_robots_parser(base_url, ua):
    parsed = urlparse.urlparse(base_url)
    robots_url = f"{parsed.scheme}://{parsed.netloc}/robots.txt"
    if robots_url in _robot_cache:
        return _robot_cache[robots_url]
    rp = RobotFileParser()
    try:
        resp = _session.get(robots_url, timeout=8)
        if resp.status_code == 200:
            rp.parse(resp.text.splitlines())
            print("resp.status_code == 200")
        else:
            rp.parse([])
            print("resp.status_code != 200")
    except Exception:
        rp.parse([])
    _robot_cache[robots_url] = rp
    return rp

def host_key(url):
    p = urlparse.urlparse(url)
    ext = tldextract.extract(p.netloc)
    return ".".join([x for x in [ext.domain, ext.suffix] if x])

def rate_limit(url, min_delay_sec):
    hk = host_key(url)
    with _lock:
        now = time.time()
        nt = _host_next_time[hk]
        wait = nt - now
        if wait > 0:
            time.sleep(wait)
        _host_next_time[hk] = time.time() + min_delay_sec


Fetch function (with Retry & Exponential Backoff)

In [80]:
def fetch(url, user_agent, timeout=15, max_retries=3, min_delay_sec=2):
    rp = get_robots_parser(url, user_agent)
    if not rp.can_fetch(user_agent, url):
        raise PermissionError(f"Blocked by robots.txt for {url}")
    rate_limit(url, min_delay_sec)
    headers = {"User-Agent": user_agent}
    attempt = 0
    backoff = 1.6
    while attempt <= max_retries:
        try:
            resp = _session.get(url, headers=headers, timeout=timeout)
            if 200 <= resp.status_code < 300:
                return resp
            if resp.status_code in (429, 503):
                time.sleep((backoff ** attempt) + random.uniform(0, 0.5))
            else:
                time.sleep(0.6)
        except requests.RequestException:
            time.sleep((backoff ** attempt) + 0.4)
        attempt += 1
    raise TimeoutError(f"Fetch failed after retries for {url}")


HTML Crawler for Reddit

In [81]:
# def parse_reddit_listing(html, base_url):
#     soup = BeautifulSoup(html, "lxml")
#     items = []
#     # Generic selector for post container
#     for post in soup.select("[data-testid='post-container']"):
#         title_el = post.select_one("h3")
#         if not title_el:
#             continue
#         title = title_el.get_text(strip=True)
#         link_el = post.find("a", href=True)
#         url = urlparse.urljoin(base_url, link_el["href"]) if link_el else None

#         items.append({
#             "source_type": "reddit",
#             "source_name": base_url,
#             "subreddit": base_url.rstrip("/").split("/")[-1],
#             "url": url,
#             "canonical_url": url,
#             "title": title,
#             "text": None,
#             "author": None,
#             "published_at": None,
#             "score": None,
#             "comments": None,
#             "fetched_at": datetime.utcnow().isoformat(timespec="seconds")
#         })
#     return items

# def crawl_reddit_subreddit(sub, cfg):
#     ua = cfg["user_agent"]
#     limit = cfg["sources"]["reddit"]["limit_per_sub"]
#     url = f"https://www.reddit.com/r/{sub}/"
#     resp = fetch(
#         url, ua, timeout=cfg["default_timeout_sec"], 
#         max_retries=cfg["max_retries"], 
#         min_delay_sec=cfg["per_host_min_delay_sec"]
#     )
#     items = parse_reddit_listing(resp.text, url)
#     return items[:limit] if limit else items


In [82]:
import re
from bs4 import BeautifulSoup
from datetime import datetime
import urllib.parse as urlparse

UPVOTE_RE = re.compile(r"(\d+(?:\.\d+)?)([kK])?\s*upvote")  # برای استخراج عدد از aria-label

def to_int_k(v):
    if v is None:
        return None
    s = str(v).strip().lower()
    if s.endswith("k"):
        try:
            return int(float(s[:-1]) * 1000)
        except:
            return None
    try:
        return int(s)
    except:
        return None

def parse_reddit_listing(html, base_url):
    soup = BeautifulSoup(html, "lxml")
    items = []

    for post in soup.select("[data-testid='post-container'], div[data-test-id='post-content']"):
        # عنوان
        title_el = post.select_one("h3") or post.select_one("a[data-click-id='body'] h3")
        if not title_el:
            continue
        title = title_el.get_text(strip=True)

        # لینک پایدار پست
        link_el = post.select_one("a[data-click-id='body']") or post.find("a", href=True)
        url = urlparse.urljoin(base_url, link_el["href"]) if link_el and link_el.get("href") else None

        # نویسنده
        author_el = post.select_one("a[data-click-id='user']") or post.select_one("a[href^='/user/']")
        author = author_el.get_text(strip=True) if author_el else None

        # زمان انتشار
        time_el = post.select_one("a[data-click-id='timestamp'] time") or post.find("time")
        published_at = time_el.get("datetime") if time_el and time_el.has_attr("datetime") else None

        # شمار نظرها
        comments_el = post.select_one("a[data-click-id='comments']") or post.find("a", string=re.compile("comment", re.I))
        comments = None
        if comments_el:
            m = re.search(r"(\d+(?:\.\d+)?[kK]?)", comments_el.get_text(" ", strip=True))
            if m:
                comments = to_int_k(m.group(1))

        # امتیاز تقریبی
        score = None
        aria_up = post.find(attrs={"aria-label": re.compile("upvote", re.I)})
        if aria_up:
            m = UPVOTE_RE.search(aria_up.get("aria-label", ""))
            if m:
                val = float(m.group(1))
                if m.group(2):
                    val *= 1000
                score = int(val)

        items.append({
            "source_type": "reddit",
            "source_name": base_url,
            "subreddit": base_url.rstrip("/").split("/")[-1],
            "url": url,
            "canonical_url": url,  # بعدا در مرحله کاننیکال اصلاح می‌کنیم
            "title": title,
            "text": None,
            "author": author,
            "published_at": published_at,
            "score": score,
            "comments": comments,
            "fetched_at": datetime.utcnow().isoformat(timespec="seconds")
        })
    return items


In [83]:
# from hashlib import sha1

# STRIP_PARAMS = {"utm_source","utm_medium","utm_campaign","utm_term","utm_content","ref","utm_name"}

# def normalize_url(u):
#     if not u:
#         return None
#     p = urlparse.urlparse(u)
#     # حذف پارامترهای رهگیری
#     q = urlparse.parse_qsl(p.query, keep_blank_values=True)
#     q = [(k,v) for k,v in q if k not in STRIP_PARAMS]
#     new_q = urlparse.urlencode(q)
#     norm = urlparse.urlunparse((p.scheme, p.netloc, p.path, "", new_q, ""))
#     return norm

# def get_canonical_from_html(html, url):
#     soup = BeautifulSoup(html, "lxml")
#     link = soup.find("link", rel=lambda x: x and "canonical" in x.lower())
#     if link and link.get("href"):
#         return urlparse.urljoin(url, link["href"])
#     # پشتیبان، اگر داده ساخت یافته وجود داشت
#     meta = soup.find("meta", property="og:url")
#     if meta and meta.get("content"):
#         return urlparse.urljoin(url, meta["content"])
#     return None

# def dedupe_records(records):
#     seen = set()
#     unique = []
#     for r in records:
#         key = normalize_url(r.get("canonical_url") or r.get("url")) or r.get("url")
#         if not key:
#             continue
#         h = sha1(key.encode("utf-8")).hexdigest()
#         if h in seen:
#             continue
#         seen.add(h)
#         r["canonical_url"] = key
#         unique.append(r)
#     return unique


In [84]:
# ==== helpers for canonicalization and meta extraction ====
import re
from hashlib import sha1
from bs4 import BeautifulSoup
import urllib.parse as urlparse

STRIP_PARAMS = {"utm_source","utm_medium","utm_campaign","utm_term","utm_content","ref","utm_name","gclid","fbclid"}

def normalize_url(u: str) -> str | None:
    if not u:
        return None
    p = urlparse.urlparse(u)
    if p.scheme not in ("http","https"):
        return None
    q = [(k,v) for k,v in urlparse.parse_qsl(p.query, keep_blank_values=True) if k not in STRIP_PARAMS]
    new_q = urlparse.urlencode(q)
    return urlparse.urlunparse((p.scheme, p.netloc, p.path, "", new_q, ""))

def get_canonical_from_html(html: str, url: str) -> str | None:
    soup = BeautifulSoup(html, "lxml")
    link = soup.find("link", rel=lambda x: x and "canonical" in x.lower())
    if link and link.get("href"):
        return urlparse.urljoin(url, link["href"])
    meta = soup.find("meta", property="og:url")
    if meta and meta.get("content"):
        return urlparse.urljoin(url, meta["content"])
    return None

def extract_article_meta(html: str, url: str):
    """برگشت می دهد title و author و published_at و canonical"""
    soup = BeautifulSoup(html, "lxml")

    # title
    title = None
    for sel in [
        "meta[property='og:title']",
        "meta[name='twitter:title']",
        "title"
    ]:
        el = soup.select_one(sel)
        if el:
            title = el.get("content") if el.has_attr("content") else el.get_text(strip=True)
        if title:
            break

    # author
    author = None
    for sel in [
        "meta[name='author']",
        "meta[property='article:author']",
        "a[rel='author']"
    ]:
        el = soup.select_one(sel)
        if el:
            author = el.get("content") if el.has_attr("content") else el.get_text(strip=True)
        if author:
            break

    # published time
    published_at = None
    for sel in [
        "meta[property='article:published_time']",
        "meta[name='pubdate']",
        "time[datetime]"
    ]:
        el = soup.select_one(sel)
        if el:
            published_at = el.get("content") if el.has_attr("content") else el.get("datetime")
        if published_at:
            break

    # canonical
    can = get_canonical_from_html(html, url) or url
    can = normalize_url(can)

    return title, author, published_at, can

def dedupe_records(records):
    seen = set()
    unique = []
    for r in records:
        key = normalize_url(r.get("canonical_url") or r.get("url")) or r.get("url")
        if not key:
            continue
        h = sha1(key.encode("utf-8")).hexdigest()
        if h in seen:
            continue
        seen.add(h)
        r["canonical_url"] = key
        unique.append(r)
    return unique

In [85]:
def enrich_canonical_for_some(items, cfg, take=10):
    ua = cfg["user_agent"]
    for i, r in enumerate(items[:take]):
        if not r.get("url"):
            continue
        try:
            resp = fetch(r["url"], ua, timeout=cfg["default_timeout_sec"], 
                         max_retries=cfg["max_retries"], 
                         min_delay_sec=cfg["per_host_min_delay_sec"])
            can = get_canonical_from_html(resp.text, r["url"])
            if can:
                r["canonical_url"] = normalize_url(can)
            else:
                r["canonical_url"] = normalize_url(r["url"])
        except Exception:
            r["canonical_url"] = normalize_url(r["url"])
    # بقیه موارد حداقل نرمالایز شوند
    for r in items[take:]:
        r["canonical_url"] = normalize_url(r.get("url"))
    return items


In [None]:
import feedparser

def reddit_rss_url(sub: str) -> str:
    # old.reddit.com و www.reddit.com هر دو RSS می‌دهند
    return f"https://old.reddit.com/r/{sub}/.rss"

def crawl_reddit_via_rss(sub, cfg, limit=None):
    url = reddit_rss_url(sub)
    d = feedparser.parse(url)
    items = []
    now = datetime.utcnow().isoformat(timespec="seconds")
    for e in d.get("entries", [])[: (limit or 50)]:
        link = e.get("link")
        title = e.get("title")
        author = None
        if "author" in e:
            author = e["author"]
        published_at = None
        if "published" in e:
            published_at = e["published"]
        items.append({
            "source_type": "reddit",
            "source_name": url,
            "subreddit": sub,
            "url": link,
            "canonical_url": normalize_url(link) if link else link,
            "title": title,
            "text": None,
            "author": author,
            "published_at": published_at,
            "score": None,
            "comments": None,
            "fetched_at": now
        })
    # dedupe نهایی
    return dedupe_records(items)


In [86]:
# def crawl_reddit_subreddit(sub, cfg):
#     ua = cfg["user_agent"]
#     limit = cfg["sources"]["reddit"]["limit_per_sub"]
#     # url = f"https://www.reddit.com/r/{sub}/"
#     # از old.reddit.com استفاده 
#     url = f"https://old.reddit.com/r/{sub}/"
#     resp = fetch(url, ua, timeout=cfg["default_timeout_sec"], 
#                  max_retries=cfg["max_retries"], 
#                  min_delay_sec=cfg["per_host_min_delay_sec"])
#     items = parse_reddit_listing(resp.text, url)

#     # خواندن کاننیکال برای چند مورد اول
#     items = enrich_canonical_for_some(items, cfg, take=10)
#     # حذف تکراری
#     items = dedupe_records(items)

#     return items[:limit] if limit else items


In [None]:
def crawl_reddit_subreddit(sub, cfg):
    ua = cfg["user_agent"]
    limit = cfg["sources"]["reddit"]["limit_per_sub"]
    html_url = f"https://old.reddit.com/r/{sub}/"

    # اگر robots اجازه نداد به RSS سوییچ کن
    rp = get_robots_parser(html_url, ua)
    if not rp.can_fetch(ua, html_url):
        return crawl_reddit_via_rss(sub, cfg, limit)

    # در غیر این صورت HTML
    resp = fetch(html_url, ua,
                 timeout=cfg["default_timeout_sec"],
                 max_retries=cfg["max_retries"],
                 min_delay_sec=cfg["per_host_min_delay_sec"])
    items = parse_reddit_listing(resp.text, html_url)
    items = enrich_canonical_for_some(items, cfg, take=10)
    items = dedupe_records(items)
    return items[:limit] if limit else items


In [None]:
def looks_like_consent_page(html: str) -> bool:
    s = html.lower()
    return ("consent" in s) or ("gdpr" in s) or ("privacy preferences" in s) or ("iab" in s)

def crawl_news_site(entry, cfg, max_article_per_listing=30, min_text_len=300):
    ua = cfg["user_agent"]
    start_urls = entry["start_urls"]
    selector_hint = entry.get("article_selector_hint", "a")
    records = []
    seen = set()

    for su in start_urls:
        resp = fetch(su, ua,
                     timeout=cfg["default_timeout_sec"],
                     max_retries=cfg["max_retries"],
                     min_delay_sec=cfg["per_host_min_delay_sec"])
        links = parse_listing_find_links(resp.text, su, selector_hint)

        for lk in links[:max_article_per_listing]:
            try:
                art = fetch(lk, ua,
                            timeout=cfg["default_timeout_sec"],
                            max_retries=cfg["max_retries"],
                            min_delay_sec=cfg["per_host_min_delay_sec"])

                # اگر صفحه consent بود از آن عبور کن
                if looks_like_consent_page(art.text):
                    print("skip consent page:", lk)
                    continue

                text = extract_article_text(art.text) or ""
                if len(text) < min_text_len:
                    continue

                title, author, published_at, canonical = extract_article_meta(art.text, lk)
                canonical = canonical or normalize_url(lk)
                if not canonical:
                    continue
                h = sha1(canonical.encode("utf-8")).hexdigest()
                if h in seen:
                    continue
                seen.add(h)

                records.append({
                    "source_type": "news",
                    "source_name": entry["name"],
                    "subreddit": None,
                    "url": lk,
                    "canonical_url": canonical,
                    "title": title,
                    "text": text,
                    "author": author,
                    "published_at": published_at,
                    "score": None,
                    "comments": None,
                    "fetched_at": datetime.utcnow().isoformat(timespec="seconds")
                })
            except Exception as e:
                print("error on", lk, e)
    return records


In [None]:
# def looks_like_consent_page(html: str) -> bool:
#     s = html.lower()
#     return ("consent" in s) or ("gdpr" in s) or ("privacy preferences" in s) or ("iab" in s)

# # داخل crawl_news_site پس از fetch هر مقاله
# art = fetch(lk, ua,
#             timeout=cfg["default_timeout_sec"],
#             max_retries=cfg["max_retries"],
#             min_delay_sec=cfg["per_host_min_delay_sec"])
# if looks_like_consent_page(art.text):
#     continue


In [None]:
# def safe_print(*args):
#     try:
#         print(*args)
#     except Exception:
#         pass

# # نمونه استفاده
# if not rp.can_fetch(ua, html_url):
#     safe_print(f"robots disallows HTML for {html_url}, switching to RSS")


Reading Samples

In [87]:
# from readability import Document

# def extract_article_text(html):
#     try:
#         doc = Document(html)
#         content_html = doc.summary()
#         soup = BeautifulSoup(content_html, "lxml")
#         text = soup.get_text("\n", strip=True)
#         if len(text) < 200:
#             # fallback
#             soup_full = BeautifulSoup(html, "lxml")
#             paras = [p.get_text(" ", strip=True) for p in soup_full.select("p")]
#             text = "\n".join(paras[:60])
#         return text
#     except Exception:
#         soup = BeautifulSoup(html, "lxml")
#         paras = [p.get_text(" ", strip=True) for p in soup.select("p")]
#         return "\n".join(paras[:60])

In [88]:
# ==== article text extraction (بدون تغییر اساسی) ====
from readability import Document

def extract_article_text(html):
    try:
        doc = Document(html)
        content_html = doc.summary()
        soup = BeautifulSoup(content_html, "lxml")
        text = soup.get_text("\n", strip=True)
        if len(text) < 200:
            soup_full = BeautifulSoup(html, "lxml")
            paras = [p.get_text(" ", strip=True) for p in soup_full.select("p")]
            text = "\n".join(paras[:60])
        return text
    except Exception:
        soup = BeautifulSoup(html, "lxml")
        paras = [p.get_text(" ", strip=True) for p in soup.select("p")]
        return "\n".join(paras[:60])


In [89]:
# def parse_listing_find_links(html, base_url, selector_hint="a"):
#     soup = BeautifulSoup(html, "lxml")
#     links = []
#     for a in soup.select(selector_hint):
#         href = a.get("href")
#         if not href:
#             continue
#         full = urlparse.urljoin(base_url, href)
#         links.append(full)
#     # حذف تکراری
#     return list(dict.fromkeys(links))

In [90]:
# ==== smarter link discovery on listing pages ====
def parse_listing_find_links(html, base_url, selector_hint="a"):
    soup = BeautifulSoup(html, "lxml")
    base = urlparse.urlparse(base_url).netloc
    links = []
    for a in soup.select(selector_hint):
        href = a.get("href")
        if not href:
            continue
        full = urlparse.urljoin(base_url, href)
        norm = normalize_url(full)
        if not norm:
            continue
        # only same-site pages, skip anchors and mailto and javascript
        net = urlparse.urlparse(norm).netloc
        if net != base:
            continue
        if norm.endswith("#"):
            norm = norm[:-1]
        links.append(norm)
    # unique while keeping order
    return list(dict.fromkeys(links))


In [91]:
# def crawl_news_site(entry, cfg):
#     ua = cfg["user_agent"]
#     start_urls = entry["start_urls"]
#     selector_hint = entry.get("article_selector_hint", "a")
#     records = []
#     for su in start_urls:
#         resp = fetch(su, ua, timeout=cfg["default_timeout_sec"], max_retries=cfg["max_retries"], min_delay_sec=cfg["per_host_min_delay_sec"])
#         links = parse_listing_find_links(resp.text, su, selector_hint)
#         for lk in links[:30]:
#             try:
#                 art = fetch(lk, ua, timeout=cfg["default_timeout_sec"], max_retries=cfg["max_retries"], min_delay_sec=cfg["per_host_min_delay_sec"])
#                 text = extract_article_text(art.text)
#                 records.append({
#                     "source_type": "news",
#                     "source_name": entry["name"],
#                     "subreddit": None,
#                     "url": lk,
#                     "canonical_url": lk,
#                     "title": None,
#                     "text": text,
#                     "author": None,
#                     "published_at": None,
#                     "score": None,
#                     "comments": None,
#                     "fetched_at": datetime.utcnow().isoformat(timespec="seconds")
#                 })
#             except Exception as e:
#                 print("error on", lk, e)
#     return records


In [92]:
# ==== crawl a news site with meta, canonical and dedupe ====
from datetime import datetime

def crawl_news_site(entry, cfg, max_article_per_listing=30, min_text_len=300):
    ua = cfg["user_agent"]
    start_urls = entry["start_urls"]
    selector_hint = entry.get("article_selector_hint", "a")
    records = []
    seen = set()  # dedupe on canonical

    for su in start_urls:
        resp = fetch(su, ua,
                     timeout=cfg["default_timeout_sec"],
                     max_retries=cfg["max_retries"],
                     min_delay_sec=cfg["per_host_min_delay_sec"])
        links = parse_listing_find_links(resp.text, su, selector_hint)

        for lk in links[:max_article_per_listing]:
            try:
                art = fetch(lk, ua,
                            timeout=cfg["default_timeout_sec"],
                            max_retries=cfg["max_retries"],
                            min_delay_sec=cfg["per_host_min_delay_sec"])
                text = extract_article_text(art.text) or ""
                if len(text) < min_text_len:
                    continue  # likely not an article

                title, author, published_at, canonical = extract_article_meta(art.text, lk)
                canonical = canonical or normalize_url(lk)
                if not canonical:
                    continue
                h = sha1(canonical.encode("utf-8")).hexdigest()
                if h in seen:
                    continue
                seen.add(h)

                records.append({
                    "source_type": "news",
                    "source_name": entry["name"],
                    "subreddit": None,
                    "url": lk,
                    "canonical_url": canonical,
                    "title": title,
                    "text": text,
                    "author": author,
                    "published_at": published_at,
                    "score": None,
                    "comments": None,
                    "fetched_at": datetime.utcnow().isoformat(timespec="seconds")
                })
            except Exception as e:
                print("error on", lk, e)
    return records


CSV schema & saving

In [93]:
CSV_SCHEMA = [
    "source_type", "source_name", "subreddit", "url", "canonical_url",
    "title", "text", "author", "published_at", "score", "comments",
    "language", "token_count", "predicted_label", "label_scores", "fetched_at"
]

def to_dataframe(records):
    df = pd.DataFrame(records)
    for col in CSV_SCHEMA:
        if col not in df.columns:
            df[col] = None
    return df[CSV_SCHEMA]

def save_csv(df, path):
    df.to_csv(path, index=False, encoding="utf-8")
    print("saved", path)


Initial run and output production

In [94]:
from datetime import datetime

with open("configs/crawl_config.yaml", "r", encoding="utf-8") as f:
    cfg = yaml.safe_load(f)

all_records = []

# Reddit
for sub in cfg["sources"]["reddit"]["subreddits"]:
    try:
        recs = crawl_reddit_subreddit(sub, cfg)
        all_records.extend(recs)
        print(f"reddit {sub} records:", len(recs))
    except Exception as e:
        print("reddit error", sub, e)

# News
for site in cfg["sources"]["news_sites"]:
    try:
        recs = crawl_news_site(site, cfg)
        all_records.extend(recs)
        print(f"news {site['name']} records:", len(recs))
    except Exception as e:
        print("news error", site["name"], e)

df = to_dataframe(all_records)
ts = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
out_path = f"storage/raw/crawl_{ts}.csv"
save_csv(df, out_path)
df.head(10)

resp.status_code == 200
reddit error worldnews Blocked by robots.txt for https://old.reddit.com/r/worldnews/
reddit error news Blocked by robots.txt for https://old.reddit.com/r/news/
resp.status_code == 200
news error reuters_world Fetch failed after retries for https://www.reuters.com/world/
saved storage/raw/crawl_20250911_004252.csv


Unnamed: 0,source_type,source_name,subreddit,url,canonical_url,title,text,author,published_at,score,comments,language,token_count,predicted_label,label_scores,fetched_at
