In [1]:
# --- standard libs ---
import math, re, sys
from urllib.parse import urlparse, urljoin
from datetime import datetime, timezone
# --- light scraping deps (pip install if missing) ---
# pip install requests beautifulsoup4
import requests
from bs4 import BeautifulSoup

# =========================
# SCORING (unchanged logic)
# =========================
def score_articles(articles, weights=None):
    """
    Score a list of article dicts for credibility & objectivity.

    Each article can include ANY of these (all optional):
      title, url, domain, outlet_type, published_at, updated_at,
      has_byline, has_about_page, has_contact_info, corrections_policy,
      cites_sources_count, links_to_primary_sources, quotes_named_experts,
      advertising_density, uses_stock_images_only, is_press_release,
      clickbait_score, subjectivity_score, polarity_score, paywalled

    Returns list of dicts with:
      credibility_score_0_100, objectivity_stars_0_5, overall_stars_0_5,
      rationale, breakdown
    """

    # ---------- Weighting for rule-based components ----------
    W = {
        "domain": 0.23,        # Trustworthiness of the domain / outlet type
        "evidence": 0.22,      # Citations, primary sources, expert quotes
        "transparency": 0.18,  # Presence of bylines, about pages, contact info
        "quality": 0.15,       # Article quality signals (ads density, stock images, PR)
        "recency": 0.12,       # How recent/up-to-date the article is
        "objectivity": 0.10,   # Subjectivity, sentiment, and opinion cues
    }
    if weights:
        W.update(weights)

    TRUSTED_TLDS = (".gov", ".edu")
    SUSPECT_TLDS  = (".zip", ".top", ".click", ".work", ".country", ".gq", ".cf", ".ml")

    CLICKBAIT_PATTERNS = [
        r"\bshocking\b", r"\byou won'?t believe\b", r"\bwhat happened next\b",
        r"\b(exposed|leaked)\b", r"\b(slam|destroys|obliterates)\b", r"\b(OMG|wow)\b",
        r"\bsecret(s)?\b", r"\bno one is talking about\b"
    ]
    OPINION_CUES = [r"^opinion\b", r"^editorial\b", r"\bop-ed\b", r"\bcommentary\b"]

    def _safe_lower(s):
        return (s or "").strip().lower()

    def _domain(url, domain):
        if domain: return domain.lower()
        if url:
            try: return urlparse(url).netloc.lower()
            except Exception: return ""
        return ""

    def _parse_date(s):
        if not s: return None
        try:
            dt = datetime.fromisoformat(s.replace("Z", "+00:00"))
            if not dt.tzinfo: dt = dt.replace(tzinfo=timezone.utc)
            return dt
        except Exception:
            pass
        for fmt in ("%Y-%m-%d", "%Y/%m/%d", "%m/%d/%Y"):
            try: return datetime.strptime(s, fmt).replace(tzinfo=timezone.utc)
            except Exception: continue
        return None

    def _years_since(dt):
        if not dt: return None
        now = datetime.now(timezone.utc)
        return (now - dt).days / 365.25

    # ---------- Component scoring ----------
    def score_domain(url, domain, outlet_type):
        d = _domain(url, domain)
        rep = 0.5
        if any(d.endswith(t) for t in TRUSTED_TLDS): rep = 0.85
        if any(d.endswith(t) for t in SUSPECT_TLDS): rep = min(rep, 0.35)
        t = _safe_lower(outlet_type)
        if t in ("university", "gov", "government", "journal"): rep = max(rep, 0.9)
        elif t in ("newspaper", "magazine", "wire", "public broadcaster"): rep = max(rep, 0.75)
        elif t in ("blog", "substack", "youtube", "social"): rep = min(rep, 0.55)
        return rep

    def score_recency(published_at, updated_at):
        pts = 0.5
        base = _parse_date(published_at) or _parse_date(updated_at)
        if base:
            y = _years_since(base) or 100
            if y <= (30/365.25): pts = 1.0
            elif y <= 0.5: pts = 0.85
            elif y <= 2: pts = 0.7
            elif y <= 5: pts = 0.5
            else: pts = 0.3
        return pts

    def score_transparency(x):
        pts = 0.3
        if x.get("has_byline") is True: pts += 0.25
        if x.get("has_about_page") is True: pts += 0.2
        if x.get("has_contact_info") is True: pts += 0.15
        if x.get("corrections_policy") is True: pts += 0.15
        return max(0.0, min(1.0, pts))

    def score_evidence(x):
        cites = x.get("cites_sources_count") or 0
        primary = x.get("links_to_primary_sources") or 0
        experts = x.get("quotes_named_experts") or 0
        cites_pts = 1 - math.exp(-cites/4)
        prim_pts  = 1 - math.exp(-primary/2)
        exp_pts   = 1 - math.exp(-experts/2)
        return (0.45*cites_pts + 0.35*prim_pts + 0.20*exp_pts)

    def score_quality(x):
        pts = 0.7
        ad = x.get("advertising_density")
        if ad is not None: pts -= max(0.0, min(0.5, float(ad) * 0.6))
        if x.get("uses_stock_images_only") is True: pts -= 0.1
        if x.get("is_press_release") is True: pts -= 0.15
        return max(0.0, min(1.0, pts))

    def score_objectivity(x):
        base = 0.7
        title_l = _safe_lower(x.get("title"))
        if any(re.search(p, title_l) for p in OPINION_CUES): base -= 0.2
        subj = x.get("subjectivity_score")  # optional
        pol  = x.get("polarity_score")      # optional
        if subj is not None: base += 0.25 * (1 - max(0.0, min(1.0, float(subj))))
        if pol is not None:  base += 0.15 * (1 - min(1.0, abs(float(pol))))
        return max(0.0, min(1.0, base))

    def penalties(x):
        p = 0.0
        title_l = _safe_lower(x.get("title"))
        for pat in CLICKBAIT_PATTERNS:
            if re.search(pat, title_l): p += 0.08
        cb = x.get("clickbait_score")
        if cb is not None: p += 0.15 * max(0.0, min(1.0, float(cb)))
        if x.get("paywalled") is True: p += 0.03
        return min(0.5, p)

    def stars_from_objectivity(obj_0_1, evid_0_1, pen_0_1):
        obj = max(0.0, min(1.0, obj_0_1 * 0.8 + 0.2*evid_0_1 - 0.3*pen_0_1))
        return round(obj * 5, 2)

    out = []
    for a in articles:
        domain = score_domain(a.get("url"), a.get("domain"), a.get("outlet_type"))
        rec = score_recency(a.get("published_at"), a.get("updated_at"))
        transp = score_transparency(a)
        evid = score_evidence(a)
        qual = score_quality(a)
        obj = score_objectivity(a)
        pen = penalties(a)

        cred_0_1 = (
            W["domain"]*domain + W["evidence"]*evid + W["transparency"]*transp +
            W["quality"]*qual + W["recency"]*rec + W["objectivity"]*obj
        )
        cred_0_1 = max(0.0, min(1.0, cred_0_1 - pen))

        credibility_score_0_100 = round(cred_0_1 * 100, 1)
        objectivity_stars_0_5 = stars_from_objectivity(obj, evid, pen)
        overall_0_1 = max(0.0, min(1.0, 0.7*cred_0_1 + 0.3*(objectivity_stars_0_5/5.0)))
        overall_stars_0_5 = round(overall_0_1 * 5, 2)

        rationale = (
            f"domain={domain:.2f}, evidence={evid:.2f}, transparency={transp:.2f}, "
            f"quality={qual:.2f}, recency={rec:.2f}, objectivity={obj:.2f}, penalties={pen:.2f}."
        )

        out.append({
            "title": a.get("title"),
            "url": a.get("url"),
            "credibility_score_0_100": credibility_score_0_100,
            "objectivity_stars_0_5": objectivity_stars_0_5,
            "overall_stars_0_5": overall_stars_0_5,
            "rationale": rationale,
            "breakdown": {
                "domain": round(domain, 2),
                "evidence": round(evid, 2),
                "transparency": round(transp, 2),
                "quality": round(qual, 2),
                "recency": round(rec, 2),
                "objectivity": round(obj, 2),
                "penalties": round(pen, 2),
            }
        })
    return out

# ==========================================
# URL → FEATURES (auto-extraction from HTML)
# ==========================================
def _fetch_html(url):
    """Fetch HTML with a safe User-Agent and short timeout."""
    resp = requests.get(url, timeout=12, headers={"User-Agent": "Mozilla/5.0 (cred-bot)"})
    resp.raise_for_status()
    return resp.text

def _text_or_attr(tag, *attrs):
    """Get text or attribute from a BeautifulSoup tag, safely."""
    if not tag: return None
    for a in attrs:
        if tag.has_attr(a):
            val = tag.get(a)
            if val: return val.strip()
    # Fallback to tag text
    txt = getattr(tag, "text", None)
    return txt.strip() if txt else None

def _guess_outlet_type(domain):
    """Rough outlet type guess from domain suffix."""
    d = domain.lower()
    if d.endswith(".edu"): return "university"
    if d.endswith(".gov"): return "gov"
    return None  # let scorer treat as neutral

def _count_outbound_links(soup, page_domain):
    """Count outbound links as a proxy for citations/evidence."""
    cnt = 0
    primary = 0
    primary_patterns = (".gov", ".edu", "doi.org", "pubmed.", "nature.com", "science.org")
    for a in soup.find_all("a", href=True):
        href = a["href"].strip()
        # Normalize absolute
        n = urlparse(href)
        if not n.netloc:
            # relative link → same site (not outbound)
            continue
        # Outbound if domain differs
        if n.netloc and page_domain not in n.netloc:
            cnt += 1
            if any(p in href.lower() for p in primary_patterns):
                primary += 1
    return cnt, primary

def _detect_byline(soup):
    """Detect byline via common meta tags / classes."""
    if soup.find(attrs={"name": "author"}): return True
    if soup.find("meta", {"property": "article:author"}): return True
    if soup.find(class_=re.compile(r"byline|author", re.I)): return True
    return False

def _detect_contact_info(soup):
    """Crudely detect presence of contact info on page or footer links."""
    footer = soup.find("footer")
    hay = (footer or soup).get_text(" ", strip=True).lower()
    return any(k in hay for k in ["contact", "email", "feedback", "editor@", "newsroom"])

def _detect_about_page(soup):
    """Look for an About link in header/footer/nav."""
    for a in soup.find_all("a", href=True):
        txt = (a.get_text() or "").strip().lower()
        if "about" in txt or "masthead" in txt:
            return True
    return False

def _detect_corrections_policy(soup):
    hay = soup.get_text(" ", strip=True).lower()
    return any(k in hay for k in ["correction", "corrections policy", "errata", "clarification"])

def _detect_press_release(soup):
    hay = soup.get_text(" ", strip=True).lower()
    return ("press release" in hay) or ("newswire" in hay)

def _estimate_ads_density(soup):
    """Very rough: count iframes & common ad class hints normalized by length."""
    html_len = len(soup.get_text(" ", strip=True)) + 1
    ad_iframes = len(soup.find_all("iframe"))
    ad_divs = len(soup.find_all(class_=re.compile(r"advert|ad-container|sponsor", re.I)))
    score = min(1.0, (ad_iframes*2 + ad_divs) / max(800, html_len/5))
    return round(score, 3)

def _detect_paywall(soup):
    """Simple: look for common paywall flags."""
    txt = soup.get_text(" ", strip=True).lower()
    flags = ["subscribe to read", "subscriber-only", "meteredpaywall", "paywall", "sign in to continue"]
    return any(f in txt for f in flags)

def _extract_date(soup):
    """Try several standard meta/time locations for published/updated dates."""
    # OpenGraph / schema.org metas
    meta_keys = [
        ("meta", {"property": "article:published_time"}),
        ("meta", {"name": "datePublished"}),
        ("meta", {"itemprop": "datePublished"}),
        ("time", {"itemprop": "datePublished"}),
        ("time", {"datetime": True}),
    ]
    for tag, attrs in meta_keys:
        t = soup.find(tag, attrs)
        v = _text_or_attr(t, "content", "datetime")
        if v: return v
    return None

def _count_named_experts(text):
    """
    Heuristic: count quotes that include expert indicators like Dr./Prof./PhD/MD.
    Not perfect, but gives a weak signal.
    """
    quotes = re.findall(r"“([^”]+)”|\"([^\"]+)\"", text)
    quotes = ["".join(q) for q in quotes]
    expert_markers = re.compile(r"\b(Dr\.|Professor|Prof\.|PhD|MD|M\.D\.)\b")
    return sum(1 for q in quotes if expert_markers.search(q))

def _clickbait_score_from_title(title):
    """Lightweight score 0..1 from clickbait terms/punctuation."""
    if not title: return 0.0
    title_l = title.lower()
    pats = [
        r"you won'?t believe", r"what happened next", r"shocking", r"omg", r"wow",
        r"destroys", r"obliterates", r"leaked", r"exposed", r"secret", r"no one is talking about"
    ]
    hits = sum(1 for p in pats if re.search(p, title_l))
    excls = title.count("!")
    return min(1.0, 0.15*hits + 0.05*excls)

def extract_article_features_from_url(url):
    """
    Fetch `url` and auto-extract a best-effort feature dict compatible with `score_articles`.
    """
    try:
        html = _fetch_html(url)
    except Exception as e:
        # Return a minimal fallback with only URL and domain
        d = urlparse(url).netloc
        return {
            "title": url, "url": url, "domain": d,
            # leave other fields None/omitted; scorer handles defaults
        }

    soup = BeautifulSoup(html, "html.parser")
    domain = urlparse(url).netloc

    # Title
    title = None
    if soup.title and soup.title.string:
        title = soup.title.string.strip()
    else:
        ogt = soup.find("meta", {"property": "og:title"})
        title = _text_or_attr(ogt, "content") or url

    # Dates
    published_at = _extract_date(soup)

    # Evidence signals
    cites, primary = _count_outbound_links(soup, domain)
    text_body = soup.get_text(" ", strip=True)
    experts = _count_named_experts(text_body)

    # Transparency/quality/paywall
    has_byline = _detect_byline(soup)
    has_contact = _detect_contact_info(soup)
    has_about = _detect_about_page(soup)
    corrections = _detect_corrections_policy(soup)
    is_press_rel = _detect_press_release(soup)
    ads_density = _estimate_ads_density(soup)
    paywalled = _detect_paywall(soup)

    # Clickbait / opinion cues
    clickbait_score = _clickbait_score_from_title(title)

    # Outlet type guess (very rough)
    outlet_type = _guess_outlet_type(domain)

    return {
        "title": title,
        "url": url,
        "domain": domain,
        "outlet_type": outlet_type,
        "published_at": published_at,
        "has_byline": has_byline,
        "has_about_page": has_about,
        "has_contact_info": has_contact,
        "corrections_policy": corrections,
        "cites_sources_count": cites,
        "links_to_primary_sources": primary,
        "quotes_named_experts": experts,
        "advertising_density": ads_density,
        "is_press_release": is_press_rel,
        "clickbait_score": clickbait_score,
        "paywalled": paywalled,
        # subjectivity_score & polarity_score left None by default
    }

# ==========================================
# PUBLIC API: pass only URLs (strings/list)
# ==========================================
def score_articles_from_urls(urls):
    """
    Accepts a single URL string or a list of URL strings.
    Returns scored articles (same structure as score_articles output).
    """
    if isinstance(urls, str):
        urls = [urls]
    articles = [extract_article_features_from_url(u) for u in urls]
    return score_articles(articles)

# ====================
# Example (quick test)
# ====================
if __name__ == "__main__":
    test_url = "https://www.university.edu/news/2025/09/air-quality-asthma-study"
    result = score_articles_from_urls(test_url)[0]
    print("Title:", result["title"])
    print("Credibility:", result["credibility_score_0_100"])
    print("Objectivity:", result["objectivity_stars_0_5"])
    print("Overall:", result["overall_stars_0_5"])
    print("Rationale:", result["rationale"])
    print("Breakdown:", result["breakdown"])


Title: https://www.university.edu/news/2025/09/air-quality-asthma-study
Credibility: 48.4
Objectivity: 2.8
Overall: 2.54
Rationale: domain=0.85, evidence=0.00, transparency=0.30, quality=0.70, recency=0.50, objectivity=0.70, penalties=0.00.
Breakdown: {'domain': 0.85, 'evidence': 0.0, 'transparency': 0.3, 'quality': 0.7, 'recency': 0.5, 'objectivity': 0.7, 'penalties': 0.0}


In [10]:
# Example URL (you can swap this for any article you want to test)
url = "https://apnews.com/article/charlie-kirk-shooting-political-violence-reaction-87f6755421938d0a0d7c5905be10767f"

# Run the scoring
result = score_articles_from_urls(url)[0]

# Show results
print("Title:", result["title"])
print("Credibility Score (0–100):", result["credibility_score_0_100"])
print("Objectivity Stars (0–5):", result["objectivity_stars_0_5"])
print("Overall Stars (0–5):", result["overall_stars_0_5"])
print("Rationale:", result["rationale"])
print("Breakdown:", result["breakdown"])


Title: Charlie Kirk shooting brings condemnation from victims of political violence | AP News
Credibility Score (0–100): 56.4
Objectivity Stars (0–5): 3.25
Overall Stars (0–5): 2.95
Rationale: domain=0.50, evidence=0.45, transparency=0.90, quality=0.55, recency=0.30, objectivity=0.70, penalties=0.00.
Breakdown: {'domain': 0.5, 'evidence': 0.45, 'transparency': 0.9, 'quality': 0.55, 'recency': 0.3, 'objectivity': 0.7, 'penalties': 0.0}
