In [None]:
# @title  Install required packages
!pip install readability-lxml beautifulsoup4 tqdm  chromadb fastembed lxml readability


In [None]:
# @title  turrnds headlines.json into a db   use this script to download articles in turbo mode
import os
import requests
import sqlite3
import threading
import time
from urllib.parse import urlparse, unquote
from queue import Queue

from readability import Document
from bs4 import BeautifulSoup
from tqdm import tqdm

# -----------------------------------------
# CONFIG
# -----------------------------------------
JSON_URL = "https://api.lexkynews.com/headlines.json"
DOWNLOAD_DIR = "/content/articles"
DB_PATH = "/content/articles.db"

THREADS = 10               # <-- turbo mode
HEARTBEAT_INTERVAL = 30    # seconds

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:130.0) Gecko/20100101 Firefox/130.0"
}

TERMINAL_STATUSES = {"done", "blocked", "not_found"}

# Hard skip domain prefixes
SKIP_DOMAINS = [
    "fox56news.com",
    "www.fox56news.com",
]

# Globals
pbar = None
queue = Queue()


# -----------------------------------------
# HEARTBEAT
# -----------------------------------------
def heartbeat():
    while True:
        time.sleep(HEARTBEAT_INTERVAL)
        if pbar:
            pbar.write(f"[♥] Alive... {time.strftime('%H:%M:%S')}")


# -----------------------------------------
# DB FUNCTIONS
# -----------------------------------------
def init_db():
    conn = sqlite3.connect(DB_PATH, check_same_thread=False)
    c = conn.cursor()
    c.execute("""
        CREATE TABLE IF NOT EXISTS articles (
            url TEXT PRIMARY KEY,
            filename TEXT,
            status TEXT,
            size INTEGER
        )
    """)
    conn.commit()
    return conn


db_lock = threading.Lock()


def save_record(conn, url, filename, status, size):
    with db_lock:
        c = conn.cursor()
        c.execute("""
            INSERT OR REPLACE INTO articles (url, filename, status, size)
            VALUES (?, ?, ?, ?)
        """, (url, filename, status, size))
        conn.commit()


# -----------------------------------------
# FILENAME NORMALIZATION
# -----------------------------------------
def url_to_safe_name(url):
    parsed = urlparse(url)
    path = parsed.path.rstrip("/") or "index"
    slug = unquote(path.split("/")[-1]) or "index"
    slug = slug.replace("?", "_").replace("&", "_").replace("=", "_")
    if not slug.lower().endswith(".html"):
        slug += ".html"
    return slug


# -----------------------------------------
# ARTICLE EXTRACTION
# -----------------------------------------
def extract_article(html):
    doc = Document(html)
    title = doc.short_title() or "Untitled"
    cleaned = BeautifulSoup(doc.summary(), "html.parser")
    for tag in cleaned(["script", "style", "noscript"]):
        tag.decompose()
    return f"<h1>{title}</h1>\n{str(cleaned)}"


# -----------------------------------------
# WORKER: downloads in parallel
# -----------------------------------------
def worker(conn, session):
    while True:
        url = queue.get()
        if url is None:
            break
        try:
            download_article(conn, session, url)
        finally:
            queue.task_done()
            pbar.update(1)


# -----------------------------------------
# DOWNLOAD ONE ARTICLE (safe & resumable)
# -----------------------------------------
def download_article(conn, session, url):

    # HARD BLOCK
    domain = urlparse(url).netloc.lower()
    for d in SKIP_DOMAINS:
        if d in domain:
            pbar.write(f"[skip-domain] {url}")
            save_record(conn, url, None, "blocked", 0)
            return

    filename = url_to_safe_name(url)
    filepath = os.path.join(DOWNLOAD_DIR, filename)

    # Resume check
    with db_lock:
        c = conn.cursor()
        c.execute("SELECT status FROM articles WHERE url = ?", (url,))
        row = c.fetchone()

    if row and row[0] in TERMINAL_STATUSES and os.path.exists(filepath):
        pbar.write(f"[skip] {filename}")
        return

    pbar.write(f"[↓] {url}")

    try:
        r = session.get(url, headers=HEADERS, timeout=20)

        if r.status_code == 403:
            pbar.write(f"[403] {url}")
            save_record(conn, url, None, "blocked", 0)
            return

        if r.status_code == 404:
            pbar.write(f"[404] {url}")
            save_record(conn, url, None, "not_found", 0)
            return

        r.raise_for_status()

        article_html = extract_article(r.text)

        with open(filepath, "w", encoding="utf-8") as f:
            f.write(article_html)

        size = len(article_html.encode("utf-8"))
        save_record(conn, url, filename, "done", size)
        pbar.write(f"[ok] {filename}")

    except Exception as e:
        pbar.write(f"[ERR] {url}: {e}")
        save_record(conn, url, filename, "error", 0)


# -----------------------------------------
# MAIN
# -----------------------------------------
def run():
    global pbar

    os.makedirs(DOWNLOAD_DIR, exist_ok=True)
    conn = init_db()

    # heartbeat
    threading.Thread(target=heartbeat, daemon=True).start()

    print("[*] Fetching JSON…")
    data = requests.get(JSON_URL).json()
    items = data.get("items", []) if isinstance(data, dict) else data

    # normalize urls
    urls = []
    for item in items:
        if isinstance(item, str):
            url = item
        elif isinstance(item, dict):
            url = item.get("url") or item.get("link")
        else:
            continue
        if url and url.startswith("http"):
            urls.append(url)

    print(f"[*] URLs total: {len(urls)}")

    session = requests.Session()

    # tqdm progress bar
    pbar = tqdm(total=len(urls), unit="url")

    # create worker threads
    threads = []
    for _ in range(THREADS):
        t = threading.Thread(target=worker, args=(conn, session))
        t.daemon = True
        t.start()
        threads.append(t)

    # queue jobs
    for url in urls:
        queue.put(url)

    # wait for all jobs
    queue.join()

    # stop workers
    for _ in range(THREADS):
        queue.put(None)

    for t in threads:
        t.join()

    pbar.close()
    print("[DONE] turbo mode complete.")


run()


In [5]:
# @title  turrnds headlines.json into a db

import os
import json
import time
import sqlite3
import requests
from urllib.parse import urlparse
from tqdm import tqdm

DB_PATH = "articles.db"
JSON_URL = "https://api.lexkynews.com/headlines.json"
USER_AGENT = "Mozilla/5.0 (compatible; Downloader/1.0)"

BLOCK_DOMAINS = ["fox56news.com", "www.fox56news.com"]


def init_db():
    conn = sqlite3.connect(DB_PATH)
    conn.execute("""
        CREATE TABLE IF NOT EXISTS articles (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            url TEXT UNIQUE,
            status TEXT,
            html TEXT,
            ts INTEGER
        )
    """)
    conn.commit()
    return conn


def load_json():
    print("[*] Fetching headlines.json…")
    r = requests.get(JSON_URL, timeout=10)
    r.raise_for_status()
    data = r.json()
    return data.get("items", [])


def get_resume_index(conn, urls):
    c = conn.cursor()
    c.execute("SELECT url FROM articles ORDER BY id DESC LIMIT 1")
    row = c.fetchone()
    if not row:
        return 0

    last_url = row[0]
    print(f"[resume] Last processed URL: {last_url}")

    if last_url in urls:
        return urls.index(last_url) + 1

    return 0


def save(conn, url, status, html):
    ts = int(time.time())
    conn.execute(
        "INSERT OR REPLACE INTO articles (url, status, html, ts) VALUES (?, ?, ?, ?)",
        (url, status, html, ts)
    )
    conn.commit()


def download_url(url):
    domain = urlparse(url).netloc.lower()
    if "fox56news.com" in domain:
        return "blocked", None

    try:
        r = requests.get(url, headers={"User-Agent": USER_AGENT}, timeout=10)
        if r.status_code == 403:
            return "blocked", None
        if r.status_code == 404:
            return "not_found", None
        r.raise_for_status()
        return "done", r.text
    except:
        return "error", None


def heartbeat(last, interval=10):
    now = time.time()
    if now - last >= interval:
        print(f"[♥] Alive... {time.strftime('%H:%M:%S')}")
        return now
    return last


def run():
    conn = init_db()
    items = load_json()

    urls = [item["url"] for item in items if "url" in item]
    total = len(urls)

    start = get_resume_index(conn, urls)
    print(f"[start] Resuming at index {start}/{total}")

    hb = time.time()
    pbar = tqdm(total=total, initial=start, desc="Downloading", ncols=100)

    for i in range(start, total):
        url = urls[i]

        hb = heartbeat(hb)

        status, html = download_url(url)
        save(conn, url, status, html)

        pbar.set_postfix({"status": status})
        pbar.update(1)

        # REMOVED sleep for max speed
        # if you want polite mode, add: time.sleep(0.01)

    pbar.close()
    print("[DONE] Completed all downloads.")


if __name__ == "__main__":
    run()


In [10]:
# @title  clean downloaded articles into text
import sqlite3
import time
from bs4 import BeautifulSoup
from readability import Document
from urllib.parse import urlparse

DB_PATH = "articles.db"

# ---------------------------------------------
# INIT CLEAN TABLE
# ---------------------------------------------
def init_clean_table(conn):
    conn.execute("""
        CREATE TABLE IF NOT EXISTS clean_articles (
            id INTEGER PRIMARY KEY,
            url TEXT,
            source TEXT,
            title TEXT,
            published TEXT,
            text TEXT,
            html_len INTEGER,
            extracted_at INTEGER
        )
    """)
    conn.commit()


# ---------------------------------------------
# SIMPLE DATE PARSER (BEST-EFFORT)
# ---------------------------------------------
def extract_date(soup):
    # Look for <time> element
    t = soup.find("time")
    if t and t.get_text(strip=True):
        return t.get_text(strip=True)

    # Metadata
    meta = soup.find("meta", {"property":"article:published_time"})
    if meta:
        return meta.get("content")

    meta2 = soup.find("meta", {"name":"pubdate"})
    if meta2:
        return meta2.get("content")

    return None


# ---------------------------------------------
# CLEAN TEXT VIA READABILITY FIRST
# ---------------------------------------------
def extract_content(html):
    try:
        doc = Document(html)
        title = doc.title()
        cleaned_html = doc.summary(html_partial=True)

        soup = BeautifulSoup(cleaned_html, "lxml")

        # text extraction
        text = soup.get_text("\n", strip=True)
        return title, text
    except:
        return None, None


# ---------------------------------------------
# FALLBACK CLEANER (SOUP ONLY)
# ---------------------------------------------
def fallback_extract(html):
    soup = BeautifulSoup(html, "lxml")

    # Drop scripts, styles
    for t in soup(["script","style","nav","header","footer","aside"]):
        t.decompose()

    text = soup.get_text("\n", strip=True)
    title = soup.title.string.strip() if soup.title else None

    return title, text


# ---------------------------------------------
# PROCESS ONE ARTICLE
# ---------------------------------------------
def process_article(row):
    aid, url, status, html, ts = row

    if not html:
        return None

    # primary extractor
    title, text = extract_content(html)

    # fallback
    if not text or len(text) < 200:
        title, text = fallback_extract(html)

    if not text:
        return None

    # source domain
    domain = urlparse(url).netloc

    # date extraction
    soup = BeautifulSoup(html, "lxml")
    published = extract_date(soup)

    return {
        "id": aid,
        "url": url,
        "source": domain,
        "title": title,
        "published": published,
        "text": text,
        "html_len": len(html)
    }


# ---------------------------------------------
# MAIN
# ---------------------------------------------
def run():
    conn = sqlite3.connect(DB_PATH)
    init_clean_table(conn)

    # fetch ALL raw html rows
    raw_rows = conn.execute(
        "SELECT id, url, status, html, ts FROM articles WHERE status='done'"
    ).fetchall()

    # find already cleaned IDs (resume)
    cleaned_ids = {
        r[0] for r in conn.execute(
            "SELECT id FROM clean_articles"
        ).fetchall()
    }

    print(f"[info] Raw articles: {len(raw_rows)}")
    print(f"[info] Already cleaned: {len(cleaned_ids)}")

    count = 0

    for row in raw_rows:
        aid = row[0]
        if aid in cleaned_ids:
            continue

        result = process_article(row)

        if result:
            conn.execute("""
                INSERT OR REPLACE INTO clean_articles
                (id, url, source, title, published, text, html_len, extracted_at)
                VALUES (?, ?, ?, ?, ?, ?, ?, ?)
            """, (
                result["id"],
                result["url"],
                result["source"],
                result["title"],
                result["published"],
                result["text"],
                result["html_len"],
                int(time.time())
            ))
            conn.commit()

            print(f"[ok] Cleaned ID {aid} ({result['source']})")
        else:
            print(f"[skip] ID {aid}: could not extract text")

        count += 1

    print(f"[DONE] Cleaned {count} articles.")


if __name__ == "__main__":
    run()
