In [1]:
%pip install requests feedparser
%pip install pandas
%pip install googletrans==4.0.0-rc1
%pip install influxdb-client


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
# Cell 1 - imports + config (DB-only)
import os
import requests
import xml.etree.ElementTree as ET
from datetime import datetime, timezone

# InfluxDB (read from env if you want)
INFLUX_URL = os.getenv("INFLUX_URL", "http://localhost:8086")
INFLUX_TOKEN = os.getenv("INFLUX_TOKEN", "")
INFLUX_ORG = os.getenv("INFLUX_ORG", "")
INFLUX_BUCKET = os.getenv("INFLUX_BUCKET", "")

FEED_URL = "https://rss.orf.at/news.xml"

TARGET_OEWA = "urn:oewa:RedCont:Politik/PolitikAusland"
USER_AGENT = "orf-rss-tracker/1.0 (+local notebook)"


In [3]:
# Cell 2 - fetch XML
def fetch_feed_xml(url: str, timeout: int = 20) -> str:
    r = requests.get(url, timeout=timeout, headers={"User-Agent": USER_AGENT})
    r.raise_for_status()
    return r.text

xml_text = fetch_feed_xml(FEED_URL)
len(xml_text), xml_text[:200]


(15659,
 '<?xml version="1.0" encoding="UTF-8"?>\n<rdf:RDF\n  xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"\n  xmlns:dc="http://purl.org/dc/elements/1.1/"\n  xmlns:sy="http://purl.org/rss/1.0/modules/synd')

In [4]:
# Cell 3 - detect namespaces robustly (so you don't have to guess)
import io

def detect_namespaces(xml_text: str) -> dict:
    ns = {}
    for event, elem in ET.iterparse(io.StringIO(xml_text), events=("start-ns",)):
        prefix, uri = elem
        ns[prefix if prefix is not None else ""] = uri
    return ns

NS = detect_namespaces(xml_text)
NS


{'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
 'dc': 'http://purl.org/dc/elements/1.1/',
 'sy': 'http://purl.org/rss/1.0/modules/syndication/',
 'orfon': 'http://rss.orf.at/1.0/',
 '': 'http://purl.org/rss/1.0/'}

In [5]:
# Cell 4 - parse + sanity checks (THIS will show why your old code returned 0)
root = ET.fromstring(xml_text)

rss_ns = NS.get("rss", "http://purl.org/rss/1.0/")  # ORF uses RSS 1.0
items = root.findall(".//{%s}item" % rss_ns)

root.tag, len(items)


('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF', 25)

In [7]:
# Cell 5 - helper: load already-seen usids (dedupe) from InfluxDB (DB-only)
from influx_io import get_client, INFLUX_BUCKET, INFLUX_ORG

def load_seen_usids_from_influx(lookback: str = "30d") -> set[str]:
    """
    Pull distinct usid tag values stored in measurement 'orf_article'
    within a recent lookback window.
    """
    with get_client() as client:
        query_api = client.query_api()

        flux = f'''
from(bucket: "{INFLUX_BUCKET}")
  |> range(start: -{lookback})
  |> filter(fn: (r) => r._measurement == "orf_article")
  |> keep(columns: ["usid"])
  |> distinct(column: "usid")
'''

        tables = query_api.query(flux, org=INFLUX_ORG)

    seen = set()
    for table in tables:
        for record in table.records:
            val = record.values.get("usid")
            if val:
                seen.add(str(val))
    return seen

seen_usids = load_seen_usids_from_influx("30d")
len(seen_usids)


17

In [8]:
# Cell 6 - parse items + filter by oewaCategory
def text_of(el):
    return el.text.strip() if el is not None and el.text else None

def parse_filtered_items(root: ET.Element, ns: dict) -> list[dict]:
    rss_ns = ns.get("rss", "http://purl.org/rss/1.0/")
    rdf_ns = ns.get("rdf", "http://www.w3.org/1999/02/22-rdf-syntax-ns#")
    dc_ns  = ns.get("dc",  "http://purl.org/dc/elements/1.1/")
    orf_ns = ns.get("orfon")  # must exist in feed; we'll rely on detected value

    if not orf_ns:
        raise RuntimeError("Could not detect 'orfon' namespace in the feed. Check NS dict output.")

    out = []
    for item in root.findall(".//{%s}item" % rss_ns):
        # orfon:oewaCategory rdf:resource="..."
        cat_el = item.find("{%s}oewaCategory" % orf_ns)
        if cat_el is None:
            continue

        cat_val = cat_el.attrib.get("{%s}resource" % rdf_ns)
        if cat_val != TARGET_OEWA:
            continue

        title_el = item.find("{%s}title" % rss_ns)
        link_el  = item.find("{%s}link" % rss_ns)
        date_el  = item.find("{%s}date" % dc_ns)
        usid_el  = item.find("{%s}usid" % orf_ns)

        out.append({
            "usid": text_of(usid_el),
            "date": text_of(date_el),
            "link": text_of(link_el),
            "title": text_of(title_el),
            "oewaCategory": cat_val,
            "fetched_at_utc": datetime.now(timezone.utc).isoformat(),
        })

    return out

filtered_items = parse_filtered_items(root, NS)
len(filtered_items), filtered_items[:2]


(14,
 [{'usid': 'news:3416835',
   'date': '2026-01-11T17:32:03+01:00',
   'link': 'https://orf.at/stories/3416835/',
   'title': 'Trump will Kuba Ölhahn abdrehen',
   'oewaCategory': 'urn:oewa:RedCont:Politik/PolitikAusland',
   'fetched_at_utc': '2026-01-11T16:37:12.590976+00:00'},
  {'usid': 'news:3416833',
   'date': '2026-01-11T15:56:56+01:00',
   'link': 'https://orf.at/stories/3416833/',
   'title': 'Wieder Strom in ukrainischer Region Saporischschja',
   'oewaCategory': 'urn:oewa:RedCont:Politik/PolitikAusland',
   'fetched_at_utc': '2026-01-11T16:37:12.590976+00:00'}])

In [9]:
from influx_io import write_orf_articles

written = write_orf_articles(filtered_items)
print("Wrote ORF articles to InfluxDB:", written)

import pandas as pd
df = pd.DataFrame(filtered_items)


Wrote ORF articles to InfluxDB: 14


In [10]:
# Cell 7 - (optional) debug: what categories exist + counts
from collections import Counter

def category_counts(root: ET.Element, ns: dict) -> Counter:
    rss_ns = ns.get("rss", "http://purl.org/rss/1.0/")
    rdf_ns = ns.get("rdf", "http://www.w3.org/1999/02/22-rdf-syntax-ns#")
    orf_ns = ns.get("orfon")
    c = Counter()
    for item in root.findall(".//{%s}item" % rss_ns):
        cat_el = item.find("{%s}oewaCategory" % orf_ns) if orf_ns else None
        if cat_el is None:
            continue
        cat_val = cat_el.attrib.get("{%s}resource" % rdf_ns)
        if cat_val:
            c[cat_val] += 1
    return c

counts = category_counts(root, NS)
counts.most_common(10)


[('urn:oewa:RedCont:Politik/PolitikAusland', 14),
 ('urn:oewa:RedCont:Politik/PolitikInland', 5),
 ('urn:oewa:RedCont:Nachrichten/Chronik', 4),
 ('urn:oewa:RedCont:KulturUndFreizeit/Literatur', 1),
 ('urn:oewa:RedCont:KulturUndFreizeit/KulturUeberblick', 1)]

In [11]:
# Cell 8 - DB-only: keep only new items + write to InfluxDB
from influx_io import write_orf_articles

new_items = [it for it in filtered_items if it.get("usid") and it["usid"] not in seen_usids]

written = write_orf_articles(new_items)
print("Wrote ORF articles to InfluxDB:", written)

# update local seen set (so rerunning later cells in this same session doesn't re-write)
for it in new_items:
    seen_usids.add(it["usid"])

len(new_items)


Wrote ORF articles to InfluxDB: 14


14

In [12]:
# Cell 9 - show latest rows quickly (DB-only, from current run)
import pandas as pd

df = pd.DataFrame(new_items)
df.tail(10)


Unnamed: 0,usid,date,link,title,oewaCategory,fetched_at_utc
4,news:3416817,2026-01-11T12:39:49+01:00,https://orf.at/stories/3416817/,Über 1.000 Wohnhäuser in Kiew weiter ohne Heizung,urn:oewa:RedCont:Politik/PolitikAusland,2026-01-11T16:37:12.590976+00:00
5,news:3416805,2026-01-11T11:43:19+01:00,https://orf.at/stories/3416805/,Landesweite Demos gegen US-Regierung,urn:oewa:RedCont:Politik/PolitikAusland,2026-01-11T16:37:12.590976+00:00
6,news:3416812,2026-01-11T11:38:17+01:00,https://orf.at/stories/3416812/,Selenskyj wirft Kreml gezielten Terror gegen B...,urn:oewa:RedCont:Politik/PolitikAusland,2026-01-11T16:37:12.590976+00:00
7,news:3416809,2026-01-11T11:15:50+01:00,https://orf.at/stories/3416809/,Syrien gibt Festnahme Hunderter Kurden in Alep...,urn:oewa:RedCont:Politik/PolitikAusland,2026-01-11T16:37:12.590976+00:00
8,news:3416796,2026-01-11T09:41:24+01:00,https://orf.at/stories/3416796/,Zahl der Toten bei Protesten im Iran steigt,urn:oewa:RedCont:Politik/PolitikAusland,2026-01-11T16:37:12.590976+00:00
9,news:3416788,2026-01-11T08:31:17+01:00,https://orf.at/stories/3416788/,Zehntausende bei Protesten gegen ICE in Minnea...,urn:oewa:RedCont:Politik/PolitikAusland,2026-01-11T16:37:12.590976+00:00
10,news:3416787,2026-01-11T08:26:20+01:00,https://orf.at/stories/3416787/,Berichte: Israel und Hamas bereiten sich auf n...,urn:oewa:RedCont:Politik/PolitikAusland,2026-01-11T16:37:12.590976+00:00
11,news:3416786,2026-01-11T08:18:32+01:00,https://orf.at/stories/3416786/,Eine Tote nach nächtlichen Drohnenangriffen in...,urn:oewa:RedCont:Politik/PolitikAusland,2026-01-11T16:37:12.590976+00:00
12,news:3416785,2026-01-11T08:16:57+01:00,https://orf.at/stories/3416785/,Nobelinstitut: Nobelpreis kann nicht übertrage...,urn:oewa:RedCont:Politik/PolitikAusland,2026-01-11T16:37:12.590976+00:00
13,news:3416784,2026-01-11T07:54:47+01:00,https://orf.at/stories/3416784/,Nicaragua lässt Dutzende Häftlinge frei,urn:oewa:RedCont:Politik/PolitikAusland,2026-01-11T16:37:12.590976+00:00


Reddit

In [13]:
# =========================
# RUN SCRIPT (full)
# =========================
import os
import re
import time
from datetime import datetime, timezone

import pandas as pd
import requests
from googletrans import Translator

# df already exists:
# df = pd.read_csv(CSV_PATH)
df = pd.DataFrame(filtered_items).head(2)


# =========================
# CONFIG
# =========================
OUT_CSV = "reddit_posts_minimal.csv"
USER_AGENT = "orf-reddit-minimal/1.0"

TITLE_COL = "title"
USID_COL = "usid"

# --- Rate limiting ---
REQUEST_SLEEP = 2.0      # <- normal sleep between EVERY reddit request
BACKOFF_SLEEP = 90       # <- sleep when 429 happens

# --- groups / matching ---
MAX_KEYWORDS_DE = 20          # extract more DE keywords from title
ADD_BIGRAM_GROUPS = True      # add phrase/bigram groups to reach ~7+ groups
MAX_BIGRAMS = 10              # cap bigram groups per article

TARGET_GROUPS = 7             # try to use 7 groups
MIN_GROUP_MATCHES = 3         # must hit at least 3 groups
MAX_POST_WORDS = 300          # only keep posts with <= 300 words in checked text

CHECK_TEXT_MODE = "title+selftext"  # or "selftext"

# --- reddit search (ranked) ---
MAX_PAGES_PER_SOURCE = 10
LIMIT_PER_PAGE = 100
SORT_MODE = "relevance"

# --- reddit subreddit feeds (/new.json) ---
USE_SUB_NEW_FEEDS = True
NEW_FEED_PAGES_PER_SUB = 8
NEW_FEED_LIMIT = 100

# --- where to search ---
SUBREDDITS = ["politics", "austria", "europe", "worldnews", "news"]
SEARCH_GLOBAL_TOO = True
RESTRICT_TO_SUBS_ONLY = False

# If you delete the CSV after each run, keep False
DEDUPE_WITH_EXISTING_CSV = False

# =========================
# INIT
# =========================
session = requests.Session()
session.headers.update({"User-Agent": USER_AGENT})
translator = Translator()

STOPWORDS = {
    # DE
    "der","die","das","und","oder","nicht","nur","auch","mit","von","für","über","unter","nach","vor",
    "ein","eine","einer","eines","dem","den","des","im","in","am","an","auf","aus","bei","zum","zur",
    "ist","sind","war","waren","wird","werden","hat","haben","kann","können","muss","müssen",
    # EN
    "the","and","or","not","only","also","with","from","for","about","this","that","these","those",
    "is","are","was","were","be","been","being","has","have","had","can","could","must","will","may",
    "of","by","to","in","on","at"
}

WORD_RE = re.compile(r"\b[\wÄÖÜäöüß]+\b", flags=re.UNICODE)


# =========================
# HELPERS
# =========================
def safe_str(x) -> str:
    if pd.isna(x):
        return ""
    return str(x)


def words_list(text: str) -> list[str]:
    return WORD_RE.findall((text or "").strip())


def tokenize_list(text: str) -> list[str]:
    """Ordered tokens, lowercased, stopwords removed, len>=3."""
    text = (text or "").lower()
    text = re.sub(r"[^0-9a-zäöüß]+", " ", text, flags=re.IGNORECASE)
    toks = []
    for t in text.split():
        if len(t) >= 3 and t not in STOPWORDS:
            toks.append(t)
    return toks


def tokenize_set(text: str) -> set[str]:
    return set(tokenize_list(text))


def normalize_term_variants(term: str) -> set[str]:
    """
    Lightweight variants:
    - lower
    - naive plural/singular + DE endings trimming
    """
    t = (term or "").strip().lower()
    if not t:
        return set()
    vars_ = {t}

    # EN plural -> singular heuristic
    if t.endswith("s") and len(t) > 3:
        vars_.add(t[:-1])

    # DE common endings heuristic
    for suf in ("en", "er", "e", "n", "s"):
        if t.endswith(suf) and len(t) > len(suf) + 2:
            vars_.add(t[: -len(suf)])

    return {v for v in vars_ if len(v) >= 3 and v not in STOPWORDS}


def build_keywords_de(title: str) -> list[str]:
    """
    Deterministic DE keywords:
    - unique tokens
    - prefer longer tokens
    """
    toks = tokenize_list(title)
    uniq = []
    seen = set()
    for t in toks:
        if t not in seen:
            uniq.append(t)
            seen.add(t)
    uniq_sorted = sorted(uniq, key=lambda x: (-len(x), x))
    return uniq_sorted[:MAX_KEYWORDS_DE]


def translate_text_de_to_en(text: str) -> str | None:
    """Best-effort translation; returns None on failure/empty."""
    try:
        t = translator.translate(text, src="de", dest="en").text
        t = (t or "").strip().lower()
        return t if t else None
    except Exception:
        return None


def translate_keywords_to_en(keywords_de: list[str]) -> dict[str, str]:
    """Mapping {de_kw: en_kw} best-effort."""
    m = {}
    for kw in keywords_de:
        t = translate_text_de_to_en(kw)
        if t:
            m[kw] = t
    return m


def build_bigram_phrases(title: str) -> list[str]:
    """
    Build phrase candidates from title tokens in order.
    """
    toks = tokenize_list(title)
    bigrams = []
    for i in range(len(toks) - 1):
        bigrams.append(f"{toks[i]} {toks[i+1]}")
    bigrams = sorted(list(dict.fromkeys(bigrams)), key=lambda x: (-len(x), x))
    return bigrams[:MAX_BIGRAMS]


def build_keyword_groups_from_title(title: str) -> tuple[list[set[str]], dict[str, str], list[str]]:
    """
    Returns:
      - groups (list[set[str]])  : synonym groups (unigrams + optional bigrams)
      - de_to_en_map (dict)      : mapping for unigrams and bigrams that were translated
      - debug_reps (list[str])   : representative term per group (for logging)
    """
    keywords_de = build_keywords_de(title)
    de_to_en = translate_keywords_to_en(keywords_de)

    groups: list[set[str]] = []

    # unigram groups
    for de_kw in keywords_de:
        g = set()
        g |= normalize_term_variants(de_kw)
        en_kw = de_to_en.get(de_kw)
        if en_kw:
            g |= normalize_term_variants(en_kw)
        if g:
            groups.append(g)

    # bigram groups
    if ADD_BIGRAM_GROUPS:
        for bg_de in build_bigram_phrases(title):
            g = set()
            g.add(bg_de.lower())
            for tok in tokenize_list(bg_de):
                g |= normalize_term_variants(tok)

            bg_en = translate_text_de_to_en(bg_de)
            if bg_en:
                de_to_en[bg_de] = bg_en
                g.add(bg_en.lower())
                for tok in tokenize_list(bg_en):
                    g |= normalize_term_variants(tok)

            groups.append(g)

    # dedupe identical groups
    uniq = []
    seen = set()
    for g in groups:
        key = tuple(sorted(g))
        if key not in seen:
            uniq.append(g)
            seen.add(key)
    groups = uniq

    # sort groups by specificity (longest representative)
    scored = []
    for g in groups:
        rep = sorted(g, key=lambda x: (-len(x), x))[0]
        scored.append((len(rep), rep, g))
    scored.sort(reverse=True)

    picked = [g for _, _, g in scored[:TARGET_GROUPS]]
    reps = [rep for _, rep, _ in scored[:TARGET_GROUPS]]

    return picked, de_to_en, reps


def group_hit_count(tokens: set[str], groups: list[set[str]]) -> int:
    return sum(1 for g in groups if (tokens & g))


def get_check_text(post: dict) -> str:
    """
    Include URL as extra matchable text (important for link posts).
    """
    title = safe_str(post.get("title", ""))
    selftext = safe_str(post.get("selftext", ""))
    url = safe_str(post.get("url", ""))

    base = selftext if CHECK_TEXT_MODE == "selftext" else f"{title} {selftext}".strip()
    return f"{base} {url}".strip()


# =========================
# REQUEST WRAPPER (rate limited)
# =========================
def reddit_get(url: str, *, params: dict | None = None, timeout: int = 30):
    """
    Always sleeps REQUEST_SLEEP after each request.
    On 429, sleeps BACKOFF_SLEEP and retries once.
    Returns response or None if still 429/failed.
    """
    try:
        r = session.get(url, params=params, timeout=timeout)
        time.sleep(REQUEST_SLEEP)

        if r.status_code == 429:
            print("⚠️ 429 rate limit. Backing off...")
            time.sleep(BACKOFF_SLEEP)
            r = session.get(url, params=params, timeout=timeout)
            time.sleep(REQUEST_SLEEP)

        if r.status_code == 429:
            print("⚠️ 429 again. Skipping this request.")
            return None

        r.raise_for_status()
        return r
    except requests.RequestException as e:
        print(f"Request error: {e}")
        return None


# =========================
# REDDIT FETCHERS
# =========================
def reddit_search(query: str, subreddit: str | None = None):
    """Search either globally or within a subreddit."""
    if subreddit:
        base = f"https://www.reddit.com/r/{subreddit}/search.json"
    else:
        base = "https://www.reddit.com/search.json"

    after = None
    for _ in range(MAX_PAGES_PER_SOURCE):
        params = {
            "q": query,
            "sort": SORT_MODE,
            "limit": LIMIT_PER_PAGE,
            "syntax": "lucene",
        }
        if subreddit:
            params["restrict_sr"] = 1
        if after:
            params["after"] = after

        r = reddit_get(base, params=params)
        if r is None:
            return

        data = r.json().get("data", {})
        children = data.get("children", [])
        if not children:
            return

        for ch in children:
            yield ch.get("data", {})

        after = data.get("after")
        if not after:
            return


def iter_posts_for_query(query: str):
    """Global + subreddit search; dedupe across sources by reddit_id."""
    seen = set()

    if SEARCH_GLOBAL_TOO and not RESTRICT_TO_SUBS_ONLY:
        for p in reddit_search(query, subreddit=None):
            rid = safe_str(p.get("id"))
            if rid and rid not in seen:
                seen.add(rid)
                yield p

    for sub in SUBREDDITS:
        for p in reddit_search(query, subreddit=sub):
            rid = safe_str(p.get("id"))
            if rid and rid not in seen:
                seen.add(rid)
                yield p


def subreddit_new_feed(subreddit: str, pages: int = NEW_FEED_PAGES_PER_SUB, limit: int = NEW_FEED_LIMIT):
    """Iterate newest posts from a subreddit feed (bypasses search limits)."""
    base = f"https://www.reddit.com/r/{subreddit}/new.json"
    after = None

    for _ in range(pages):
        params = {"limit": limit}
        if after:
            params["after"] = after

        r = reddit_get(base, params=params)
        if r is None:
            return

        data = r.json().get("data", {})
        children = data.get("children", [])
        if not children:
            return

        for ch in children:
            yield ch.get("data", {})

        after = data.get("after")
        if not after:
            return


def prefetch_new_feeds(subreddits: list[str]) -> dict[str, list[dict]]:
    """Prefetch new posts once per run so each article can filter locally."""
    cache: dict[str, list[dict]] = {}
    for sub in subreddits:
        posts = []
        seen = set()
        try:
            for p in subreddit_new_feed(sub):
                rid = safe_str(p.get("id"))
                if rid and rid not in seen:
                    seen.add(rid)
                    posts.append(p)
        except Exception as e:
            print(f"Prefetch error for r/{sub}: {e}")
        cache[sub] = posts
        print(f"Prefetched r/{sub} new: {len(posts)} posts")
    return cache


# =========================
# OPTIONAL DEDUPE WITH EXISTING CSV
# =========================
existing_pairs = set()
if DEDUPE_WITH_EXISTING_CSV and os.path.exists(OUT_CSV):
    existing = pd.read_csv(OUT_CSV, encoding="utf-8")
    if {"article_usid", "reddit_id"}.issubset(existing.columns):
        existing_pairs = set(zip(existing["article_usid"].astype(str), existing["reddit_id"].astype(str)))


# =========================
# PREFETCH /new.json POSTS
# =========================
new_feed_cache = {}
if USE_SUB_NEW_FEEDS:
    new_feed_cache = prefetch_new_feeds(SUBREDDITS)


# =========================
# MAIN
# =========================
rows = []

for idx, r in df.iterrows():
    article_usid = safe_str(r.get(USID_COL))
    article_title = safe_str(r.get(TITLE_COL))
    if not article_usid or not article_title:
        continue

    groups, de_to_en_map, group_reps = build_keyword_groups_from_title(article_title)
    if len(groups) < MIN_GROUP_MATCHES:
        print(f"[{idx+1}/{len(df)}] {article_usid} skipped (only {len(groups)} groups)")
        continue

    # Build DE & EN queries
    reps = group_reps[:]
    reps_de = [t for t in reps if (re.search(r"[äöüß]", t) or t in tokenize_set(article_title))]
    if len(reps_de) < 3:
        reps_de = reps[:]
    reps_de = reps_de[:TARGET_GROUPS]

    en_candidates = sorted({v for v in de_to_en_map.values() if v}, key=lambda x: (-len(x), x))
    reps_en = en_candidates[:TARGET_GROUPS] if len(en_candidates) >= 3 else reps[:TARGET_GROUPS]

    query_de = "(" + " OR ".join(f"\"{k}\"" for k in reps_de) + ")"
    query_en = "(" + " OR ".join(f"\"{k}\"" for k in reps_en) + ")"

    candidates: dict[str, tuple[dict, str]] = {}  # rid -> (post, source)

    def add_candidates_from_iter(it, source: str):
        for p in it:
            rid = safe_str(p.get("id"))
            if not rid:
                continue
            if rid not in candidates:
                candidates[rid] = (p, source)

    # searches
    add_candidates_from_iter(iter_posts_for_query(query_de), "search_de")
    add_candidates_from_iter(iter_posts_for_query(query_en), "search_en")

    # new feeds
    if USE_SUB_NEW_FEEDS:
        for sub, posts in new_feed_cache.items():
            for p in posts:
                rid = safe_str(p.get("id"))
                if rid and rid not in candidates:
                    candidates[rid] = (p, f"new_{sub}")

    scanned_total = len(candidates)
    kept = 0

    for rid, (post, source) in candidates.items():
        if DEDUPE_WITH_EXISTING_CSV and (article_usid, rid) in existing_pairs:
            continue

        text = get_check_text(post)
        words = words_list(text)
        if len(words) > MAX_POST_WORDS:
            continue

        window_text = " ".join(words[:MAX_POST_WORDS])
        tokens = tokenize_set(window_text)

        matches = group_hit_count(tokens, groups)
        if matches < MIN_GROUP_MATCHES:
            continue

        matched_reps = []
        for g, rep in zip(groups, group_reps):
            if tokens & g:
                matched_reps.append(rep)

        rows.append({
            "article_usid": article_usid,
            "reddit_id": rid,
            "reddit_title": safe_str(post.get("title", "")),
            "reddit_selftext": safe_str(post.get("selftext", "")),
            "post_url": safe_str(post.get("url", "")),
            "reddit_permalink": "https://www.reddit.com" + safe_str(post.get("permalink", "")),
            "source": source,
            "checked_word_count": len(words),
            "groups_used": len(groups),
            "group_matches_in_window": matches,
            "matched_group_reps": ",".join(matched_reps),
            "query_de": query_de,
            "query_en": query_en,
            "saved_at_utc": datetime.now(timezone.utc).isoformat(),
        })
        kept += 1

    print(
        f"[{idx+1}/{len(df)}] {article_usid} | "
        f"groups_used={len(groups)}/{TARGET_GROUPS} | "
        f"scanned_total={scanned_total} | kept={kept}"
    )

# Write CSV
# Write to InfluxDB instead of CSV (DB-only)
from influx_io import write_orf_articles

if rows:
    written = write_orf_articles(rows)
    print(f"Wrote {written} rows to InfluxDB")
else:
    print("No matching posts found.")


Prefetched r/politics new: 800 posts
Prefetched r/austria new: 800 posts
Prefetched r/europe new: 800 posts
Prefetched r/worldnews new: 800 posts
Prefetched r/news new: 800 posts
[1/2] news:3416835 | groups_used=7/7 | scanned_total=5545 | kept=164
[2/2] news:3416833 | groups_used=7/7 | scanned_total=6038 | kept=589
Wrote 0 rows to InfluxDB


In [None]:
from influx_io import write_reddit_matches

written = write_reddit_matches(rows)
print("Wrote Reddit matches to InfluxDB:", written)


Wrote Reddit matches to InfluxDB: 2540


In [None]:
# =========================
# TEST / AUDIT SCRIPT
# =========================
import pandas as pd

OUT_CSV = "reddit_posts_minimal.csv"

df_out = pd.read_csv(OUT_CSV, encoding="utf-8")

print("Rows:", len(df_out))
print("Unique articles:", df_out["article_usid"].nunique())
print("Unique reddit posts:", df_out["reddit_id"].nunique())
print("\nTop sources:")
print(df_out["source"].value_counts().head(10))

# --- per-article summary ---
per_article = (
    df_out.groupby("article_usid")
    .agg(
        kept=("reddit_id", "count"),
        avg_group_matches=("group_matches_in_window", "mean"),
        avg_word_count=("checked_word_count", "mean"),
    )
    .sort_values("kept", ascending=False)
)

display(per_article.head(20))

# --- quick noise inspection: top matches & random sample ---
best = df_out.sort_values(
    ["group_matches_in_window", "checked_word_count"],
    ascending=[False, True]
).head(15)

rand = df_out.sample(min(15, len(df_out)), random_state=42)

display(best[[
    "article_usid", "group_matches_in_window", "matched_group_reps",
    "reddit_title", "post_url", "source", "reddit_permalink"
]])

display(rand[[
    "article_usid", "group_matches_in_window", "matched_group_reps",
    "reddit_title", "post_url", "source", "reddit_permalink"
]])

# --- optional: look at one specific article id quickly ---
# example:
# aid = "news:3415936"
# display(df_out[df_out["article_usid"] == aid].head(50))


Rows: 18578
Unique articles: 28
Unique reddit posts: 16230

Top sources:
source
search_en        9346
search_de        7339
new_worldnews     534
new_politics      492
new_europe        334
new_austria       272
new_news          261
Name: count, dtype: int64


Unnamed: 0_level_0,kept,avg_group_matches,avg_word_count
article_usid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
news:3416318,1961,3.211117,34.694034
news:3415898,1576,4.286802,37.378173
news:3416296,1515,3.09835,44.458086
news:3415931,1298,3.025424,45.75963
news:3415934,1280,3.073437,44.971875
news:3416290,1232,3.362825,37.194805
news:3416307,1227,3.007335,42.270579
news:3415933,1201,3.129892,36.096586
news:3416305,995,3.023116,42.782915
news:3415915,933,3.096463,49.154341


Unnamed: 0,article_usid,group_matches_in_window,matched_group_reps,reddit_title,post_url,source,reddit_permalink
10101,news:3416242,7,"capitol anniversary,storm capitol,sich fünften...",Fifth anniversary of Jan. 6 brings fresh divis...,https://www.wral.com/news/ap/1ef8f-fifth-anniv...,new_politics,https://www.reddit.com/r/politics/comments/1q5...
12340,news:3416290,7,"plädiert gericht,gericht schuldig,maduro plädi...",President Nicolas Maduro pleads not guilty to ...,https://www.reuters.com/world/americas/venezue...,search_de,https://www.reddit.com/r/worldnews/comments/1q...
10835,news:3416296,7,"activists around,tote protesten,protesten iran...","Ban private jets, say scientists blocking airp...",https://prod.euronews.com/green/2023/02/14/act...,search_de,https://www.reddit.com/r/europe/comments/112iw...
10563,news:3416296,7,"activists around,tote protesten,protesten iran...",Jewish activists form protective barrier aroun...,http://www.independent.co.uk/news/world/americ...,search_de,https://www.reddit.com/r/politics/comments/6ey...
11002,news:3416296,7,"activists around,tote protesten,protesten iran...",Protesters in cities around the world joined t...,http://thehill.com/blogs/blog-briefing-room/39...,search_de,https://www.reddit.com/r/worldnews/comments/8v...
1401,news:3415933,7,"somalia military,schabab kämpfer,military kill...",Undeclared War: Obama's Robot Army Kills Dozen...,http://www.presstv.ir/detail/187978.html,search_de,https://www.reddit.com/r/politics/comments/ij2...
4291,news:3415915,7,"neuen protesten,protesten iran,dead new ones,m...",International media on the suspension of Nepsz...,https://www.reddit.com/r/europe/comments/56l27...,search_de,https://www.reddit.com/r/europe/comments/56l27...
10812,news:3416296,6,"activists around,tote protesten,protesten iran...",Brexit protest around Buckingham Palace.,https://i.redd.it/pwlt45na7qwy.jpg,search_de,https://www.reddit.com/r/europe/comments/6af3h...
15248,news:3416269,6,"transferred to court,yorker gericht,new yorker...",Luigi Mangione in New York Court today!,https://i.redd.it/5h3s797j1n4g1.jpeg,search_en,https://www.reddit.com/r/pics/comments/1pbl2ah...
15250,news:3416269,6,"transferred to court,yorker gericht,new yorker...",Luigi Mangione in New York Court today,https://i.redd.it/qfifn448616g1.jpeg,search_en,https://www.reddit.com/r/JoeRogan/comments/1ph...


Unnamed: 0,article_usid,group_matches_in_window,matched_group_reps,reddit_title,post_url,source,reddit_permalink
15285,news:3416269,3,"transferred to court,yorker gericht,gericht",This Is the Nastiest Opinion by a Supreme Cour...,https://slate.com/news-and-politics/2026/01/su...,new_politics,https://www.reddit.com/r/politics/comments/1q4...
8885,news:3416301,3,"new elections in venezuela,trump for now,venez...",Was Someone Insider Trading Right Before Trump...,https://newrepublic.com/post/204885/insider-tr...,search_de,https://www.reddit.com/r/politics/comments/1q3...
9970,news:3416242,3,"capitol anniversary,storm capitol,capitol",Purported neo-Nazis rally at South Dakota Stat...,https://www.thedakotascout.com/p/purported-neo...,search_en,https://www.reddit.com/r/news/comments/1dbisfi...
18569,news:3416305,3,"grönland kämpfen,usa greenland,greenland",Retired US general says US ‘needs Europe’ to ‘...,https://tvpworld.com/90899221/ben-hodges-europ...,new_europe,https://www.reddit.com/r/europe/comments/1q5cq...
11118,news:3416296,3,"activists around,around dead,around",Soaring ocean temperatures have already cut th...,https://www.independent.co.uk/environment/fish...,search_de,https://www.reddit.com/r/worldnews/comments/aw...
9542,news:3416242,3,"sich fünften,fünften mal,fünften",[Wien] Parken Möglichkeiten für Touristen für ...,https://www.reddit.com/r/Austria/comments/p3c4...,search_de,https://www.reddit.com/r/Austria/comments/p3c4...
970,news:3415934,3,"internationalen flugverkehr,stoppt internation...",Hungarian Parliament votes in favor of Hungary...,https://telex.hu/english/2025/05/20/hungarian-...,search_en,https://www.reddit.com/r/worldnews/comments/1k...
11124,news:3416296,3,"activists around,around dead,around",White House is expected to release a version o...,http://news.yahoo.com/on-the-torture-report--a...,search_de,https://www.reddit.com/r/worldnews/comments/2b...
7558,news:3416307,3,"paris sicherheitsgarantien,beraten paris,paris",US officially rejoins the Paris climate accord,https://www.cnn.com/2021/02/19/politics/us-rej...,search_de,https://www.reddit.com/r/politics/comments/lng...
13367,news:3416290,3,"plädiert gericht,gericht schuldig,gericht",Macrons to offer 'scientific evidence' to US c...,https://www.bbc.com/news/articles/ckg3llj5nxdo,search_en,https://www.reddit.com/r/news/comments/1njzxwa...


In [14]:
# --- Reddit write (CORRECT PLACE) ---
from influx_io import write_reddit_matches

reddit_rows = rows

# sanity check (optional)
if reddit_rows:
    print("Reddit row keys:", reddit_rows[0].keys())

written_reddit = write_reddit_matches(reddit_rows)
print("Wrote Reddit matches to InfluxDB:", written_reddit)


Reddit row keys: dict_keys(['article_usid', 'reddit_id', 'reddit_title', 'reddit_selftext', 'post_url', 'reddit_permalink', 'source', 'checked_word_count', 'groups_used', 'group_matches_in_window', 'matched_group_reps', 'query_de', 'query_en', 'saved_at_utc'])
Wrote Reddit matches to InfluxDB: 753
