In [1]:
%pip install requests feedparser
%pip install pandas
%pip install googletrans==4.0.0-rc1
%pip install influxdb-client
%pip install praw rapidfuzz

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
# Cell 1 - imports + config (DB-only)
import os
import requests
import xml.etree.ElementTree as ET
from datetime import datetime, timezone

# InfluxDB (read from env if you want)
INFLUX_URL = os.getenv("INFLUX_URL", "http://localhost:8086")
INFLUX_TOKEN = os.getenv("INFLUX_TOKEN", "")
INFLUX_ORG = os.getenv("INFLUX_ORG", "")
INFLUX_BUCKET = os.getenv("INFLUX_BUCKET", "")

FEED_URL = "https://rss.orf.at/news.xml"

TARGET_OEWA = "urn:oewa:RedCont:Politik/PolitikAusland"
USER_AGENT = "orf-rss-tracker/1.0 (+local notebook)"


In [3]:
# Cell 2 - fetch XML
def fetch_feed_xml(url: str, timeout: int = 20) -> str:
    r = requests.get(url, timeout=timeout, headers={"User-Agent": USER_AGENT})
    r.raise_for_status()
    return r.text

xml_text = fetch_feed_xml(FEED_URL)
len(xml_text), xml_text[:200]


(13146,
 '<?xml version="1.0" encoding="UTF-8"?>\n<rdf:RDF\n  xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"\n  xmlns:dc="http://purl.org/dc/elements/1.1/"\n  xmlns:sy="http://purl.org/rss/1.0/modules/synd')

In [4]:
# Cell 3 - detect namespaces robustly (so you don't have to guess)
import io

def detect_namespaces(xml_text: str) -> dict:
    ns = {}
    for event, elem in ET.iterparse(io.StringIO(xml_text), events=("start-ns",)):
        prefix, uri = elem
        ns[prefix if prefix is not None else ""] = uri
    return ns

NS = detect_namespaces(xml_text)
NS


{'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
 'dc': 'http://purl.org/dc/elements/1.1/',
 'sy': 'http://purl.org/rss/1.0/modules/syndication/',
 'orfon': 'http://rss.orf.at/1.0/',
 '': 'http://purl.org/rss/1.0/'}

In [5]:
# Cell 4 - parse + sanity checks (THIS will show why your old code returned 0)
root = ET.fromstring(xml_text)

rss_ns = NS.get("rss", "http://purl.org/rss/1.0/")  # ORF uses RSS 1.0
items = root.findall(".//{%s}item" % rss_ns)

root.tag, len(items)


('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF', 21)

In [6]:
# Cell 5 - helper: load already-seen usids (dedupe) from InfluxDB (DB-only)
from influx_io import get_client, INFLUX_BUCKET, INFLUX_ORG

def load_seen_usids_from_influx(lookback: str = "30d") -> set[str]:
    """
    Pull distinct usid tag values stored in measurement 'orf_article'
    within a recent lookback window.
    """
    with get_client() as client:
        query_api = client.query_api()

        flux = f'''
from(bucket: "{INFLUX_BUCKET}")
  |> range(start: -{lookback})
  |> filter(fn: (r) => r._measurement == "orf_article")
  |> keep(columns: ["usid"])
  |> distinct(column: "usid")
'''

        tables = query_api.query(flux, org=INFLUX_ORG)

    seen = set()
    for table in tables:
        for record in table.records:
            val = record.values.get("usid")
            if val:
                seen.add(str(val))
    return seen

seen_usids = load_seen_usids_from_influx("30d")
len(seen_usids)


17

In [7]:
# Cell 6 - parse items + filter by oewaCategory
def text_of(el):
    return el.text.strip() if el is not None and el.text else None

def parse_filtered_items(root: ET.Element, ns: dict) -> list[dict]:
    rss_ns = ns.get("rss", "http://purl.org/rss/1.0/")
    rdf_ns = ns.get("rdf", "http://www.w3.org/1999/02/22-rdf-syntax-ns#")
    dc_ns  = ns.get("dc",  "http://purl.org/dc/elements/1.1/")
    orf_ns = ns.get("orfon")  # must exist in feed; we'll rely on detected value

    if not orf_ns:
        raise RuntimeError("Could not detect 'orfon' namespace in the feed. Check NS dict output.")

    out = []
    for item in root.findall(".//{%s}item" % rss_ns):
        # orfon:oewaCategory rdf:resource="..."
        cat_el = item.find("{%s}oewaCategory" % orf_ns)
        if cat_el is None:
            continue

        cat_val = cat_el.attrib.get("{%s}resource" % rdf_ns)
        if cat_val != TARGET_OEWA:
            continue

        title_el = item.find("{%s}title" % rss_ns)
        link_el  = item.find("{%s}link" % rss_ns)
        date_el  = item.find("{%s}date" % dc_ns)
        usid_el  = item.find("{%s}usid" % orf_ns)

        out.append({
            "usid": text_of(usid_el),
            "date": text_of(date_el),
            "link": text_of(link_el),
            "title": text_of(title_el),
            "oewaCategory": cat_val,
            "fetched_at_utc": datetime.now(timezone.utc).isoformat(),
        })

    return out

filtered_items = parse_filtered_items(root, NS)
len(filtered_items), filtered_items[:2]


(6,
 [{'usid': 'news:3417249',
   'date': '2026-01-15T14:37:31+01:00',
   'link': 'https://orf.at/stories/3417249/',
   'title': 'Wahl in Uganda mit Verz√∂gerungen begonnen',
   'oewaCategory': 'urn:oewa:RedCont:Politik/PolitikAusland',
   'fetched_at_utc': '2026-01-15T14:11:11.740879+00:00'},
  {'usid': 'news:3417218',
   'date': '2026-01-15T13:26:05+01:00',
   'link': 'https://orf.at/stories/3417218/',
   'title': 'Iran setzt angeblich Hinrichtungen aus',
   'oewaCategory': 'urn:oewa:RedCont:Politik/PolitikAusland',
   'fetched_at_utc': '2026-01-15T14:11:11.740879+00:00'}])

In [8]:
# Cell 7 - (optional) debug: what categories exist + counts
from collections import Counter

def category_counts(root: ET.Element, ns: dict) -> Counter:
    rss_ns = ns.get("rss", "http://purl.org/rss/1.0/")
    rdf_ns = ns.get("rdf", "http://www.w3.org/1999/02/22-rdf-syntax-ns#")
    orf_ns = ns.get("orfon")
    c = Counter()
    for item in root.findall(".//{%s}item" % rss_ns):
        cat_el = item.find("{%s}oewaCategory" % orf_ns) if orf_ns else None
        if cat_el is None:
            continue
        cat_val = cat_el.attrib.get("{%s}resource" % rdf_ns)
        if cat_val:
            c[cat_val] += 1
    return c

counts = category_counts(root, NS)
counts.most_common(10)


[('urn:oewa:RedCont:Politik/PolitikAusland', 6),
 ('urn:oewa:RedCont:Politik/PolitikInland', 5),
 ('urn:oewa:RedCont:Nachrichten/Chronik', 3),
 ('urn:oewa:RedCont:AutoUndMotor/AutoUndMotor', 1),
 ('urn:oewa:RedCont:KulturUndFreizeit/Musik', 1),
 ('urn:oewa:RedCont:Wirtschaft/Unternehmensberichterstattung', 1),
 ('urn:oewa:RedCont:KulturUndFreizeit/Literatur', 1),
 ('urn:oewa:RedCont:ComputerUndTechnik/ComputerUndTechnikUeberblick', 1),
 ('urn:oewa:RedCont:Wirtschaft/Wirtschaftspolitik', 1),
 ('urn:oewa:RedCont:Wissenschaft/WissenschaftUeberblick', 1)]

In [9]:
# Cell 8 - DB-only: keep only new items + write to InfluxDB
from influx_io import write_orf_articles

new_items = [it for it in filtered_items if it.get("usid") and it["usid"] not in seen_usids]

written = write_orf_articles(new_items)
print("Wrote ORF articles to InfluxDB:", written)

# update local seen set (so rerunning later cells in this same session doesn't re-write)
for it in new_items:
    seen_usids.add(it["usid"])

len(new_items)


Wrote ORF articles to InfluxDB: 0


0

In [10]:
# Cell 9 - show latest rows quickly (DB-only, from current run)
import pandas as pd

df = pd.DataFrame(new_items)
df.tail(10)


Reddit

In [11]:
import os
import time
import re
from datetime import datetime, timezone

import requests

from influx_io import (
    load_orf_articles_from_influx,
    load_existing_reddit_ids_for_usid,
    write_reddit_posts,
)

# ---- Config ----
REDDIT_UA = os.getenv("REDDIT_USER_AGENT", "orf-reddit-matcher/1.0 (contact: local)")
REDDIT_SLEEP_S = float(os.getenv("REDDIT_SLEEP_S", "1.2"))  # be kind
REDDIT_TIMEOUT = int(os.getenv("REDDIT_TIMEOUT", "20"))
REDDIT_MAX_PAGES_PER_QUERY = int(os.getenv("REDDIT_MAX_PAGES_PER_QUERY", "6"))  # 6*100=600 max per query
REDDIT_MIN_POSTS_PER_ARTICLE = int(os.getenv("REDDIT_MIN_POSTS_PER_ARTICLE", "20"))
REDDIT_LOOKBACK = os.getenv("REDDIT_LOOKBACK", "all")  # "all", "year", "month", ...
REDDIT_DEDUP_LOOKBACK = os.getenv("REDDIT_DEDUP_LOOKBACK", "365d")  # how far back to check for duplicates

# ---- Helpers (Reddit/Matching) ----
def _words(s: str) -> int:
    if not s:
        return 0
    return len(re.findall(r"\b\w+\b", s))

def _reddit_search(
    query: str,
    after: str | None = None,
    limit: int = 100,
    sort: str = "relevance",
    t: str = "all",
) -> dict:
    """Calls Reddit's public search endpoint and returns parsed JSON."""
    base = "https://www.reddit.com/search.json"
    params = {
        "q": query,
        "limit": str(limit),
        "sort": sort,
        "t": t,
        "restrict_sr": "false",
        "include_over_18": "on",
        "type": "link",
        "raw_json": "1",
    }
    if after:
        params["after"] = after

    r = requests.get(base, params=params, timeout=REDDIT_TIMEOUT, headers={"User-Agent": REDDIT_UA})
    if r.status_code == 429:
        time.sleep(max(5.0, REDDIT_SLEEP_S * 4))
        r = requests.get(base, params=params, timeout=REDDIT_TIMEOUT, headers={"User-Agent": REDDIT_UA})
    r.raise_for_status()
    return r.json()

def _article_queries(title: str, url: str | None) -> list[str]:
    """Build a small set of queries (URL/domain + title-based)."""
    qs: list[str] = []
    if url:
        qs.append(f'url:"{url}"')
        qs.append("site:orf.at " + " ".join(re.findall(r"\w+", title)[:8]))
    if title:
        qs.append(f'"{title}"')
        qs.append(" ".join(re.findall(r"\w+", title)[:10]) + " orf")

    seen = set()
    out = []
    for q in qs:
        q2 = q.strip()
        if q2 and q2 not in seen:
            out.append(q2)
            seen.add(q2)
    return out

def find_reddit_posts_for_article(usid: str, title: str, url: str | None) -> list[dict]:
    """Return list of dicts with keys expected by the Influx writer."""
    results: dict[str, dict] = {}
    queries = _article_queries(title, url)

    for q in queries:
        after = None
        for _page in range(REDDIT_MAX_PAGES_PER_QUERY):
            data = _reddit_search(q, after=after, limit=100, sort="relevance", t=REDDIT_LOOKBACK)
            children = (data.get("data") or {}).get("children") or []
            after = (data.get("data") or {}).get("after")

            for ch in children:
                d = (ch.get("data") or {})
                reddit_id = d.get("id")
                if not reddit_id or reddit_id in results:
                    continue

                permalink = "https://www.reddit.com" + str(d.get("permalink") or "")
                post_url = str(d.get("url") or "")
                selftext = str(d.get("selftext") or "")
                title_r = str(d.get("title") or "")

                hay = (title_r + " " + selftext + " " + post_url).lower()

                keep = False
                if url and url.lower() in hay:
                    keep = True
                elif "orf.at" in hay:
                    keep = True
                else:
                    toks = [t.lower() for t in re.findall(r"\w+", title) if len(t) >= 5][:8]
                    overlap = sum(1 for t in toks if t in hay)
                    keep = overlap >= 2  # heuristic

                if not keep:
                    continue

                results[reddit_id] = {
                    "usid": usid,
                    "source": d.get("subreddit") or "",
                    "reddit_id": reddit_id,
                    "reddit_title": title_r,
                    "reddit_permalink": permalink,
                    "post_url": post_url,
                    "reddit_selftext": selftext,
                    "checked_word_count": _words(title_r) + _words(selftext),
                    "group_matches_in_window": 0,
                    "created_utc": d.get("created_utc"),
                }

            time.sleep(REDDIT_SLEEP_S)

            if len(results) >= REDDIT_MIN_POSTS_PER_ARTICLE:
                break
            if not after:
                break

        if len(results) >= REDDIT_MIN_POSTS_PER_ARTICLE:
            break

    out = list(results.values())
    out.sort(key=lambda r: (r.get("created_utc") or 0), reverse=True)
    return out[:REDDIT_MIN_POSTS_PER_ARTICLE]

# ---- Main loop ----
articles = load_orf_articles_from_influx(lookback="24h")
print("Loaded ORF articles (last 24h):", len(articles))
print(articles[:2])

all_written = 0

for a in articles:
    usid = a.get("usid")
    title = a.get("title") or ""
    link = a.get("link")

    if not usid or not title:
        continue

    existing_ids = load_existing_reddit_ids_for_usid(usid, lookback=REDDIT_DEDUP_LOOKBACK)

    reddit_rows = find_reddit_posts_for_article(usid=usid, title=title, url=link)
    if len(reddit_rows) < REDDIT_MIN_POSTS_PER_ARTICLE:
        print(f"[WARN] usid={usid}: found only {len(reddit_rows)} reddit posts for '{title[:70]}‚Ä¶'")

    # Dedup in-memory before writing
    to_write = [r for r in reddit_rows if str(r.get("reddit_id") or "") and str(r.get("reddit_id")) not in existing_ids]

    w = write_reddit_posts(to_write)
    all_written += w
    skipped = len(reddit_rows) - len(to_write)
    print(f"usid={usid}: wrote {w} reddit posts (skipped {skipped} already in DB)")

print("Total reddit posts written:", all_written)


Loaded ORF articles (last 24h): 16
[{'time': datetime.datetime(2026, 1, 14, 22, 38, 43, tzinfo=datetime.timezone.utc), 'usid': 'news:3416646', 'title': 'Popstar fordert Langzeitmachthaber', 'link': 'https://orf.at/stories/3416646/', 'date': None}, {'time': datetime.datetime(2026, 1, 14, 19, 40, 23, tzinfo=datetime.timezone.utc), 'usid': 'news:3417135', 'title': 'Weiter Differenzen mit USA wegen Gr√∂nland', 'link': 'https://orf.at/stories/3417135/', 'date': None}]
[WARN] usid=news:3416646: found only 0 reddit posts for 'Popstar fordert Langzeitmachthaber‚Ä¶'
usid=news:3416646: wrote 0 reddit posts (skipped 0 already in DB)
usid=news:3417135: wrote 1 reddit posts (skipped 19 already in DB)
usid=news:3417160: wrote 11 reddit posts (skipped 9 already in DB)
usid=news:3417166: wrote 1 reddit posts (skipped 19 already in DB)
usid=news:3417168: wrote 1 reddit posts (skipped 19 already in DB)
usid=news:3417170: wrote 9 reddit posts (skipped 11 already in DB)
usid=news:3417172: wrote 0 reddit p

In [12]:
# Cell Y - Inspect: show the 20 reddit posts per ORF article (read back from InfluxDB)
# Fixes the Pandas "_time" attribute issue by using row["_time"] via iterrows().

import pandas as pd
from influx_io import get_client, INFLUX_BUCKET, INFLUX_ORG

def load_reddit_posts_from_influx(lookback: str = "30d") -> pd.DataFrame:
    """
    Loads measurement 'reddit_post' from InfluxDB for inspection.
    reddit_id/title/permalink/url/selftext are written as FIELDS; usid/source are tags.
    """
    with get_client() as client:
        query_api = client.query_api()
        flux = f"""
from(bucket: "{INFLUX_BUCKET}")
  |> range(start: -{lookback})
  |> filter(fn: (r) => r._measurement == "reddit_post")
  |> pivot(rowKey:["_time"], columnKey: ["_field"], valueColumn: "_value")
  |> keep(columns: ["_time","usid","source","reddit_id","title","permalink","url","checked_word_count","group_matches_in_window","selftext"])
"""
        tables = query_api.query(flux, org=INFLUX_ORG)

    rows = []
    for t in tables:
        for rec in t.records:
            v = rec.values
            rows.append({
                "_time": v.get("_time"),
                "usid": v.get("usid"),
                "source": v.get("source"),
                "reddit_id": v.get("reddit_id"),
                "title": v.get("title"),
                "permalink": v.get("permalink"),
                "url": v.get("url"),
                "checked_word_count": v.get("checked_word_count"),
                "group_matches_in_window": v.get("group_matches_in_window"),
                "selftext": v.get("selftext"),
            })

    df = pd.DataFrame(rows)
    if df.empty:
        print("No reddit_post points found in Influx for lookback =", lookback)
        return df

    # normalize numeric columns
    df["checked_word_count"] = pd.to_numeric(df["checked_word_count"], errors="coerce").fillna(0).astype(int)
    df["group_matches_in_window"] = pd.to_numeric(df["group_matches_in_window"], errors="coerce").fillna(0).astype(int)

    # sort: most recent first within each usid
    df = df.sort_values(["usid", "_time"], ascending=[True, False])
    return df


def show_top_reddit_posts_per_article(df: pd.DataFrame, articles: list[dict], n: int = 20, exact_written_20: bool = True):
    """
    Prints N posts per article (usid). Uses articles list to show ORF title too.
    If exact_written_20=True: shows the last N points by time (tail), which better matches "what was written".
    """
    if df.empty:
        print("DataFrame is empty; nothing to show.")
        return

    # map usid -> orf title
    usid_to_orf_title = {a.get("usid"): (a.get("title") or "") for a in articles if a.get("usid")}

    # keep group order stable
    for usid, grp in df.groupby("usid", sort=False):
        orf_title = usid_to_orf_title.get(usid, "")

        print("\n" + "=" * 120)
        print(f"USID: {usid}")
        if orf_title:
            print(f"ORF:  {orf_title[:200]}")
        print(f"Found points in DB: {len(grp)} | showing {min(n, len(grp))}")
        print("-" * 120)

        if exact_written_20:
            # show the last N by write time (in case more than N exist for this usid)
            top = grp.sort_values("_time", ascending=True).tail(n).sort_values("_time", ascending=True)
        else:
            # show most recent first
            top = grp.head(n)

        for i, (_, row) in enumerate(top.iterrows(), 1):
            title = str(row.get("title") or "")
            permalink = str(row.get("permalink") or "")
            url = str(row.get("url") or "")
            subreddit = str(row.get("source") or "")
            tm = row.get("_time")
            words = int(row.get("checked_word_count", 0) or 0)
            groups = int(row.get("group_matches_in_window", 0) or 0)

            print(f"[{i:02d}] {title[:180]}")
            print(f"     subreddit: {subreddit} | words={words} | groups={groups} | time={tm}")
            print(f"     permalink: {permalink}")
            print(f"     url:       {url}")

            st = row.get("selftext") or ""
            if st:
                st2 = str(st).replace("\n", " ").strip()
                print(f"     selftext:  {st2[:240]}{'‚Ä¶' if len(st2) > 240 else ''}")
            print()


# ---- run inspection ----
df_reddit = load_reddit_posts_from_influx(lookback="30d")
print("Loaded reddit_post points:", len(df_reddit))

# Shows the last 20 written per usid (recommended)
show_top_reddit_posts_per_article(df_reddit, articles, n=20, exact_written_20=True)

# Optional: interactive peek
df_reddit.head()


Loaded reddit_post points: 111

USID: news:3417133
Found points in DB: 8 | showing 8
------------------------------------------------------------------------------------------------------------------------
[01] Japan oder zum 1. Mal Korea
     subreddit: reisende | words=102 | groups=0 | time=2025-12-18 10:56:37+00:00
     permalink: https://www.reddit.com/r/reisende/comments/1ppo0bh/japan_oder_zum_1_mal_korea/
     url:       https://www.reddit.com/r/reisende/comments/1ppo0bh/japan_oder_zum_1_mal_korea/
     selftext:  Ich plane aktuell meinen Urlaub f√ºr n√§chstes Jahr. Ein Ziel welches ich immer bereisen k√∂nnte, und in dem ich bereits 3 Mal gewesen bin, ist Japan. F√ºr meinen n√§chsten Urlaub w√ºrde ich gerne Bangkok, Hanoi und als 3. Reiseziel entweder wied‚Ä¶

[02] √ñsterreich Temu Promo Code {"100 ‚Ç¨ Rabatt"} ‚ûî[^¬∞^aci789589^¬∞^] f√ºr die erste Bestellung
     subreddit: TEMUpact | words=1708 | groups=0 | time=2025-12-20 07:59:49+00:00
     permalink: https://www.reddit.com/r

Unnamed: 0,_time,usid,source,reddit_id,title,permalink,url,checked_word_count,group_matches_in_window,selftext
21,2026-01-14 17:04:36+00:00,news:3417133,Finanzen,1qcstcv,TK-Chef will Mondpreise bei Blockbuster-Medika...,https://www.reddit.com/r/Finanzen/comments/1qc...,https://www.reddit.com/r/Finanzen/comments/1qc...,412,0,Von McDonald‚Äôs lernen hei√üt siegen lernen: Kau...
96,2026-01-10 18:33:08+00:00,news:3417133,reisende,1q9bnrw,"Wie fange ich an meine Reise nach Thailand, Ja...",https://www.reddit.com/r/reisende/comments/1q9...,https://www.reddit.com/r/reisende/comments/1q9...,278,0,"Ich (w18) mache jetzt 2026 mein Abitur, m√∂chte..."
49,2026-01-07 22:16:02+00:00,news:3417133,TagesFakten,1q6thrk,üáØüáµ Japan verurteilt Chinas Exportverbot als wi...,https://www.reddit.com/r/TagesFakten/comments/...,https://www.reddit.com/r/TagesFakten/comments/...,339,0,Japan hat offiziell gegen Chinas j√ºngste Expor...
95,2026-01-05 09:16:15+00:00,news:3417133,reisende,1q4h29g,"SOA - Hochzeitsreise und Fernreise - Mai, 4 Wo...",https://www.reddit.com/r/reisende/comments/1q4...,https://www.reddit.com/r/reisende/comments/1q4...,441,0,"Gr√º√üe euch,\r\n\r\nmeine Verlobte und ich plan..."
48,2026-01-04 15:15:44+00:00,news:3417133,TagesFakten,1q3se07,üá∞üáµ Nordkorea feuert Raketen ab: US-Schl√§ge erh...,https://www.reddit.com/r/TagesFakten/comments/...,https://www.reddit.com/r/TagesFakten/comments/...,319,0,Nordkorea hat diese Woche mehrere ballistische...
