In [57]:
%pip install requests feedparser
%pip install pandas

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [58]:
# Cell 1 - imports + config
import os, csv
import requests
import xml.etree.ElementTree as ET
from datetime import datetime, timezone

FEED_URL = "https://rss.orf.at/news.xml"
CSV_PATH = "orf_politik_ausland.csv"

TARGET_OEWA = "urn:oewa:RedCont:Politik/PolitikAusland"
USER_AGENT = "orf-rss-tracker/1.0 (+local notebook)"


In [59]:
# Cell 2 - fetch XML
def fetch_feed_xml(url: str, timeout: int = 20) -> str:
    r = requests.get(url, timeout=timeout, headers={"User-Agent": USER_AGENT})
    r.raise_for_status()
    return r.text

xml_text = fetch_feed_xml(FEED_URL)
len(xml_text), xml_text[:200]


(17850,
 '<?xml version="1.0" encoding="UTF-8"?>\n<rdf:RDF\n  xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"\n  xmlns:dc="http://purl.org/dc/elements/1.1/"\n  xmlns:sy="http://purl.org/rss/1.0/modules/synd')

In [60]:
# Cell 3 - detect namespaces robustly (so you don't have to guess)
import io

def detect_namespaces(xml_text: str) -> dict:
    ns = {}
    for event, elem in ET.iterparse(io.StringIO(xml_text), events=("start-ns",)):
        prefix, uri = elem
        ns[prefix if prefix is not None else ""] = uri
    return ns

NS = detect_namespaces(xml_text)
NS


{'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
 'dc': 'http://purl.org/dc/elements/1.1/',
 'sy': 'http://purl.org/rss/1.0/modules/syndication/',
 'orfon': 'http://rss.orf.at/1.0/',
 '': 'http://purl.org/rss/1.0/'}

In [61]:
# Cell 4 - parse + sanity checks (THIS will show why your old code returned 0)
root = ET.fromstring(xml_text)

rss_ns = NS.get("rss", "http://purl.org/rss/1.0/")  # ORF uses RSS 1.0
items = root.findall(".//{%s}item" % rss_ns)

root.tag, len(items)


('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF', 26)

In [62]:
# Cell 5 - helper: load already-seen usids (dedupe)
def load_seen_usids(csv_path: str) -> set[str]:
    if not os.path.exists(csv_path):
        return set()
    seen = set()
    with open(csv_path, "r", encoding="utf-8", newline="") as f:
        reader = csv.DictReader(f)
        for row in reader:
            if row.get("usid"):
                seen.add(row["usid"])
    return seen

seen_usids = load_seen_usids(CSV_PATH)
len(seen_usids)


0

In [63]:
# Cell 6 - parse items + filter by oewaCategory
def text_of(el):
    return el.text.strip() if el is not None and el.text else None

def parse_filtered_items(root: ET.Element, ns: dict) -> list[dict]:
    rss_ns = ns.get("rss", "http://purl.org/rss/1.0/")
    rdf_ns = ns.get("rdf", "http://www.w3.org/1999/02/22-rdf-syntax-ns#")
    dc_ns  = ns.get("dc",  "http://purl.org/dc/elements/1.1/")
    orf_ns = ns.get("orfon")  # must exist in feed; we'll rely on detected value

    if not orf_ns:
        raise RuntimeError("Could not detect 'orfon' namespace in the feed. Check NS dict output.")

    out = []
    for item in root.findall(".//{%s}item" % rss_ns):
        # orfon:oewaCategory rdf:resource="..."
        cat_el = item.find("{%s}oewaCategory" % orf_ns)
        if cat_el is None:
            continue

        cat_val = cat_el.attrib.get("{%s}resource" % rdf_ns)
        if cat_val != TARGET_OEWA:
            continue

        title_el = item.find("{%s}title" % rss_ns)
        link_el  = item.find("{%s}link" % rss_ns)
        date_el  = item.find("{%s}date" % dc_ns)
        usid_el  = item.find("{%s}usid" % orf_ns)

        out.append({
            "usid": text_of(usid_el),
            "date": text_of(date_el),
            "link": text_of(link_el),
            "title": text_of(title_el),
            "oewaCategory": cat_val,
            "fetched_at_utc": datetime.now(timezone.utc).isoformat(),
        })

    return out

filtered_items = parse_filtered_items(root, NS)
len(filtered_items), filtered_items[:2]


(10,
 [{'usid': 'news:3415936',
   'date': '2026-01-01T18:43:26+01:00',
   'link': 'https://orf.at/stories/3415936/',
   'title': 'Trump bestreitet Nickerchen vor Kameras',
   'oewaCategory': 'urn:oewa:RedCont:Politik/PolitikAusland',
   'fetched_at_utc': '2026-01-01T19:02:50.058914+00:00'},
  {'usid': 'news:3415934',
   'date': '2026-01-01T18:06:56+01:00',
   'link': 'https://orf.at/stories/3415934/',
   'title': 'Jemen stoppt internationalen Flugverkehr in Aden',
   'oewaCategory': 'urn:oewa:RedCont:Politik/PolitikAusland',
   'fetched_at_utc': '2026-01-01T19:02:50.058914+00:00'}])

In [64]:
# Cell 6 - parse items + filter by oewaCategory
def text_of(el):
    return el.text.strip() if el is not None and el.text else None

def parse_filtered_items(root: ET.Element, ns: dict) -> list[dict]:
    rss_ns = ns.get("rss", "http://purl.org/rss/1.0/")
    rdf_ns = ns.get("rdf", "http://www.w3.org/1999/02/22-rdf-syntax-ns#")
    dc_ns  = ns.get("dc",  "http://purl.org/dc/elements/1.1/")
    orf_ns = ns.get("orfon")  # must exist in feed; we'll rely on detected value

    if not orf_ns:
        raise RuntimeError("Could not detect 'orfon' namespace in the feed. Check NS dict output.")

    out = []
    for item in root.findall(".//{%s}item" % rss_ns):
        # orfon:oewaCategory rdf:resource="..."
        cat_el = item.find("{%s}oewaCategory" % orf_ns)
        if cat_el is None:
            continue

        cat_val = cat_el.attrib.get("{%s}resource" % rdf_ns)
        if cat_val != TARGET_OEWA:
            continue

        title_el = item.find("{%s}title" % rss_ns)
        link_el  = item.find("{%s}link" % rss_ns)
        date_el  = item.find("{%s}date" % dc_ns)
        usid_el  = item.find("{%s}usid" % orf_ns)

        out.append({
            "usid": text_of(usid_el),
            "date": text_of(date_el),
            "link": text_of(link_el),
            "title": text_of(title_el),
            "oewaCategory": cat_val,
            "fetched_at_utc": datetime.now(timezone.utc).isoformat(),
        })

    return out

filtered_items = parse_filtered_items(root, NS)
len(filtered_items), filtered_items[:2]


(10,
 [{'usid': 'news:3415936',
   'date': '2026-01-01T18:43:26+01:00',
   'link': 'https://orf.at/stories/3415936/',
   'title': 'Trump bestreitet Nickerchen vor Kameras',
   'oewaCategory': 'urn:oewa:RedCont:Politik/PolitikAusland',
   'fetched_at_utc': '2026-01-01T19:02:50.064421+00:00'},
  {'usid': 'news:3415934',
   'date': '2026-01-01T18:06:56+01:00',
   'link': 'https://orf.at/stories/3415934/',
   'title': 'Jemen stoppt internationalen Flugverkehr in Aden',
   'oewaCategory': 'urn:oewa:RedCont:Politik/PolitikAusland',
   'fetched_at_utc': '2026-01-01T19:02:50.064421+00:00'}])

In [65]:
# Cell 7 - (optional) debug: what categories exist + counts
from collections import Counter

def category_counts(root: ET.Element, ns: dict) -> Counter:
    rss_ns = ns.get("rss", "http://purl.org/rss/1.0/")
    rdf_ns = ns.get("rdf", "http://www.w3.org/1999/02/22-rdf-syntax-ns#")
    orf_ns = ns.get("orfon")
    c = Counter()
    for item in root.findall(".//{%s}item" % rss_ns):
        cat_el = item.find("{%s}oewaCategory" % orf_ns) if orf_ns else None
        if cat_el is None:
            continue
        cat_val = cat_el.attrib.get("{%s}resource" % rdf_ns)
        if cat_val:
            c[cat_val] += 1
    return c

counts = category_counts(root, NS)
counts.most_common(10)


[('urn:oewa:RedCont:Politik/PolitikAusland', 10),
 ('urn:oewa:RedCont:Nachrichten/Chronik', 6),
 ('urn:oewa:RedCont:Wirtschaft/Wirtschaftspolitik', 2),
 ('urn:oewa:RedCont:Wirtschaft/Unternehmensberichterstattung', 2),
 ('urn:oewa:RedCont:KulturUndFreizeit/Musik', 2),
 ('urn:oewa:RedCont:Politik/PolitikInland', 1),
 ('urn:oewa:RedCont:MedienUndWerbung/Medien', 1),
 ('urn:oewa:RedCont:KulturUndFreizeit/KulturUeberblick', 1),
 ('urn:oewa:RedCont:Wissenschaft/WissenschaftUeberblick', 1)]

In [66]:
# Cell 8 - append only new items to CSV
FIELDNAMES = ["usid", "date", "link", "title", "oewaCategory", "fetched_at_utc"]

def append_new_items(csv_path: str, items: list[dict], seen: set[str]) -> int:
    is_new_file = not os.path.exists(csv_path)

    new_rows = []
    for it in items:
        if not it.get("usid"):
            continue
        if it["usid"] in seen:
            continue
        new_rows.append(it)
        seen.add(it["usid"])

    if not new_rows:
        return 0

    with open(csv_path, "a", encoding="utf-8", newline="") as f:
        w = csv.DictWriter(f, fieldnames=FIELDNAMES)
        if is_new_file:
            w.writeheader()
        w.writerows(new_rows)

    return len(new_rows)

added = append_new_items(CSV_PATH, filtered_items, seen_usids)
added, CSV_PATH


(10, 'orf_politik_ausland.csv')

In [67]:
# Cell 9 - show latest rows quickly
import pandas as pd

df = pd.read_csv(CSV_PATH)
df.tail(10)


Unnamed: 0,usid,date,link,title,oewaCategory,fetched_at_utc
0,news:3415936,2026-01-01T18:43:26+01:00,https://orf.at/stories/3415936/,Trump bestreitet Nickerchen vor Kameras,urn:oewa:RedCont:Politik/PolitikAusland,2026-01-01T19:02:50.064421+00:00
1,news:3415934,2026-01-01T18:06:56+01:00,https://orf.at/stories/3415934/,Jemen stoppt internationalen Flugverkehr in Aden,urn:oewa:RedCont:Politik/PolitikAusland,2026-01-01T19:02:50.064421+00:00
2,news:3415933,2026-01-01T17:45:45+01:00,https://orf.at/stories/3415933/,Somalia: Militär tötet 29 Al-Schabab-Kämpfer,urn:oewa:RedCont:Politik/PolitikAusland,2026-01-01T19:02:50.064421+00:00
3,news:3415931,2026-01-01T17:34:49+01:00,https://orf.at/stories/3415931/,Bolsonaro muss nach Operationen wieder in Haft,urn:oewa:RedCont:Politik/PolitikAusland,2026-01-01T19:02:50.064421+00:00
4,news:3415923,2026-01-01T16:49:09+01:00,https://orf.at/stories/3415923/,Knapp 41.500 Migranten überquerten 2025 Ärmelk...,urn:oewa:RedCont:Politik/PolitikAusland,2026-01-01T19:02:50.064421+00:00
5,news:3415915,2026-01-01T15:26:39+01:00,https://orf.at/stories/3415915/,Medien: Tote bei neuen Protesten im Iran,urn:oewa:RedCont:Politik/PolitikAusland,2026-01-01T19:02:50.064421+00:00
6,news:3415913,2026-01-01T14:52:32+01:00,https://orf.at/stories/3415913/,Israel entzog Dutzenden Hilfsorganisationen di...,urn:oewa:RedCont:Politik/PolitikAusland,2026-01-01T19:02:50.064421+00:00
7,news:3415907,2026-01-01T13:33:22+01:00,https://orf.at/stories/3415907/,Saudi-Arabien richtete 2025 mehr als 350 Mensc...,urn:oewa:RedCont:Politik/PolitikAusland,2026-01-01T19:02:50.064421+00:00
8,news:3415898,2026-01-01T11:02:33+01:00,https://orf.at/stories/3415898/,Russland: Drohnenangriffe auf russische Ölanlagen,urn:oewa:RedCont:Politik/PolitikAusland,2026-01-01T19:02:50.064421+00:00
9,news:3415894,2026-01-01T10:02:52+01:00,https://orf.at/stories/3415894/,Mamdani tritt Amt als New Yorker Bürgermeister an,urn:oewa:RedCont:Politik/PolitikAusland,2026-01-01T19:02:50.064421+00:00
