In [1]:
# ===== Influx -> DataFrame helpers (v2) – FULL CELL (uses influx_io config) =====

import pandas as pd
from influxdb_client import InfluxDBClient

# Reuse EXACT same config your writer uses
from influx_io import INFLUX_URL, INFLUX_TOKEN, INFLUX_ORG, INFLUX_BUCKET

# Connect
client = InfluxDBClient(url=INFLUX_URL, token=INFLUX_TOKEN, org=INFLUX_ORG)
qapi = client.query_api()

def flux_df(flux: str) -> pd.DataFrame:
    """Run Flux and return a single merged pandas DataFrame."""
    dfs = qapi.query_data_frame(flux)
    if isinstance(dfs, list):
        dfs = [d for d in dfs if d is not None and len(d) > 0]
        if not dfs:
            return pd.DataFrame()
        return pd.concat(dfs, ignore_index=True)
    return dfs if dfs is not None else pd.DataFrame()

def list_measurements(bucket: str, start: str = "-30d") -> list[str]:
    flux = f'''
import "influxdata/influxdb/schema"
schema.measurements(bucket: "{bucket}", start: {start})
'''
    dfm = flux_df(flux)
    if dfm.empty:
        return []
    return sorted(dfm["_value"].dropna().astype(str).unique().tolist())

def load_measurement_pivot(measurement: str, bucket: str, start: str = "-30d") -> pd.DataFrame:
    """Load a measurement and pivot fields into columns."""
    flux = f'''
from(bucket: "{bucket}")
  |> range(start: {start})
  |> filter(fn: (r) => r._measurement == "{measurement}")
  |> pivot(rowKey:["_time"], columnKey: ["_field"], valueColumn: "_value")
'''
    d = flux_df(flux)
    for c in ("result", "table"):
        if c in d.columns:
            d = d.drop(columns=[c])
    return d

# ---- quick test: list measurements ----
meas = list_measurements(INFLUX_BUCKET)
print("INFLUX_URL:", INFLUX_URL)
print("INFLUX_ORG:", INFLUX_ORG)
print("INFLUX_BUCKET:", INFLUX_BUCKET)
print("Measurements:", meas)
print("Count:", len(meas))


INFLUX_URL: http://localhost:8086
INFLUX_ORG: bigdata
INFLUX_BUCKET: bigdata_bucket
Measurements: ['orf_article', 'reddit_post']
Count: 2



The result will not be shaped to optimal processing by pandas.DataFrame. Use the pivot() function by:

    
import "influxdata/influxdb/schema"
schema.measurements(bucket: "bigdata_bucket", start: -30d)
 |> pivot(rowKey:["_time"], columnKey: ["_field"], valueColumn: "_value")



For more info see:
    - https://docs.influxdata.com/resources/videos/pivots-in-flux/
    - https://docs.influxdata.com/flux/latest/stdlib/universe/pivot/
    - https://docs.influxdata.com/flux/latest/stdlib/influxdata/influxdb/schema/fieldsascols/



In [2]:
# ===== auto-detect measurement names =====
def pick_measurement(measurements: list[str], includes_any: list[str]) -> str | None:
    m2 = [m for m in measurements if any(k.lower() in m.lower() for k in includes_any)]
    return m2[0] if m2 else None

MEAS_REDDIT = pick_measurement(meas, ["reddit", "match"])
MEAS_ORF    = pick_measurement(meas, ["orf", "article", "rss"])

MEAS_REDDIT, MEAS_ORF


('reddit_post', 'orf_article')

In [3]:
# ===== load reddit matches (pivot to columns) =====
START_RANGE = "-30d"

def load_measurement_pivot(measurement: str, bucket: str, start: str = "-30d") -> pd.DataFrame:
    flux = f'''
from(bucket: "{bucket}")
  |> range(start: {start})
  |> filter(fn: (r) => r._measurement == "{measurement}")
  |> pivot(rowKey:["_time"], columnKey: ["_field"], valueColumn: "_value")
'''
    d = flux_df(flux)
    # Influx adds these housekeeping cols sometimes
    for c in ["result", "table"]:
        if c in d.columns:
            d = d.drop(columns=[c])
    return d

reddit_df = load_measurement_pivot(MEAS_REDDIT, INFLUX_BUCKET, START_RANGE) if MEAS_REDDIT else pd.DataFrame()
orf_df    = load_measurement_pivot(MEAS_ORF, INFLUX_BUCKET, START_RANGE) if MEAS_ORF else pd.DataFrame()

reddit_df.head(3), orf_df.head(3)


(                            _start                            _stop  \
 0 2025-12-14 14:46:00.701554+00:00 2026-01-13 14:46:00.701554+00:00   
 1 2025-12-14 14:46:00.701554+00:00 2026-01-13 14:46:00.701554+00:00   
 2 2025-12-14 14:46:00.701554+00:00 2026-01-13 14:46:00.701554+00:00   
 
                       _time _measurement          source          usid  \
 0 2026-01-13 13:17:35+00:00  reddit_post  10xPennyStocks  news:3416966   
 1 2026-01-13 13:15:13+00:00  reddit_post          5_9_14  news:3416997   
 2 2026-01-12 00:56:14+00:00  reddit_post            ADHS  news:3416968   
 
    checked_word_count  group_matches_in_window  \
 0                 435                        2   
 1                 306                        2   
 2                 426                        2   
 
                                            permalink reddit_id  \
 0  https://www.reddit.com/r/10xPennyStocks/commen...   1qbr1zj   
 1  https://www.reddit.com/r/5_9_14/comments/1qbr0...   1qbr00y   
 

In [4]:
# ===== join reddit posts with ORF titles/claims via article_usid =====
# Heuristik: ORF-Titel-Spalte finden
def find_first_col(df: pd.DataFrame, candidates: list[str]) -> str | None:
    for c in candidates:
        if c in df.columns:
            return c
    return None

# typische Felder aus deinem rows.append / write_orf_articles:
col_usid_orf = find_first_col(orf_df, ["usid", "article_usid"])
col_title_orf = find_first_col(orf_df, ["title", "article_title"])

# typische Felder aus reddit matches:
col_usid_red = find_first_col(reddit_df, ["article_usid", "usid"])
col_rtitle   = find_first_col(reddit_df, ["reddit_title", "title"])
col_self     = find_first_col(reddit_df, ["reddit_selftext", "selftext"])
col_perm     = find_first_col(reddit_df, ["reddit_permalink", "permalink"])
col_url      = find_first_col(reddit_df, ["post_url", "url"])
col_relevance= find_first_col(reddit_df, ["group_matches_in_window", "matches", "relevance"])

(col_usid_orf, col_title_orf, col_usid_red, col_rtitle, col_self, col_relevance)


('usid', 'title', 'usid', 'title', 'selftext', 'group_matches_in_window')

In [5]:
# Build claim/title map
assert col_usid_orf and col_title_orf, "Konnte ORF usid/title nicht finden – schau dir orf_df.columns an."
assert col_usid_red and col_rtitle, "Konnte Reddit article_usid/reddit_title nicht finden – schau dir reddit_df.columns an."

orf_map = (
    orf_df[[col_usid_orf, col_title_orf]]
    .dropna()
    .astype({col_usid_orf: str, col_title_orf: str})
    .drop_duplicates(subset=[col_usid_orf])
    .rename(columns={col_usid_orf: "article_usid", col_title_orf: "article_claim"})
)

work = reddit_df.copy()
work["article_usid"] = work[col_usid_red].astype(str)
work["reddit_text"] = (
    work[col_rtitle].fillna("").astype(str)
    + "\n\n"
    + (work[col_self].fillna("").astype(str) if col_self else "")
)

work = work.merge(orf_map, on="article_usid", how="left")
work[["article_usid", "article_claim", col_rtitle]].head(10)


Unnamed: 0,article_usid,article_claim,title
0,news:3416966,Trump droht Handelspartnern des Iran,$AIBT AIBotics Ushers in a New Era as Intellig...
1,news:3416997,Syrisches Militär errichtet neue Sperrzonen be...,"Iran Update, January 12, 2026"
2,news:3416968,Selenskyj warnt erneut vor großem russischem A...,Schulsystem hat mich Arbeitsunfähig gemacht. W...
3,news:3416989,Gewalt im Iran: Spanien und Finnland bestellen...,Neue Familien-Statistik veröffentlicht: Geburt...
4,news:3416997,Syrisches Militär errichtet neue Sperrzonen be...,Neue Familien-Statistik veröffentlicht: Geburt...
5,news:3417000,Morgen Treffen von USA und Dänemark zu Grönland,Land erlässt Feuerwerksverbot für Nordtirol
6,news:3417002,Bereits Tausende Tote bei Protesten im Iran,Unmut über Wiener Gastpatienten-Regelung
7,news:3417006,Staatliche Angabe: Rund 2.000 Tote bei Protest...,Ernüchternde Erkenntnis: Wohl kaum eine Gesell...
8,news:3416987,„Kriegsverbrechen“ mit getarntem Flugzeug,Translation?
9,news:3416990,Iranische Justiz klagt erste Demonstranten an,Forum-Turnier: Qualifikationen


In [None]:
# ===== stance detection (offline baseline) =====
import re
import numpy as np

NEG_WORDS = {
    # EN
    "bad","worse","worst","awful","terrible","disaster","corrupt","corruption","scandal","fail","failure",
    "ridiculous","stupid","idiotic","dangerous","hate","fraud","lies","lying",
    # DE
    "schlecht","schlimm","katastrophe","korrupt","skandal","versagen","lächerlich","dumm","gefährlich",
    "hass","betrug","lüge","lügen"
}
POS_WORDS = {
    # EN
    "good","great","excellent","right","correct","finally","welcome","benefit","improve","success",
    # DE
    "gut","super","richtig","endlich","willkommen","vorteil","verbessern","erfolg"
}

CON_PATTERNS = [r"resign", r"step down", r"ban", r"illegal", r"should be fired",
                r"rücktritt", r"zurücktreten", r"verbieten", r"illegal", r"muss weg"]
PRO_PATTERNS = [r"good idea", r"makes sense", r"well done", r"about time",
                r"gute idee", r"macht sinn", r"gut gemacht", r"wurde zeit"]

def stance_heuristic(claim: str, text: str):
    t = (text or "").lower()
    claim_toks = set(re.findall(r"[\wäöüß]{3,}", (claim or "").lower()))
    text_toks  = set(re.findall(r"[\wäöüß]{3,}", t))
    overlap = len(claim_toks & text_toks)

    neg = sum(1 for w in NEG_WORDS if w in t)
    pos = sum(1 for w in POS_WORDS if w in t)
    con_hits = sum(1 for p in CON_PATTERNS if re.search(p, t))
    pro_hits = sum(1 for p in PRO_PATTERNS if re.search(p, t))

    score = (pos + 2*pro_hits) - (neg + 2*con_hits)

    if overlap == 0:
        return "UNRELATED", 0.1

    if score >= 2:
        return "PRO", 0.6 + min(0.35, 0.05*score)
    if score <= -2:
        return "CON", 0.6 + min(0.35, 0.05*abs(score))

    return "NEUTRAL", 0.45


# apply
labels, confs = [], []
for _, r in work.iterrows():
    claim = str(r.get("article_claim","") or "")
    text  = str(r.get("reddit_text","") or "")
    lab, cf = stance_heuristic(claim, text)
    labels.append(lab); confs.append(cf)



work["stance_label"] = labels
work["stance_conf"] = confs
work = work[work["stance_label"] != "UNRELATED"]
work[["article_usid", "article_claim", "stance_label", "stance_conf"]].head(10)


Unnamed: 0,article_usid,article_claim,stance_label,stance_conf
0,news:3416966,Trump droht Handelspartnern des Iran,NEUTRAL,0.45
1,news:3416997,Syrisches Militär errichtet neue Sperrzonen be...,NEUTRAL,0.45
2,news:3416968,Selenskyj warnt erneut vor großem russischem A...,NEUTRAL,0.45
3,news:3416989,Gewalt im Iran: Spanien und Finnland bestellen...,NEUTRAL,0.45
4,news:3416997,Syrisches Militär errichtet neue Sperrzonen be...,NEUTRAL,0.45
5,news:3417000,Morgen Treffen von USA und Dänemark zu Grönland,NEUTRAL,0.45
6,news:3417002,Bereits Tausende Tote bei Protesten im Iran,NEUTRAL,0.45
7,news:3417006,Staatliche Angabe: Rund 2.000 Tote bei Protest...,CON,0.9
8,news:3416987,„Kriegsverbrechen“ mit getarntem Flugzeug,NEUTRAL,0.45
9,news:3416990,Iranische Justiz klagt erste Demonstranten an,NEUTRAL,0.45


In [None]:
from influx_io import write_reddit_stance_updates

updates_df = work[["_time", "article_usid", "source", "stance_label", "stance_conf"]].copy()

# enforce exact tag match
updates_df["source"] = updates_df["source"].fillna("").astype(str)

# reuse original timestamp
updates_df["saved_at_utc"] = updates_df["_time"].astype(str)

rows_updates = updates_df.to_dict(orient="records")

n = write_reddit_stance_updates(rows_updates)
print("stance fields upserted:", n)


In [7]:
# ===== aggregate per article =====
STANCE_TO_SCORE = {"PRO": 1.0, "CON": -1.0, "NEUTRAL": 0.0}
work["stance_score"] = work["stance_label"].map(STANCE_TO_SCORE).fillna(0.0)

# weights: confidence * optional relevance
if col_relevance and col_relevance in work.columns:
    rel = pd.to_numeric(work[col_relevance], errors="coerce").fillna(0)
    work["weight"] = work["stance_conf"] * (1.0 + np.maximum(0, rel - 3))
else:
    work["weight"] = work["stance_conf"]

def agg_article(g: pd.DataFrame) -> pd.Series:
    w = g["weight"].values
    s = g["stance_score"].values
    total_w = float(np.sum(w)) if np.sum(w) > 0 else 1.0
    mean_score = float(np.sum(w*s) / total_w)

    counts = g["stance_label"].value_counts(dropna=False).to_dict()
    pro = counts.get("PRO", 0)
    con = counts.get("CON", 0)
    neu = counts.get("NEUTRAL", 0)
    n = len(g)

    polar = float((pro/n) * (con/n) * 4) if n else 0.0  # 0..1

    return pd.Series({
        "n_posts": n,
        "mean_stance_score": mean_score,
        "share_pro": pro/n if n else 0,
        "share_con": con/n if n else 0,
        "share_neutral": neu/n if n else 0,
        "polarisation": polar
    })

summary = work.groupby("article_usid", as_index=False).apply(agg_article).reset_index(drop=True)

# Titel dazumappen
title_map = work.groupby("article_usid")["article_claim"].first()
summary["article_title"] = summary["article_usid"].map(title_map)

summary.sort_values("mean_stance_score").head(20)


  summary = work.groupby("article_usid", as_index=False).apply(agg_article).reset_index(drop=True)


Unnamed: 0,article_usid,n_posts,mean_stance_score,share_pro,share_con,share_neutral,polarisation,article_title
10,news:3417010,7.0,-0.908163,0.0,0.857143,0.142857,0.0,Ungarns Parlamentspräsident attackiert von der...
4,news:3416990,4.0,-0.288462,0.25,0.5,0.25,0.5,Iranische Justiz klagt erste Demonstranten an
1,news:3416968,16.0,-0.190476,0.0625,0.1875,0.75,0.046875,Selenskyj warnt erneut vor großem russischem A...
0,news:3416966,17.0,-0.143921,0.058824,0.294118,0.647059,0.069204,Trump droht Handelspartnern des Iran
5,news:3416997,18.0,-0.09901,0.111111,0.166667,0.722222,0.074074,Syrisches Militär errichtet neue Sperrzonen be...
6,news:3416998,1.0,0.0,0.0,0.0,1.0,0.0,BBC will Abweisung von Trumps Klage beantragen
3,news:3416989,3.0,0.0,0.0,0.0,1.0,0.0,Gewalt im Iran: Spanien und Finnland bestellen...
7,news:3417000,20.0,0.034063,0.1,0.05,0.85,0.02,Morgen Treffen von USA und Dänemark zu Grönland
9,news:3417006,16.0,0.048387,0.25,0.1875,0.5625,0.1875,Staatliche Angabe: Rund 2.000 Tote bei Protest...
2,news:3416987,4.0,0.061776,0.25,0.0,0.75,0.0,„Kriegsverbrechen“ mit getarntem Flugzeug


In [9]:
# show 5 raw rows (proof that content is there)
cols_show = ["article_usid", "article_claim", "stance_label", "stance_conf"]
if col_rtitle: cols_show.append(col_rtitle)
if col_perm and col_perm in work.columns: cols_show.append(col_perm)

work[cols_show].head(20)


Unnamed: 0,article_usid,article_claim,stance_label,stance_conf,title,permalink
0,news:3416966,Trump droht Handelspartnern des Iran,NEUTRAL,0.45,$AIBT AIBotics Ushers in a New Era as Intellig...,https://www.reddit.com/r/10xPennyStocks/commen...
1,news:3416997,Syrisches Militär errichtet neue Sperrzonen be...,NEUTRAL,0.45,"Iran Update, January 12, 2026",https://www.reddit.com/r/5_9_14/comments/1qbr0...
2,news:3416968,Selenskyj warnt erneut vor großem russischem A...,NEUTRAL,0.45,Schulsystem hat mich Arbeitsunfähig gemacht. W...,https://www.reddit.com/r/ADHS/comments/1qagpa9...
3,news:3416989,Gewalt im Iran: Spanien und Finnland bestellen...,NEUTRAL,0.45,Neue Familien-Statistik veröffentlicht: Geburt...,https://www.reddit.com/r/Austria/comments/1pye...
4,news:3416997,Syrisches Militär errichtet neue Sperrzonen be...,NEUTRAL,0.45,Neue Familien-Statistik veröffentlicht: Geburt...,https://www.reddit.com/r/Austria/comments/1pye...
5,news:3417000,Morgen Treffen von USA und Dänemark zu Grönland,NEUTRAL,0.45,Land erlässt Feuerwerksverbot für Nordtirol,https://www.reddit.com/r/Austria/comments/1pyp...
6,news:3417002,Bereits Tausende Tote bei Protesten im Iran,NEUTRAL,0.45,Unmut über Wiener Gastpatienten-Regelung,https://www.reddit.com/r/Austria/comments/1q83...
7,news:3417006,Staatliche Angabe: Rund 2.000 Tote bei Protest...,CON,0.9,Ernüchternde Erkenntnis: Wohl kaum eine Gesell...,https://www.reddit.com/r/Austria/comments/1q98...
8,news:3416987,„Kriegsverbrechen“ mit getarntem Flugzeug,NEUTRAL,0.45,Translation?,https://www.reddit.com/r/Battlefield6/comments...
9,news:3416990,Iranische Justiz klagt erste Demonstranten an,NEUTRAL,0.45,Forum-Turnier: Qualifikationen,https://www.reddit.com/r/Battlerapde/comments/...
