In [4]:
# Balkendiagramm: Emotionen nebeneinander
# - funktioniert mit:
#   a) ALL_emotions_means_by_id.tsv  (enthält Spalten mean_Agitation, …)
#   b) ALL_emotions_summary.tsv      (Spalten emotion, mean, share)
#
# Wähle, ob du absolute Mittelwerte ("mean") oder Anteile ("share") zeigen willst:
MODE = "share"   # "mean" oder "share"

import sys, subprocess, csv, re
from pathlib import Path

# Pfad anpassen:
TSV = Path(r"C:\Users\sam97xs\Stabi_Hackathon\Results\ALL_emotions_means_by_id.tsv")  # z.B. "/mnt/data/ALL_emotions_means_by_id.tsv"

# Plotly sicherstellen (kein NumPy/Matplotlib nötig)
subprocess.run([sys.executable, "-m", "pip", "install", "-q", "plotly"], check=False)
import plotly.graph_objects as go

EMOS = ["Agitation","Anger","Fear","Joy","Love","Sadness"]

def to_float(x):
    if x is None: return 0.0
    x = x.strip()
    if not x: return 0.0
    try: return float(x.replace(",", "."))
    except: return 0.0

# Erkennen, ob es eine Summary-Datei ist (emotion, mean, share)
with TSV.open("r", encoding="utf-8", errors="ignore", newline="") as f:
    r = csv.reader(f, delimiter="\t")
    header = next(r, [])
    is_summary = ("emotion" in header and "mean" in header)

emotions, values = [], []

if is_summary:
    # Direkt aus der Summary lesen (eine Zeile je Emotion)
    with TSV.open("r", encoding="utf-8", errors="ignore", newline="") as f:
        r = csv.DictReader(f, delimiter="\t")
        for row in r:
            e = row.get("emotion","")
            if e in EMOS:
                v = to_float(row.get(MODE, ""))
                emotions.append(e); values.append(v)
else:
    # Aus der means_by_id-Datei: mean_* Spalten über alle Zeilen mitteln
    sums = {e: 0.0 for e in EMOS}
    n = 0
    with TSV.open("r", encoding="utf-8", errors="ignore", newline="") as f:
        r = csv.DictReader(f, delimiter="\t")
        mean_cols = [f"mean_{e}" for e in EMOS if f"mean_{e}" in (r.fieldnames or [])]
        if not mean_cols:
            raise RuntimeError("Keine 'mean_*'-Spalten gefunden.")
        for row in r:
            n += 1
            for e in EMOS:
                col = f"mean_{e}"
                if col in row:
                    sums[e] += to_float(row[col])
    means = {e: (sums[e] / max(1, n)) for e in EMOS}
    if MODE == "share":
        total = sum(means.values())
        vals = {e: (means[e] / total if total > 0 else 0.0) for e in EMOS}
    else:
        vals = means
    emotions = EMOS
    values   = [vals[e] for e in EMOS]

# hübsches Label (Prozent bei share)
labels = [f"{v:.1%}" if MODE=="share" else f"{v:.3f}" for v in values]

fig = go.Figure(go.Bar(x=emotions, y=values, text=labels, textposition="auto"))
fig.update_layout(
    title=f"Emotionen – {'Anteile' if MODE=='share' else 'Mittelwerte'} (ein Jahr)",
    xaxis_title="Emotion",
    yaxis_title=("Anteil" if MODE=="share" else "Mittelwert"),
    yaxis_tickformat=(".0%" if MODE=="share" else None),
    template="plotly_white"
)
fig.show()

# Optional speichern:
# fig.write_html("emotionen_balken.html", include_plotlyjs="cdn")


In [5]:
# Line-Chart: Emotionen über die Jahre (Plotly, ohne NumPy/Pandas)
import sys, subprocess, csv, re
from collections import defaultdict, OrderedDict

# Plotly sicherstellen
subprocess.run([sys.executable, "-m", "pip", "install", "-q", "plotly"], check=False)
import plotly.graph_objects as go

TSV = r"C:\Users\sam97xs\Stabi_Hackathon\Results\ALL_emotions_means_by_id.tsv"   # <--- anpassen

# Welche Emos werden erwartet (falls Spalten fehlen, werden sie übersprungen)
EMOS = ["Joy","Love","Fear","Anger","Sadness","Agitation"]

def to_float(x):
    if x is None: return 0.0
    x = x.strip()
    if not x: return 0.0
    try: return float(x.replace(",", "."))
    except: return 0.0

def extract_year_from_id(s):
    m = re.search(r"(\d{4})-\d{2}-\d{2}", s or "")
    return int(m.group(1)) if m else None

# --- Datei lesen & pro Jahr mitteln ---
sums = defaultdict(lambda: defaultdict(float))  # sums[year][emo]
cnts = defaultdict(int)                         # cnts[year] (Zeilen je Jahr)

with open(TSV, "r", encoding="utf-8", errors="ignore", newline="") as f:
    r = csv.DictReader(f, delimiter="\t")
    headers = r.fieldnames or []
    # bevorzugt 'year', sonst aus 'id'
    has_year_col = "year" in headers
    # Emotionsspalten (mean_Emotion)
    emo_cols = [f"mean_{e}" for e in EMOS if f"mean_{e}" in headers]
    if not emo_cols:
        raise RuntimeError("Keine Spalten 'mean_*' gefunden (z. B. mean_Joy, mean_Fear, ...).")

    for row in r:
        year = None
        if has_year_col:
            # robustes Year-Parsing
            try:
                year = int(float(str(row.get("year","")).replace(",", ".")))
            except:
                year = None
        if year is None:
            year = extract_year_from_id(row.get("id",""))
        if year is None:
            continue  # Zeilen ohne Jahr ignorieren

        cnts[year] += 1
        for e in EMOS:
            col = f"mean_{e}"
            if col in row:
                sums[year][e] += to_float(row.get(col))

# Mittelwerte pro Jahr berechnen
years = sorted(cnts.keys())
means_by_year = OrderedDict()
for y in years:
    n = max(1, cnts[y])
    means_by_year[y] = {e: (sums[y].get(e, 0.0) / n) for e in EMOS}

# --- Farben: Positiv = Grüntöne, Negativ = Rottöne (ähnliche Familien) ---
pos_palette = {
    "Joy":  "#2ca25f",   # kräftiges Grün
    "Love": "#66c2a4",   # helleres Grün
}
neg_palette = {
    "Anger":     "#b2182b",  # dunkles Rot
    "Fear":      "#d6604d",  # rötlich
    "Sadness":   "#f4a582",  # lachs-rot (negativ, aber heller)
    "Agitation": "#d7301f",  # orangerot
}
color_map = {**pos_palette, **neg_palette}

# Plot bauen
fig = go.Figure()

for e in EMOS:
    # überspringen, wenn diese Emotion in den Daten nie vorkam
    if all(e not in means_by_year[y] for y in years):
        continue
    yvals = [means_by_year[y].get(e, 0.0) for y in years]
    fig.add_trace(go.Scatter(
        x=years, y=yvals,
        mode="lines+markers",
        name=e,
        line=dict(width=2, color=color_map.get(e)),
        marker=dict(size=6, color=color_map.get(e))
    ))

fig.update_layout(
    title="Emotionen über die Jahre (Jahresmittel, pro Ausgabe gemittelt)",
    xaxis_title="Jahr",
    yaxis_title="Mittelwert",
    template="plotly_white",
    legend_title="Emotion",
    hovermode="x unified"
)

# optional: Y-Achse ab 0 starten
fig.update_yaxes(rangemode="tozero")

fig.show()

# Optional speichern:
# fig.write_html("emotions_line_by_year.html", include_plotlyjs="cdn")


In [7]:
# Boxplots je Emotion: pre war vs. war vs. post war (Plotly, ohne Pandas/Numpy)
import sys, subprocess, csv

# Plotly sicherstellen
subprocess.run([sys.executable, "-m", "pip", "install", "-q", "plotly"], check=False)
import plotly.graph_objects as go
from plotly.subplots import make_subplots

TSV = r"C:\Users\sam97xs\Stabi_Hackathon\Results\ALL_emotions_means_by_id.tsv"  # <--- anpassen

EMOS = ["Joy","Love","Fear","Anger","Sadness","Agitation"]
ERA_LEVELS = ["pre war", "war", "post war"]  # Reihenfolge im Plot
ERA_COLORS = {
    "pre war":  "#2b8cbe",  # blau
    "war":      "#f46d43",  # orange/rot
    "post war": "#74add1",  # hellblau
}

def to_float(x):
    if x is None: return 0.0
    x = x.strip().replace(",", ".")
    try: return float(x) if x else 0.0
    except: return 0.0

# Datenstruktur: values[emotion][era] -> Liste von Zahlen
values = {e: {era: [] for era in ERA_LEVELS} for e in EMOS}

with open(TSV, "r", encoding="utf-8", errors="ignore", newline="") as f:
    r = csv.DictReader(f, delimiter="\t")
    headers = r.fieldnames or []
    # prüfen, ob alle benötigten Spalten existieren
    needed_means = [f"mean_{e}" for e in EMOS]
    for col in needed_means + ["era1"]:
        if col not in headers:
            raise RuntimeError(f"Spalte '{col}' fehlt in {TSV}. Gefunden: {headers}")

    for row in r:
        era = (row.get("era1") or "").strip()
        if era not in ERA_LEVELS:
            continue
        for emo in EMOS:
            v = to_float(row.get(f"mean_{emo}", ""))
            values[emo][era].append(v)

# Figure mit 6 Subplots (2x3)
fig = make_subplots(rows=2, cols=3, subplot_titles=EMOS,
                    shared_yaxes=True, horizontal_spacing=0.08, vertical_spacing=0.12)

# helper: Mapping Emo -> (row, col)
def pos(idx):
    return (1 + idx // 3, 1 + idx % 3)

legend_shown = set()
for i, emo in enumerate(EMOS):
    r, c = pos(i)
    for era in ERA_LEVELS:
        ys = values[emo][era]
        fig.add_trace(
            go.Box(
                y=ys,
                name=era,
                marker_color=ERA_COLORS.get(era, None),
                boxpoints="outliers",
                jitter=0.3,
                whiskerwidth=0.8,
                legendgroup=era,
                showlegend=(era not in legend_shown),  # Legendeneintrag nur einmal
            ),
            row=r, col=c
        )
        legend_shown.add(era)

fig.update_layout(
    title="Emotionen: Verteilung pro Epoche (era1)",
    template="plotly_white",
    height=700,
    boxmode="group",
    legend_title="Epoche (era1)",
)
fig.update_yaxes(title_text="Wert (mean_*)", rangemode="tozero", row=1, col=1)

fig.show()

# Optional speichern:
# fig.write_html("boxplots_era1_per_emotion.html", include_plotlyjs="cdn")


In [9]:
# === Boxplots (era2: pre vs post) je Emotion + Signifikanzvergleich era1 vs era2 ===
import sys, subprocess, csv, re, math
from collections import defaultdict, OrderedDict

# Plotly sicherstellen (keine NumPy/Pandas nötig)
subprocess.run([sys.executable, "-m", "pip", "install", "-q", "plotly"], check=False)
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# <<< Pfad zu deiner Datei anpassen >>>
TSV = r"C:\Users\sam97xs\Stabi_Hackathon\Results\ALL_emotions_means_by_id.tsv"

EMOS = ["Joy","Love","Fear","Anger","Sadness","Agitation"]
ERA1_LEVELS = ["pre war","war","post war"]
ERA2_LEVELS = ["pre","post"]

def to_float(x):
    if x is None: return 0.0
    x = str(x).strip().replace(",", ".")
    try: return float(x) if x else 0.0
    except: return 0.0

# --- Mann-Whitney U (2-seitig), rein in Python, mit Tie-Korrektur ---
def _rankdata(values):
    """Average ranks ab 1 (wie SciPy)."""
    sorted_pairs = sorted((v,i) for i,v in enumerate(values))
    ranks = [0.0]*len(values)
    i = 0
    while i < len(sorted_pairs):
        j = i+1
        # Tie-Block
        while j < len(sorted_pairs) and sorted_pairs[j][0] == sorted_pairs[i][0]:
            j += 1
        # mittlere Ränge für Block
        avg_rank = (i+1 + j) / 2.0
        for k in range(i, j):
            ranks[sorted_pairs[k][1]] = avg_rank
        i = j
    return ranks

def _mw_u_test(x, y):
    """Return dict: {'U':..., 'z':..., 'p':..., 'n1':..., 'n2':..., 'delta':...}"""
    n1, n2 = len(x), len(y)
    if n1 == 0 or n2 == 0:
        return {'U': 0.0, 'z': 0.0, 'p': 1.0, 'n1': n1, 'n2': n2, 'delta': 0.0}

    all_vals = x + y
    ranks = _rankdata(all_vals)
    R1 = sum(ranks[:n1])
    U1 = R1 - n1*(n1+1)/2.0
    # Tie-Korrektur
    # T = sum(t_i^3 - t_i) über alle Tie-Gruppen im kombinierten Sample
    T = 0.0
    i = 0
    sp = sorted(all_vals)
    N = n1 + n2
    while i < N:
        j = i+1
        while j < N and sp[j] == sp[i]:
            j += 1
        t = j - i
        if t > 1:
            T += t**3 - t
        i = j
    meanU = n1*n2/2.0
    # Varianz mit Tie-Korrektur
    sigma_sq = (n1*n2/12.0) * (N+1 - T/(N*(N-1))) if N > 1 else 0.0
    sigma = math.sqrt(max(sigma_sq, 1e-12))

    # Kontinuitätskorrektur
    if U1 > meanU:
        z = (U1 - meanU - 0.5)/sigma
    elif U1 < meanU:
        z = (U1 - meanU + 0.5)/sigma
    else:
        z = 0.0
    # zweiseitiger p-Wert
    p = 2.0 * (1.0 - 0.5*(1.0 + math.erf(abs(z)/math.sqrt(2))))

    # Cliff's delta (Effektgröße), O(n log n)
    def cliffs_delta(a, b):
        a_sorted = sorted(a)
        b_sorted = sorted(b)
        i = j = wins = ties = 0
        nb = len(b_sorted)
        for av in a_sorted:
            # count b < av
            while j < nb and b_sorted[j] < av:
                j += 1
            k = j
            # count b == av
            while k < nb and b_sorted[k] == av:
                k += 1
            wins += j
            ties += (k - j)
        total = n1*n2
        losses = total - wins - ties
        delta = (wins - losses) / total if total else 0.0
        return delta

    delta = cliffs_delta(x, y)
    return {'U': U1, 'z': z, 'p': p, 'n1': n1, 'n2': n2, 'delta': delta}

# --- Daten lesen ---
vals_era2 = {emo: {lvl: [] for lvl in ERA2_LEVELS} for emo in EMOS}
vals_era1 = {emo: {lvl: [] for lvl in ERA1_LEVELS} for emo in EMOS}

with open(TSV, "r", encoding="utf-8", errors="ignore", newline="") as f:
    r = csv.DictReader(f, delimiter="\t")
    headers = r.fieldnames or []
    need = ["era1","era2"] + [f"mean_{e}" for e in EMOS]
    missing = [c for c in need if c not in headers]
    if missing:
        raise RuntimeError(f"Fehlende Spalten in TSV: {missing}\nGefunden: {headers}")

    for row in r:
        e1 = (row.get("era1") or "").strip()
        e2 = (row.get("era2") or "").strip()
        if e1 in ERA1_LEVELS:
            for emo in EMOS:
                vals_era1[emo][e1].append(to_float(row.get(f"mean_{emo}","")))
        if e2 in ERA2_LEVELS:
            for emo in EMOS:
                vals_era2[emo][e2].append(to_float(row.get(f"mean_{emo}","")))

# --- 1) Boxplots era2: "pre" vs "post" je Emotion ---
fig = make_subplots(rows=2, cols=3, subplot_titles=EMOS,
                    shared_yaxes=True, horizontal_spacing=0.08, vertical_spacing=0.12)

era2_colors = {"pre": "#4c78a8", "post": "#f58518"}  # blau vs orange
legend_once = set()

def grid_pos(i):  # 0..5 -> (row,col)
    return 1 + i//3, 1 + i%3

for i, emo in enumerate(EMOS):
    rrow, ccol = grid_pos(i)
    for lvl in ERA2_LEVELS:
        ys = vals_era2[emo][lvl]
        fig.add_trace(
            go.Box(
                y=ys, name=lvl, marker_color=era2_colors[lvl],
                boxpoints="outliers", jitter=0.3, whiskerwidth=0.8,
                legendgroup=lvl, showlegend=(lvl not in legend_once)
            ),
            row=rrow, col=ccol
        )
        legend_once.add(lvl)

fig.update_layout(
    title="Emotionen: era2 (pre vs post) – Verteilungen je Emotion",
    template="plotly_white", height=700, boxmode="group",
    legend_title="era2"
)
fig.update_yaxes(title_text="Wert (mean_*)", rangemode="tozero", row=1, col=1)
fig.show()

# --- 2) Signifikanz: era2 (pre vs post) je Emotion ---
era2_stats = {}
for emo in EMOS:
    x = vals_era2[emo]["pre"]
    y = vals_era2[emo]["post"]
    era2_stats[emo] = _mw_u_test(x, y)

# --- 3) Signifikanz: era1 – beste Zweiertrennung je Emotion (min p) ---
pairs_era1 = [("pre war","war"), ("war","post war"), ("pre war","post war")]
era1_best = {}
for emo in EMOS:
    best = {"p": 1.0, "pair": None, "z": 0.0, "delta": 0.0}
    for a,b in pairs_era1:
        stat = _mw_u_test(vals_era1[emo][a], vals_era1[emo][b])
        if stat["p"] < best["p"]:
            best = {"p": stat["p"], "pair": (a,b), "z": stat["z"], "delta": stat["delta"]}
    era1_best[emo] = best

# --- 4) Vergleichs-Plot: -log10(p) für era1(best) vs era2(pre vs post) ---
def nlog10(p):
    import math
    p = max(min(p, 1.0), 1e-300)
    return -math.log10(p)

emotions = EMOS
era1_bars = [nlog10(era1_best[e]["p"]) for e in emotions]
era2_bars = [nlog10(era2_stats[e]["p"]) for e in emotions]

cmp_fig = go.Figure()
cmp_fig.add_bar(name="era1 (beste Paarung)", x=emotions, y=era1_bars, marker_color="#9ecae1")
cmp_fig.add_bar(name="era2 (pre vs post)", x=emotions, y=era2_bars, marker_color="#fb9a99")
# Schwellenlinien
cmp_fig.add_hline(y=nlog10(0.05), line_dash="dash", line_color="gray", opacity=0.6,
                  annotation_text="p=0.05", annotation_position="top right")
cmp_fig.add_hline(y=nlog10(0.01), line_dash="dot", line_color="gray", opacity=0.6,
                  annotation_text="p=0.01", annotation_position="bottom right")

cmp_fig.update_layout(
    title="-log10(p): era1 vs. era2 pro Emotion (höher = stärkere Trennung)",
    barmode="group",
    template="plotly_white",
    yaxis_title="-log10(p)  (zweiseitig, MWU)",
    xaxis_title="Emotion",
    legend_title="Segmentierung"
)
cmp_fig.show()

# --- 5) Kleiner Text-Report in der Konsole ---
print("\n=== ERA2 (pre vs post) – MWU p-Werte & Cliff's delta ===")
for e in EMOS:
    s = era2_stats[e]
    print(f"{e:10s}  n={s['n1']+s['n2']:4d}  pre={s['n1']:3d} post={s['n2']:3d}  p={s['p']:.3g}  z={s['z']:.2f}  delta={s['delta']:.3f}")

print("\n=== ERA1 beste Paarung pro Emotion (niedrigster p) ===")
for e in EMOS:
    b = era1_best[e]
    pair = f"{b['pair'][0]} vs {b['pair'][1]}" if b["pair"] else "n/a"
    print(f"{e:10s}  best: {pair:23s}  p={b['p']:.3g}  z={b['z']:.2f}  delta={b['delta']:.3f}")



=== ERA2 (pre vs post) – MWU p-Werte & Cliff's delta ===
Joy         n=  14  pre=  6 post=  8  p=0.107  z=1.61  delta=0.542
Love        n=  14  pre=  6 post=  8  p=0.138  z=1.48  delta=0.500
Fear        n=  14  pre=  6 post=  8  p=0.561  z=-0.58  delta=-0.208
Anger       n=  14  pre=  6 post=  8  p=0.401  z=-0.84  delta=-0.292
Sadness     n=  14  pre=  6 post=  8  p=0.561  z=-0.58  delta=-0.208
Agitation   n=  14  pre=  6 post=  8  p=0.651  z=0.45  delta=0.167

=== ERA1 beste Paarung pro Emotion (niedrigster p) ===
Joy         best: war vs post war          p=0.514  z=-0.65  delta=-0.375
Love        best: pre war vs war           p=0.27  z=1.10  delta=0.438
Fear        best: war vs post war          p=0.151  z=1.44  delta=0.750
Anger       best: pre war vs war           p=0.35  z=-0.93  delta=-0.375
Sadness     best: pre war vs war           p=0.203  z=-1.27  delta=-0.500
Agitation   best: war vs post war          p=0.695  z=0.39  delta=0.250


In [10]:
# Line-Chart: Emotionen monatlich (Monatsmittel aus 'id' geparst), Plotly ohne NumPy/Pandas
import sys, subprocess, csv, re, datetime as dt
from collections import defaultdict, OrderedDict

# Plotly sicherstellen
subprocess.run([sys.executable, "-m", "pip", "install", "-q", "plotly"], check=False)
import plotly.graph_objects as go

TSV = r"C:\Users\sam97xs\Stabi_Hackathon\Results\ALL_emotions_means_by_id.tsv"   # <--- anpassen

EMOS = ["Joy","Love","Fear","Anger","Sadness","Agitation"]

def to_float(x):
    if x is None: return 0.0
    x = str(x).strip()
    if not x: return 0.0
    try: return float(x.replace(",", "."))
    except: return 0.0

# YYYY-MM aus der id holen
DATE_RE = re.compile(r"(\d{4})-(\d{2})-(\d{2})")
def extract_year_month_from_id(s):
    m = DATE_RE.search(s or "")
    if not m: return None, None
    return int(m.group(1)), int(m.group(2))

# --- Datei lesen & pro Monat mitteln ---
sums = defaultdict(lambda: defaultdict(float))  # sums[(year,month)][emo]
cnts = defaultdict(int)                         # cnts[(year,month)]

with open(TSV, "r", encoding="utf-8", errors="ignore", newline="") as f:
    r = csv.DictReader(f, delimiter="\t")
    headers = r.fieldnames or []

    # Emotionsspalten (mean_Emotion)
    emo_cols = [f"mean_{e}" for e in EMOS if f"mean_{e}" in headers]
    if not emo_cols:
        raise RuntimeError("Keine Spalten 'mean_*' gefunden (z. B. mean_Joy, mean_Fear, ...).")

    for row in r:
        y, m = extract_year_month_from_id(row.get("id",""))
        if y is None or m is None:
            continue
        key = (y, m)
        cnts[key] += 1
        for e in EMOS:
            col = f"mean_{e}"
            if col in row:
                sums[key][e] += to_float(row.get(col))

# Monatsmittel berechnen
ym_keys = sorted(cnts.keys())                        # (year, month) chronologisch
means_by_ym = OrderedDict()
for ym in ym_keys:
    n = max(1, cnts[ym])
    means_by_ym[ym] = {e: (sums[ym].get(e, 0.0) / n) for e in EMOS}

# X-Achse als echte Datumswerte (erster Tag des Monats)
x_dates = [dt.date(y, m, 1) for (y, m) in ym_keys]

# --- Farben: Positiv = Grüntöne, Negativ = Rottöne ---
pos_palette = {
    "Joy":  "#2ca25f",   # kräftiges Grün
    "Love": "#66c2a4",   # helleres Grün
}
neg_palette = {
    "Anger":     "#b2182b",  # dunkles Rot
    "Fear":      "#d6604d",  # rötlich
    "Sadness":   "#f4a582",  # lachs-rot
    "Agitation": "#d7301f",  # orangerot
}
color_map = {**pos_palette, **neg_palette}

# Plot
fig = go.Figure()
for e in EMOS:
    yvals = [means_by_ym[ym].get(e, 0.0) for ym in ym_keys]
    fig.add_trace(go.Scatter(
        x=x_dates, y=yvals,
        mode="lines+markers",
        name=e,
        line=dict(width=2, color=color_map.get(e)),
        marker=dict(size=5, color=color_map.get(e))
    ))

fig.update_layout(
    title="Emotionen über die Monate (Monatsmittel, aus ID geparst)",
    xaxis_title="Monat",
    yaxis_title="Mittelwert",
    template="plotly_white",
    legend_title="Emotion",
    hovermode="x unified"
)

# Monats-Ticks & Format
fig.update_xaxes(dtick="M1", tickformat="%Y-%m")
fig.update_yaxes(rangemode="tozero")

fig.show()

# Optional speichern:
# fig.write_html("emotions_line_by_month.html", include_plotlyjs="cdn")


In [12]:
# Stream Chart (relative Anteile) – Plotly ohne NumPy/Pandas
import sys, subprocess, csv, re, datetime as dt
from collections import defaultdict, OrderedDict

# Plotly sicherstellen
subprocess.run([sys.executable, "-m", "pip", "install", "-q", "plotly"], check=False)
import plotly.graph_objects as go

# ---- Einstellungen ----
TSV  = r"C:\Users\sam97xs\Stabi_Hackathon\Results\ALL_emotions_means_by_id.tsv"  # <— anpassen
FREQ = "month"   # "month" oder "year"

EMOS = ["Joy","Love","Fear","Anger","Sadness","Agitation"]

def to_float(x):
    if x is None: return 0.0
    s = str(x).strip()
    if not s: return 0.0
    try: return float(s.replace(",", "."))
    except: return 0.0

DATE_RE = re.compile(r"(\d{4})-(\d{2})-(\d{2})")
def extract_ymd(s):
    m = DATE_RE.search(s or "")
    if not m: return None, None, None
    return int(m.group(1)), int(m.group(2)), int(m.group(3))

# ---- Aggregation pro Periode (Monat/Jahr) ----
# sums[period_key][emo] = sum of mean_emotion values; cnts[period_key] = rows counted
sums = defaultdict(lambda: defaultdict(float))
cnts = defaultdict(int)

with open(TSV, "r", encoding="utf-8", errors="ignore", newline="") as f:
    r = csv.DictReader(f, delimiter="\t")
    headers = r.fieldnames or []
    have_year_col = "year" in headers
    # check that at least one mean_* exists
    have_means = any((f"mean_{e}" in headers) for e in EMOS)
    if not have_means:
        raise RuntimeError("Keine Spalten 'mean_*' gefunden (z. B. mean_Joy).")

    for row in r:
        # Zeitraum bestimmen
        if FREQ == "year":
            # bevorzugt 'year', sonst aus id
            try:
                year = int(float(str(row.get("year","")).replace(",", "."))) if have_year_col else None
            except:
                year = None
            if year is None:
                y, m, d = extract_ymd(row.get("id",""))
                year = y
            if year is None: 
                continue
            key = (year,)
        else:
            # month
            y, m, d = extract_ymd(row.get("id",""))
            if y is None or m is None:
                continue
            key = (y, m)

            # optional: wenn du „Monatserste“ o. spezifische Sampling-Regeln hast, greift das hier automatisch,
            # weil nur vorhandene Zeilen berücksichtigt werden.

        cnts[key] += 1
        for e in EMOS:
            col = f"mean_{e}"
            if col in headers:
                sums[key][e] += to_float(row.get(col))

# ---- Mittel & Anteile pro Periode ----
# 1) (optional) Mittel pro Periode -> hier nicht zwingend nötig, da wir relative Anteile bilden
# 2) relative Anteile: share = emo_sum / sum_all_emos
periods = sorted(sums.keys())  # chronologisch
shares = OrderedDict()
for k in periods:
    total = sum(sums[k].get(e, 0.0) for e in EMOS)
    if total <= 0:
        shares[k] = {e: 0.0 for e in EMOS}
    else:
        shares[k] = {e: (sums[k].get(e, 0.0) / total) for e in EMOS}

# X-Achse (echte Datumswerte für Monat, integer für Jahr)
if FREQ == "year":
    x_vals = [k[0] for k in periods]  # Jahre als Zahlen
else:
    x_vals = [dt.date(k[0], k[1], 1) for k in periods]

# ---- Farben (positive ähnlich, negative ähnlich) ----
pos_palette = {"Joy": "#2ca25f", "Love": "#66c2a4"}       # Grünfamilie
neg_palette = {
    "Anger": "#b2182b", "Fear": "#d6604d",
    "Sadness": "#f4a582", "Agitation": "#d7301f"          # Rot-/Orangetöne
}
color_map = {**pos_palette, **neg_palette}

# ---- Stream/Stacked Area (normalized) ----
fig = go.Figure()
# Reihenfolge kann die visuelle „Strömung“ beeinflussen.
# Hier zuerst Negatives, dann Positives (ändere nach Geschmack).
plot_order = ["Agitation","Anger","Fear","Sadness","Love","Joy"]

for e in plot_order:
    y_vals = [shares[k].get(e, 0.0) for k in periods]
    fig.add_trace(go.Scatter(
        x=x_vals, y=y_vals, name=e,
        mode="lines",
        line=dict(width=0.5, color=color_map.get(e)),
        stackgroup="one",             # aktiviert Flächenstacking
        groupnorm="fraction",         # jede Periode auf 1 normalisieren
        hovertemplate=f"{e}: %{y:.1%}<extra></extra>"
    ))

title_freq = "Monate" if FREQ == "month" else "Jahre"
fig.update_layout(
    title=f"Stream Chart – relative Anteile der Emotionen über die {title_freq}",
    template="plotly_white",
    legend_title="Emotion",
    yaxis=dict(tickformat=".0%", title="Anteil"),
    xaxis_title=("Monat" if FREQ == "month" else "Jahr"),
    hovermode="x unified"
)

# Monats-Ticks hübscher
if FREQ == "month":
    fig.update_xaxes(dtick="M2", tickformat="%Y-%m")  # alle 2 Monate (anpassen nach Dichte)

fig.show()

# Optional speichern:
# fig.write_html("emotions_stream_relative.html", include_plotlyjs="cdn")


In [13]:
# Outlier-Finder ohne NumPy/Pandas: MAD-z & IQR je Emotion
import csv, re, math, sys, subprocess
from pathlib import Path
from collections import defaultdict

# ---- Eingaben anpassen ----
TSV  = r"C:\Users\sam97xs\Stabi_Hackathon\Results\ALL_emotions_means_by_id.tsv"   # oder ...\ALL_emotions_merged.tsv
EMOS = ["Joy","Love","Fear","Anger","Sadness","Agitation"]
TOPN = 10   # wie viele Ausreißer je Emotion zeigen

# ---- (optional) Plotly für Scatterplots laden; wenn Install fehlschlägt, wird nur Text/TSV erzeugt ----
try:
    subprocess.run([sys.executable, "-m", "pip", "install", "-q", "plotly"], check=False)
    import plotly.graph_objects as go
    PLOT_OK = True
except Exception:
    PLOT_OK = False

# ---- Helfer ----
DATE_RE = re.compile(r"(\d{4})-(\d{2})-(\d{2})")

def to_float(x):
    if x is None: return 0.0
    s = str(x).strip()
    if not s: return 0.0
    try: return float(s.replace(",", "."))
    except: return 0.0

def get_date_parts(id_str):
    m = DATE_RE.search(id_str or "")
    if not m: return None, None, None, None
    y, mo, d = int(m.group(1)), int(m.group(2)), int(m.group(3))
    return f"{y:04d}-{mo:02d}-{d:02d}", y, mo, d

def median(sorted_vals):
    n = len(sorted_vals)
    if n == 0: return 0.0
    mid = n // 2
    if n % 2 == 1:
        return sorted_vals[mid]
    else:
        return 0.5 * (sorted_vals[mid-1] + sorted_vals[mid])

def mad_z_scores(values):
    """returns (med, mad, zlist). MAD uses 1.4826 scaling (normal)."""
    if not values:
        return 0.0, 0.0, []
    sv = sorted(values)
    med = median(sv)
    abs_dev = [abs(v - med) for v in values]
    mad = median(sorted(abs_dev))
    madn = mad * 1.4826
    if madn == 0.0:
        # alle (fast) gleich -> z=0
        return med, madn, [0.0 for _ in values]
    return med, madn, [(v - med)/madn for v in values]

def quantiles(sorted_vals, q):
    """einfache Quantile (q in [0..1]) per linearer Interpolation."""
    n = len(sorted_vals)
    if n == 0: return 0.0
    if n == 1: return sorted_vals[0]
    pos = (n - 1) * q
    lo = int(math.floor(pos))
    hi = int(math.ceil(pos))
    if lo == hi: return sorted_vals[lo]
    frac = pos - lo
    return sorted_vals[lo] * (1 - frac) + sorted_vals[hi] * frac

def iqr_flags(values):
    """gibt (Q1,Q3,low_fence,high_fence, flags[]) zurück; flags[i]=True wenn Outlier."""
    if not values:
        return 0.0, 0.0, 0.0, 0.0, []
    sv = sorted(values)
    Q1 = quantiles(sv, 0.25)
    Q3 = quantiles(sv, 0.75)
    IQR = Q3 - Q1
    low  = Q1 - 1.5 * IQR
    high = Q3 + 1.5 * IQR
    flags = [(v < low) or (v > high) for v in values]
    return Q1, Q3, low, high, flags

# ---- Daten laden ----
rows = []  # jede Zeile: dict mit id, year, era1, era2, per-emotion value
headers = None
with open(TSV, "r", encoding="utf-8", errors="ignore", newline="") as f:
    r = csv.DictReader(f, delimiter="\t")
    headers = r.fieldnames or []
    have_era1 = "era1" in headers
    have_era2 = "era2" in headers
    have_text = "text" in headers
    for raw in r:
        rid = raw.get("id","")
        date_str, year, mo, d = get_date_parts(rid)
        row = {
            "id": rid,
            "date": date_str or "",
            "year": year or "",
            "era1": (raw.get("era1","") if have_era1 else ""),
            "era2": (raw.get("era2","") if have_era2 else ""),
            "text": (raw.get("text","") if have_text else "")
        }
        # Emotionen einsammeln
        any_val = False
        for emo in EMOS:
            col = f"mean_{emo}"
            if col in headers:
                v = to_float(raw.get(col, ""))
                row[col] = v
                any_val = True
        if any_val:
            rows.append(row)

if not rows:
    raise RuntimeError("Keine Datenzeilen mit mean_* gefunden. Bitte Datei/Spalten prüfen.")

# ---- Outlier-Scores je Emotion berechnen ----
results = []  # Sammel-Ausgabezeilen für TSV
per_emo_values = {emo: [] for emo in EMOS}
for emo in EMOS:
    col = f"mean_{emo}"
    if col not in headers: 
        continue
    vals = [r[col] for r in rows]
    med, madn, zlist = mad_z_scores(vals)
    Q1, Q3, low, high, iqrflag = iqr_flags(vals)

    # Outlierliste (sortiert nach |z|)
    idx_sorted = sorted(range(len(rows)), key=lambda i: abs(zlist[i]), reverse=True)
    print(f"\n=== {emo} — Top {TOPN} nach |z_MAD| (Median={med:.4f}, MADn={madn:.4f}, IQR=[{Q1:.4f},{Q3:.4f}]) ===")
    for rank, i in enumerate(idx_sorted[:TOPN], 1):
        r = rows[i]
        val = vals[i]
        z   = zlist[i]
        flag= iqrflag[i]
        text_snip = (r["text"][:80] + "…") if r["text"] and len(r["text"])>80 else (r["text"] or "")
        print(f"{rank:2d}. {r['date'] or r['id']}  val={val:.4f}  z={z:+.2f}  IQR_outlier={flag}  era1={r['era1']}  era2={r['era2']}")
        results.append({
            "emotion": emo,
            "rank": rank,
            "id": r["id"],
            "date": r["date"],
            "year": r["year"],
            "era1": r["era1"],
            "era2": r["era2"],
            "value": f"{val:.6f}",
            "z_mad": f"{z:.3f}",
            "iqr_outlier": "1" if flag else "0",
            "text_snippet": text_snip
        })

# ---- Alles in eine TSV schreiben ----
out_path = Path(TSV).with_suffix(".outliers_topN.tsv")
with out_path.open("w", encoding="utf-8", newline="") as fout:
    w = csv.DictWriter(fout, delimiter="\t", lineterminator="\n",
                       fieldnames=["emotion","rank","id","date","year","era1","era2","value","z_mad","iqr_outlier","text_snippet"])
    w.writeheader()
    for row in results:
        w.writerow(row)
print(f"\n[OK] geschrieben: {out_path}")

# ---- (optional) kleine Scatterplots pro Emotion (wenn Plotly verfügbar) ----
if PLOT_OK:
    for emo in EMOS:
        col = f"mean_{emo}"
        if col not in headers: 
            continue
        xs = []
        ys = []
        labels = []
        colors = []
        sizes = []
        # dafür z-Scores noch einmal holen
        vals = [r[col] for r in rows]
        _, _, zlist = mad_z_scores(vals)
        _, _, _, _, iqrflag = iqr_flags(vals)
        for i, r in enumerate(rows):
            # x = Jahr-Monat als sortierbarer String (falls Datum fehlt, nimm year)
            x = r["date"] or str(r["year"]) or r["id"]
            xs.append(x)
            ys.append(r[col])
            labels.append(f"{r['date'] or r['id']}<br>era1={r['era1']} | era2={r['era2']}")
            # Outlier optisch hervorheben:
            if abs(zlist[i]) >= 3 or iqrflag[i]:
                colors.append("#d62728")  # rot
                sizes.append(10)
            else:
                colors.append("#1f77b4")  # blau
                sizes.append(6)

        fig = go.Figure()
        fig.add_scatter(x=xs, y=ys, mode="markers",
                        marker=dict(size=sizes, color=colors, opacity=0.8),
                        text=labels, hovertemplate="%{text}<br>value=%{y:.4f}<extra></extra>",
                        name=emo)
        fig.update_layout(
            title=f"Outliers: {emo} (rot = auffällig nach z_MAD≥3 oder IQR)",
            template="plotly_white",
            xaxis_title="Datum (aus id)",
            yaxis_title=f"mean_{emo}",
            height=420
        )
        fig.show()



=== Joy — Top 10 nach |z_MAD| (Median=0.0391, MADn=0.0142, IQR=[0.0307,0.0479]) ===
 1. 1913-11-01  val=0.0913  z=+3.69  IQR_outlier=True  era1=pre war  era2=pre
 2. 1915-04-01  val=0.0648  z=+1.82  IQR_outlier=False  era1=war  era2=pre
 3. 1918-10-01  val=0.0195  z=-1.38  IQR_outlier=False  era1=war  era2=post
 4. 1913-02-01  val=0.0208  z=-1.29  IQR_outlier=False  era1=pre war  era2=pre
 5. 1914-01-01  val=0.0511  z=+0.85  IQR_outlier=False  era1=pre war  era2=pre
 6. 1915-12-01  val=0.0491  z=+0.71  IQR_outlier=False  era1=war  era2=pre
 7. 1916-02-01  val=0.0292  z=-0.70  IQR_outlier=False  era1=war  era2=post
 8. 1917-12-01  val=0.0299  z=-0.65  IQR_outlier=False  era1=war  era2=post
 9. 1918-02-01  val=0.0329  z=-0.44  IQR_outlier=False  era1=war  era2=post
10. 1919-07-01  val=0.0443  z=+0.37  IQR_outlier=False  era1=post war  era2=post

=== Love — Top 10 nach |z_MAD| (Median=0.0226, MADn=0.0096, IQR=[0.0192,0.0319]) ===
 1. 1915-04-01  val=0.0440  z=+2.23  IQR_outlier=False  er