
# 02 · Harmonisierung

Dieses Notebook lädt die Statista-CSV-Dateien (2021, 2022, 2024), harmonisiert die Kategorien
gemäß Mapping (aus Notebook 01), **konsolidiert Teilkategorien**, und exportiert die Daten als
**long**- und **wide**-Format für das Reporting.


In [6]:

# 02_harmonisierung — Cell 1: Imports, Pfade, Config laden
from __future__ import annotations

from pathlib import Path
import pandas as pd
import re, json
from datetime import datetime

NB_DIR  = Path.cwd().resolve()
BASE    = NB_DIR.parents[0] if NB_DIR.name.lower() == "notebooks" else NB_DIR
DATA    = BASE / "data"
RAW     = DATA / "raw"
OUT     = DATA / "processed"
OUT.mkdir(parents=True, exist_ok=True)

CONFIG = json.loads((OUT / "project_config.json").read_text(encoding="utf-8"))
KANON  = CONFIG["kanon"]
MAP_CSV = Path(CONFIG["paths"]["mapping_csv"])
mapping_df = pd.read_csv(MAP_CSV)
print("Mapping geladen aus:", MAP_CSV)


Mapping geladen aus: D:\Q3_2025\data-analytics\project\data\processed\mapping_statista_to_kanon.csv


In [7]:

# 02_harmonisierung — Cell 2: Hilfsfunktionen
def read_csv_robust(path: Path) -> pd.DataFrame:
    encodings = ["utf-8-sig", "cp1252", "latin1"]
    last_err = None
    for enc in encodings:
        try:
            return pd.read_csv(path, encoding=enc, sep=None, engine="python")
        except UnicodeDecodeError as e:
            last_err = e
            continue
    return pd.read_csv(path, encoding="latin1", sep=None, engine="python")

def norm_text(s: str) -> str:
    if pd.isna(s): return ""
    s = str(s).strip().lower()
    s = s.replace("&", "und").replace("-", " ")
    s = re.sub(r"\s+", " ", s)
    s = s.replace("accressoires", "accessoires").replace("hi tech", "high tech")
    return s

# Mapping-Lookup aus CSV (Quelle: 01)
MAP_NORM = dict(zip(mapping_df["source_normalized"].map(str), mapping_df["kanon"]))

def pick_statista_cols(df: pd.DataFrame) -> tuple[str, str | None]:
    "Findet Spalten für (Kategorie, Wert)."
    cat_col = None; val_col = None
    normalized = {str(c).strip().lower(): c for c in df.columns}
    for key, c in normalized.items():
        if key in {"category","kategorie","warengruppe","produktkategorie","bereich"}:
            cat_col = c
        if key in {"value","wert","prozent","anteil","share","%","rate","anzahl"}:
            val_col = c
    if cat_col is None:
        raise ValueError("CSV: Kategorien-Spalte (z.B. 'Kategorie'/'Category') fehlt.")
    return cat_col, val_col

def clean_value_col(s: pd.Series) -> pd.Series:
    return (
        s.astype(str)
         .str.replace("%","", regex=False)
         .str.replace(",",".", regex=False)
         .str.strip()
         .pipe(pd.to_numeric, errors="coerce")
         .fillna(0.0)
    )

def map_category_to_kanon(cat: pd.Series) -> pd.Series:
    return cat.map(lambda x: MAP_NORM.get(norm_text(x), "IGNORE"))


In [8]:

# 02_harmonisierung — Cell 3: Loader pro Jahr
def load_statista_csv(year: int, fname: str) -> pd.DataFrame:
    path = RAW / fname
    df = read_csv_robust(path)
    cat_col, val_col = pick_statista_cols(df)
    df = df[[cat_col] + ([val_col] if val_col else [])].copy()
    df.rename(columns={cat_col: "source_category"}, inplace=True)
    df["value"] = 0.0 if val_col is None else clean_value_col(df[val_col])
    df["year"] = year
    df["Kategorie"] = map_category_to_kanon(df["source_category"]).fillna("IGNORE")
    df = df[df["Kategorie"] != "IGNORE"].copy()
    df["value"] = pd.to_numeric(df["value"], errors="coerce").fillna(0.0)
    # Teilkategorien konsolidieren
    out = df.groupby(["year", "Kategorie"], as_index=False)["value"].sum()
    return out


In [9]:

# 02_harmonisierung — Cell 4: Laden, mergen, Pivot, Plausis
available = {
    2021: "statista_2021.csv",
    2022: "statista_2022.csv",
    2024: "statista_2024.csv",
}
frames = []
for yr, fn in available.items():
    p = RAW / fn
    if p.exists():
        print(f"Lade {yr} → {p}")
        frames.append(load_statista_csv(yr, fn))
    else:
        print(f"Warnung: Datei fehlt → {p}")
if not frames:
    raise FileNotFoundError("Keine Statista-Dateien gefunden.")

stat_all = pd.concat(frames, ignore_index=True)
# Pivot: Kategorien x Jahre
stat_pivot = stat_all.pivot(index="Kategorie", columns="year", values="value").fillna(0.0)
stat_pivot = stat_pivot.reindex(KANON).fillna(0.0)

display(stat_all.sort_values(["year","Kategorie"]).head(10))
display(stat_pivot.reset_index().head(10))

# Plausibilitätscheck: Summen je Jahr (Hinweis: nicht zwingend 100, da Teilmenge möglich)
sums = stat_pivot.sum(axis=0)
print("Summen je Jahr (in %):")
print(sums)


Lade 2021 → D:\Q3_2025\data-analytics\project\data\raw\statista_2021.csv
Lade 2022 → D:\Q3_2025\data-analytics\project\data\raw\statista_2022.csv
Lade 2024 → D:\Q3_2025\data-analytics\project\data\raw\statista_2024.csv


Unnamed: 0,year,Kategorie,value
0,2021,"Elektronik (z. B. Smartphones, Haushaltsgeräte)",38
1,2021,Hobby- & Freizeitartikel,83
2,2021,Kleidung / Schuhe,57
3,2021,Lebensmittel / Getränke,33
4,2021,Medikamente / Drogerieartikel,78
5,2021,Möbel / Wohnaccessoires,38
6,2022,Bücher / Medien / Software,92
7,2022,"Elektronik (z. B. Smartphones, Haushaltsgeräte)",145
8,2022,Hobby- & Freizeitartikel,71
9,2022,Kleidung / Schuhe,78


year,Kategorie,2021,2022,2024
0,Kleidung / Schuhe,57.0,78.0,106.0
1,"Elektronik (z. B. Smartphones, Haushaltsgeräte)",38.0,145.0,38.0
2,Lebensmittel / Getränke,33.0,28.0,16.0
3,Bücher / Medien / Software,0.0,92.0,25.0
4,Medikamente / Drogerieartikel,78.0,92.0,51.0
5,Hobby- & Freizeitartikel,83.0,71.0,57.0
6,Möbel / Wohnaccessoires,38.0,32.0,15.0


Summen je Jahr (in %):
year
2021    327.0
2022    538.0
2024    308.0
dtype: float64


In [10]:

# 02_harmonisierung — Cell 5: Exporte
long_path = OUT / "statista_long_2021_2022_2024.csv"
wide_path = OUT / "statista_harmonisiert_2021_2022_2024.csv"
stat_all.sort_values(["year","Kategorie"]).to_csv(long_path, index=False, encoding="utf-8")
stat_pivot.reset_index().to_csv(wide_path, index=False, encoding="utf-8")
print("Exportiert:")
print("-", long_path)
print("-", wide_path)


Exportiert:
- D:\Q3_2025\data-analytics\project\data\processed\statista_long_2021_2022_2024.csv
- D:\Q3_2025\data-analytics\project\data\processed\statista_harmonisiert_2021_2022_2024.csv
