In [3]:
%pip install pandas seaborn

from pathlib import Path
import pandas as pd
import seaborn as sns
import json

data_dir = Path(r"C:\Users\alexa\Desktop\Untitled Folder")  # Pfad ggf. anpassen
files = sorted(data_dir.glob("*.json"))



Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.0.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
import pandas as pd
import json
import gc

batch_size = 20  # Anzahl Dateien pro Batch, kann je nach RAM angepasst werden
dfs_batch = []
dfs_final = []

for i, f in enumerate(files):
    try:
        dfs_batch.append(pd.read_json(f, lines=True))
    except ValueError:
        obj = json.loads(f.read_text(encoding="utf-8"))
        if isinstance(obj, list):
            dfs_batch.append(pd.DataFrame(obj))
        elif isinstance(obj, dict):
            dfs_batch.append(pd.json_normalize(obj))
        else:
            raise ValueError(f"Unbekanntes JSON-Format in {f.name}")

    # Sobald batch_size erreicht ist oder letzte Datei, zusammenfügen und aufsammeln
    if (i + 1) % batch_size == 0 or (i + 1) == len(files):
        temp_df = pd.concat(dfs_batch, ignore_index=True)
        dfs_final.append(temp_df)
        dfs_batch.clear()
        gc.collect()  # Speicher freigeben

# Endgültig alle Batches zusammenfügen
df = pd.concat(dfs_final, ignore_index=True)


ValueError: No objects to concatenate

In [11]:
# Beispielhafte Umbenennung auf erwartete Spalten
rename_map = {
    "duration_ms": "exec_time_ms",
    "duration": "exec_time_ms",
    "latency": "latency_ms",
    "cpu_time_ms": "cpu_ms",
    "memory_gb_s": "mem_mb_s",  # ggf. skaliert konvertieren
    "transfer_mb": "data_transfer_mb"
}
df = df.rename(columns={k:v for k,v in rename_map.items() if k in df.columns})

# Metadaten aus Dateinamen extrahieren (optional, je nach Namensschema)
def extract_meta(path: Path):
    # Beispiel: aws_eu_110_cold_results_256.json
    parts = path.stem.split("_")
    meta = {}
    if len(parts) >= 6:
        meta["provider"] = parts
        meta["scenario"] = parts[12]
        meta["load"] = int(parts[13]) if parts[13].isdigit() else parts[13]
        meta["start_type"] = parts[14]
        meta["platform"] = "lambda"  # Beispiel; ggf. aus Inhalt ableiten
        meta["settings"] = {"memory": parts[15]}
    return meta

meta_rows = []
for f in files:
    meta_rows.append(extract_meta(f))
import pandas as pd
meta_df = pd.DataFrame(meta_rows)
df[[c for c in meta_df.columns if c not in df.columns]] = meta_df[[c for c in meta_df.columns if c not in df.columns]].values


NameError: name 'df' is not defined

In [None]:
from pathlib import Path
import pandas as pd, json

data_dir = Path(r"C:\Users\alexa\Desktop\Untitled Folder")
print("Suche in:", data_dir.resolve())

# 1) Dateien sammeln: .json und .jsonl, notfalls rekursiv
files = sorted(list(data_dir.glob("*.json"))) + sorted(list(data_dir.glob("*.jsonl")))
if not files:
    files = sorted(list(data_dir.rglob("*.json"))) + sorted(list(data_dir.rglob("*.jsonl")))
print(f"{len(files)} Dateien gefunden")
print([f.name for f in files[:5]])  # Stichprobe

def load_one(f: Path) -> pd.DataFrame:
    # Versuch A: JSON-Lines
    try:
        dfA = pd.read_json(f, lines=True)
        if not dfA.empty:
            return dfA
    except ValueError:
        pass
    # Versuch B: Normales JSON
    obj = json.loads(f.read_text(encoding="utf-8"))
    if isinstance(obj, list):
        return pd.DataFrame(obj)
    if isinstance(obj, dict):
        # Heuristik: erst Liste von Records in einem Key suchen
        for k, v in obj.items():
            if isinstance(v, list) and v and isinstance(v, dict):
                return pd.json_normalize(v)
        # Sonst Top-Level-Dict flatten
        return pd.json_normalize(obj)
    # Fallback: leere Tabelle
    return pd.DataFrame()

parts = []
for f in files:
    dfi = load_one(f)
    print(f"{f.name}: {dfi.shape}")
    parts.append(dfi)

df = pd.concat(parts, ignore_index=True) if parts else pd.DataFrame()
print("Gesamt:", df.shape)
df.head()


In [None]:
expected = {"provider","platform","scenario","load","start_type","exec_time_ms","latency_ms",
            "cpu_ms","mem_mb_s","data_transfer_mb","cost","settings"}
missing = expected - set(df.columns)
if missing:
    # Beispiel: fehlende Spalten mit NaN auffüllen (besser: korrekt mappen/ableiten)
    for col in missing:
        df[col] = pd.NA
# harte Prüfung
missing = expected - set(df.columns)
assert not missing, f"Fehlende Spalten: {missing}"


In [None]:
metrics = ["exec_time_ms","latency_ms","cost"]

def iqr_filter(group, cols):
    for c in cols:
        q1 = group[c].quantile(0.25)
        q3 = group[c].quantile(0.75)
        iqr = q3 - q1
        lower = q1 - 1.5*iqr
        upper = q3 + 1.5*iqr
        group = group[(group[c] >= lower) & (group[c] <= upper)]
    return group

df_clean = (df
    .groupby(["provider","platform","scenario","load","start_type"], group_keys=False)
    .apply(lambda g: iqr_filter(g, metrics)))


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.set(style="whitegrid")
g = sns.catplot(
    data=df_clean, kind="box",
    x="provider", y="latency_ms",
    col="load", row="start_type",
    sharey=False, height=3, aspect=1.2
)
g.set_xticklabels(rotation=30)
plt.show()
