In [17]:
# SIMPLE AGGREGATOR

import pandas as pd
from pathlib import Path
import re

RESULTS_DIR = Path("results")
RESULTS_DIR.mkdir(exist_ok=True)

GLOBAL_RE = re.compile(
    r"^global_metrics_(?P<algo>[^_]+)_(?P<balance>[^_]+)_(?P<fraction>\d+)_(?P<failure>\d+)_(?P<iid>\d+)_(?P<seed>\d+)\.csv$"
)
TRAIN_RE = re.compile(
    r"^train_metrics_(?P<algo>[^_]+)_(?P<balance>[^_]+)_(?P<fraction>\d+)_(?P<failure>\d+)_(?P<iid>\d+)_(?P<seed>\d+)\.csv$"
)

In [18]:
# Function to parse factors from filename
def parse_factors(filename):
    m = GLOBAL_RE.match(filename) or TRAIN_RE.match(filename)
    if m is None:
        return None
    d = m.groupdict()
    return {
        "algo": d["algo"],
        "balance": d["balance"],
        "fraction_percent": int(d["fraction"]),
        "failure_percent": int(d["failure"]),
        "iid": int(d["iid"]),
        "seed": int(d["seed"])
    }

In [19]:
# ------------------------
# Load GLOBAL metrics
# ------------------------
global_frames = []
for f in RESULTS_DIR.glob("global_metrics_*.csv"):
    factors = parse_factors(f.name)
    if factors is None:
        continue
    df = pd.read_csv(f)
    for k, v in factors.items():
        df[k] = v
    global_frames.append(df)

df_global = pd.concat(global_frames, ignore_index=True) if global_frames else pd.DataFrame()
df_global = df_global[df_global["fraction_percent"] == 100].reset_index(drop=True)
print("Global metrics rows:", len(df_global))


# ------------------------
# Load TRAIN communication metrics
# ------------------------
train_frames = []
for f in RESULTS_DIR.glob("train_metrics_*.csv"):
    factors = parse_factors(f.name)
    if factors is None:
        continue
    df = pd.read_csv(f)

    # keep only round + comm metrics
    cols = ["round", "comm_download_bytes", "comm_upload_bytes"]
    df = df[[c for c in cols if c in df.columns]].copy()

    # Add missing comm columns if not present
    for c in cols:
        if c not in df.columns:
            df[c] = pd.NA

    for k, v in factors.items():
        df[k] = v
    train_frames.append(df)


df_train_comm = pd.concat(train_frames, ignore_index=True) if train_frames else pd.DataFrame()
df_train_comm = df_train_comm[df_train_comm["fraction_percent"] == 100].reset_index(drop=True)
print("Train communication rows:", len(df_train_comm))

JOIN_KEYS = ["algo","balance","fraction_percent","failure_percent","iid","seed","round"]

# Left join: keep ALL global rows
df_merged = df_global.merge(
    df_train_comm,
    on=JOIN_KEYS,
    how="left"
)

# Fill missing values with last valid value
df_merged["comm_download_bytes"] = df_merged["comm_download_bytes"].bfill()
df_merged["comm_upload_bytes"]   = df_merged["comm_upload_bytes"].bfill()

# Save final merged dataframe
df_merged.to_csv("results/0_final_merged_global_with_comm.csv", index=False)

df_merged.head()

Global metrics rows: 32320
Train communication rows: 31074


Unnamed: 0,round,global_accuracy,global_logloss,global_precision_bin,global_recall_bin,global_f1_bin,global_precision_macro,global_recall_macro,global_f1_macro,global_tn,...,fraction,failure_rate,iid,seed,algo,balance,fraction_percent,failure_percent,comm_download_bytes,comm_upload_bytes
0,0,0.5,0.693147,0.5,1.0,0.666667,0.25,0.5,0.333333,0,...,1.0,0.0,0,1145661099,FedAdagrad,balanced,100,0,48.0,48.0
1,1,0.668306,0.66218,0.610567,0.929407,0.73698,0.73141,0.668306,0.644038,3934,...,1.0,0.0,0,1145661099,FedAdagrad,balanced,100,0,48.0,48.0
2,2,0.659714,0.645028,0.602757,0.93686,0.733558,0.730547,0.659714,0.631403,3696,...,1.0,0.0,0,1145661099,FedAdagrad,balanced,100,0,48.0,48.0
3,3,0.721354,0.626537,0.680876,0.833247,0.749395,0.733024,0.721354,0.717821,5888,...,1.0,0.0,0,1145661099,FedAdagrad,balanced,100,0,48.0,48.0
4,4,0.739986,0.613254,0.717516,0.791636,0.752756,0.742574,0.739986,0.73929,6650,...,1.0,0.0,0,1145661099,FedAdagrad,balanced,100,0,48.0,48.0
