In [None]:
"""
Mixture-of-Experts pipeline for LED_a

STAGE 1  (first 50 % of the stream)
    • split 80 / 20  → expert-train / expert-val
    • train 10 one-vs-rest Hoeffding-Tree experts
    • keep all 10 experts (we do NOT pre-discard any)

STAGE 2  (second 50 % of the stream)
    • split 80 / 20  → router-train / router-val
    • for every sample in router-train, find **which expert(s) predict correctly**
      – if ≥1 experts correct → pick the first correct ID as the target label  
      – if 0 experts correct   → skip the sample (router can’t learn from it)
    • train a 10-way soft-max MLP router on those (x, expert-id) pairs
    • router-val:  router chooses an expert ► expert predicts ► measure accuracy
"""

# ────────────────────────────────────────────────────────────────────────────
#  Dependencies
# ────────────────────────────────────────────────────────────────────────────

from river import tree, naive_bayes
from river.datasets import synth
import matplotlib.pyplot as plt
from river import metrics
from river import forest
from river.datasets import synth
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split
from collections import defaultdict
import numpy as np
# ────────────────────────────────────────────────────────────────────────────
#  CONFIG
# ────────────────────────────────────────────────────────────────────────────
TOTAL_SAMPLES   = 100000          # change for quick tests
TRAIN_RATIO     = 0.80
NUM_CLASSES     = 10
INPUT_DIM       = 24                 # 7 relevant + 17 irrelevant
BATCH           = 256
EPOCHS          = 5
LR              = 3e-3
SEED_STREAM     = 112
SEED_TORCH      = 42
torch.manual_seed(SEED_TORCH)

# ────────────────────────────────────────────────────────────────────────────
#  LOAD STREAM  &  SPLIT
# ────────────────────────────────────────────────────────────────────────────
stream = list(
    synth.LEDDrift(
        seed                = SEED_STREAM,
        noise_percentage    = 0.10,
        irrelevant_features = True,
        n_drift_features    = 7
    ).take(TOTAL_SAMPLES)
)

half               = TOTAL_SAMPLES // 2
expert_block       = stream[:half]
router_block       = stream[half:]

exp_train_sz       = int(len(expert_block)  * TRAIN_RATIO)
rtr_train_sz       = int(len(router_block)  * TRAIN_RATIO)

exp_train, exp_val = expert_block[:exp_train_sz], expert_block[exp_train_sz:]
rtr_train, rtr_val = router_block[:rtr_train_sz], router_block[rtr_train_sz:]

print("── SPLITS ───────────────────────────────────────────")
print(f" Total samples         : {TOTAL_SAMPLES:,}")
print(f" Expert  train / val   : {len(exp_train):,} / {len(exp_val):,}")
print(f" Router  train / val   : {len(rtr_train):,} / {len(rtr_val):,}")

# helper: dict→24-float vector
d2v = lambda d: np.fromiter(d.values(), dtype=np.float32, count=INPUT_DIM)

# ────────────────────────────────────────────────────────────────────────────
#  STAGE 1  – TRAIN 10 EXPERTS
# ────────────────────────────────────────────────────────────────────────────
experts   = {cid: tree.HoeffdingTreeClassifier() for cid in range(NUM_CLASSES)}
exp_val_acc = {cid: metrics.Accuracy()           for cid in range(NUM_CLASSES)}

for x,y in exp_train:
    for cid,e in experts.items():
        e.learn_one(x, int(y==cid))

for x,y in exp_val:
    for cid,e in experts.items():
        exp_val_acc[cid].update(int(y==cid), e.predict_one(x))

print("\n── EXPERT VALIDATION ACC (first 50 %) ──────────────")
for cid in range(NUM_CLASSES):
    print(f" Expert {cid}: {exp_val_acc[cid].get():.4f}")

# ────────────────────────────────────────────────────────────────────────────
#  STAGE 2  – BUILD ROUTER TRAIN SET
# ────────────────────────────────────────────────────────────────────────────
router_X, router_y = [], []

for x_dict, y_true in rtr_train:
    correct_ids = [cid for cid,e in experts.items()
                   if e.predict_one(x_dict)==1 and y_true==cid]
    if not correct_ids:           # skip sample if no expert is correct
        continue
    router_X.append(d2v(x_dict))
    router_y.append(correct_ids[0])   # choose first correct expert id

router_X = np.stack(router_X)
router_y = np.array(router_y, dtype=np.int64)
print(f"\nRouter-train usable samples            : {len(router_y):,}")

# ────────────────────────────────────────────────────────────────────────────
#  ROUTER MLP
# ────────────────────────────────────────────────────────────────────────────
class TorchDS(Dataset):
    def __init__(self,X,y):
        self.X = torch.tensor(X)
        self.y = torch.tensor(y)
    def __len__(self):  return len(self.X)
    def __getitem__(self,i): return self.X[i], self.y[i]

train_dl = DataLoader(TorchDS(router_X, router_y), batch_size=BATCH, shuffle=True)

class RouterMLP(nn.Module):
    def __init__(self, in_dim=INPUT_DIM, hidden=128, out_dim=NUM_CLASSES):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim,hidden), nn.SiLU(),
            nn.Linear(hidden,hidden), nn.SiLU(),
            nn.Linear(hidden,out_dim)
        )
    def forward(self,x): return self.net(x)

router = RouterMLP()
opt    = torch.optim.Adam(router.parameters(), lr=LR)
ce     = nn.CrossEntropyLoss()
print("\n── ROUTER TRAINING ACC (Second 50 %) ──────────────")
router.train()
for epoch in range(1, EPOCHS+1):
    running = 0.0
    for xb,yb in train_dl:
        opt.zero_grad()
        loss = ce(router(xb), yb)
        loss.backward(); opt.step()
        running += loss.item()*len(xb)
    print(f"Epoch {epoch}/{EPOCHS} | router-train CE: {running/len(train_dl.dataset):.4f}")

# ────────────────────────────────────────────────────────────────────────────
#  FINAL EVALUATION ON router-val
# ────────────────────────────────────────────────────────────────────────────
router.eval()
pipeline_acc = metrics.Accuracy()

with torch.no_grad():
    for x_dict, y_true in rtr_val:
        logits = router(torch.tensor(d2v(x_dict))).softmax(0)
        eid    = int(torch.argmax(logits).item())          # expert chosen
        final  = eid if experts[eid].predict_one(x_dict)==1 else -1
        pipeline_acc.update(y_true, final)

print(f"\n🏁  Pipeline accuracy on router-val slice: "
      f"{pipeline_acc.get():.4f}")

── SPLITS ───────────────────────────────────────────
 Total samples         : 100,000
 Expert  train / val   : 40,000 / 10,000
 Router  train / val   : 40,000 / 10,000

── EXPERT VALIDATION ACC (first 50 %) ──────────────
 Expert 0: 0.9583
 Expert 1: 0.9654
 Expert 2: 0.9712
 Expert 3: 0.9438
 Expert 4: 0.9663
 Expert 5: 0.9497
 Expert 6: 0.9581
 Expert 7: 0.9554
 Expert 8: 0.9370
 Expert 9: 0.9346

Router-train usable samples            : 26,575
Epoch 1/5 | router-train CE: 0.4322
Epoch 2/5 | router-train CE: 0.0063
Epoch 3/5 | router-train CE: 0.0082
Epoch 4/5 | router-train CE: 0.0050
Epoch 5/5 | router-train CE: 0.0045

🏁  Pipeline accuracy on router-val slice: 0.6725


In [31]:
"""
	•	Router-training labels are now multi-hot – if several experts predict correctly, every one of those IDs is marked 1 (multi-label).
	•	BCEWithLogitsLoss replaces cross-entropy so the router can “light up” multiple experts.
	•	At inference we simply take the router’s top-score expert no matter what (even if no expert was correct during training) – no more -1.
	•	If no expert happened to be correct for a router-train sample, we still keep it with all-zeros label; the BCE loss nudges logits toward 0 for that input.
"""
# ─────────────────────────────────────────────────────────────────────────────
#  Mixture-of-Experts pipeline  (LED_a, multi-label router)
# ─────────────────────────────────────────────────────────────────────────────
# ───────── CONFIG ───────────────────────────────────────────────────────────
TOTAL_SAMPLES  = 1_000_00        # change to a smaller value for quick runs
TRAIN_RATIO    = 0.80
NUM_CLASSES    = 10
INPUT_DIM      = 24               # 7 relevant + 17 irrelevant
BATCH_SIZE     = 256
EPOCHS         = 5
LR             = 3e-3
SEED_STREAM    = 112
torch.manual_seed(42)

# ───────── STREAM & SPLIT ───────────────────────────────────────────────────
stream = list(
    synth.LEDDrift(
        seed                = SEED_STREAM,
        noise_percentage    = 0.10,
        irrelevant_features = True,
        n_drift_features    = 7
    ).take(TOTAL_SAMPLES)
)

half           = TOTAL_SAMPLES // 2
expert_block   = stream[:half]
router_block   = stream[half:]

exp_train_sz   = int(len(expert_block) * TRAIN_RATIO)
rtr_train_sz   = int(len(router_block) * TRAIN_RATIO)

exp_train, exp_val = expert_block[:exp_train_sz], expert_block[exp_train_sz:]
rtr_train, rtr_val = router_block[:rtr_train_sz], router_block[rtr_train_sz:]

print("── SPLITS ───────────────────────────────────────────")
print(f" total samples         : {TOTAL_SAMPLES:,}")
print(f" expert  train / val   : {len(exp_train):,} / {len(exp_val):,}")
print(f" router  train / val   : {len(rtr_train):,} / {len(rtr_val):,}")

d2v = lambda d: np.fromiter(d.values(), dtype=np.float32, count=INPUT_DIM)

# ───────── 1) TRAIN TEN ONE-VS-REST EXPERTS ─────────────────────────────────
experts   = {cid: tree.HoeffdingTreeClassifier() for cid in range(NUM_CLASSES)}
exp_val_acc = {cid: metrics.Accuracy()           for cid in range(NUM_CLASSES)}

for x,y in exp_train:
    for cid, model in experts.items():
        model.learn_one(x, int(y == cid))

for x,y in exp_val:
    for cid, model in experts.items():
        exp_val_acc[cid].update(int(y == cid), model.predict_one(x))

print("\n── EXPERT VALID ACC (first 50 %) ───────────────────")
for cid in range(NUM_CLASSES):
    print(f" expert {cid}: {exp_val_acc[cid].get():.4f}")

# ───────── 2) BUILD ROUTER-TRAIN  (multi-label targets) ────────────────────
router_X, router_Y = [], []              # Y shape = [N, 10]

for x_dict, y_true in rtr_train:
    multi = np.zeros(NUM_CLASSES, dtype=np.float32)
    for cid, model in experts.items():
        if model.predict_one(x_dict) == 1 and y_true == cid:
            multi[cid] = 1.0
    router_X.append(d2v(x_dict))
    router_Y.append(multi)               # keep even if all zeros

router_X = np.stack(router_X)
router_Y = np.stack(router_Y)

print(f"\nrouter-train samples              : {len(router_Y):,}")
print(f"positive label density            : {router_Y.sum()/router_Y.size:.4f}")

# ───────── 3) ROUTER MLP  (multi-label BCE loss) ───────────────────────────
class TorchDS(Dataset):
    def __init__(self,X,Y):
        self.X = torch.tensor(X)
        self.Y = torch.tensor(Y)
    def __len__(self):  return len(self.X)
    def __getitem__(self,i): return self.X[i], self.Y[i]

train_dl = DataLoader(TorchDS(router_X, router_Y),
                      batch_size=BATCH_SIZE, shuffle=True)

class RouterMLP(nn.Module):
    def __init__(self,in_dim=INPUT_DIM,h=128,out=NUM_CLASSES):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim,h), nn.SiLU(),
            nn.Linear(h,h),       nn.SiLU(),
            nn.Linear(h,out)
        )
    def forward(self,x): return self.net(x)

router = RouterMLP()
opt    = torch.optim.Adam(router.parameters(), lr=LR)
bce    = nn.BCEWithLogitsLoss()

router.train()
for epoch in range(1, EPOCHS+1):
    running = 0.0
    for xb, yb in train_dl:
        opt.zero_grad()
        loss = bce(router(xb), yb)
        loss.backward(); opt.step()
        running += loss.item() * len(xb)
    print(f"epoch {epoch}/{EPOCHS} | router-train BCE: "
          f"{running/len(train_dl.dataset):.4f}")

# ───────── 4) PIPELINE EVAL on router-val (pick top expert) ────────────────
router.eval()
pipe_acc = metrics.Accuracy()

with torch.no_grad():
    for x_dict, y_true in rtr_val:
        logits = router(torch.tensor(d2v(x_dict))).sigmoid()
        eid    = int(torch.argmax(logits).item())        # expert with highest prob
        final  = eid                                     # trust that expert’s class
        pipe_acc.update(y_true, final)

print(f"\n🏁  pipeline accuracy on router-val slice: "
      f"{pipe_acc.get():.4f}")

── SPLITS ───────────────────────────────────────────
 total samples         : 100,000
 expert  train / val   : 40,000 / 10,000
 router  train / val   : 40,000 / 10,000

── EXPERT VALID ACC (first 50 %) ───────────────────
 expert 0: 0.9583
 expert 1: 0.9654
 expert 2: 0.9712
 expert 3: 0.9438
 expert 4: 0.9663
 expert 5: 0.9497
 expert 6: 0.9581
 expert 7: 0.9554
 expert 8: 0.9370
 expert 9: 0.9346

router-train samples              : 40,000
positive label density            : 0.0664
epoch 1/5 | router-train BCE: 0.1618
epoch 2/5 | router-train BCE: 0.0398
epoch 3/5 | router-train BCE: 0.0353
epoch 4/5 | router-train BCE: 0.0348
epoch 5/5 | router-train BCE: 0.0340

🏁  pipeline accuracy on router-val slice: 0.7709
