In [16]:
# ────────────────────────────────────────────────────────────────────────────
#  Dependencies
# ────────────────────────────────────────────────────────────────────────────

from river import tree, naive_bayes
from river.datasets import synth
import matplotlib.pyplot as plt
from river import metrics
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split
from collections import defaultdict
import numpy as np
import random

from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.metrics import calinski_harabasz_score
from scipy.spatial.distance import cdist
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import davies_bouldin_score
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
# ────────────────────────────────────────────────────────────────────────────
#  CONFIG - same as pipeline 2
# ────────────────────────────────────────────────────────────────────────────
TOTAL_SAMPLES   = 100000          # change for quick tests
TRAIN_RATIO     = 0.80
NUM_CLASSES     = 10
INPUT_DIM       = 24                 # 7 relevant + 17 irrelevant
BATCH           = 256
EPOCHS          = 75
LR              = 3e-3
SEED_STREAM     = 112
SEED_TORCH      = 42
torch.manual_seed(SEED_TORCH)
random.seed(SEED_TORCH)

In [17]:
# ────────────────────────────────────────────────────────────────────────────
#  LOAD STREAM
# ────────────────────────────────────────────────────────────────────────────
stream = list(
    synth.LEDDrift(
        seed                = SEED_STREAM,
        noise_percentage    = 0.10,
        irrelevant_features = True,
        n_drift_features    = 7
    ).take(TOTAL_SAMPLES)
)

half               = TOTAL_SAMPLES // 2
exp_cluster = TOTAL_SAMPLES // 10

# helper: dict→24-float vector
d2v = lambda d: np.fromiter(d.values(), dtype=np.float32, count=INPUT_DIM)

In [18]:
SAMPLE_FRAC  = 0.10                     # 10 % of the training window

# 0-a  Split once:  90 % train | 10 % hold-out
stream        = list(stream)            # make slice-able
TOTAL         = len(stream)
split_at      = int(0.9 * TOTAL)
train_stream  = stream[:split_at]
hold_stream   = stream[split_at:]       # untouched until final eval

# 0-b  Draw a small uniform sample from the *train* part
sample_len    = int(SAMPLE_FRAC * len(train_stream))
random.seed(SEED_TORCH)
sample_idx    = random.sample(range(len(train_stream)), sample_len)
cluster_block = [train_stream[i] for i in sample_idx]

# 0-c  Embed  ➜  scale  ➜  whiten
X   = np.stack([d2v(x) for x, _ in cluster_block])
scaler = StandardScaler().fit(X)
X_std  = scaler.transform(X)
pca    = PCA(whiten=True).fit(X_std)
X_wht  = pca.transform(X_std)

# 0-d  Fit candidate GMMs and pick k with lowest BIC
bic_scores, gmms = [], []
for k in range(1, NUM_CLASSES + 1):
    g = GaussianMixture(n_components=k, covariance_type="diag",
                        random_state=SEED_TORCH).fit(X_wht)
    bic_scores.append(g.bic(X_wht))
    gmms.append(g)

best_k      = int(np.argmin(bic_scores)) + 1   # +1 because range starts at 1
gmm         = gmms[best_k - 1]                 # keep if you still want cluster IDs
n_experts   = best_k                           # ← the number you will use

print(f"Stage 0 ✔  selected n_experts = {n_experts} (BIC)")

Stage 0 ✔  selected n_experts = 9 (BIC)


In [19]:
# ──────────────────────────────────────────────────────────────
# 1.  Hyper-params & boiler-plate
# ──────────────────────────────────────────────────────────────

TOP_K         = 2            # update the K heaviest-weighted experts
PRINT_EVERY   = 1_000
CLASSES       = list(range(NUM_CLASSES))

def to_tensor(x):
    return torch.tensor(x, dtype=torch.float32)

# ──────────────────────────────────────────────────────────────
# 2.  Initialise experts and router
# ──────────────────────────────────────────────────────────────
experts = {i: tree.HoeffdingTreeClassifier() for i in range(n_experts)}

class RouterMLP(nn.Module):
    def __init__(self, in_dim=INPUT_DIM, h=256, out_dim=n_experts):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim, h), nn.ReLU(), nn.Dropout(0.2),
            nn.Linear(h, h // 2), nn.ReLU(),
            nn.Linear(h // 2, out_dim)
        )
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight)
                nn.init.zeros_(m.bias)
    def forward(self, x): return self.net(x)

router = RouterMLP()
opt    = torch.optim.Adam(router.parameters(), lr=LR)
nll    = nn.NLLLoss(reduction="mean")

pipeline_acc = metrics.Accuracy()
running_loss = 0.0

# ──────────────────────────────────────────────────────────────
# 3.  Online joint-training loop  (90 % train slice)
# ──────────────────────────────────────────────────────────────
router.train()
micro_X, micro_y = [], []

for t, (x_dict, y_true) in enumerate(train_stream, 1):
    # 3-A  Embed sample
    x_vec = d2v(x_dict)
    x_t   = to_tensor(x_vec).unsqueeze(0)         # 1×24

    # 3-B  Router forward
    logits  = router(x_t)                         # 1×n_experts
    weights = torch.softmax(logits, dim=1)        # 1×n_experts

    # 3-C  Gather experts’ probability vectors
    exp_probs = []
    for e in experts.values():
        pdict = e.predict_proba_one(x_dict) or {c: 1/NUM_CLASSES for c in CLASSES}
        exp_probs.append([pdict.get(c, 0.0) for c in CLASSES])
    exp_probs = torch.tensor(exp_probs)           # n_experts × C

    mix_prob = torch.mm(weights, exp_probs) + 1e-9
    log_mix  = (mix_prob / mix_prob.sum()).log()  # 1×C log-probs

    # 3-D  Accumulate mini-batch for router update
    micro_X.append(log_mix)
    micro_y.append(y_true)
    if len(micro_X) == BATCH:
        batch_X = torch.cat(micro_X, dim=0)       # B×C
        batch_y = torch.tensor(micro_y)
        loss = nll(batch_X, batch_y)
        opt.zero_grad(); loss.backward(); opt.step()
        running_loss += loss.item() * BATCH
        micro_X.clear(); micro_y.clear()

    # 3-E  Top-K expert updates
    with torch.no_grad():
        topk_ids = torch.topk(weights, k=TOP_K, dim=1).indices.squeeze(0)
    for eid in topk_ids.tolist():
        experts[eid].learn_one(x_dict, y_true)

    # 3-F  Running metrics
    y_hat = CLASSES[int(torch.argmax(mix_prob))]
    pipeline_acc.update(y_true, y_hat)

    if t % PRINT_EVERY == 0:
        avg_ce = running_loss / max(1, (t // BATCH))
        print(f"[{t:,} samples]  router CE: {avg_ce:.4f}   "
              f"pipeline acc: {pipeline_acc.get():.4f}")
        running_loss = 0.0

print("🏁 train-window accuracy:", pipeline_acc.get())

# ──────────────────────────────────────────────────────────────
# 4.  Hold-out evaluation  (last 10 %)
# ──────────────────────────────────────────────────────────────
router.eval()
hold_acc = metrics.Accuracy()

with torch.no_grad():
    for x_dict, y_true in hold_stream:
        x_vec = d2v(x_dict)
        logits  = router(to_tensor(x_vec).unsqueeze(0))
        weights = torch.softmax(logits, dim=1)
        exp_probs = []
        for e in experts.values():
            pdict = e.predict_proba_one(x_dict) or {c: 1/NUM_CLASSES for c in CLASSES}
            exp_probs.append([pdict.get(c, 0.0) for c in CLASSES])
        exp_probs = torch.tensor(exp_probs)
        mix_prob  = torch.mm(weights, exp_probs)
        y_hat     = CLASSES[int(torch.argmax(mix_prob))]
        hold_acc.update(y_true, y_hat)

print("🏁 hold-out (10 %) accuracy:", hold_acc.get())

[1,000 samples]  router CE: 425.0343   pipeline acc: 0.5630
[2,000 samples]  router CE: 133.7576   pipeline acc: 0.6535
[3,000 samples]  router CE: 91.3693   pipeline acc: 0.6853
[4,000 samples]  router CE: 59.4474   pipeline acc: 0.7030
[5,000 samples]  router CE: 46.8206   pipeline acc: 0.7124
[6,000 samples]  router CE: 38.0124   pipeline acc: 0.7200
[7,000 samples]  router CE: 32.0989   pipeline acc: 0.7260
[8,000 samples]  router CE: 27.3526   pipeline acc: 0.7305
[9,000 samples]  router CE: 24.1677   pipeline acc: 0.7352
[10,000 samples]  router CE: 23.6133   pipeline acc: 0.7365
[11,000 samples]  router CE: 15.7349   pipeline acc: 0.7397
[12,000 samples]  router CE: 18.0157   pipeline acc: 0.7422
[13,000 samples]  router CE: 17.6991   pipeline acc: 0.7449
[14,000 samples]  router CE: 14.9804   pipeline acc: 0.7471
[15,000 samples]  router CE: 15.0886   pipeline acc: 0.7476
[16,000 samples]  router CE: 12.7112   pipeline acc: 0.7492
[17,000 samples]  router CE: 13.1319   pipeline