In [231]:
# ────────────────────────────────────────────────────────────────────────────
#  Dependencies
# ────────────────────────────────────────────────────────────────────────────

from river import tree, naive_bayes
from river.datasets import synth
import matplotlib.pyplot as plt
from river import metrics
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split
from collections import defaultdict
import numpy as np
import random

from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.metrics import calinski_harabasz_score
from sklearn.metrics import silhouette_score
from scipy.spatial.distance import cdist
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import davies_bouldin_score
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

from capymoa.stream.generator import LEDGeneratorDrift, SEA, RandomRBFGeneratorDrift
from capymoa.stream.drift import DriftStream, AbruptDrift, GradualDrift
# ────────────────────────────────────────────────────────────────────────────
#  CONFIG - same as pipeline 2
# ────────────────────────────────────────────────────────────────────────────
TOTAL_SAMPLES   = 1_000_000          # change for quick tests
TRAIN_RATIO     = 0.80
NUM_CLASSES     = 10
INPUT_DIM       = 24                 # 7 relevant + 17 irrelevant
BATCH           = 256
EPOCHS          = 75
LR              = 2e-3
SEED_STREAM     = 112
SEED_TORCH      = 42
torch.manual_seed(SEED_TORCH)
random.seed(SEED_TORCH)

In [220]:
from capymoa.stream import MOAStream, Stream
from moa.streams import ConceptDriftStream

In [221]:
led_g_stream = DriftStream(
    stream=[LEDGeneratorDrift(number_of_attributes_with_drift=1),
    
    GradualDrift(width=50_000, position=225_000),
   
    LEDGeneratorDrift(number_of_attributes_with_drift=3),
    GradualDrift(width=50_000, position=475_000),
    
    LEDGeneratorDrift(number_of_attributes_with_drift=5),
    GradualDrift(width=50_000, position=725_000),
    
    LEDGeneratorDrift(number_of_attributes_with_drift=7)
           ]
)

In [222]:
led_a_stream = DriftStream(
    stream=[LEDGeneratorDrift(number_of_attributes_with_drift=1),
    
    GradualDrift(width=50, position=249_975),
   
    LEDGeneratorDrift(number_of_attributes_with_drift=3),
    GradualDrift(width=50, position=499_975),
    
    LEDGeneratorDrift(number_of_attributes_with_drift=5),
    GradualDrift(width=50, position=749_975),
    
    LEDGeneratorDrift(number_of_attributes_with_drift=7)
           ]
)

In [223]:
sea_a_stream = DriftStream(
    stream=[SEA(function=1),
            GradualDrift(width=50, position=249_975),
            SEA(function=2),
            GradualDrift(width=50, position=499_975),
            SEA(function=4),
            GradualDrift(width=50, position=749_975),
            SEA(function=1)
           ]
)

In [224]:
sea_g_stream = DriftStream(
    stream=[SEA(function=1),
    
    GradualDrift(width=50_000, position=225_000),
   
    SEA(function=2),
    GradualDrift(width=50_000, position=475_000),
    
    SEA(function=4),
    GradualDrift(width=50_000, position=725_000),
    SEA(function=1)
    
           ]
)

In [232]:
rbf_m_stream = RandomRBFGeneratorDrift(number_of_drifting_centroids=50, magnitude_of_change=0.0001)
rbf_f_stream = RandomRBFGeneratorDrift(number_of_drifting_centroids=50, magnitude_of_change=0.001)

In [236]:
from capymoa.datasets import Electricity, Covtype

elec_stream = Electricity()
covt_stream = Covtype()

Downloading covtype.arff
100% [........................................................................] 11241262 / 11241262

In [225]:
schema = led_g_stream.get_schema()
schema

@relation 'generators.LEDGeneratorDrift '

@attribute att1 {0,1}
@attribute att2 {0,1}
@attribute att3 {0,1}
@attribute att4 {0,1}
@attribute att5 {0,1}
@attribute att6 {0,1}
@attribute att7 {0,1}
@attribute att8 {0,1}
@attribute att9 {0,1}
@attribute att10 {0,1}
@attribute att11 {0,1}
@attribute att12 {0,1}
@attribute att13 {0,1}
@attribute att14 {0,1}
@attribute att15 {0,1}
@attribute att16 {0,1}
@attribute att17 {0,1}
@attribute att18 {0,1}
@attribute att19 {0,1}
@attribute att20 {0,1}
@attribute att21 {0,1}
@attribute att22 {0,1}
@attribute att23 {0,1}
@attribute att24 {0,1}
@attribute class {0,1,2,3,4,5,6,7,8,9}

@data

In [226]:
sea_schema = sea_g_stream.get_schema()
sea_schema

@relation 'generators.SEAGenerator '

@attribute attrib1 numeric
@attribute attrib2 numeric
@attribute attrib3 numeric
@attribute class {groupA,groupB}

@data

In [184]:
led_g_stream.next_instance()

LabeledInstance(
    Schema(generators.LEDGeneratorDrift ),
    x=[1. 1. 0. ... 0. 0. 0.],
    y_index=5,
    y_label='5'
)

In [227]:
def to_tensor(x):
    return torch.tensor(x, dtype=torch.float32)

In [228]:
# ──────────────────────────────────────────────────────────────
# 1.  Hyper-params & boiler-plate
# ──────────────────────────────────────────────────────────────

from capymoa.classifier import HoeffdingTree


n_experts = 15

TOP_K         = 3            # update the K heaviest-weighted experts
PRINT_EVERY   = 10_000
CLASSES       = list(range(NUM_CLASSES))

def to_tensor(x):
    return torch.tensor(x, dtype=torch.float32)

# ──────────────────────────────────────────────────────────────
# 2.  Initialise experts and router
# ──────────────────────────────────────────────────────────────

class RouterMLP(nn.Module):
    def __init__(self, in_dim=INPUT_DIM, h=256, out_dim=n_experts):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim, h), nn.ReLU(), nn.Dropout(0.2),
            nn.Linear(h, h // 2), nn.ReLU(),
            nn.Linear(h // 2, out_dim)
        )
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight)
                nn.init.zeros_(m.bias)
    def forward(self, x): return self.net(x)



In [229]:


# ──────────────────────────────────────────────────────────────
# 3.  Online joint-training loop (basic version)
# ──────────────────────────────────────────────────────────────
experts = {i: HoeffdingTree(schema=schema, grace_period=50, confidence=1e-07, binary_split=False, stop_mem_management=False) for i in range(n_experts)}

router = RouterMLP()
opt    = torch.optim.Adam(router.parameters(), lr=LR)
nll    = nn.NLLLoss(reduction="mean")

pipeline_acc = metrics.Accuracy()
running_loss = 0.0

led_g_stream.restart()
sea_g_stream.restart()
#NUM_CLASSES = 2
router.train()
micro_X, micro_y = [], []

for t in range(TOTAL_SAMPLES):
    # 3-A  Embed sample

    instance = led_g_stream.next_instance()
    x_vec = instance.x
    y_true = instance.y_index
    x_t   = to_tensor(x_vec).unsqueeze(0)         # 1×24

    # 3-B  Router forward
    logits  = router(x_t) # 1×n_experts
    #tau0, tau_min, decay_steps = 2.0, 0.7, 80_000   
    #tau = max(tau_min, tau0 * (1 - t / decay_steps)) # linear-cosine also works
    #weights = torch.softmax(logits / tau, dim=1)     # replaces previous softmax                       
    weights = torch.softmax(logits, dim=1)        # 1×n_experts

    # 3-C  Gather experts’ probability vectors
    exp_probs = []
    for e in experts.values():
        p_list = e.predict_proba(instance) or [1/NUM_CLASSES for c in CLASSES]
        if p_list is None:                            # brand-new leaf
            padded_p_list = [1 / NUM_CLASSES] * NUM_CLASSES  # uniform prior
        elif len(p_list) < NUM_CLASSES:               # seen some classes
            
            padded_p_list = list(p_list) + [0.0] * (NUM_CLASSES - len(list(p_list)))
        else:                                      # already full length
            padded_p_list = list(p_list)
        exp_probs.append(padded_p_list)
    exp_probs = torch.tensor(exp_probs)           # n_experts × C

    mix_prob = torch.mm(weights, exp_probs) + 1e-9
    log_mix  = (mix_prob / mix_prob.sum()).log()  # 1×C log-probs

    # 3-D  Accumulate mini-batch for router update
    micro_X.append(log_mix)
    micro_y.append(y_true)
    if len(micro_X) == BATCH:
        batch_X = torch.cat(micro_X, dim=0)       # B×C
        batch_y = torch.tensor(micro_y)
        loss = nll(batch_X, batch_y)
        opt.zero_grad(); loss.backward(); opt.step()
        running_loss += loss.item() * BATCH
        micro_X.clear(); micro_y.clear()

    # 3-E  Top-K expert updates
    with torch.no_grad():
        topk_ids = torch.topk(weights, k=TOP_K, dim=1).indices.squeeze(0)
    for eid in topk_ids.tolist():
        experts[eid].train(instance)

    # 3-F  Running metrics
    y_hat = CLASSES[int(torch.argmax(mix_prob))]
    pipeline_acc.update(y_true, y_hat)

    if t % PRINT_EVERY == 0:
        avg_ce = running_loss / max(1, (t // BATCH))
        print(f"[{t:,} samples]  router CE: {avg_ce:.4f}   "
              f"pipeline acc: {pipeline_acc.get():.4f}")
        running_loss = 0.0

print("🏁 train-window accuracy:", pipeline_acc.get())

"""
# ──────────────────────────────────────────────────────────────
# 4.  Hold-out evaluation  (last 10 %)
# ──────────────────────────────────────────────────────────────
router.eval()
hold_acc = metrics.Accuracy()

with torch.no_grad():
    for x_dict, y_true in hold_stream:
        x_vec = d2v(x_dict)
        logits  = router(to_tensor(x_vec).unsqueeze(0))
        weights = torch.softmax(logits, dim=1)
        exp_probs = []
        for e in experts.values():
            pdict = e.predict_proba_one(x_dict) or {c: 1/NUM_CLASSES for c in CLASSES}
            exp_probs.append([pdict.get(c, 0.0) for c in CLASSES])
        exp_probs = torch.tensor(exp_probs)
        mix_prob  = torch.mm(weights, exp_probs)
        y_hat     = CLASSES[int(torch.argmax(mix_prob))]
        hold_acc.update(y_true, y_hat)

print("🏁 hold-out (10 %) accuracy:", hold_acc.get())
"""

RuntimeError: mat1 and mat2 shapes cannot be multiplied (1x3 and 24x256)

In [191]:
# ──────────────────────────────────────────────────────────────
# 3.  Online joint-training loop (BCE loss version)
# ──────────────────────────────────────────────────────────────

experts        = {i: HoeffdingTree(schema=schema, grace_period=50, confidence=1e-07, binary_split=False, stop_mem_management=False)
                  for i in range(n_experts)}
expert_metrics = {i: metrics.Accuracy() for i in range(n_experts)}

router        = RouterMLP()
opt_router    = torch.optim.Adam(router.parameters(), lr=LR)
bce_loss      = nn.BCEWithLogitsLoss()
pipeline_acc  = metrics.Accuracy()
running_loss = 0.0

# Buffer is not needed for KL, but we keep it if you want to inspect router‐weights
router_buffer = deque(maxlen=10_000)

led_g_stream.restart()
router.train()
micro_logits, micro_multi = [], []

for t in range(TOTAL_SAMPLES):
    instance = led_g_stream.next_instance()
    x_list, y_true = instance.x, instance.y_index
    x_t = to_tensor(x_list).unsqueeze(0)   # [1×24]

    # 4.1 – Router forward (raw logits, no temperature)
    logits  = router(x_t)                   # [1×n_experts]
    weights = torch.softmax(logits, dim=1)  # [1×n_experts]
    router_buffer.append(weights.squeeze(0))

    # 4.2 – Gather expert probabilities
    exp_probs = []
    for eid in range(n_experts):
        p_list = experts[eid].predict_proba(instance)
        if p_list is None:
            padded = [1.0 / NUM_CLASSES] * NUM_CLASSES
        else:
            padded = list(p_list)
            if len(padded) < NUM_CLASSES:
                padded = padded + [0.0] * (NUM_CLASSES - len(padded))
        exp_probs.append(padded)
    exp_probs = torch.tensor(np.stack(exp_probs, axis=0), dtype=torch.float32)  # [n_experts × C]

    # 4.3 – Build multi-hot “which experts predict y_true correctly?”
    correct_mask = torch.zeros(n_experts, dtype=torch.float32)
    for eid in range(n_experts):
        p_list = exp_probs[eid].numpy().tolist()
        pred_cls = int(np.argmax(p_list))
        if pred_cls == y_true:
            correct_mask[eid] = 1.0
    if correct_mask.sum() == 0.0:
        # fallback: pick expert with highest P(y_true)
        best_e = int(torch.argmax(exp_probs[:, y_true]).item())
        correct_mask[best_e] = 1.0

    # 4.4 – Accumulate minibatch for BCE update
    micro_logits.append(logits)                    # [1×n_experts]
    micro_multi.append(correct_mask.unsqueeze(0))  # [1×n_experts]
    if len(micro_logits) == BATCH:
        batch_logits = torch.cat(micro_logits, dim=0)  # [B×n_experts]
        batch_multi  = torch.cat(micro_multi, dim=0)   # [B×n_experts]
        loss_bce     = bce_loss(batch_logits, batch_multi)
        opt_router.zero_grad()
        loss_bce.backward()
        opt_router.step()
        micro_logits.clear()
        micro_multi.clear()

    # 4.5 – Top-K=3 expert updates
    topk_ids = torch.topk(weights, k=TOP_K, dim=1).indices.squeeze(0)
    for eid in topk_ids.tolist():
        experts[eid].train(instance)
        p_list = experts[eid].predict_proba(instance)
        if p_list is None:
            pred_cls = -1
        else:
            pred_cls = int(np.argmax(p_list))
        expert_metrics[eid].update(y_true, pred_cls)

    # 4.6 – Pipeline accuracy (argmax weight)
    chosen_eid = int(torch.argmax(weights).item())
    p_list = experts[chosen_eid].predict_proba(instance)
    if p_list is None:
        y_hat = -1
    else:
        y_hat = int(np.argmax(p_list))
    pipeline_acc.update(y_true, y_hat)

    # 4.7 – Logging
    if t % 10_000 == 0:
        print(f"[{t:,} samples]  PipeAcc = {pipeline_acc.get():.4f}")

print("🏁 Final pipeline accuracy:", pipeline_acc.get())

[0 samples]  PipeAcc = 1.0000
[10,000 samples]  PipeAcc = 0.7602
[20,000 samples]  PipeAcc = 0.7550
[30,000 samples]  PipeAcc = 0.7515
[40,000 samples]  PipeAcc = 0.7495
[50,000 samples]  PipeAcc = 0.7499
[60,000 samples]  PipeAcc = 0.7488
[70,000 samples]  PipeAcc = 0.7481
[80,000 samples]  PipeAcc = 0.7477
[90,000 samples]  PipeAcc = 0.7478
[100,000 samples]  PipeAcc = 0.7483
[110,000 samples]  PipeAcc = 0.7479
[120,000 samples]  PipeAcc = 0.7485
[130,000 samples]  PipeAcc = 0.7481
[140,000 samples]  PipeAcc = 0.7474
[150,000 samples]  PipeAcc = 0.7481
[160,000 samples]  PipeAcc = 0.7479
[170,000 samples]  PipeAcc = 0.7478
[180,000 samples]  PipeAcc = 0.7475
[190,000 samples]  PipeAcc = 0.7470
[200,000 samples]  PipeAcc = 0.7457
[210,000 samples]  PipeAcc = 0.7435
[220,000 samples]  PipeAcc = 0.7402
[230,000 samples]  PipeAcc = 0.7361
[240,000 samples]  PipeAcc = 0.7319
[250,000 samples]  PipeAcc = 0.7296
[260,000 samples]  PipeAcc = 0.7292
[270,000 samples]  PipeAcc = 0.7297
[280,00