# 探索：在线学习

我希望当新一批 embeddings 进入时，只进行少量的训练。既让模型适应新数据，又尽量不使原来的 embedding - label 映射发生偏移。

我的计划是：

1. 先训练一次 DEC 模型
2. 再将原本一半样本丢弃，加入与丢弃数量相同的新样本

观察模型在新数据集上准确率是否有改善，以及聚类中心的变动是否平缓。

In [1]:
import copy
import torch
import numpy as np
import torch.nn.functional as F
import torch.optim as optim

import dec

from torch.utils.data import DataLoader, TensorDataset
from sklearn.cluster import KMeans
from scipy.optimize import linear_sum_assignment

In [2]:
# 参数配置
TRAIN_CSV_PATH = './data/train_embed_label.csv'
TEST_CSV_PATH = './data/test_embed_label.csv'

# 模型超参数配置
config = {
    "dims": [768, 256, 32],
    "n_clusters": 100,
    "pretrain_epochs": 50,
    "soft_dist_epochs": 100,
    "update_interval": 10,
    "batch_size": 256,
    "tol": 0.001,
    "alpha": 1.0,
    "save_dir": "./model",
    "args_model_file": "dec_args.pth",
    "full_model_file": "dec_full.pth"
}

## 1. 初次训练 DEC 模型

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Using device: {device}')

# 数据准备
X, y_true = dec.load_embed_data(TRAIN_CSV_PATH)
dataset = TensorDataset(torch.arange(len(X)), torch.from_numpy(X.astype(np.float32)))
pretrain_loader = DataLoader(dataset, batch_size=config["batch_size"], shuffle=True)

# ======= 阶段一：训练降噪自编码器 =======
# 实例化编码器
auto_encoder = dec.Autoencoder(config["dims"]).to(device)

# 执行编码器预训练代码
auto_encoder = dec.pretrain(autoencoder=auto_encoder,
                            data_loader=pretrain_loader,
                            epochs=50,
                            device=device,
                            interval=config["update_interval"])

# ======= 阶段二：初始化聚类中心 =======
full_loader = DataLoader(dataset, batch_size=1024, shuffle=False)
kmeans, y_pred, init_acc = dec.init_cluster_centers(encoder=auto_encoder.encoder,
                                                    data_loader=full_loader,
                                                    n_clusters=config["n_clusters"],
                                                    device=device,
                                                    y_true=y_true)
print(f'init_acc: {init_acc}')

# 代表聚类中心的特征向量
cluster_centers = torch.tensor(kmeans.cluster_centers_,
                               dtype=torch.float,
                               requires_grad=True,
                               device=device)

# ======= 阶段三：训练 DEC =======
# 实例化 DEC
dec_model = dec.DEC(
    cluster_number=config["n_clusters"],  # 预设的聚类数
    hidden_dimension=config["dims"][-1],  # 编码器输出维度
    encoder=auto_encoder.encoder,
    alpha=config["alpha"],
    cluster_centers=cluster_centers
)
data_loader = DataLoader(dataset, batch_size=config["batch_size"], shuffle=False)
dec_model, dec_acc = dec.train_dec(model=dec_model,
                                   data_loader=data_loader,
                                   epochs=config["soft_dist_epochs"],
                                   device=device,
                                   X=X,
                                   y_true=y_true,
                                   interval=config["update_interval"])

# 保存最优模型
dec.save_full_model(dec_model, config)
dec.save_args_model(dec_model, config)

# 计算指标
y_pred = dec.infer_embeddings(dec_model, X, device=device)
if y_true is not None:
    print("\nFinal Clustering Results:")
    print(f"ACC: {dec.acc(y_true, y_pred):.4f}")
    print(f"NMI: {dec.nmi(y_true, y_pred):.4f}")
    print(f"ARI: {dec.ari(y_true, y_pred):.4f}")

Using device: cuda
Pretrain Epoch 10/50, Loss: 0.0009
Pretrain Epoch 20/50, Loss: 0.0008
Pretrain Epoch 30/50, Loss: 0.0007
Pretrain Epoch 40/50, Loss: 0.0007
Pretrain Epoch 50/50, Loss: 0.0007
init_acc: 0.696
===== best_acc: 0.6958 =====
===== best_acc: 0.6959 =====
DEC Train Epoch 10/100, Loss: 0.0057
DEC Train Epoch 20/100, Loss: 0.0065
DEC Train Epoch 30/100, Loss: 0.0074
DEC Train Epoch 40/100, Loss: 0.0085
DEC Train Epoch 50/100, Loss: 0.0098
DEC Train Epoch 60/100, Loss: 0.0112
DEC Train Epoch 70/100, Loss: 0.0128
DEC Train Epoch 80/100, Loss: 0.0147
DEC Train Epoch 90/100, Loss: 0.0167
DEC Train Epoch 100/100, Loss: 0.0189

Final Clustering Results:
ACC: 0.6959
NMI: 0.8014
ARI: 0.5668


## 2. 生成新样本

丢弃原来一半的数据，加入新数据

In [4]:
def randomly_discard_half(X, y, seed=None):
    """随机丢弃一半的 X 和 y"""
    assert len(X) == len(y)

    if seed is not None:
        np.random.seed(seed)

    # 转换为 numpy 数组
    X = np.array(X)
    y = np.array(y)

    # 生成一个随机排列的索引序列
    num_samples = len(X)
    random_indices = np.random.permutation(num_samples)

    # 选择前一半的索引
    half_indices = random_indices[:num_samples // 2]

    # 根据索引选择数据
    return X[half_indices], y[half_indices]

In [5]:
# 旧数据丢弃一半
half_X, half_y_true = randomly_discard_half(X, y_true, seed=42)

# 新数据也丢弃一半
test_X, test_y_true = dec.load_embed_data(TEST_CSV_PATH)
half_test_X, half_test_y_true = randomly_discard_half(test_X, test_y_true, seed=37)

# 新旧数据合并
new_X = np.concatenate((half_X, half_test_X))
new_y_true = np.concatenate((half_y_true, half_test_y_true))

In [6]:
new_X.shape, new_y_true.shape

((10000, 768), (10000,))

In [7]:
# 数据准备
new_dataset = TensorDataset(torch.arange(len(new_X)), torch.from_numpy(new_X.astype(np.float32)))
new_data_loader = DataLoader(new_dataset, batch_size=config["batch_size"], shuffle=False)

## 3. 增量训练

使用新数据集对原模型做增量训练。在训练前，先评估一下原模型在新数据集上的效果。

In [8]:
# 保存原模型重要参数

# 原模型的聚类中心
old_cluster_centers = dec_model.assignment.cluster_centers

# 原模型的 encoder
old_encoder = dec_model.encoder

1）原模型在新数据集上的效果

In [9]:
new_y_pred = dec.infer_embeddings(dec_model, new_X, device=device)
print("Final Clustering Results:")
print(f"ACC: {dec.acc(new_y_true, new_y_pred):.4f}")
print(f"NMI: {dec.nmi(new_y_true, new_y_pred):.4f}")
print(f"ARI: {dec.ari(new_y_true, new_y_pred):.4f}")

Final Clustering Results:
ACC: 0.6888
NMI: 0.7935
ARI: 0.5515


接下来开始训练增量模型。

2）思路一：移动聚类中心

1. 计算新聚类中心
2. 用 linear_sum_assignment 函数匹配新老聚类中心
3. 在老聚类中心的基础上，向新中心做一点平移

In [10]:
def match_and_translate_centers(old_centers, new_centers, translation_ratio=0.1):
    """
    匹配新老聚类中心并进行平移
    :param old_centers: 老的聚类中心，形状为 (n_clusters, n_features)
    :param new_centers: 新的聚类中心，形状为 (n_clusters, n_features)
    :param translation_ratio: 平移比例，取值范围为 [0, 1]
    :return: 平移后的聚类中心 标签映射字典
    """
    # 计算新老中心之间的距离矩阵
    distance_matrix = np.linalg.norm(old_centers[:, np.newaxis] - new_centers, axis=-1)
    # 使用 linear_sum_assignment 匹配新老中心
    row_indices, col_indices = linear_sum_assignment(distance_matrix)

    # 构建标签映射
    mapping = {old_label: new_label for old_label, new_label in zip(row_indices, col_indices)}

    # 进行平移
    translated_centers = old_centers.copy()
    for old_idx, new_idx in zip(row_indices, col_indices):
        translated_centers[old_idx] += translation_ratio * (new_centers[new_idx] - old_centers[old_idx])

    return translated_centers, mapping

In [11]:
# 计算新的聚类中心
nc_kmeans, nc_y_pred, nc_init_acc = dec.init_cluster_centers(encoder=old_encoder,
                                                             data_loader=new_data_loader,
                                                             n_clusters=config["n_clusters"],
                                                             device=device,
                                                             y_true=new_y_true)
nc_centers = nc_kmeans.cluster_centers_

# 匹配并平移聚类中心
translated_centers, mapping = match_and_translate_centers(old_centers=old_cluster_centers.cpu().detach().numpy(),
                                                          new_centers=nc_centers,
                                                          translation_ratio=0.1)

In [12]:
# mapping

In [13]:
# 评估聚类中心移动距离
old_vectors = dec_model.assignment.cluster_centers.cpu().detach()
new_vectors = torch.Tensor(translated_centers)

# 计算每对向量的欧氏距离
euclidean_distances = torch.norm(old_vectors - new_vectors, dim=1)

# 计算欧氏距离的均值和最大值
mean_distance = torch.mean(euclidean_distances)
max_distance = torch.max(euclidean_distances)

# print(f"euclidean_distances: {euclidean_distances}")
print(f"mean_distance: {mean_distance:.4f}")
print(f"max_distance: {max_distance:.4f}")

mean_distance: 0.0069
max_distance: 0.0636


In [14]:
# 加入 L2 正则化，以及更严格的梯度裁剪
def train_dec(model, data_loader, epochs, device, X, y_true=None, interval=10):
    """通过目标分布引导聚类优化"""

    # 记录最优模型
    best_model, best_acc = None, None

    optimizer = optim.Adam([
        {'params': model.encoder.parameters(), 'lr': 1e-6},
        {'params': model.assignment.parameters(), 'lr': 1e-5}
    ])

    criterion = F.kl_div
    model.train()
    for epoch in range(epochs):
        total_loss = 0.0
        for idx, x in data_loader:
            x = x.to(device)
            optimizer.zero_grad()
            output = model(x)
            target = dec.target_distribution(output).detach()

            # 加入 L2 正则化
            lambda_reg = 1e-4  # 正则化系数
            loss = criterion(output.log(), target, reduction='batchmean') + \
                lambda_reg * model.assignment.cluster_centers.norm(2)
            loss.backward()

            # 更严格的梯度裁剪
            torch.nn.utils.clip_grad_norm_(model.parameters(),
                                           max_norm=0.5,
                                           norm_type=2)

            optimizer.step()
            total_loss += loss.item()

        avg_loss = total_loss / len(data_loader)
        if (epoch + 1) % interval == 0:
            print(f"DEC Train Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}")

        if y_true is not None:
            # 计算准确率
            with torch.no_grad():
                input = torch.from_numpy(X).float().to(device)
                y_pred = model(input).argmax(1).cpu().numpy()
            current_acc = dec.acc(y_true, y_pred)

            # 更新最优模型
            if best_acc is None or current_acc > best_acc:
                best_model = copy.deepcopy(model)
                best_acc = current_acc
                print(f'===== best_acc: {best_acc:.4f} =====')

    return model if best_model is None else best_model, best_acc

In [15]:
# 新的超参数
new_soft_dist_epochs = 10
new_update_interval = 1

# 重新实例化一个 DEC
new_dec_model = dec.DEC(
    cluster_number=config["n_clusters"],  # 预设的聚类数
    hidden_dimension=config["dims"][-1],  # 编码器输出维度
    encoder=old_encoder,
    alpha=config["alpha"],
    cluster_centers=torch.Tensor(translated_centers).to(device)  # 替换聚类中心
)
new_dec_model, new_dec_acc = train_dec(model=new_dec_model,
                                       data_loader=new_data_loader,
                                       epochs=new_soft_dist_epochs,
                                       device=device,
                                       X=new_X,
                                       y_true=new_y_true,
                                       interval=new_update_interval)

DEC Train Epoch 1/10, Loss: 0.0058
===== best_acc: 0.6905 =====
DEC Train Epoch 2/10, Loss: 0.0059
DEC Train Epoch 3/10, Loss: 0.0059
DEC Train Epoch 4/10, Loss: 0.0060
DEC Train Epoch 5/10, Loss: 0.0061
DEC Train Epoch 6/10, Loss: 0.0062
DEC Train Epoch 7/10, Loss: 0.0062
DEC Train Epoch 8/10, Loss: 0.0063
DEC Train Epoch 9/10, Loss: 0.0064
DEC Train Epoch 10/10, Loss: 0.0065


In [16]:
# 计算指标
nn_y_pred = dec.infer_embeddings(new_dec_model, new_X, device=device)
print("Final Clustering Results:")
print(f"ACC: {dec.acc(new_y_true, nn_y_pred):.4f}")
print(f"NMI: {dec.nmi(new_y_true, nn_y_pred):.4f}")
print(f"ARI: {dec.ari(new_y_true, nn_y_pred):.4f}")

Final Clustering Results:
ACC: 0.6905
NMI: 0.7945
ARI: 0.5543


3）思路二：重新拟合目标分布

In [17]:
# dir(dec_model)

In [18]:
# 加入 L2 正则化，以及更严格的梯度裁剪
def train_dec(model, data_loader, epochs, device, X, y_true=None, interval=10):
    """通过目标分布引导聚类优化"""

    # 记录最优模型
    best_model, best_acc = None, None

    optimizer = optim.Adam([
        {'params': model.encoder.parameters(), 'lr': 1e-6},
        {'params': model.assignment.parameters(), 'lr': 1e-5}
    ])

    criterion = F.kl_div
    model.train()
    for epoch in range(epochs):
        total_loss = 0.0
        for idx, x in data_loader:
            x = x.to(device)
            optimizer.zero_grad()
            output = model(x)
            target = dec.target_distribution(output).detach()

            # 加入 L2 正则化
            lambda_reg = 1e-4  # 正则化系数
            loss = criterion(output.log(), target, reduction='batchmean') + \
                lambda_reg * model.assignment.cluster_centers.norm(2)
            loss.backward()

            # 更严格的梯度裁剪
            torch.nn.utils.clip_grad_norm_(model.parameters(),
                                           max_norm=0.5,
                                           norm_type=2)

            optimizer.step()
            total_loss += loss.item()

        avg_loss = total_loss / len(data_loader)
        if (epoch + 1) % interval == 0:
            print(f"DEC Train Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}")

        if y_true is not None:
            # 计算准确率
            with torch.no_grad():
                input = torch.from_numpy(X).float().to(device)
                y_pred = model(input).argmax(1).cpu().numpy()
            current_acc = dec.acc(y_true, y_pred)

            # 更新最优模型
            if best_acc is None or current_acc > best_acc:
                best_model = copy.deepcopy(model)
                best_acc = current_acc
                print(f'===== best_acc: {best_acc:.4f} =====')

    return model if best_model is None else best_model, best_acc

In [19]:
# 新的超参数
new_soft_dist_epochs = 10
new_update_interval = 1

# 重新实例化一个 DEC
new_dec_model = dec.DEC(
    cluster_number=config["n_clusters"],  # 预设的聚类数
    hidden_dimension=config["dims"][-1],  # 编码器输出维度
    encoder=old_encoder,
    alpha=config["alpha"],
    cluster_centers=old_cluster_centers
)
new_dec_model, new_dec_acc = train_dec(model=new_dec_model,
                                       data_loader=new_data_loader,
                                       epochs=new_soft_dist_epochs,
                                       device=device,
                                       X=new_X,
                                       y_true=new_y_true,
                                       interval=new_update_interval)

DEC Train Epoch 1/10, Loss: 0.0061
===== best_acc: 0.6920 =====
DEC Train Epoch 2/10, Loss: 0.0062
DEC Train Epoch 3/10, Loss: 0.0063
DEC Train Epoch 4/10, Loss: 0.0064
DEC Train Epoch 5/10, Loss: 0.0064
DEC Train Epoch 6/10, Loss: 0.0065
DEC Train Epoch 7/10, Loss: 0.0066
DEC Train Epoch 8/10, Loss: 0.0067
DEC Train Epoch 9/10, Loss: 0.0067
DEC Train Epoch 10/10, Loss: 0.0068


In [20]:
# 计算指标
nn_y_pred = dec.infer_embeddings(new_dec_model, new_X, device=device)
print("Final Clustering Results:")
print(f"ACC: {dec.acc(new_y_true, nn_y_pred):.4f}")
print(f"NMI: {dec.nmi(new_y_true, nn_y_pred):.4f}")
print(f"ARI: {dec.ari(new_y_true, nn_y_pred):.4f}")

Final Clustering Results:
ACC: 0.6920
NMI: 0.7953
ARI: 0.5561


EWC的弹性约束？