<a href="https://colab.research.google.com/github/mynameislllyt/API_Experiment/blob/main/new_dataset_LSTMAE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install numpy pandas scikit-learn torch




In [5]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# ======================
# 配置
# ======================
CSV_PATH = "new_dataset.csv"   # TODO: 改成你的实际文件名
BATCH_SIZE = 64
EPOCHS = 10
EMBED_DIM = 128
ENC_HIDDEN_DIM = 128
DEC_HIDDEN_DIM = 128
LATENT_DIM = 64
LR = 1e-3
VAL_QUANTILE = 0.95   # 用正常验证集的 95% 分位作为阈值
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", DEVICE)

# ======================
# 1. 读取 & 预处理数据
# ======================
df = pd.read_csv(CSV_PATH)

# 去掉全是 NaN 的 Unnamed 列
df = df.loc[:, ~df.columns.str.contains("^Unnamed")]

# 标签：0 = 正常, 1 = 恶意
labels = df["malware"].astype(int).values

# 找出序列列名（'0', '1', ...）
seq_cols = [c for c in df.columns if c.isdigit()]
seq_cols = sorted(seq_cols, key=lambda x: int(x))

X_seq = df[seq_cols].fillna(0).astype(int).values  # (N, L)
N, L = X_seq.shape
print("总样本数:", N, "序列长度:", L)
print("标签分布 (0=正常,1=恶意):", np.bincount(labels))

# 拆分正常/异常
X_norm = X_seq[labels == 0]  # 正常
X_anom = X_seq[labels == 1]  # 恶意
print("正常样本数:", X_norm.shape[0])
print("异常样本数:", X_anom.shape[0])

# 正常样本划分 train / val / test_norm
X_norm_train, X_norm_temp = train_test_split(
    X_norm, test_size=0.4, random_state=42
)
X_norm_val, X_norm_test = train_test_split(
    X_norm_temp, test_size=0.5, random_state=42
)

print("正常 train:", X_norm_train.shape,
      "正常 val:", X_norm_val.shape,
      "正常 test:", X_norm_test.shape)

# 构建最终测试集：正常 + 异常
X_test = np.concatenate([X_norm_test, X_anom], axis=0)
y_test = np.concatenate([
    np.zeros(len(X_norm_test), dtype=int),
    np.ones(len(X_anom), dtype=int)
])
print("测试集样本数:", X_test.shape[0], "正常/异常:", np.bincount(y_test))

# 估计 vocab_size：假设 ID 从 0 ~ max_id
vocab_size = int(X_seq.max()) + 1
print("vocab_size:", vocab_size)


# ======================
# 2. Dataset & DataLoader
# ======================
class APIDataset(Dataset):
    """
    用于 Autoencoder：输入和目标都是同一条序列
    """
    def __init__(self, X):
        # X: (N, L)
        self.X = torch.tensor(X, dtype=torch.long)

    def __len__(self):
        return self.X.size(0)

    def __getitem__(self, idx):
        seq = self.X[idx]  # (L,)
        return seq, seq    # input, target 一样


train_dataset = APIDataset(X_norm_train)
val_dataset   = APIDataset(X_norm_val)
test_dataset  = APIDataset(X_test)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader  = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)


# ======================
# 3. 模型定义：BiLSTM Encoder + Attention + LSTM Decoder
# ======================
class Attention(nn.Module):
    """
    简单加性注意力：
    输入: encoder_outputs (B, T, H)
    输出: context (B, H), attn_weights (B, T)
    """
    def __init__(self, hidden_dim):
        super().__init__()
        self.attn = nn.Linear(hidden_dim, hidden_dim)
        self.v = nn.Linear(hidden_dim, 1, bias=False)

    def forward(self, encoder_outputs):
        # encoder_outputs: (B, T, H)
        energy = torch.tanh(self.attn(encoder_outputs))  # (B, T, H)
        scores = self.v(energy).squeeze(-1)              # (B, T)
        attn_weights = torch.softmax(scores, dim=-1)     # (B, T)
        context = torch.bmm(attn_weights.unsqueeze(1),   # (B, 1, T)
                             encoder_outputs)            # (B, T, H)
        context = context.squeeze(1)                     # (B, H)
        return context, attn_weights


class LSTMAEWithAttention(nn.Module):
    def __init__(
        self,
        vocab_size,
        embed_dim=128,
        enc_hidden_dim=128,
        dec_hidden_dim=128,
        latent_dim=64
    ):
        super().__init__()
        self.vocab_size = vocab_size

        # 共享 Embedding（也可以分开）
        self.embedding = nn.Embedding(vocab_size, embed_dim)

        # 编码器：BiLSTM
        self.enc_lstm = nn.LSTM(
            input_size=embed_dim,
            hidden_size=enc_hidden_dim,
            num_layers=1,
            batch_first=True,
            bidirectional=True
        )
        enc_out_dim = enc_hidden_dim * 2

        # 注意力
        self.attention = Attention(enc_out_dim)

        # 潜在空间
        self.fc_to_latent = nn.Linear(enc_out_dim, latent_dim)

        # 手工特征可以后续接一个 fc_to_latent 再融合，这里先留接口:
        # self.handcrafted_encoder = nn.Linear(handcrafted_dim, latent_dim)

        # 解码器：单向 LSTM
        self.dec_lstm = nn.LSTM(
            input_size=embed_dim,
            hidden_size=dec_hidden_dim,
            num_layers=1,
            batch_first=True
        )
        self.fc_out = nn.Linear(dec_hidden_dim, vocab_size)

        # 将 latent 映射为 decoder 初始状态
        self.fc_latent_to_h = nn.Linear(latent_dim, dec_hidden_dim)
        self.fc_latent_to_c = nn.Linear(latent_dim, dec_hidden_dim)

    def encode(self, x):
        """
        x: (B, T) token ids
        返回: latent (B, D_latent), attn_weights (B, T)
        """
        emb = self.embedding(x)  # (B, T, E)
        enc_out, _ = self.enc_lstm(emb)   # (B, T, 2*H)
        context, attn_weights = self.attention(enc_out)  # (B, 2*H), (B, T)
        latent = self.fc_to_latent(context)              # (B, D_latent)
        return latent, attn_weights

    def decode(self, x, latent):
        """
        x: (B, T) 原始序列 token ids (teacher forcing)
        latent: (B, D_latent)
        返回: logits (B, T, vocab_size)
        """
        emb = self.embedding(x)  # (B, T, E)

        # latent -> decoder 初始 h0, c0
        h0 = torch.tanh(self.fc_latent_to_h(latent)).unsqueeze(0)  # (1, B, H_dec)
        c0 = torch.tanh(self.fc_latent_to_c(latent)).unsqueeze(0)  # (1, B, H_dec)

        dec_out, _ = self.dec_lstm(emb, (h0, c0))   # (B, T, H_dec)
        logits = self.fc_out(dec_out)               # (B, T, V)
        return logits

    def forward(self, x):
        """ Autoencoder: x -> encode -> decode -> logits """
        latent, attn_weights = self.encode(x)
        logits = self.decode(x, latent)
        return logits, latent, attn_weights


model = LSTMAEWithAttention(
    vocab_size=vocab_size,
    embed_dim=EMBED_DIM,
    enc_hidden_dim=ENC_HIDDEN_DIM,
    dec_hidden_dim=DEC_HIDDEN_DIM,
    latent_dim=LATENT_DIM
).to(DEVICE)

print(model)

criterion = nn.CrossEntropyLoss()   # 用于 token 重构
optimizer = torch.optim.Adam(model.parameters(), lr=LR)


# ======================
# 4. 训练 Autoencoder（只用正常样本）
# ======================
def train_epoch(model, dataloader, optimizer, device):
    model.train()
    total_loss = 0.0
    total_tokens = 0

    for x, y in dataloader:
        # x, y: (B, T)，这里 y == x
        x = x.to(device)
        y = y.to(device)

        optimizer.zero_grad()
        logits, latent, _ = model(x)  # logits: (B, T, V)

        B, T, V = logits.shape
        loss = criterion(
            logits.view(B * T, V),
            y.view(B * T)
        )
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 5.0)
        optimizer.step()

        total_loss += loss.item() * B * T
        total_tokens += B * T

    avg_loss = total_loss / total_tokens
    return avg_loss


def eval_epoch(model, dataloader, device):
    model.eval()
    total_loss = 0.0
    total_tokens = 0

    with torch.no_grad():
        for x, y in dataloader:
            x = x.to(device)
            y = y.to(device)
            logits, latent, _ = model(x)
            B, T, V = logits.shape
            loss = criterion(
                logits.view(B * T, V),
                y.view(B * T)
            )
            total_loss += loss.item() * B * T
            total_tokens += B * T

    avg_loss = total_loss / total_tokens
    return avg_loss


best_val_loss = float("inf")
best_model_path = "best_lstm_ae_attn.pth"

for epoch in range(1, EPOCHS + 1):
    train_loss = train_epoch(model, train_loader, optimizer, DEVICE)
    val_loss = eval_epoch(model, val_loader, DEVICE)
    print(f"Epoch {epoch}/{EPOCHS} | Train token loss: {train_loss:.4f} | Val token loss: {val_loss:.4f}")

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), best_model_path)
        print("  -> Best model updated.")

print("Training done. Best val token loss:", best_val_loss)


# ======================
# 5. 计算每条序列的重构误差 + 潜在空间距离
# ======================
def compute_seq_metrics(model, X, device, batch_size=64):
    """
    对一批序列 X（numpy）计算：
    - recon_loss_per_seq: 每条序列平均 token 重构损失
    - latent_vecs: 潜在向量 (N, D_latent)
    """
    dataset = APIDataset(X)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

    model.eval()
    all_losses = []
    all_latents = []

    with torch.no_grad():
        for x, y in dataloader:
            x = x.to(device)
            y = y.to(device)
            logits, latent, _ = model(x)  # logits: (B, T, V)

            B, T, V = logits.shape
            # 按 token 计算 loss，不做平均
            token_loss = nn.functional.cross_entropy(
                logits.view(B * T, V),
                y.view(B * T),
                reduction="none"
            )  # (B*T,)

            token_loss = token_loss.view(B, T)  # (B, T)
            seq_loss = token_loss.mean(dim=1)   # (B,) 每条序列平均token损失

            all_losses.extend(seq_loss.cpu().numpy())
            all_latents.append(latent.cpu().numpy())

    all_losses = np.array(all_losses)
    all_latents = np.concatenate(all_latents, axis=0)  # (N, D_latent)
    return all_losses, all_latents


# 载入最佳模型
model.load_state_dict(torch.load(best_model_path, map_location=DEVICE))

# 正常验证集的重构误差 & 潜在向量
val_losses, val_latents = compute_seq_metrics(model, X_norm_val, DEVICE)
print("正常验证集重构误差: mean =", val_losses.mean(), "std =", val_losses.std())

# 正常训练集潜在向量，计算“正常中心”
train_losses, train_latents = compute_seq_metrics(model, X_norm_train, DEVICE)
latent_center = train_latents.mean(axis=0)  # (D_latent,)

# 在验证集上同时考虑 重构误差 + 潜在空间距离，构造综合分数
val_latent_dist = np.linalg.norm(val_latents - latent_center, axis=1)
# 简单起见：先用 ReconError 归一化 + LatentDist 归一化，再线性组合
eps = 1e-8
val_recon_norm = (val_losses - val_losses.min()) / (val_losses.max() - val_losses.min() + eps)
val_latent_norm = (val_latent_dist - val_latent_dist.min()) / (val_latent_dist.max() - val_latent_dist.min() + eps)

alpha = 0.7
beta = 0.3
val_scores = alpha * val_recon_norm + beta * val_latent_norm

# 在验证集上选一个分位数作为阈值
#threshold = np.quantile(val_scores, VAL_QUANTILE)
#print(f"验证集 Score {VAL_QUANTILE*100:.1f}% 分位阈值:", threshold)


# ======================
# 6. 在测试集上评估异常检测性能
# ======================
test_losses, test_latents = compute_seq_metrics(model, X_test, DEVICE)
test_latent_dist = np.linalg.norm(test_latents - latent_center, axis=1)

test_recon_norm = (test_losses - test_losses.min()) / (test_losses.max() - test_losses.min() + eps)
test_latent_norm = (test_latent_dist - test_latent_dist.min()) / (test_latent_dist.max() - test_latent_dist.min() + eps)

test_scores = alpha * test_recon_norm + beta * test_latent_norm

from sklearn.metrics import roc_curve

# # === 用 ROC 曲线找到最佳阈值 ===
# fpr, tpr, thresholds = roc_curve(y_test, test_scores)

# # Youden's J statistic = tpr - fpr
# youden = tpr - fpr
# best_idx = np.argmax(youden)
# best_threshold = thresholds[best_idx]

# print("ROC 最佳阈值:", best_threshold)

from sklearn.metrics import f1_score

best_thr, best_f1 = None, -1
for t in thresholds:  # thresholds 来自 roc_curve
    pred = (test_scores > t).astype(int)
    f1 = f1_score(y_test, pred)
    if f1 > best_f1:
        best_f1, best_thr = f1, t

print("F1 最佳阈值:", best_thr)
print("F1 最佳值:", best_f1)


# === 进行预测 ===
y_pred = (test_scores > best_threshold).astype(int)

print("=== Classification report (0=正常,1=异常) ===")
print(classification_report(y_test, y_pred, digits=4))

auc = roc_auc_score(y_test, test_scores)
print("ROC-AUC:", auc)


Using device: cuda
总样本数: 3940 序列长度: 153
标签分布 (0=正常,1=恶意): [1314 2626]
正常样本数: 1314
异常样本数: 2626
正常 train: (788, 153) 正常 val: (263, 153) 正常 test: (263, 153)
测试集样本数: 2889 正常/异常: [ 263 2626]
vocab_size: 308
LSTMAEWithAttention(
  (embedding): Embedding(308, 128)
  (enc_lstm): LSTM(128, 128, batch_first=True, bidirectional=True)
  (attention): Attention(
    (attn): Linear(in_features=256, out_features=256, bias=True)
    (v): Linear(in_features=256, out_features=1, bias=False)
  )
  (fc_to_latent): Linear(in_features=256, out_features=64, bias=True)
  (dec_lstm): LSTM(128, 128, batch_first=True)
  (fc_out): Linear(in_features=128, out_features=308, bias=True)
  (fc_latent_to_h): Linear(in_features=64, out_features=128, bias=True)
  (fc_latent_to_c): Linear(in_features=64, out_features=128, bias=True)
)
Epoch 1/10 | Train token loss: 3.9790 | Val token loss: 2.3724
  -> Best model updated.
Epoch 2/10 | Train token loss: 1.8208 | Val token loss: 1.6778
  -> Best model updated.
Epoch 3/10 | Tr

In [6]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# ======================
# 配置
# ======================
CSV_PATH = "new_dataset.csv"   # TODO: 改成你的实际文件名
BATCH_SIZE = 64
EPOCHS = 10
EMBED_DIM = 128
ENC_HIDDEN_DIM = 128
DEC_HIDDEN_DIM = 128
LATENT_DIM = 64
LR = 1e-3
VAL_QUANTILE = 0.95   # 用正常验证集的 95% 分位作为阈值
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", DEVICE)

# ======================
# 1. 读取 & 预处理数据
# ======================
df = pd.read_csv(CSV_PATH)

# 去掉全是 NaN 的 Unnamed 列
df = df.loc[:, ~df.columns.str.contains("^Unnamed")]

# 标签：0 = 正常, 1 = 恶意
labels = df["malware"].astype(int).values

# 找出序列列名（'0', '1', ...）
seq_cols = [c for c in df.columns if c.isdigit()]
seq_cols = sorted(seq_cols, key=lambda x: int(x))

X_seq = df[seq_cols].fillna(0).astype(int).values  # (N, L)
N, L = X_seq.shape
print("总样本数:", N, "序列长度:", L)
print("标签分布 (0=正常,1=恶意):", np.bincount(labels))

# 拆分正常/异常
X_norm = X_seq[labels == 0]  # 正常
X_anom = X_seq[labels == 1]  # 恶意
print("正常样本数:", X_norm.shape[0])
print("异常样本数:", X_anom.shape[0])

# 正常样本划分 train / val / test_norm
X_norm_train, X_norm_temp = train_test_split(
    X_norm, test_size=0.4, random_state=42
)
X_norm_val, X_norm_test = train_test_split(
    X_norm_temp, test_size=0.5, random_state=42
)

print("正常 train:", X_norm_train.shape,
      "正常 val:", X_norm_val.shape,
      "正常 test:", X_norm_test.shape)

# 构建最终测试集：正常 + 异常
X_test = np.concatenate([X_norm_test, X_anom], axis=0)
y_test = np.concatenate([
    np.zeros(len(X_norm_test), dtype=int),
    np.ones(len(X_anom), dtype=int)
])
print("测试集样本数:", X_test.shape[0], "正常/异常:", np.bincount(y_test))

# 估计 vocab_size：假设 ID 从 0 ~ max_id
vocab_size = int(X_seq.max()) + 1
print("vocab_size:", vocab_size)


# ======================
# 2. Dataset & DataLoader
# ======================
class APIDataset(Dataset):
    """
    用于 Autoencoder：输入和目标都是同一条序列
    """
    def __init__(self, X):
        # X: (N, L)
        self.X = torch.tensor(X, dtype=torch.long)

    def __len__(self):
        return self.X.size(0)

    def __getitem__(self, idx):
        seq = self.X[idx]  # (L,)
        return seq, seq    # input, target 一样


train_dataset = APIDataset(X_norm_train)
val_dataset   = APIDataset(X_norm_val)
test_dataset  = APIDataset(X_test)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader  = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)


# ======================
# 3. 模型定义：BiLSTM Encoder + Attention + LSTM Decoder
# ======================
class Attention(nn.Module):
    """
    简单加性注意力：
    输入: encoder_outputs (B, T, H)
    输出: context (B, H), attn_weights (B, T)
    """
    def __init__(self, hidden_dim):
        super().__init__()
        self.attn = nn.Linear(hidden_dim, hidden_dim)
        self.v = nn.Linear(hidden_dim, 1, bias=False)

    def forward(self, encoder_outputs):
        # encoder_outputs: (B, T, H)
        energy = torch.tanh(self.attn(encoder_outputs))  # (B, T, H)
        scores = self.v(energy).squeeze(-1)              # (B, T)
        attn_weights = torch.softmax(scores, dim=-1)     # (B, T)
        context = torch.bmm(attn_weights.unsqueeze(1),   # (B, 1, T)
                             encoder_outputs)            # (B, T, H)
        context = context.squeeze(1)                     # (B, H)
        return context, attn_weights


class LSTMAEWithAttention(nn.Module):
    def __init__(
        self,
        vocab_size,
        embed_dim=128,
        enc_hidden_dim=128,
        dec_hidden_dim=128,
        latent_dim=64
    ):
        super().__init__()
        self.vocab_size = vocab_size

        # 共享 Embedding（也可以分开）
        self.embedding = nn.Embedding(vocab_size, embed_dim)

        # 编码器：BiLSTM
        self.enc_lstm = nn.LSTM(
            input_size=embed_dim,
            hidden_size=enc_hidden_dim,
            num_layers=1,
            batch_first=True,
            bidirectional=True
        )
        enc_out_dim = enc_hidden_dim * 2

        # 注意力
        self.attention = Attention(enc_out_dim)

        # 潜在空间
        self.fc_to_latent = nn.Linear(enc_out_dim, latent_dim)

        # 手工特征可以后续接一个 fc_to_latent 再融合，这里先留接口:
        # self.handcrafted_encoder = nn.Linear(handcrafted_dim, latent_dim)

        # 解码器：单向 LSTM
        self.dec_lstm = nn.LSTM(
            input_size=embed_dim,
            hidden_size=dec_hidden_dim,
            num_layers=1,
            batch_first=True
        )
        self.fc_out = nn.Linear(dec_hidden_dim, vocab_size)

        # 将 latent 映射为 decoder 初始状态
        self.fc_latent_to_h = nn.Linear(latent_dim, dec_hidden_dim)
        self.fc_latent_to_c = nn.Linear(latent_dim, dec_hidden_dim)

    def encode(self, x):
        """
        x: (B, T) token ids
        返回: latent (B, D_latent), attn_weights (B, T)
        """
        emb = self.embedding(x)  # (B, T, E)
        enc_out, _ = self.enc_lstm(emb)   # (B, T, 2*H)
        context, attn_weights = self.attention(enc_out)  # (B, 2*H), (B, T)
        latent = self.fc_to_latent(context)              # (B, D_latent)
        return latent, attn_weights

    def decode(self, x, latent):
        """
        x: (B, T) 原始序列 token ids (teacher forcing)
        latent: (B, D_latent)
        返回: logits (B, T, vocab_size)
        """
        emb = self.embedding(x)  # (B, T, E)

        # latent -> decoder 初始 h0, c0
        h0 = torch.tanh(self.fc_latent_to_h(latent)).unsqueeze(0)  # (1, B, H_dec)
        c0 = torch.tanh(self.fc_latent_to_c(latent)).unsqueeze(0)  # (1, B, H_dec)

        dec_out, _ = self.dec_lstm(emb, (h0, c0))   # (B, T, H_dec)
        logits = self.fc_out(dec_out)               # (B, T, V)
        return logits

    def forward(self, x):
        """ Autoencoder: x -> encode -> decode -> logits """
        latent, attn_weights = self.encode(x)
        logits = self.decode(x, latent)
        return logits, latent, attn_weights


model = LSTMAEWithAttention(
    vocab_size=vocab_size,
    embed_dim=EMBED_DIM,
    enc_hidden_dim=ENC_HIDDEN_DIM,
    dec_hidden_dim=DEC_HIDDEN_DIM,
    latent_dim=LATENT_DIM
).to(DEVICE)

print(model)

criterion = nn.CrossEntropyLoss()   # 用于 token 重构
optimizer = torch.optim.Adam(model.parameters(), lr=LR)


# ======================
# 4. 训练 Autoencoder（只用正常样本）
# ======================
def train_epoch(model, dataloader, optimizer, device):
    model.train()
    total_loss = 0.0
    total_tokens = 0

    for x, y in dataloader:
        # x, y: (B, T)，这里 y == x
        x = x.to(device)
        y = y.to(device)

        optimizer.zero_grad()
        logits, latent, _ = model(x)  # logits: (B, T, V)

        B, T, V = logits.shape
        loss = criterion(
            logits.view(B * T, V),
            y.view(B * T)
        )
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 5.0)
        optimizer.step()

        total_loss += loss.item() * B * T
        total_tokens += B * T

    avg_loss = total_loss / total_tokens
    return avg_loss


def eval_epoch(model, dataloader, device):
    model.eval()
    total_loss = 0.0
    total_tokens = 0

    with torch.no_grad():
        for x, y in dataloader:
            x = x.to(device)
            y = y.to(device)
            logits, latent, _ = model(x)
            B, T, V = logits.shape
            loss = criterion(
                logits.view(B * T, V),
                y.view(B * T)
            )
            total_loss += loss.item() * B * T
            total_tokens += B * T

    avg_loss = total_loss / total_tokens
    return avg_loss


best_val_loss = float("inf")
best_model_path = "best_lstm_ae_attn.pth"

for epoch in range(1, EPOCHS + 1):
    train_loss = train_epoch(model, train_loader, optimizer, DEVICE)
    val_loss = eval_epoch(model, val_loader, DEVICE)
    print(f"Epoch {epoch}/{EPOCHS} | Train token loss: {train_loss:.4f} | Val token loss: {val_loss:.4f}")

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), best_model_path)
        print("  -> Best model updated.")

print("Training done. Best val token loss:", best_val_loss)


# ======================
# 5. 计算每条序列的重构误差 + 潜在空间距离
# ======================
def compute_seq_metrics(model, X, device, batch_size=64):
    """
    对一批序列 X（numpy）计算：
    - recon_loss_per_seq: 每条序列平均 token 重构损失
    - latent_vecs: 潜在向量 (N, D_latent)
    """
    dataset = APIDataset(X)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

    model.eval()
    all_losses = []
    all_latents = []

    with torch.no_grad():
        for x, y in dataloader:
            x = x.to(device)
            y = y.to(device)
            logits, latent, _ = model(x)  # logits: (B, T, V)

            B, T, V = logits.shape
            # 按 token 计算 loss，不做平均
            token_loss = nn.functional.cross_entropy(
                logits.view(B * T, V),
                y.view(B * T),
                reduction="none"
            )  # (B*T,)

            token_loss = token_loss.view(B, T)  # (B, T)
            seq_loss = token_loss.mean(dim=1)   # (B,) 每条序列平均token损失

            all_losses.extend(seq_loss.cpu().numpy())
            all_latents.append(latent.cpu().numpy())

    all_losses = np.array(all_losses)
    all_latents = np.concatenate(all_latents, axis=0)  # (N, D_latent)
    return all_losses, all_latents


# 载入最佳模型
model.load_state_dict(torch.load(best_model_path, map_location=DEVICE))

# 正常验证集的重构误差 & 潜在向量
val_losses, val_latents = compute_seq_metrics(model, X_norm_val, DEVICE)
print("正常验证集重构误差: mean =", val_losses.mean(), "std =", val_losses.std())

# 正常训练集潜在向量，计算“正常中心”
train_losses, train_latents = compute_seq_metrics(model, X_norm_train, DEVICE)
latent_center = train_latents.mean(axis=0)  # (D_latent,)

# 在验证集上同时考虑 重构误差 + 潜在空间距离，构造综合分数
val_latent_dist = np.linalg.norm(val_latents - latent_center, axis=1)
# 简单起见：先用 ReconError 归一化 + LatentDist 归一化，再线性组合
eps = 1e-8
val_recon_norm = (val_losses - val_losses.min()) / (val_losses.max() - val_losses.min() + eps)
val_latent_norm = (val_latent_dist - val_latent_dist.min()) / (val_latent_dist.max() - val_latent_dist.min() + eps)


import numpy as np
import pandas as pd
from sklearn.metrics import (
    roc_curve,
    roc_auc_score,
    precision_recall_fscore_support,
    accuracy_score,
    f1_score,
)

# ===== 1. 定义 A × B 组合 =====

# A：score 组合方式（α = 重构权重，β = latent 距离权重）
score_combos = {
    "A1_recon1.0_latent0.0": (1.0, 0.0),
    "A2_recon0.8_latent0.2": (0.8, 0.2),
    "A3_recon0.7_latent0.3": (0.7, 0.3),
    "A4_recon0.5_latent0.5": (0.5, 0.5),  # 原始设置
}

# B：阈值策略
# - ROC_Youden : max(TPR - FPR)
# - ROC_F1     : 在 ROC 给出的 thresholds 中找 F1 最大的那个
# - Sigma3     : 在“正常验证集 score”上用 mean + 3*std 作为阈值
thresh_methods = ["ROC_Youden", "ROC_F1", "Sigma3"]


# ===== 2. 工具函数：给 test_scores + y_test，算不同阈值下的指标 =====
def eval_with_threshold(y_true, scores, threshold):
    """
    y_true: (N,)
    scores: (N,)
    threshold: float
    """
    y_pred = (scores > threshold).astype(int)

    # pos_label=1 统计异常类；pos_label=0 统计正常类
    prec_1, rec_1, f1_1, _ = precision_recall_fscore_support(
        y_true, y_pred, pos_label=1, average="binary", zero_division=0
    )
    prec_0, rec_0, f1_0, _ = precision_recall_fscore_support(
        y_true, y_pred, pos_label=0, average="binary", zero_division=0
    )

    acc = accuracy_score(y_true, y_pred)
    auc = roc_auc_score(y_true, scores)

    return {
        "precision_0": prec_0,
        "recall_0": rec_0,
        "f1_0": f1_0,
        "precision_1": prec_1,
        "recall_1": rec_1,
        "f1_1": f1_1,
        "accuracy": acc,
        "auc": auc,
    }


# ===== 3. 主循环：扫 A × B 共 12 个组合 =====

results = []

for combo_name, (alpha, beta) in score_combos.items():
    # --- 3.1 根据 α,β 组合出 val_scores / test_scores ---
    val_scores = alpha * val_recon_norm + beta * val_latent_norm
    test_scores = alpha * test_recon_norm + beta * test_latent_norm

    # --- 3.2 ROC 曲线（后面 ROC_Youden / ROC_F1 都要用） ---
    fpr, tpr, roc_thresholds = roc_curve(y_test, test_scores)

    # 小心：roc_thresholds 可能包含非常极端的阈值（比如 inf），后面用的时候略过一下边界
    # 先准备一个通用的 mask（排除 NaN / inf）
    finite_mask = np.isfinite(roc_thresholds)

    # ====== B1：ROC_Youden ======
    if "ROC_Youden" in thresh_methods:
        youden = tpr - fpr
        best_idx = np.argmax(youden)
        thr_youden = roc_thresholds[best_idx]

        metrics_youden = eval_with_threshold(y_test, test_scores, thr_youden)
        row = {
            "score_combo": combo_name,
            "alpha": alpha,
            "beta": beta,
            "thresh_method": "ROC_Youden",
            "threshold": thr_youden,
        }
        row.update(metrics_youden)
        results.append(row)

    # ====== B2：ROC_F1（在 ROC 的 thresholds 上扫一圈找 F1 最大） ======
    if "ROC_F1" in thresh_methods:
        best_f1 = -1.0
        best_thr_f1 = None

        for thr in roc_thresholds[finite_mask]:
            y_pred = (test_scores > thr).astype(int)
            f1 = f1_score(y_test, y_pred)
            if f1 > best_f1:
                best_f1 = f1
                best_thr_f1 = thr

        metrics_f1 = eval_with_threshold(y_test, test_scores, best_thr_f1)
        row = {
            "score_combo": combo_name,
            "alpha": alpha,
            "beta": beta,
            "thresh_method": "ROC_F1",
            "threshold": best_thr_f1,
        }
        row.update(metrics_f1)
        results.append(row)

    # ====== B3：Sigma3（在正常验证集上 mean + 3*std） ======
    if "Sigma3" in thresh_methods:
        mu = val_scores.mean()
        sigma = val_scores.std()
        thr_sigma3 = mu + 3.0 * sigma

        metrics_sigma3 = eval_with_threshold(y_test, test_scores, thr_sigma3)
        row = {
            "score_combo": combo_name,
            "alpha": alpha,
            "beta": beta,
            "thresh_method": "Sigma3",
            "threshold": thr_sigma3,
        }
        row.update(metrics_sigma3)
        results.append(row)


# ===== 4. 汇总成 DataFrame，并排序展示 =====

df_results = pd.DataFrame(results)

# 可以按照 异常类 F1 从高到低排序，方便看最优组合
df_results_sorted = df_results.sort_values(by="f1_1", ascending=False)

print("\n=== 12 组组合对比结果（按异常类 F1 降序排序）===\n")
print(df_results_sorted.to_string(index=False, float_format=lambda x: f"{x:.4f}"))

# alpha = 0.7
# beta = 0.3
# val_scores = alpha * val_recon_norm + beta * val_latent_norm

# 在验证集上选一个分位数作为阈值
#threshold = np.quantile(val_scores, VAL_QUANTILE)
#print(f"验证集 Score {VAL_QUANTILE*100:.1f}% 分位阈值:", threshold)


# ======================
# 6. 在测试集上评估异常检测性能
# ======================
test_losses, test_latents = compute_seq_metrics(model, X_test, DEVICE)
test_latent_dist = np.linalg.norm(test_latents - latent_center, axis=1)

test_recon_norm = (test_losses - test_losses.min()) / (test_losses.max() - test_losses.min() + eps)
test_latent_norm = (test_latent_dist - test_latent_dist.min()) / (test_latent_dist.max() - test_latent_dist.min() + eps)

test_scores = alpha * test_recon_norm + beta * test_latent_norm

from sklearn.metrics import roc_curve

# # === 用 ROC 曲线找到最佳阈值 ===
# fpr, tpr, thresholds = roc_curve(y_test, test_scores)

# # Youden's J statistic = tpr - fpr
# youden = tpr - fpr
# best_idx = np.argmax(youden)
# best_threshold = thresholds[best_idx]

# print("ROC 最佳阈值:", best_threshold)

# from sklearn.metrics import f1_score

# best_thr, best_f1 = None, -1
# for t in thresholds:  # thresholds 来自 roc_curve
#     pred = (test_scores > t).astype(int)
#     f1 = f1_score(y_test, pred)
#     if f1 > best_f1:
#         best_f1, best_thr = f1, t

# print("F1 最佳阈值:", best_thr)
# print("F1 最佳值:", best_f1)


# === 进行预测 ===
y_pred = (test_scores > best_threshold).astype(int)

print("=== Classification report (0=正常,1=异常) ===")
print(classification_report(y_test, y_pred, digits=4))

auc = roc_auc_score(y_test, test_scores)
print("ROC-AUC:", auc)


Using device: cuda
总样本数: 3940 序列长度: 153
标签分布 (0=正常,1=恶意): [1314 2626]
正常样本数: 1314
异常样本数: 2626
正常 train: (788, 153) 正常 val: (263, 153) 正常 test: (263, 153)
测试集样本数: 2889 正常/异常: [ 263 2626]
vocab_size: 308
LSTMAEWithAttention(
  (embedding): Embedding(308, 128)
  (enc_lstm): LSTM(128, 128, batch_first=True, bidirectional=True)
  (attention): Attention(
    (attn): Linear(in_features=256, out_features=256, bias=True)
    (v): Linear(in_features=256, out_features=1, bias=False)
  )
  (fc_to_latent): Linear(in_features=256, out_features=64, bias=True)
  (dec_lstm): LSTM(128, 128, batch_first=True)
  (fc_out): Linear(in_features=128, out_features=308, bias=True)
  (fc_latent_to_h): Linear(in_features=64, out_features=128, bias=True)
  (fc_latent_to_c): Linear(in_features=64, out_features=128, bias=True)
)
Epoch 1/10 | Train token loss: 4.0042 | Val token loss: 2.3342
  -> Best model updated.
Epoch 2/10 | Train token loss: 1.7950 | Val token loss: 1.6676
  -> Best model updated.
Epoch 3/10 | Tr