<a href="https://colab.research.google.com/github/mynameislllyt/API_Experiment/blob/main/train_API_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

1、FP 较高（误报偏多）模型对异常很敏感
Confusion matrix (行=真实  列=预测):
[[ 1060  4330]
 [ 1664 23391]]
Precision=0.8438, Recall=0.9336, F1=0.8864
(Top-k = 5)

In [1]:
# 在 Colab 中下载 csicFinal.csv （基于 CSIC 2010）
!wget -O csicFinal.csv https://raw.githubusercontent.com/thpablo/Notebook_KNN_CSIC_Data/main/csicFinal.csv

import pandas as pd

df = pd.read_csv("csicFinal.csv")
print(df.head())
print(df.columns)


--2025-11-25 11:53:47--  https://raw.githubusercontent.com/thpablo/Notebook_KNN_CSIC_Data/main/csicFinal.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 21147547 (20M) [text/plain]
Saving to: ‘csicFinal.csv’


2025-11-25 11:53:48 (53.6 MB/s) - ‘csicFinal.csv’ saved [21147547/21147547]

   Class Method                              URI Host-Header            Host  \
0  Valid    GET               /tienda1/index.jsp    HTTP/1.1  localhost:8080   
1  Valid    GET      /tienda1/publico/anadir.jsp    HTTP/1.1  localhost:8080   
2  Valid   POST      /tienda1/publico/anadir.jsp    HTTP/1.1  localhost:8080   
3  Valid    GET  /tienda1/publico/autenticar.jsp    HTTP/1.1  localhost:8080   
4  Valid   POST  /tienda1/publico/autenticar.jsp    HTTP/1.1  localhost:8080 

In [2]:
# ===========================
# 1. 导入库 & 读取数据
# ===========================
import pandas as pd
import numpy as np
import re
from collections import Counter

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from sklearn.metrics import confusion_matrix, precision_recall_fscore_support

# 如果是在 Colab，可以用相对路径（假设已经 wget 下来了）
CSV_PATH = "csicFinal.csv"   # 按你实际路径改

df = pd.read_csv(CSV_PATH)
print("数据维度:", df.shape)
print("列名:", df.columns.tolist())

print("\nClass 列取值统计：")
print(df["Class"].value_counts())


数据维度: (61065, 16)
列名: ['Class', 'Method', 'URI', 'Host-Header', 'Host', 'Connection', 'Accept', 'Accept-Charset', 'Accept-Language', 'Cache-control', 'Cookie', 'Pragma', 'Content-Length', 'Content-Type', 'POST-Data', 'GET-Query']

Class 列取值统计：
Class
Valid        36000
Anomalous    25065
Name: count, dtype: int64


In [3]:
# ===========================
# 2. 划分正常 / 异常 & 数据集
# ===========================
from sklearn.model_selection import train_test_split

LABEL_COL = "Class"

# 假设 normal 是样本最多的那个类别（一般就是 "normal"）
normal_label = df[LABEL_COL].value_counts().idxmax()
print("推测正常标签为:", normal_label)

normal_df   = df[df[LABEL_COL] == normal_label].copy()
abnormal_df = df[df[LABEL_COL] != normal_label].copy()

print("normal 样本数:", len(normal_df))
print("abnormal 样本数:", len(abnormal_df))

# 在 normal 中划分 train / val / test_normal
train_norm, temp_norm = train_test_split(
    normal_df, test_size=0.3, random_state=42, shuffle=True
)
val_norm, test_norm = train_test_split(
    temp_norm, test_size=0.5, random_state=42, shuffle=True
)

# 异常全部做 test_abn
test_abn = abnormal_df

print("train_norm:", len(train_norm))
print("val_norm  :", len(val_norm))
print("test_norm :", len(test_norm))
print("test_abn  :", len(test_abn))


推测正常标签为: Valid
normal 样本数: 36000
abnormal 样本数: 25065
train_norm: 25200
val_norm  : 5400
test_norm : 5400
test_abn  : 25065


In [4]:
# ===========================
# 3. 事件抽象函数（Method + URI 模板）
# ===========================
def template_uri(uri: str) -> str:
    """
    把 URI 归一化：
    - 去掉 query (? 后面)
    - 把数字替换成 <num>
    """
    if pd.isna(uri):
        uri = ""
    uri = str(uri)

    # 去掉 query
    if "?" in uri:
        uri = uri.split("?", 1)[0]

    # 把连续数字替换为 <num>
    uri = re.sub(r"\d+", "<num>", uri)

    return uri

def row_to_event_str(row) -> str:
    method = str(row.get("Method", "")).upper()
    uri_raw = row.get("URI", "")
    uri_t = template_uri(uri_raw)
    event_str = f"{method}|{uri_t}"
    return event_str

# 看看示例
print("\n示例事件：")
for i in range(5):
    print(row_to_event_str(df.iloc[i]))



示例事件：
GET|/tienda<num>/index.jsp
GET|/tienda<num>/publico/anadir.jsp
POST|/tienda<num>/publico/anadir.jsp
GET|/tienda<num>/publico/autenticar.jsp
POST|/tienda<num>/publico/autenticar.jsp


In [5]:
# ===========================
# 4. 构建事件字典（vocab）
# ===========================
# 只用 train_norm 中的事件来建词表
train_events_str = [row_to_event_str(row) for _, row in train_norm.iterrows()]

counter = Counter(train_events_str)
print("\n不同事件个数(训练集):", len(counter))

# 预留特殊 token
PAD_TOKEN = "<PAD>"
UNK_TOKEN = "<UNK>"

event2id = {
    PAD_TOKEN: 0,
    UNK_TOKEN: 1,
}
id2event = {
    0: PAD_TOKEN,
    1: UNK_TOKEN,
}

for ev in counter:
    idx = len(event2id)
    event2id[ev] = idx
    id2event[idx] = ev

vocab_size = len(event2id)
print("事件字典大小（含 PAD/UNK）:", vocab_size)



不同事件个数(训练集): 34
事件字典大小（含 PAD/UNK）: 36


In [6]:
# ===========================
# 5. DataFrame → 事件 ID 序列
# ===========================
def df_to_event_ids(df_in, event2id):
    ids = []
    for _, row in df_in.iterrows():
        ev_str = row_to_event_str(row)
        ev_id = event2id.get(ev_str, event2id[UNK_TOKEN])
        ids.append(ev_id)
    return np.array(ids, dtype=np.int64)

train_ids = df_to_event_ids(train_norm, event2id)
val_ids   = df_to_event_ids(val_norm,   event2id)
test_norm_ids = df_to_event_ids(test_norm, event2id)
test_abn_ids  = df_to_event_ids(test_abn,  event2id)

print("train_ids shape:", train_ids.shape)
print("val_ids   shape:", val_ids.shape)
print("test_norm_ids shape:", test_norm_ids.shape)
print("test_abn_ids  shape:", test_abn_ids.shape)


train_ids shape: (25200,)
val_ids   shape: (5400,)
test_norm_ids shape: (5400,)
test_abn_ids  shape: (25065,)


In [7]:
# ===========================
# 6. 滑动窗口 Dataset
# ===========================
class EventWindowDataset(Dataset):
    """
    input:  长度为 L 的事件 ID 序列
    target: 第 L+1 个事件 ID
    （训练、验证时 label=None；测试时可带上 0/1 标签）
    """
    def __init__(self, event_ids: np.ndarray, window_size: int, label: int = None):
        self.window_size = window_size
        X_list = []
        y_list = []
        labels = []

        N = len(event_ids)
        for i in range(N - window_size):
            X_list.append(event_ids[i:i+window_size])
            y_list.append(event_ids[i+window_size])
            if label is not None:
                labels.append(label)

        self.X = torch.tensor(np.stack(X_list), dtype=torch.long)
        self.y = torch.tensor(np.array(y_list), dtype=torch.long)
        self.has_label = label is not None
        if self.has_label:
            self.labels = torch.tensor(np.array(labels), dtype=torch.long)

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        if self.has_label:
            return self.X[idx], self.y[idx], self.labels[idx]
        else:
            return self.X[idx], self.y[idx]

window_size = 10

train_dataset = EventWindowDataset(train_ids, window_size=window_size, label=None)
val_dataset   = EventWindowDataset(val_ids,   window_size=window_size, label=None)

print("train windows:", len(train_dataset))
print("val windows  :", len(val_dataset))

batch_size = 256
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader   = DataLoader(val_dataset,   batch_size=batch_size, shuffle=False)


train windows: 25190
val windows  : 5390


In [8]:
# ===========================
# 7. DeepLog 风格 LSTM 模型
# ===========================
class DeepLogLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim=64, hidden_size=64, num_layers=1):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True
        )
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x):
        """
        x: [B, L] 事件 ID 序列
        return: logits [B, vocab_size]，对应下一个事件的概率分布（未 softmax）
        """
        emb = self.embedding(x)           # [B, L, emb]
        out, _ = self.lstm(emb)           # [B, L, hidden]
        last_h = out[:, -1, :]            # [B, hidden]
        logits = self.fc(last_h)          # [B, vocab_size]
        return logits

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

model = DeepLogLSTM(vocab_size=vocab_size, embedding_dim=64, hidden_size=64, num_layers=1)
model = model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

total_params = sum(p.numel() for p in model.parameters())
print("模型参数量:", total_params)


Using device: cpu
模型参数量: 37924


In [9]:
# ===========================
# 8. 训练 & 验证
# ===========================
def train_one_epoch():
    model.train()
    total_loss = 0.0
    for X_batch, y_batch in train_loader:
        X_batch = X_batch.to(device)
        y_batch = y_batch.to(device)

        logits = model(X_batch)          # [B, vocab_size]
        loss = criterion(logits, y_batch)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * X_batch.size(0)
    return total_loss / len(train_dataset)

def eval_one_epoch():
    model.eval()
    total_loss = 0.0
    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            X_batch = X_batch.to(device)
            y_batch = y_batch.to(device)
            logits = model(X_batch)
            loss = criterion(logits, y_batch)
            total_loss += loss.item() * X_batch.size(0)
    return total_loss / len(val_dataset)

num_epochs = 10
for epoch in range(num_epochs):
    train_loss = train_one_epoch()
    val_loss = eval_one_epoch()
    print(f"Epoch {epoch+1}/{num_epochs} - train_loss={train_loss:.6f}, val_loss={val_loss:.6f}")


Epoch 1/10 - train_loss=3.529273, val_loss=3.509246
Epoch 2/10 - train_loss=3.492899, val_loss=3.506516
Epoch 3/10 - train_loss=3.485915, val_loss=3.507505
Epoch 4/10 - train_loss=3.479956, val_loss=3.512433
Epoch 5/10 - train_loss=3.474415, val_loss=3.511867
Epoch 6/10 - train_loss=3.468172, val_loss=3.516045
Epoch 7/10 - train_loss=3.461126, val_loss=3.518617
Epoch 8/10 - train_loss=3.453674, val_loss=3.522980
Epoch 9/10 - train_loss=3.445647, val_loss=3.527018
Epoch 10/10 - train_loss=3.435652, val_loss=3.534277


In [10]:
# ===========================
# 9. 测试集窗口（正常=0 / 异常=1）
# ===========================
test_norm_dataset = EventWindowDataset(test_norm_ids, window_size=window_size, label=0)
test_abn_dataset  = EventWindowDataset(test_abn_ids,  window_size=window_size, label=1)

# 合并两个测试集
test_X = torch.cat([test_norm_dataset.X, test_abn_dataset.X], dim=0)
test_y_next = torch.cat([test_norm_dataset.y, test_abn_dataset.y], dim=0)
test_labels = torch.cat([test_norm_dataset.labels, test_abn_dataset.labels], dim=0)

test_dataset = torch.utils.data.TensorDataset(test_X, test_y_next, test_labels)
test_loader = DataLoader(test_dataset, batch_size=256, shuffle=False)

print("test windows:", len(test_dataset))


test windows: 30445


In [11]:
# ===========================
# 10. DeepLog Top-k 异常检测 & 指标
# ===========================
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix

k = 5  # Top-k

all_true_labels = []
all_pred_labels = []  # 0 正常, 1 异常

model.eval()
with torch.no_grad():
    for X_batch, y_next_batch, label_batch in test_loader:
        X_batch = X_batch.to(device)
        y_next_batch = y_next_batch.to(device)   # 真实下一事件 ID
        label_batch = label_batch.to(device)     # 0/1

        logits = model(X_batch)                  # [B, vocab_size]
        # 取 top-k
        topk_probs, topk_indices = torch.topk(F.softmax(logits, dim=1), k=k, dim=1)
        # 对于每个样本，判断真实 y_next 是否在 top-k 里
        # bool tensor: True 表示 "在 top-k 内" -> 正常; False -> 异常
        in_topk = (topk_indices == y_next_batch.unsqueeze(1)).any(dim=1)

        # DeepLog 规则：不在 top-k → 异常
        pred_is_anomaly = (~in_topk).long()   # 1 异常, 0 正常

        all_true_labels.append(label_batch.cpu().numpy())
        all_pred_labels.append(pred_is_anomaly.cpu().numpy())

all_true_labels = np.concatenate(all_true_labels, axis=0)
all_pred_labels = np.concatenate(all_pred_labels, axis=0)

cm = confusion_matrix(all_true_labels, all_pred_labels)
p, r, f1, _ = precision_recall_fscore_support(all_true_labels, all_pred_labels, average='binary')

print("Confusion matrix (行=真实  列=预测):")
print(cm)
print(f"Precision={p:.4f}, Recall={r:.4f}, F1={f1:.4f}")
print(f"(Top-k = {k})")


Confusion matrix (行=真实  列=预测):
[[ 1060  4330]
 [ 1664 23391]]
Precision=0.8438, Recall=0.9336, F1=0.8864
(Top-k = 5)
