<a href="https://colab.research.google.com/github/mynameislllyt/API_Experiment/blob/main/baseline2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
import pandas as pd
import numpy as np
import random
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

# ============ 1. 读数据 + 拆 benign-only ============
def load_csv_expanded(path):
    df = pd.read_csv(path)
    seq_cols = [c for c in df.columns if c.startswith("t_")]
    seqs = df[seq_cols].values.astype(int)      # shape: [N, 100]
    labels = df["malware"].values.astype(int)   # 1=malware, 0=benign
    return seqs, labels

def split_benign_only(seqs, labels, seed=42):
    benign = seqs[labels == 0]
    malware = seqs[labels == 1]

    rng = np.random.default_rng(seed)
    idx_b = rng.permutation(len(benign))
    idx_m = rng.permutation(len(malware))

    n_b = len(benign)
    n_m = len(malware)

    n_train = int(0.7*n_b)
    n_val   = int(0.1*n_b)

    benign_train = benign[idx_b[:n_train]]
    benign_val   = benign[idx_b[n_train:n_train+n_val]]
    benign_test  = benign[idx_b[n_train+n_val:]]

    # 比如 20% malware 做 val，用来选阈值，剩下做 test
    n_m_val = int(0.2*n_m)
    malware_val  = malware[idx_m[:n_m_val]]
    malware_test = malware[idx_m[n_m_val:]]

    return benign_train, benign_val, benign_test, malware_val, malware_test

# ============ 2. 滑动窗口 ============
def make_windows(seqs, window_size=10):
    X, y = [], []
    for s in seqs:
        s = s.tolist()
        for i in range(len(s) - window_size):
            X.append(s[i:i+window_size])
            y.append(s[i+window_size])
    return np.array(X, dtype=int), np.array(y, dtype=int)

# ============ 3. Dataset ============
class WindowDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.long)
        self.y = torch.tensor(y, dtype=torch.long)
    def __len__(self): return len(self.X)
    def __getitem__(self, i): return self.X[i], self.y[i]

# ============ 4. LSTM LM ============
class LSTMLM(nn.Module):
    def __init__(self, vocab_size, emb_dim=128, hidden_dim=256, num_layers=1, dropout=0.2):
        super().__init__()
        # self.emb = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        self.emb = nn.Embedding(vocab_size, emb_dim)  # 去掉 padding_idx
        self.lstm = nn.LSTM(
            emb_dim, hidden_dim, num_layers=num_layers, batch_first=True,
            dropout=dropout if num_layers>1 else 0.0
        )
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        e = self.emb(x)        # [B, W, E]
        o, _ = self.lstm(e)    # [B, W, H]
        last = o[:, -1, :]
        return self.fc(last)   # [B, V]

# ============ 5. 训练/验证 ============
def eval_loss(model, loader, device="cuda"):
    model.eval()
    crit = nn.CrossEntropyLoss(reduction="sum")
    total = 0.0
    with torch.no_grad():
        for Xb, yb in loader:
            Xb, yb = Xb.to(device), yb.to(device)
            total += crit(model(Xb), yb).item()
    return total / len(loader.dataset)

def train_model(model, train_loader, val_loader, epochs=20, lr=1e-3, device="cuda"):
    model.to(device)
    #opt = torch.optim.Adam(model.parameters(), lr=lr)
    opt = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=1e-5)

    crit = nn.CrossEntropyLoss()

    best_val, best_state = 1e9, None
    patience, bad_count = 5, 0
    for ep in range(1, epochs+1):
        model.train()
        total = 0.0
        for Xb, yb in tqdm(train_loader, desc=f"Epoch {ep}"):
            Xb, yb = Xb.to(device), yb.to(device)
            opt.zero_grad()
            loss = crit(model(Xb), yb)
            loss.backward()
            opt.step()
            total += loss.item() * Xb.size(0)

        val_loss = eval_loss(model, val_loader, device)
        print(f"ep{ep}: train={total/len(train_loader.dataset):.4f}, val={val_loss:.4f}")
        # if val_loss < best_val:
        #     best_val = val_loss
        #     best_state = {k:v.cpu().clone() for k,v in model.state_dict().items()}
        if val_loss < best_val - 1e-4:
            best_val = val_loss
            best_state = {k:v.cpu().clone() for k,v in model.state_dict().items()}
            bad_count = 0
        else:
            bad_count += 1
            if bad_count >= patience:
                print(f"Early stop at epoch {ep}")
                break

    model.load_state_dict(best_state)
    return model

# ============ 6. 序列 NLL 异常分数 ============
def sequence_scores_nll(model, seqs, window_size=10, device="cuda"):#score 越大 = 越不符合 benign 模式 = 越可疑
    crit = nn.CrossEntropyLoss(reduction="none")
    model.eval()
    scores = []

    with torch.no_grad():
        for s in seqs:
            X, y = make_windows([s], window_size)
            X = torch.tensor(X, dtype=torch.long).to(device)
            y = torch.tensor(y, dtype=torch.long).to(device)
            nll = crit(model(X), y)
            scores.append(nll.mean().item())
    return np.array(scores)

def search_best_threshold(benign_scores, malware_scores):
    all_scores = np.concatenate([benign_scores, malware_scores])
    cand_th = np.quantile(all_scores, np.linspace(0.7, 0.99, 20))  # 可以调范围

    best_f1, best_th, best_metrics = -1, None, None
    for th in cand_th:
        m = evaluate(th, benign_scores, malware_scores)
        if m["f1"] > best_f1:
            best_f1, best_th, best_metrics = m["f1"], th, m
    return best_th, best_metrics

def pick_threshold(val_scores, q=0.99):#在验证集上，大约 99% 的 benign score 都 低于 这个阈值
    return float(np.quantile(val_scores, q))

def evaluate(th, benign_scores, malware_scores):
    y_true = np.array([0]*len(benign_scores) + [1]*len(malware_scores))
    y_pred = np.array(
        [1 if s>th else 0 for s in benign_scores] +
        [1 if s>th else 0 for s in malware_scores]
    )
    tp = ((y_true==1)&(y_pred==1)).sum()
    tn = ((y_true==0)&(y_pred==0)).sum()
    fp = ((y_true==0)&(y_pred==1)).sum()
    fn = ((y_true==1)&(y_pred==0)).sum()

    precision = tp/(tp+fp+1e-9)
    recall    = tp/(tp+fn+1e-9)
    f1        = 2*precision*recall/(precision+recall+1e-9)
    acc       = (tp+tn)/(tp+tn+fp+fn+1e-9)
    fpr       = fp/(fp+tn+1e-9)
    return dict(acc=acc, precision=precision, recall=recall, f1=f1, fpr=fpr,
                tp=int(tp), tn=int(tn), fp=int(fp), fn=int(fn))

# ============ 7. 主流程 ============
# ============ 7. 主流程 ============

def main():
    path = "./dynamic_api_call_sequence_per_malware_100_0_306.csv"
    seqs, labels = load_csv_expanded(path)
    benign_train, benign_val, benign_test, malware_val, malware_test = split_benign_only(seqs, labels)

    # vocab_size 需要从 token 最大值推出来（+1，因为从0计数）
    vocab_size = int(seqs.max()) + 1
    print("vocab_size:", vocab_size)

    device = "cuda" if torch.cuda.is_available() else "cpu"

    # 如果你只想试一个 window_size，可以把 for 删掉，直接 window_size = 20
    for window_size in [10, 20, 30, 40]:
        print("=" * 50)
        print(f"window_size = {window_size}")

        # 1) 只用 benign_train / benign_val 训练 LM
        Xtr, ytr = make_windows(benign_train, window_size)
        Xva, yva = make_windows(benign_val, window_size)

        train_loader = DataLoader(WindowDataset(Xtr, ytr), batch_size=256, shuffle=True)
        val_loader   = DataLoader(WindowDataset(Xva, yva), batch_size=256)

        model = LSTMLM(
            vocab_size=vocab_size,
            emb_dim=128,
            hidden_dim=256,
            num_layers=2,
            dropout=0.3
        )

        model = train_model(model, train_loader, val_loader, device=device)

        # 2) 在验证集上算 NLL 分数（benign_val + malware_val），找最优阈值
        benign_val_scores  = sequence_scores_nll(model, benign_val,  window_size, device=device)
        malware_val_scores = sequence_scores_nll(model, malware_val, window_size, device=device)

        best_th, best_val_metrics = search_best_threshold(benign_val_scores, malware_val_scores)
        print("best_th on val:", best_th)
        print("val metrics:", best_val_metrics)

        # 3) 在测试集上评估（benign_test + malware_test）
        benign_test_scores  = sequence_scores_nll(model, benign_test,  window_size, device=device)
        malware_test_scores = sequence_scores_nll(model, malware_test, window_size, device=device)

        test_metrics = evaluate(best_th, benign_test_scores, malware_test_scores)
        print("test metrics:", test_metrics)


if __name__ == "__main__":
    main()

vocab_size: 307
window_size = 10


Epoch 1: 100%|██████████| 266/266 [00:03<00:00, 67.84it/s]


ep1: train=2.1840, val=1.4242


Epoch 2: 100%|██████████| 266/266 [00:03<00:00, 83.05it/s] 


ep2: train=1.1522, val=1.0886


Epoch 3: 100%|██████████| 266/266 [00:01<00:00, 146.54it/s]


ep3: train=0.9086, val=0.9594


Epoch 4: 100%|██████████| 266/266 [00:01<00:00, 147.19it/s]


ep4: train=0.7658, val=0.8914


Epoch 5: 100%|██████████| 266/266 [00:02<00:00, 126.19it/s]


ep5: train=0.6683, val=0.8448


Epoch 6: 100%|██████████| 266/266 [00:01<00:00, 137.58it/s]


ep6: train=0.5946, val=0.8247


Epoch 7: 100%|██████████| 266/266 [00:01<00:00, 144.45it/s]


ep7: train=0.5332, val=0.8067


Epoch 8: 100%|██████████| 266/266 [00:01<00:00, 145.94it/s]


ep8: train=0.4819, val=0.7932


Epoch 9: 100%|██████████| 266/266 [00:02<00:00, 132.40it/s]


ep9: train=0.4372, val=0.7927


Epoch 10: 100%|██████████| 266/266 [00:01<00:00, 142.89it/s]


ep10: train=0.4029, val=0.7944


Epoch 11: 100%|██████████| 266/266 [00:01<00:00, 136.13it/s]


ep11: train=0.3674, val=0.7943


Epoch 12: 100%|██████████| 266/266 [00:02<00:00, 129.85it/s]


ep12: train=0.3369, val=0.8042


Epoch 13: 100%|██████████| 266/266 [00:01<00:00, 144.17it/s]


ep13: train=0.3110, val=0.8030


Epoch 14: 100%|██████████| 266/266 [00:01<00:00, 145.04it/s]


ep14: train=0.2902, val=0.8239
Early stop at epoch 14
best_th on val: 2.3061466217041016
val metrics: {'acc': np.float64(0.30913916455108365), 'precision': np.float64(0.9969088098914232), 'recall': np.float64(0.3014370837714334), 'f1': np.float64(0.46290481708305253), 'fpr': np.float64(0.07476635513948816), 'tp': 2580, 'tn': 99, 'fp': 8, 'fn': 5979}
test metrics: {'acc': np.float64(0.30454215643592214), 'precision': np.float64(0.9984478075280366), 'recall': np.float64(0.3006016706583241), 'f1': np.float64(0.46208413738952187), 'fpr': np.float64(0.07373271889366943), 'tp': 10292, 'tn': 201, 'fp': 16, 'fn': 23946}
window_size = 20


Epoch 1: 100%|██████████| 236/236 [00:02<00:00, 83.48it/s]


ep1: train=2.2531, val=1.4742


Epoch 2: 100%|██████████| 236/236 [00:02<00:00, 83.22it/s]


ep2: train=1.1856, val=1.1169


Epoch 3: 100%|██████████| 236/236 [00:02<00:00, 83.19it/s]


ep3: train=0.9244, val=0.9604


Epoch 4: 100%|██████████| 236/236 [00:03<00:00, 75.79it/s]


ep4: train=0.7778, val=0.8918


Epoch 5: 100%|██████████| 236/236 [00:02<00:00, 82.85it/s]


ep5: train=0.6758, val=0.8396


Epoch 6: 100%|██████████| 236/236 [00:02<00:00, 82.77it/s]


ep6: train=0.5972, val=0.8120


Epoch 7: 100%|██████████| 236/236 [00:02<00:00, 83.02it/s]


ep7: train=0.5372, val=0.7956


Epoch 8: 100%|██████████| 236/236 [00:03<00:00, 75.46it/s]


ep8: train=0.4827, val=0.7780


Epoch 9: 100%|██████████| 236/236 [00:02<00:00, 82.44it/s]


ep9: train=0.4398, val=0.7705


Epoch 10: 100%|██████████| 236/236 [00:02<00:00, 82.75it/s]


ep10: train=0.4003, val=0.7608


Epoch 11: 100%|██████████| 236/236 [00:02<00:00, 82.08it/s]


ep11: train=0.3658, val=0.7675


Epoch 12: 100%|██████████| 236/236 [00:03<00:00, 75.17it/s]


ep12: train=0.3324, val=0.7837


Epoch 13: 100%|██████████| 236/236 [00:02<00:00, 82.42it/s]


ep13: train=0.3033, val=0.7844


Epoch 14: 100%|██████████| 236/236 [00:02<00:00, 82.47it/s]


ep14: train=0.2812, val=0.7865


Epoch 15: 100%|██████████| 236/236 [00:02<00:00, 80.73it/s]


ep15: train=0.2557, val=0.7979
Early stop at epoch 15
best_th on val: 2.512969732284546
val metrics: {'acc': np.float64(0.3109854604199964), 'precision': np.float64(0.997692307691924), 'recall': np.float64(0.3030727888771699), 'f1': np.float64(0.4649162107725287), 'fpr': np.float64(0.05607476635461612), 'tp': 2594, 'tn': 101, 'fp': 6, 'fn': 5965}
test metrics: {'acc': np.float64(0.30721230590624565), 'precision': np.float64(0.998653328203059), 'recall': np.float64(0.3032303288743413), 'f1': np.float64(0.4652058964925813), 'fpr': np.float64(0.06451612903196076), 'tp': 10382, 'tn': 203, 'fp': 14, 'fn': 23856}
window_size = 30


Epoch 1: 100%|██████████| 207/207 [00:03<00:00, 56.86it/s]


ep1: train=2.4210, val=1.6452


Epoch 2: 100%|██████████| 207/207 [00:03<00:00, 57.88it/s]


ep2: train=1.3227, val=1.2387


Epoch 3: 100%|██████████| 207/207 [00:03<00:00, 58.06it/s]


ep3: train=1.0287, val=1.0613


Epoch 4: 100%|██████████| 207/207 [00:03<00:00, 56.69it/s]


ep4: train=0.8680, val=0.9519


Epoch 5: 100%|██████████| 207/207 [00:03<00:00, 58.00it/s]


ep5: train=0.7577, val=0.8976


Epoch 6: 100%|██████████| 207/207 [00:03<00:00, 57.93it/s]


ep6: train=0.6702, val=0.8527


Epoch 7: 100%|██████████| 207/207 [00:03<00:00, 56.74it/s]


ep7: train=0.6002, val=0.8301


Epoch 8: 100%|██████████| 207/207 [00:03<00:00, 55.57it/s]


ep8: train=0.5383, val=0.8155


Epoch 9: 100%|██████████| 207/207 [00:03<00:00, 57.95it/s]


ep9: train=0.4901, val=0.7994


Epoch 10: 100%|██████████| 207/207 [00:03<00:00, 56.72it/s]


ep10: train=0.4461, val=0.7991


Epoch 11: 100%|██████████| 207/207 [00:03<00:00, 57.78it/s]


ep11: train=0.4058, val=0.7967


Epoch 12: 100%|██████████| 207/207 [00:03<00:00, 57.66it/s]


ep12: train=0.3698, val=0.7910


Epoch 13: 100%|██████████| 207/207 [00:03<00:00, 56.33it/s]


ep13: train=0.3405, val=0.7876


Epoch 14: 100%|██████████| 207/207 [00:03<00:00, 57.89it/s]


ep14: train=0.3130, val=0.7979


Epoch 15: 100%|██████████| 207/207 [00:03<00:00, 57.63it/s]


ep15: train=0.2887, val=0.7911


Epoch 16: 100%|██████████| 207/207 [00:03<00:00, 56.59it/s]


ep16: train=0.2674, val=0.7972


Epoch 17: 100%|██████████| 207/207 [00:03<00:00, 55.18it/s]


ep17: train=0.2444, val=0.8074


Epoch 18: 100%|██████████| 207/207 [00:03<00:00, 57.80it/s]


ep18: train=0.2253, val=0.8236
Early stop at epoch 18
best_th on val: 2.683459520339966
val metrics: {'acc': np.float64(0.3021001615508537), 'precision': np.float64(0.9976218787154191), 'recall': np.float64(0.2940764107956193), 'f1': np.float64(0.45425013500287814), 'fpr': np.float64(0.05607476635461612), 'tp': 2517, 'tn': 101, 'fp': 6, 'fn': 6042}
test metrics: {'acc': np.float64(0.2962995211144886), 'precision': np.float64(0.9986027944110779), 'recall': np.float64(0.2922483789940916), 'f1': np.float64(0.4521668395430464), 'fpr': np.float64(0.06451612903196076), 'tp': 10006, 'tn': 203, 'fp': 14, 'fn': 24232}
window_size = 40


Epoch 1: 100%|██████████| 177/177 [00:03<00:00, 45.66it/s]


ep1: train=2.5471, val=1.7327


Epoch 2: 100%|██████████| 177/177 [00:03<00:00, 45.60it/s]


ep2: train=1.3944, val=1.2756


Epoch 3: 100%|██████████| 177/177 [00:03<00:00, 45.03it/s]


ep3: train=1.0814, val=1.0908


Epoch 4: 100%|██████████| 177/177 [00:03<00:00, 45.66it/s]


ep4: train=0.9069, val=0.9732


Epoch 5: 100%|██████████| 177/177 [00:03<00:00, 45.28it/s]


ep5: train=0.7842, val=0.9062


Epoch 6: 100%|██████████| 177/177 [00:03<00:00, 45.26it/s]


ep6: train=0.6881, val=0.8626


Epoch 7: 100%|██████████| 177/177 [00:04<00:00, 43.89it/s]


ep7: train=0.6134, val=0.8334


Epoch 8: 100%|██████████| 177/177 [00:03<00:00, 45.12it/s]


ep8: train=0.5502, val=0.8014


Epoch 9: 100%|██████████| 177/177 [00:03<00:00, 45.43it/s]


ep9: train=0.4971, val=0.7909


Epoch 10: 100%|██████████| 177/177 [00:03<00:00, 45.57it/s]


ep10: train=0.4553, val=0.7768


Epoch 11: 100%|██████████| 177/177 [00:03<00:00, 45.07it/s]


ep11: train=0.4152, val=0.7900


Epoch 12: 100%|██████████| 177/177 [00:04<00:00, 43.52it/s]


ep12: train=0.3808, val=0.7928


Epoch 13: 100%|██████████| 177/177 [00:03<00:00, 45.52it/s]


ep13: train=0.3460, val=0.7819


Epoch 14: 100%|██████████| 177/177 [00:03<00:00, 44.91it/s]


ep14: train=0.3181, val=0.7877


Epoch 15: 100%|██████████| 177/177 [00:03<00:00, 45.28it/s]


ep15: train=0.2964, val=0.7869
Early stop at epoch 15
best_th on val: 2.951939821243286
val metrics: {'acc': np.float64(0.3002538656819409), 'precision': np.float64(0.9988004798076774), 'recall': np.float64(0.29185652529497697), 'f1': np.float64(0.45171790200074136), 'fpr': np.float64(0.02803738317730806), 'tp': 2498, 'tn': 104, 'fp': 3, 'fn': 6061}
test metrics: {'acc': np.float64(0.291278479175728), 'precision': np.float64(0.9988822274158115), 'recall': np.float64(0.2871078918161024), 'f1': np.float64(0.44601737754284987), 'fpr': np.float64(0.05069124423939774), 'tp': 9830, 'tn': 206, 'fp': 11, 'fn': 24408}
