<a href="https://colab.research.google.com/github/mynameislllyt/API_Experiment/blob/main/baseline1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd
import numpy as np
import random
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

# ============ 1. 读数据 + 拆 benign-only ============
def load_csv_expanded(path):
    df = pd.read_csv(path)
    seq_cols = [c for c in df.columns if c.startswith("t_")]
    seqs = df[seq_cols].values.astype(int)      # shape: [N, 100]
    labels = df["malware"].values.astype(int)   # 1=malware, 0=benign
    return seqs, labels

def split_benign_only(seqs, labels, seed=42):
    benign = seqs[labels == 0]
    malware = seqs[labels == 1]

    rng = np.random.default_rng(seed)
    idx = rng.permutation(len(benign))

    n = len(benign)
    n_train = int(0.7*n)
    n_val   = int(0.1*n)

    train = benign[idx[:n_train]]
    val   = benign[idx[n_train:n_train+n_val]]
    test_benign = benign[idx[n_train+n_val:]]
    test_malware = malware
    return train, val, test_benign, test_malware

# ============ 2. 滑动窗口 ============
def make_windows(seqs, window_size=10):
    X, y = [], []
    for s in seqs:
        s = s.tolist()
        for i in range(len(s) - window_size):
            X.append(s[i:i+window_size])
            y.append(s[i+window_size])
    return np.array(X, dtype=int), np.array(y, dtype=int)

# ============ 3. Dataset ============
class WindowDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.long)
        self.y = torch.tensor(y, dtype=torch.long)
    def __len__(self): return len(self.X)
    def __getitem__(self, i): return self.X[i], self.y[i]

# ============ 4. LSTM LM ============
class LSTMLM(nn.Module):
    def __init__(self, vocab_size, emb_dim=128, hidden_dim=256, num_layers=1, dropout=0.2):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        self.lstm = nn.LSTM(
            emb_dim, hidden_dim, num_layers=num_layers, batch_first=True,
            dropout=dropout if num_layers>1 else 0.0
        )
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        e = self.emb(x)        # [B, W, E]
        o, _ = self.lstm(e)    # [B, W, H]
        last = o[:, -1, :]
        return self.fc(last)   # [B, V]

# ============ 5. 训练/验证 ============
def eval_loss(model, loader, device="cuda"):
    model.eval()
    crit = nn.CrossEntropyLoss(reduction="sum")
    total = 0.0
    with torch.no_grad():
        for Xb, yb in loader:
            Xb, yb = Xb.to(device), yb.to(device)
            total += crit(model(Xb), yb).item()
    return total / len(loader.dataset)

def train_model(model, train_loader, val_loader, epochs=20, lr=1e-3, device="cuda"):
    model.to(device)
    opt = torch.optim.Adam(model.parameters(), lr=lr)
    crit = nn.CrossEntropyLoss()

    best_val, best_state = 1e9, None
    for ep in range(1, epochs+1):
        model.train()
        total = 0.0
        for Xb, yb in tqdm(train_loader, desc=f"Epoch {ep}"):
            Xb, yb = Xb.to(device), yb.to(device)
            opt.zero_grad()
            loss = crit(model(Xb), yb)
            loss.backward()
            opt.step()
            total += loss.item() * Xb.size(0)

        val_loss = eval_loss(model, val_loader, device)
        print(f"ep{ep}: train={total/len(train_loader.dataset):.4f}, val={val_loss:.4f}")
        if val_loss < best_val:
            best_val = val_loss
            best_state = {k:v.cpu().clone() for k,v in model.state_dict().items()}

    model.load_state_dict(best_state)
    return model

# ============ 6. 序列 NLL 异常分数 ============
def sequence_scores_nll(model, seqs, window_size=10, device="cuda"):#score 越大 = 越不符合 benign 模式 = 越可疑
    crit = nn.CrossEntropyLoss(reduction="none")
    model.eval()
    scores = []

    with torch.no_grad():
        for s in seqs:
            X, y = make_windows([s], window_size)
            X = torch.tensor(X, dtype=torch.long).to(device)
            y = torch.tensor(y, dtype=torch.long).to(device)
            nll = crit(model(X), y)
            scores.append(nll.mean().item())
    return np.array(scores)

def search_best_threshold(benign_scores, malware_scores):
    all_scores = np.concatenate([benign_scores, malware_scores])
    cand_th = np.quantile(all_scores, np.linspace(0.7, 0.99, 20))  # 可以调范围

    best_f1, best_th, best_metrics = -1, None, None
    for th in cand_th:
        m = evaluate(th, benign_scores, malware_scores)
        if m["f1"] > best_f1:
            best_f1, best_th, best_metrics = m["f1"], th, m
    return best_th, best_metrics

def pick_threshold(val_scores, q=0.99):#在验证集上，大约 99% 的 benign score 都 低于 这个阈值
    return float(np.quantile(val_scores, q))

def evaluate(th, benign_scores, malware_scores):
    y_true = np.array([0]*len(benign_scores) + [1]*len(malware_scores))
    y_pred = np.array(
        [1 if s>th else 0 for s in benign_scores] +
        [1 if s>th else 0 for s in malware_scores]
    )
    tp = ((y_true==1)&(y_pred==1)).sum()
    tn = ((y_true==0)&(y_pred==0)).sum()
    fp = ((y_true==0)&(y_pred==1)).sum()
    fn = ((y_true==1)&(y_pred==0)).sum()

    precision = tp/(tp+fp+1e-9)
    recall    = tp/(tp+fn+1e-9)
    f1        = 2*precision*recall/(precision+recall+1e-9)
    acc       = (tp+tn)/(tp+tn+fp+fn+1e-9)
    fpr       = fp/(fp+tn+1e-9)
    return dict(acc=acc, precision=precision, recall=recall, f1=f1, fpr=fpr,
                tp=int(tp), tn=int(tn), fp=int(fp), fn=int(fn))

# ============ 7. 主流程 ============
def main():
    path = "./dynamic_api_call_sequence_per_malware_100_0_306.csv"
    seqs, labels = load_csv_expanded(path)
    train, val, test_benign, test_malware = split_benign_only(seqs, labels)

    # vocab_size 需要从 token 最大值推出来（+1，因为从0计数）
    vocab_size = int(seqs.max()) + 1
    print("vocab_size:", vocab_size)

    window_size = 10
    Xtr, ytr = make_windows(train, window_size)
    Xva, yva = make_windows(val, window_size)

    train_loader = DataLoader(WindowDataset(Xtr, ytr), batch_size=256, shuffle=True)
    val_loader   = DataLoader(WindowDataset(Xva, yva), batch_size=256)

    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = LSTMLM(vocab_size=vocab_size)
    model = train_model(model, train_loader, val_loader, device=device)

    val_scores = sequence_scores_nll(model, val, window_size, device=device)
    # th = pick_threshold(val_scores, q=0.99)
    # print("threshold:", th)

    benign_scores  = sequence_scores_nll(model, test_benign, window_size, device=device)
    malware_scores = sequence_scores_nll(model, test_malware, window_size, device=device)
    best_th, best_metrics = search_best_threshold(benign_scores, malware_scores)
    print("best_th:", best_th)
    print(best_metrics)

if __name__ == "__main__":
    main()


vocab_size: 307


Epoch 1: 100%|██████████| 266/266 [00:01<00:00, 228.02it/s]


ep1: train=1.8520, val=1.2503


Epoch 2: 100%|██████████| 266/266 [00:01<00:00, 254.23it/s]


ep2: train=1.0113, val=1.0165


Epoch 3: 100%|██████████| 266/266 [00:01<00:00, 250.49it/s]


ep3: train=0.8115, val=0.9100


Epoch 4: 100%|██████████| 266/266 [00:01<00:00, 253.11it/s]


ep4: train=0.6887, val=0.8509


Epoch 5: 100%|██████████| 266/266 [00:01<00:00, 184.14it/s]


ep5: train=0.6006, val=0.8179


Epoch 6: 100%|██████████| 266/266 [00:01<00:00, 237.55it/s]


ep6: train=0.5302, val=0.7976


Epoch 7: 100%|██████████| 266/266 [00:01<00:00, 249.25it/s]


ep7: train=0.4732, val=0.7841


Epoch 8: 100%|██████████| 266/266 [00:01<00:00, 219.61it/s]


ep8: train=0.4222, val=0.7668


Epoch 9: 100%|██████████| 266/266 [00:01<00:00, 249.41it/s]


ep9: train=0.3794, val=0.7680


Epoch 10: 100%|██████████| 266/266 [00:01<00:00, 251.05it/s]


ep10: train=0.3415, val=0.7597


Epoch 11: 100%|██████████| 266/266 [00:01<00:00, 251.11it/s]


ep11: train=0.3084, val=0.7681


Epoch 12: 100%|██████████| 266/266 [00:01<00:00, 250.42it/s]


ep12: train=0.2778, val=0.7635


Epoch 13: 100%|██████████| 266/266 [00:01<00:00, 249.36it/s]


ep13: train=0.2550, val=0.7754


Epoch 14: 100%|██████████| 266/266 [00:01<00:00, 215.16it/s]


ep14: train=0.2309, val=0.7812


Epoch 15: 100%|██████████| 266/266 [00:01<00:00, 168.39it/s]


ep15: train=0.2110, val=0.7924


Epoch 16: 100%|██████████| 266/266 [00:01<00:00, 247.59it/s]


ep16: train=0.1954, val=0.8056


Epoch 17: 100%|██████████| 266/266 [00:01<00:00, 247.68it/s]


ep17: train=0.1815, val=0.8170


Epoch 18: 100%|██████████| 266/266 [00:01<00:00, 247.40it/s]


ep18: train=0.1708, val=0.8263


Epoch 19: 100%|██████████| 266/266 [00:01<00:00, 245.47it/s]


ep19: train=0.1602, val=0.8318


Epoch 20: 100%|██████████| 266/266 [00:01<00:00, 248.60it/s]


ep20: train=0.1533, val=0.8320
best_th: 2.4233922958374023
{'acc': np.float64(0.2894871437206424), 'precision': np.float64(0.9991025536427348), 'recall': np.float64(0.2861415519779357), 'f1': np.float64(0.4448723068431505), 'fpr': np.float64(0.05069124423939774), 'tp': 12246, 'tn': 206, 'fp': 11, 'fn': 30551}
