<a href="https://colab.research.google.com/github/mynameislllyt/API_Experiment/blob/main/baseline1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import random
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

# ============ 1. 读数据 + 拆 benign-only ============
def load_csv_expanded(path):
    df = pd.read_csv(path)
    seq_cols = [c for c in df.columns if c.startswith("t_")]
    seqs = df[seq_cols].values.astype(int)      # shape: [N, 100]
    labels = df["malware"].values.astype(int)   # 1=malware, 0=benign
    return seqs, labels

def split_benign_only(seqs, labels, seed=42):
    benign = seqs[labels == 0]
    malware = seqs[labels == 1]

    rng = np.random.default_rng(seed)
    idx = rng.permutation(len(benign))

    n = len(benign)
    n_train = int(0.7*n)
    n_val   = int(0.1*n)

    train = benign[idx[:n_train]]
    val   = benign[idx[n_train:n_train+n_val]]
    test_benign = benign[idx[n_train+n_val:]]
    test_malware = malware
    return train, val, test_benign, test_malware

# ============ 2. 滑动窗口 ============
def make_windows(seqs, window_size=10):
    X, y = [], []
    for s in seqs:
        s = s.tolist()
        for i in range(len(s) - window_size):
            X.append(s[i:i+window_size])
            y.append(s[i+window_size])
    return np.array(X, dtype=int), np.array(y, dtype=int)

# ============ 3. Dataset ============
class WindowDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.long)
        self.y = torch.tensor(y, dtype=torch.long)
    def __len__(self): return len(self.X)
    def __getitem__(self, i): return self.X[i], self.y[i]

# ============ 4. LSTM LM ============
class LSTMLM(nn.Module):
    def __init__(self, vocab_size, emb_dim=128, hidden_dim=256, num_layers=1, dropout=0.2):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        self.lstm = nn.LSTM(
            emb_dim, hidden_dim, num_layers=num_layers, batch_first=True,
            dropout=dropout if num_layers>1 else 0.0
        )
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        e = self.emb(x)        # [B, W, E]
        o, _ = self.lstm(e)    # [B, W, H]
        last = o[:, -1, :]
        return self.fc(last)   # [B, V]

# ============ 5. 训练/验证 ============
def eval_loss(model, loader, device="cuda"):
    model.eval()
    crit = nn.CrossEntropyLoss(reduction="sum")
    total = 0.0
    with torch.no_grad():
        for Xb, yb in loader:
            Xb, yb = Xb.to(device), yb.to(device)
            total += crit(model(Xb), yb).item()
    return total / len(loader.dataset)

def train_model(model, train_loader, val_loader, epochs=40, lr=1e-3, device="cuda"):
    model.to(device)
    opt = torch.optim.Adam(model.parameters(), lr=lr)
    crit = nn.CrossEntropyLoss()

    best_val, best_state = 1e9, None
    for ep in range(1, epochs+1):
        model.train()
        total = 0.0
        for Xb, yb in tqdm(train_loader, desc=f"Epoch {ep}"):
            Xb, yb = Xb.to(device), yb.to(device)
            opt.zero_grad()
            loss = crit(model(Xb), yb)
            loss.backward()
            opt.step()
            total += loss.item() * Xb.size(0)

        val_loss = eval_loss(model, val_loader, device)
        print(f"ep{ep}: train={total/len(train_loader.dataset):.4f}, val={val_loss:.4f}")
        if val_loss < best_val:
            best_val = val_loss
            best_state = {k:v.cpu().clone() for k,v in model.state_dict().items()}

    model.load_state_dict(best_state)
    return model

# ============ 6. 序列 NLL 异常分数 ============
def sequence_scores_nll(model, seqs, window_size=10, device="cuda"):#score 越大 = 越不符合 benign 模式 = 越可疑
    crit = nn.CrossEntropyLoss(reduction="none")
    model.eval()
    scores = []

    with torch.no_grad():
        for s in seqs:
            X, y = make_windows([s], window_size)
            X = torch.tensor(X, dtype=torch.long).to(device)
            y = torch.tensor(y, dtype=torch.long).to(device)
            nll = crit(model(X), y)
            scores.append(nll.mean().item())
    return np.array(scores)

def pick_threshold(val_scores, q=0.99):#在验证集上，大约 99% 的 benign score 都 低于 这个阈值
    return float(np.quantile(val_scores, q))

def evaluate(th, benign_scores, malware_scores):
    y_true = np.array([0]*len(benign_scores) + [1]*len(malware_scores))
    y_pred = np.array(
        [1 if s>th else 0 for s in benign_scores] +
        [1 if s>th else 0 for s in malware_scores]
    )
    tp = ((y_true==1)&(y_pred==1)).sum()
    tn = ((y_true==0)&(y_pred==0)).sum()
    fp = ((y_true==0)&(y_pred==1)).sum()
    fn = ((y_true==1)&(y_pred==0)).sum()

    precision = tp/(tp+fp+1e-9)
    recall    = tp/(tp+fn+1e-9)
    f1        = 2*precision*recall/(precision+recall+1e-9)
    acc       = (tp+tn)/(tp+tn+fp+fn+1e-9)
    fpr       = fp/(fp+tn+1e-9)
    return dict(acc=acc, precision=precision, recall=recall, f1=f1, fpr=fpr,
                tp=int(tp), tn=int(tn), fp=int(fp), fn=int(fn))

# ============ 7. 主流程 ============
def main():
    path = "./dynamic_api_call_sequence_per_malware_100_0_306.csv"
    seqs, labels = load_csv_expanded(path)
    train, val, test_benign, test_malware = split_benign_only(seqs, labels)

    # vocab_size 需要从 token 最大值推出来（+1，因为从0计数）
    vocab_size = int(seqs.max()) + 1
    print("vocab_size:", vocab_size)

    window_size = 20
    Xtr, ytr = make_windows(train, window_size)
    Xva, yva = make_windows(val, window_size)

    train_loader = DataLoader(WindowDataset(Xtr, ytr), batch_size=256, shuffle=True)
    val_loader   = DataLoader(WindowDataset(Xva, yva), batch_size=256)

    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = LSTMLM(vocab_size=vocab_size)
    model = train_model(model, train_loader, val_loader, device=device)

    val_scores = sequence_scores_nll(model, val, window_size, device=device)
    th = pick_threshold(val_scores, q=0.99)
    print("threshold:", th)

    benign_scores  = sequence_scores_nll(model, test_benign, window_size, device=device)
    malware_scores = sequence_scores_nll(model, test_malware, window_size, device=device)
    metrics = evaluate(th, benign_scores, malware_scores)

    print(metrics)

if __name__ == "__main__":
    main()


vocab_size: 307


Epoch 1: 100%|██████████| 236/236 [00:02<00:00, 111.33it/s]


ep1: train=1.9311, val=1.3203


Epoch 2: 100%|██████████| 236/236 [00:01<00:00, 152.61it/s]


ep2: train=1.0468, val=1.0255


Epoch 3: 100%|██████████| 236/236 [00:01<00:00, 166.16it/s]


ep3: train=0.8269, val=0.9190


Epoch 4: 100%|██████████| 236/236 [00:01<00:00, 147.98it/s]


ep4: train=0.6962, val=0.8446


Epoch 5: 100%|██████████| 236/236 [00:01<00:00, 156.49it/s]


ep5: train=0.6014, val=0.8077


Epoch 6: 100%|██████████| 236/236 [00:01<00:00, 151.51it/s]


ep6: train=0.5276, val=0.7851


Epoch 7: 100%|██████████| 236/236 [00:01<00:00, 166.34it/s]


ep7: train=0.4671, val=0.7624


Epoch 8: 100%|██████████| 236/236 [00:01<00:00, 163.91it/s]


ep8: train=0.4157, val=0.7605


Epoch 9: 100%|██████████| 236/236 [00:01<00:00, 165.87it/s]


ep9: train=0.3713, val=0.7404


Epoch 10: 100%|██████████| 236/236 [00:01<00:00, 151.64it/s]


ep10: train=0.3326, val=0.7462


Epoch 11: 100%|██████████| 236/236 [00:01<00:00, 160.25it/s]


ep11: train=0.2969, val=0.7500


Epoch 12: 100%|██████████| 236/236 [00:01<00:00, 144.47it/s]


ep12: train=0.2675, val=0.7500


Epoch 13: 100%|██████████| 236/236 [00:01<00:00, 165.53it/s]


ep13: train=0.2427, val=0.7518


Epoch 14: 100%|██████████| 236/236 [00:01<00:00, 152.39it/s]


ep14: train=0.2159, val=0.7715


Epoch 15: 100%|██████████| 236/236 [00:01<00:00, 163.76it/s]


ep15: train=0.1950, val=0.7826


Epoch 16: 100%|██████████| 236/236 [00:01<00:00, 165.94it/s]


ep16: train=0.1752, val=0.7952


Epoch 17: 100%|██████████| 236/236 [00:01<00:00, 164.60it/s]


ep17: train=0.1591, val=0.8117


Epoch 18: 100%|██████████| 236/236 [00:01<00:00, 150.97it/s]


ep18: train=0.1461, val=0.8094


Epoch 19: 100%|██████████| 236/236 [00:01<00:00, 150.47it/s]


ep19: train=0.1335, val=0.8273


Epoch 20: 100%|██████████| 236/236 [00:01<00:00, 151.84it/s]


ep20: train=0.1235, val=0.8379


Epoch 21: 100%|██████████| 236/236 [00:01<00:00, 162.63it/s]


ep21: train=0.1157, val=0.8503


Epoch 22: 100%|██████████| 236/236 [00:01<00:00, 149.95it/s]


ep22: train=0.1050, val=0.8720


Epoch 23: 100%|██████████| 236/236 [00:01<00:00, 164.82it/s]


ep23: train=0.0996, val=0.8749


Epoch 24: 100%|██████████| 236/236 [00:01<00:00, 164.08it/s]


ep24: train=0.0974, val=0.8853


Epoch 25: 100%|██████████| 236/236 [00:01<00:00, 164.20it/s]


ep25: train=0.0945, val=0.9009


Epoch 26: 100%|██████████| 236/236 [00:01<00:00, 148.37it/s]


ep26: train=0.0881, val=0.9184


Epoch 27: 100%|██████████| 236/236 [00:01<00:00, 143.46it/s]


ep27: train=0.0844, val=0.9194


Epoch 28: 100%|██████████| 236/236 [00:01<00:00, 162.63it/s]


ep28: train=0.0792, val=0.9318


Epoch 29: 100%|██████████| 236/236 [00:01<00:00, 163.49it/s]


ep29: train=0.0773, val=0.9383


Epoch 30: 100%|██████████| 236/236 [00:01<00:00, 149.58it/s]


ep30: train=0.0771, val=0.9323


Epoch 31: 100%|██████████| 236/236 [00:01<00:00, 162.74it/s]


ep31: train=0.0746, val=0.9437


Epoch 32: 100%|██████████| 236/236 [00:01<00:00, 162.13it/s]


ep32: train=0.0730, val=0.9488


Epoch 33: 100%|██████████| 236/236 [00:01<00:00, 162.82it/s]


ep33: train=0.0703, val=0.9722


Epoch 34: 100%|██████████| 236/236 [00:01<00:00, 142.55it/s]


ep34: train=0.0741, val=0.9660


Epoch 35: 100%|██████████| 236/236 [00:01<00:00, 155.13it/s]


ep35: train=0.0740, val=0.9814


Epoch 36: 100%|██████████| 236/236 [00:01<00:00, 162.66it/s]


ep36: train=0.0710, val=0.9993


Epoch 37: 100%|██████████| 236/236 [00:01<00:00, 159.67it/s]


ep37: train=0.0659, val=0.9994


Epoch 38: 100%|██████████| 236/236 [00:01<00:00, 161.69it/s]


ep38: train=0.0634, val=1.0226


Epoch 39: 100%|██████████| 236/236 [00:01<00:00, 161.29it/s]


ep39: train=0.0649, val=1.0109


Epoch 40: 100%|██████████| 236/236 [00:01<00:00, 160.91it/s]


ep40: train=0.0666, val=1.0121
threshold: 2.968801293373108
{'acc': np.float64(0.1308876179848391), 'precision': np.float64(0.9990780011062144), 'recall': np.float64(0.1265976587143929), 'f1': np.float64(0.224720032981612), 'fpr': np.float64(0.0230414746542717), 'tp': 5418, 'tn': 212, 'fp': 5, 'fn': 37379}
