# CICIDS2017 強化学習ベース分類ノートブック

このノートブックでは、CICIDS2017_improved データセットを用い、各サンプルを逐次的に観測してクラスを選択するエージェントを強化学習（方策勾配）で学習します。エピソード長はステップ数で制御し、指定ステップに達すると自動終了します。


In [1]:
import os
import sys
import random
from pathlib import Path
from typing import Optional, Tuple

import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

print(f"Using torch {torch.__version__}")


Using torch 2.9.1+cu128


In [2]:
PROJECT_ROOT = Path('..').resolve()
if str(PROJECT_ROOT) not in sys.path:
    sys.path.append(str(PROJECT_ROOT))

DATA_DIR = PROJECT_ROOT / 'data' / 'CICIDS2017_improved'
TRAIN_CSV = DATA_DIR / 'train.csv'
TEST_CSV = DATA_DIR / 'test.csv'

print(f"PROJECT_ROOT: {PROJECT_ROOT}")


PROJECT_ROOT: /home/hawk/Documents/school/test/CVPR22-Fact


In [3]:
from dataloader.cicids2017.cicids2017 import CICIDS2017_improved


In [4]:
# 強化学習設定
MAX_EPISODE_STEPS = 2048  # 1エピソードあたりの最大ステップ数
TOTAL_STEPS = 20000       # 学習全体でのステップ終了条件
GAMMA = 0.95
LR = 3e-4
HIDDEN_SIZE = 256
DEVICE = 'cpu'
SEED = 42

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
# if torch.cuda.is_available():
#     torch.cuda.manual_seed_all(SEED)

print(f"DATA_DIR: {DATA_DIR}")
print(f"DEVICE: {DEVICE}")


DATA_DIR: /home/hawk/Documents/school/test/CVPR22-Fact/data/CICIDS2017_improved
DEVICE: cpu


In [5]:
dataset_root = PROJECT_ROOT / 'data'
print("Loading CICIDS2017_improved via official dataset pipeline...")
train_dataset = CICIDS2017_improved(root=str(dataset_root), train=True)
test_dataset = CICIDS2017_improved(root=str(dataset_root), train=False)

train_x = torch.tensor(train_dataset.data, dtype=torch.float32)
train_y = torch.tensor(train_dataset.targets, dtype=torch.long)
test_x = torch.tensor(test_dataset.data, dtype=torch.float32)
test_y = torch.tensor(test_dataset.targets, dtype=torch.long)
label_encoder = train_dataset.label_encoder

NUM_CLASSES = len(label_encoder.classes_)
INPUT_DIM = train_x.shape[1]
print(f"INPUT_DIM={INPUT_DIM}, NUM_CLASSES={NUM_CLASSES}")


Loading CICIDS2017_improved via official dataset pipeline...
INPUT_DIM=66, NUM_CLASSES=10


In [6]:
class CICIDSEnv:
    """逐次的にサンプルを提示し、クラスを当てる環境"""

    def __init__(self, features: torch.Tensor, labels: torch.Tensor, max_steps: int):
        self.features = features
        self.labels = labels
        self.max_steps = max_steps
        self.num_samples = features.size(0)
        self.device = features.device
        self._indices = None
        self._cursor = 0
        self._steps = 0
        self._current_label: Optional[int] = None

    def reset(self) -> torch.Tensor:
        self._indices = torch.randperm(self.num_samples)
        self._cursor = 0
        self._steps = 0
        idx = self._indices[self._cursor]
        self._current_label = self.labels[idx].item()
        return self.features[idx]

    def step(self, action: int) -> Tuple[torch.Tensor, float, bool, dict]:
        assert self._current_label is not None, "Call reset() before step()"
        reward = 1.0 if action == self._current_label else 0.0
        self._steps += 1

        done = self._steps >= self.max_steps
        info = {"label": self._current_label, "reward": reward}

        if done:
            self._current_label = None
            return torch.zeros_like(self.features[0]), reward, True, info

        self._cursor = (self._cursor + 1) % self.num_samples
        idx = self._indices[self._cursor]
        self._current_label = self.labels[idx].item()
        next_obs = self.features[idx]
        return next_obs, reward, False, info


In [7]:
class PolicyNet(nn.Module):
    def __init__(self, input_dim: int, num_classes: int, hidden: int):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden),
            nn.ReLU(),
            nn.Linear(hidden, hidden),
            nn.ReLU(),
            nn.Linear(hidden, num_classes),
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.net(x)


policy = PolicyNet(INPUT_DIM, NUM_CLASSES, HIDDEN_SIZE).to(DEVICE)
optimizer = optim.Adam(policy.parameters(), lr=LR)

def select_action(state: torch.Tensor):
    logits = policy(state.to(DEVICE))
    dist = torch.distributions.Categorical(logits=logits)
    action = dist.sample()
    log_prob = dist.log_prob(action)
    return action.item(), log_prob


def compute_returns(rewards, gamma):
    g = 0
    returns = []
    for r in reversed(rewards):
        g = r + gamma * g
        returns.insert(0, g)
    returns = torch.tensor(returns, dtype=torch.float32, device=DEVICE)
    returns = (returns - returns.mean()) / (returns.std() + 1e-6)
    return returns


In [8]:
env = CICIDSEnv(train_x.to(DEVICE), train_y.to(DEVICE), MAX_EPISODE_STEPS)

all_rewards = []
rolling_acc = []
step_counter = 0
episode = 0

state = env.reset()
log_probs = []
episode_rewards = []

while step_counter < TOTAL_STEPS:
    action, log_prob = select_action(state)
    next_state, reward, done, info = env.step(action)

    log_probs.append(log_prob)
    episode_rewards.append(reward)
    all_rewards.append(reward)
    rolling_acc.append(reward)
    step_counter += 1

    if done or step_counter >= TOTAL_STEPS:
        returns = compute_returns(episode_rewards, GAMMA)
        loss = -torch.sum(torch.stack(log_probs) * returns)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        episode += 1
        if len(rolling_acc) > 500:
            rolling_acc = rolling_acc[-500:]
        avg_reward = np.mean(rolling_acc) if rolling_acc else 0.0
        print(f"Episode {episode:3d} | steps={step_counter} | loss={loss.item():.4f} | rolling acc={avg_reward:.3f}")

        if step_counter >= TOTAL_STEPS:
            break

        state = env.reset()
        log_probs = []
        episode_rewards = []
    else:
        state = next_state


Episode   1 | steps=2048 | loss=-8.5791 | rolling acc=0.088
Episode   2 | steps=4096 | loss=-3.6395 | rolling acc=0.126
Episode   3 | steps=6144 | loss=-20.9369 | rolling acc=0.122
Episode   4 | steps=8192 | loss=-41.2857 | rolling acc=0.114
Episode   5 | steps=10240 | loss=-53.5144 | rolling acc=0.136
Episode   6 | steps=12288 | loss=-64.9044 | rolling acc=0.156
Episode   7 | steps=14336 | loss=-67.7598 | rolling acc=0.128
Episode   8 | steps=16384 | loss=-108.7695 | rolling acc=0.120
Episode   9 | steps=18432 | loss=-115.3591 | rolling acc=0.158
Episode  10 | steps=20000 | loss=-103.2138 | rolling acc=0.166


In [9]:
@torch.no_grad()
def evaluate(policy: nn.Module, features: torch.Tensor, labels: torch.Tensor) -> float:
    policy.eval()
    logits = policy(features.to(DEVICE))
    preds = torch.argmax(logits, dim=1).cpu()
    acc = (preds == labels).float().mean().item()
    policy.train()
    return acc

train_acc = evaluate(policy, train_x, train_y)
test_acc = evaluate(policy, test_x, test_y)
print(f"Final Train Accuracy: {train_acc:.4f}")
print(f"Final Test Accuracy : {test_acc:.4f}")


Final Train Accuracy: 0.8099
Final Test Accuracy : 0.8100


## 使い方メモ
- `TOTAL_STEPS` を変更するとエージェント学習の総ステップ数を制御できます（エポックではなくステップで停止）。
- データ読み込みは `CICIDS2017_improved` データセットクラスをそのまま利用しているため、教師あり学習と同じ前処理（列削除、正規化、ラベル統合）になります。高速化やサンプリングを行いたい場合は `dataloader/cicids2017/cicids2017.py` 内のクラスを拡張してください。
- 現在は単純な REINFORCE で報酬=正解/不正解を学習します。探索を増やしたい場合は `torch.distributions.Categorical` の温度調整やエントロピー正則化を追加できます。
- 学習後は `evaluate` を再実行することで任意の時点の精度を確認できます。
