In [88]:
import os
import sys
import pandas as pd
import torch
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from torch_geometric.data import Data
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import NNConv
import networkx as nx

parent_dir = os.path.abspath(os.path.join(os.getcwd(), "..", ".."))
utils_path = os.path.join(parent_dir, "project_utils")
sys.path.append(utils_path)

CSV_PATH = '../datasets/data/NF-ToN-IoT.csv'

In [89]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [90]:
df = pd.read_csv(CSV_PATH)
df = df.sample(frac=0.05, random_state=42)  # 30% of rows
label_encoder = LabelEncoder()
df["label"] = label_encoder.fit_transform(df["Label"])

In [91]:
df["src_node"] = df["IPV4_SRC_ADDR"].astype(str) + ":" + df["L4_SRC_PORT"].astype(str)
df["dst_node"] = df["IPV4_DST_ADDR"].astype(str) + ":" + df["L4_DST_PORT"].astype(str)

In [92]:
unique_src_ips = df["IPV4_SRC_ADDR"].unique()
new_ips = np.random.permutation(unique_src_ips)
ip_map = dict(zip(unique_src_ips, new_ips))
df["IPV4_SRC_ADDR"] = df["IPV4_SRC_ADDR"].map(ip_map)
df["IPV4_DST_ADDR"] = df["IPV4_DST_ADDR"].map(lambda x: ip_map.get(x, x))
# Recompute node strings after remapping
df["src_node"] = df["IPV4_SRC_ADDR"].astype(str) + ":" + df["L4_SRC_PORT"].astype(str)
df["dst_node"] = df["IPV4_DST_ADDR"].astype(str) + ":" + df["L4_DST_PORT"].astype(str)

In [93]:
all_nodes = pd.Index(df["src_node"].tolist() + df["dst_node"].tolist()).unique()
ip_to_idx = {node: idx for idx, node in enumerate(all_nodes)}
num_nodes = len(all_nodes)

In [94]:
pkt_pair = {}
byte_pair = {}
flow_pair = {}
node_total_flows = {node: 0 for node in all_nodes}
node_attack_flows = {node: 0 for node in all_nodes}

In [95]:
edge_list = []
edge_attr_list = []
for _, row in df.iterrows():
    src = row["src_node"]
    dst = row["dst_node"]
    src_idx = ip_to_idx[src]
    dst_idx = ip_to_idx[dst]
    in_pkts = row["IN_PKTS"]
    out_pkts = row["OUT_PKTS"]
    in_bytes = row["IN_BYTES"]
    out_bytes = row["OUT_BYTES"]
    duration = row["FLOW_DURATION_MILLISECONDS"]
    flags = row["TCP_FLAGS"]
    l7 = row["L7_PROTO"]
    protocol = row["PROTOCOL"]
    total_pkts = in_pkts + out_pkts
    total_bytes = in_bytes + out_bytes
    # Update edge counters
    key = (src_idx, dst_idx)
    pkt_pair[key] = pkt_pair.get(key, 0) + total_pkts
    byte_pair[key] = byte_pair.get(key, 0) + total_bytes
    flow_pair[key] = flow_pair.get(key, 0) + 1
    # Update node flow counts
    node_total_flows[src] += 1
    node_total_flows[dst] += 1
    if row["label"] != 0:  # 0=Benign
        node_attack_flows[src] += 1
        node_attack_flows[dst] += 1
    # Temporarily store all flows to decide edges later
    edge_list.append((src_idx, dst_idx))
    edge_attr_list.append([in_bytes, out_bytes, in_pkts, out_pkts, duration, flags, l7, protocol])

In [96]:
filtered_edges = []
filtered_attrs = []
for i, (src_idx, dst_idx) in enumerate(edge_list):
    if pkt_pair.get((src_idx, dst_idx), 0) >= 10:
        filtered_edges.append([src_idx, dst_idx])
        filtered_attrs.append(edge_attr_list[i])

In [97]:
if filtered_edges:
    edge_index = torch.tensor(filtered_edges, dtype=torch.long).t().contiguous()
    edge_attr = torch.tensor(filtered_attrs, dtype=torch.float)
else:
    raise ValueError("No edges meet the packet threshold.")

In [98]:
G = nx.Graph()
edges_np = edge_index.t().tolist()
G.add_edges_from(edges_np)

In [99]:
degree_dict = dict(G.degree())
clustering_dict = nx.clustering(G)

In [100]:
degree_arr = np.array([degree_dict.get(i, 0) for i in range(num_nodes)], dtype=float)
total_pkts_arr = np.zeros(num_nodes, dtype=float)
total_bytes_arr = np.zeros(num_nodes, dtype=float)
flow_count_arr = np.zeros(num_nodes, dtype=float)
attack_fraction_arr = np.zeros(num_nodes, dtype=float)
clustering_arr = np.array([clustering_dict.get(i, 0.0) for i in range(num_nodes)], dtype=float)

In [101]:
for (u, v), val in pkt_pair.items():
    if (u, v) in pkt_pair:
        total_pkts_arr[u] += val
        total_pkts_arr[v] += val
for (u, v), val in byte_pair.items():
    if (u, v) in byte_pair:
        total_bytes_arr[u] += val
        total_bytes_arr[v] += val

In [102]:
for i, node in enumerate(all_nodes):
    total = node_total_flows[node]
    attacks = node_attack_flows[node]
    flow_count_arr[i] = total
    if total > 0:
        attack_fraction_arr[i] = attacks / total
    else:
        attack_fraction_arr[i] = 0.0

In [103]:
node_stats = np.vstack([
    degree_arr,
    total_pkts_arr,
    total_bytes_arr,
    flow_count_arr,
    attack_fraction_arr,
    clustering_arr
]).T  

In [104]:
scaler = StandardScaler()
x = scaler.fit_transform(node_stats)

In [105]:
node_labels = np.zeros(num_nodes, dtype=int)
for i, node in enumerate(all_nodes):
    total = node_total_flows[node]
    attacks = node_attack_flows[node]
    if total > 0 and (attacks / total) >= 0.30:
        node_labels[i] = 1
    else:
        node_labels[i] = 0

In [106]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
edge_attr_np = edge_attr.cpu().numpy()
edge_attr_np = scaler.fit_transform(edge_attr_np)
edge_attr = torch.tensor(edge_attr_np, dtype=torch.float)

edge_attr = edge_attr.to(device)

In [107]:
idx = np.arange(data.num_nodes)
train_idx, test_idx = train_test_split(idx, test_size=0.3, stratify=data.y)
train_idx, val_idx = train_test_split(train_idx, test_size=0.15/0.7, stratify=data.y[train_idx])

In [108]:
# Create PyG Data object (ensure tensors are on device)
data = Data(
    x=torch.tensor(x, dtype=torch.float).to(device),
    edge_index=edge_index.to(device),
    edge_attr=edge_attr.to(device),
    y=torch.tensor(node_labels, dtype=torch.long).to(device)
)

train_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
val_mask   = torch.zeros(data.num_nodes, dtype=torch.bool)
test_mask  = torch.zeros(data.num_nodes, dtype=torch.bool)
train_mask[train_idx] = True
val_mask[val_idx]     = True
test_mask[test_idx]   = True

data.train_mask = train_mask.to(device)
data.val_mask   = val_mask.to(device)
data.test_mask  = test_mask.to(device)

In [109]:
class EdgeEnhancedGCN(nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, edge_dim):
        super().__init__()
        # MLP for first NNConv: output should be in_channels * hidden_channels
        self.edge_mlp1 = nn.Sequential(
            nn.Linear(edge_dim, hidden_channels * in_channels),
            nn.ReLU(),
            nn.Linear(hidden_channels * in_channels, in_channels * hidden_channels)
        )
        self.conv1 = NNConv(
            in_channels=in_channels,
            out_channels=hidden_channels,
            nn=self.edge_mlp1,
            aggr="mean"
        )

        # MLP for second NNConv: output should be hidden_channels * out_channels
        self.edge_mlp2 = nn.Sequential(
            nn.Linear(edge_dim, hidden_channels * hidden_channels),
            nn.ReLU(),
            nn.Linear(hidden_channels * hidden_channels, hidden_channels * out_channels)
        )
        self.conv2 = NNConv(
            in_channels=hidden_channels,
            out_channels=out_channels,
            nn=self.edge_mlp2,
            aggr="mean"
        )

        self.dropout = nn.Dropout(0.3)

    def forward(self, data):
        x, edge_index, edge_attr = data.x, data.edge_index, data.edge_attr
        x = F.relu(self.conv1(x, edge_index, edge_attr))
        x = self.dropout(x)
        x = self.conv2(x, edge_index, edge_attr)
        x = torch.clamp(x, -1e6, 1e6)
        return x

In [110]:
in_dim = data.x.shape[1]        # 6 node features
hidden_dim = 32  # instead of 64
out_dim = 2                     # benign vs malicious
edge_dim = data.edge_attr.shape[1]  # 8 numeric edge features

In [111]:
model = EdgeEnhancedGCN(in_dim, hidden_dim, out_dim, edge_dim)
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)
loss_fn = nn.CrossEntropyLoss(weight=torch.tensor([
    1.0,  # weight for benign
    (train_mask.sum().item() / max(1, (node_labels[train_mask] == 1).sum()))  # inverse freq for malicious
], dtype=torch.float))

In [112]:
from sklearn.metrics import f1_score

best_val_f1 = 0.0
epochs_without_improve = 0
patience = 15

for epoch in range(200):
    model.train()
    optimizer.zero_grad()
    out = model(data)
    loss = loss_fn(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()

    # Validation
    model.eval()
    with torch.no_grad():
        logits = model(data)
        val_preds = logits[data.val_mask].argmax(dim=1).cpu().numpy()
        val_true  = data.y[data.val_mask].cpu().numpy()
        val_f1 = f1_score(val_true, val_preds, average="weighted")

    if val_f1 > best_val_f1:
        best_val_f1 = val_f1
        torch.save(model.state_dict(), "best_model.pt")
        epochs_without_improve = 0
    else:
        epochs_without_improve += 1
        if epochs_without_improve >= patience:
            break

    print(f"Epoch {epoch}, Loss: {loss.item():.4f}, Val F1: {val_f1:.4f}")

Epoch 0, Loss: 0.7522, Val F1: 0.0624
Epoch 1, Loss: 1.0311, Val F1: 0.9239
Epoch 2, Loss: 1.2858, Val F1: 0.9240
Epoch 3, Loss: 1.6808, Val F1: 0.9238
Epoch 4, Loss: 0.8290, Val F1: 0.9241
Epoch 5, Loss: 1.7070, Val F1: 0.9241
Epoch 6, Loss: 1.0719, Val F1: 0.9264
Epoch 7, Loss: 1.1284, Val F1: 0.9290
Epoch 8, Loss: 1.2047, Val F1: 0.9290
Epoch 9, Loss: 1.1605, Val F1: 0.9296
Epoch 10, Loss: 1.1358, Val F1: 0.9299
Epoch 11, Loss: 0.8295, Val F1: 0.9305
Epoch 12, Loss: 1.1520, Val F1: 0.9305
Epoch 13, Loss: 0.8570, Val F1: 0.9288
Epoch 14, Loss: 0.9004, Val F1: 0.9289
Epoch 15, Loss: 0.7894, Val F1: 0.9295
Epoch 16, Loss: 0.6861, Val F1: 0.9306
Epoch 17, Loss: 0.7709, Val F1: 0.9308
Epoch 18, Loss: 0.6834, Val F1: 0.9308
Epoch 19, Loss: 0.6032, Val F1: 0.9313
Epoch 20, Loss: 0.6889, Val F1: 0.9482
Epoch 21, Loss: 0.9245, Val F1: 0.9470
Epoch 22, Loss: 1.3807, Val F1: 0.9468
Epoch 23, Loss: 0.5188, Val F1: 0.9468
Epoch 24, Loss: 1.2987, Val F1: 0.9468
Epoch 25, Loss: 0.8913, Val F1: 0.9

In [113]:
model.load_state_dict(torch.load("best_model.pt"))
model.eval()
with torch.no_grad():
    logits = model(data)
    test_preds = logits[test_mask].argmax(dim=1).cpu().numpy()
    test_true  = data.y[test_mask].cpu().numpy()
    test_f1 = f1_score(test_true, test_preds, average="weighted")
    correct = (test_preds == test_true).sum()
    acc = correct / len(test_true)

print(f"Test Accuracy: {acc:.4f}, Test Weighted F1: {test_f1:.4f}")

Test Accuracy: 0.9465, Test Weighted F1: 0.9490
