In [1]:
import os
import sys
import pandas as pd
import networkx as nx
import torch
from torch_geometric.data import Data
from sklearn.preprocessing import LabelEncoder, StandardScaler
import numpy as np

parent_dir = os.path.abspath(os.path.join(os.getcwd(), "..", ".."))
utils_path = os.path.join(parent_dir, "project_utils")
sys.path.append(utils_path)

CSV_PATH = '../datasets/data/NF-ToN-IoT.csv'

In [2]:
df = pd.read_csv(CSV_PATH)

# Encode labels
label_encoder = LabelEncoder()
df["label"] = label_encoder.fit_transform(df["Label"])

features = [
    "IN_BYTES", "OUT_BYTES", "IN_PKTS", "OUT_PKTS",
    "FLOW_DURATION_MILLISECONDS", "TCP_FLAGS", "L7_PROTO"
]

In [3]:
node_features = df.groupby("IPV4_SRC_ADDR")[features].mean()
node_labels = df.groupby("IPV4_SRC_ADDR")["label"].agg(lambda x: x.mode()[0])  # majority label

scaler = StandardScaler()
x = scaler.fit_transform(node_features)

ip_to_idx = {ip: i for i, ip in enumerate(node_features.index)}

In [11]:
# Ensure all necessary libraries are imported
import torch

# Your mapping from IP to index
ip_to_idx = {ip: i for i, ip in enumerate(node_features.index)}

# Build edge list: convert to indices
edges = []
for _, row in df.iterrows():
    src_ip = row["IPV4_SRC_ADDR"]
    dst_ip = row["IPV4_DST_ADDR"]
    if src_ip in ip_to_idx and dst_ip in ip_to_idx:
        src_idx = ip_to_idx[src_ip]
        dst_idx = ip_to_idx[dst_ip]
        edges.append([src_idx, dst_idx])

# Convert to tensor and transpose to shape [2, num_edges]
if edges:
    edge_index = torch.tensor(edges, dtype=torch.long).t().contiguous()
else:
    raise ValueError("No valid edges found — check that IPs in df match those in node_features.index")

In [12]:
print("Edge index shape:", edge_index.shape)
print("Edge index dtype:", edge_index.dtype)

Edge index shape: torch.Size([2, 1334142])
Edge index dtype: torch.int64


In [13]:
data = Data(
    x=torch.tensor(x, dtype=torch.float),
    edge_index=edge_index,
    y=torch.tensor(node_labels.values, dtype=torch.long)
)

In [14]:
import torch.nn as nn
from torch_geometric.nn import GCNConv
import torch.nn.functional as F

class GCN(nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, out_channels)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = F.relu(self.conv1(x, edge_index))
        x = self.conv2(x, edge_index)
        return x

In [15]:
model = GCN(in_channels=x.shape[1], hidden_channels=16, out_channels=len(label_encoder.classes_))
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
loss_fn = nn.CrossEntropyLoss()

# Split (e.g., 70/30)
from sklearn.model_selection import train_test_split
idx = np.arange(data.num_nodes)
train_idx, test_idx = train_test_split(idx, test_size=0.3, stratify=data.y)

for epoch in range(100):
    model.train()
    optimizer.zero_grad()
    out = model(data)
    loss = loss_fn(out[train_idx], data.y[train_idx])
    loss.backward()
    optimizer.step()

    print(f"Epoch {epoch}, Loss: {loss.item():.4f}")

Epoch 0, Loss: 0.8764
Epoch 1, Loss: 0.8183
Epoch 2, Loss: 0.7632
Epoch 3, Loss: 0.7116
Epoch 4, Loss: 0.6655
Epoch 5, Loss: 0.6261
Epoch 6, Loss: 0.5958
Epoch 7, Loss: 0.5749
Epoch 8, Loss: 0.5583
Epoch 9, Loss: 0.5434
Epoch 10, Loss: 0.5297
Epoch 11, Loss: 0.5168
Epoch 12, Loss: 0.5048
Epoch 13, Loss: 0.4934
Epoch 14, Loss: 0.4828
Epoch 15, Loss: 0.4728
Epoch 16, Loss: 0.4634
Epoch 17, Loss: 0.4544
Epoch 18, Loss: 0.4459
Epoch 19, Loss: 0.4378
Epoch 20, Loss: 0.4302
Epoch 21, Loss: 0.4229
Epoch 22, Loss: 0.4161
Epoch 23, Loss: 0.4096
Epoch 24, Loss: 0.4035
Epoch 25, Loss: 0.3977
Epoch 26, Loss: 0.3921
Epoch 27, Loss: 0.3869
Epoch 28, Loss: 0.3818
Epoch 29, Loss: 0.3769
Epoch 30, Loss: 0.3723
Epoch 31, Loss: 0.3679
Epoch 32, Loss: 0.3635
Epoch 33, Loss: 0.3595
Epoch 34, Loss: 0.3555
Epoch 35, Loss: 0.3517
Epoch 36, Loss: 0.3480
Epoch 37, Loss: 0.3445
Epoch 38, Loss: 0.3411
Epoch 39, Loss: 0.3379
Epoch 40, Loss: 0.3346
Epoch 41, Loss: 0.3316
Epoch 42, Loss: 0.3286
Epoch 43, Loss: 0.325

In [16]:
model.eval()
_, pred = model(data).max(dim=1)
correct = int((pred[test_idx] == data.y[test_idx]).sum())
acc = correct / len(test_idx)
print(f"Test Accuracy: {acc:.4f}")

Test Accuracy: 0.9434
