In [10]:
!pip install torch torchvision torchaudio torch-geometric pandas scikit-learn pandas numpy scipy networkx

Collecting torch
  Obtaining dependency information for torch from https://files.pythonhosted.org/packages/74/c8/2ab2b6eadc45554af8768ae99668c5a8a8552e2012c7238ded7e9e4395e1/torch-2.7.0-cp313-cp313-win_amd64.whl.metadata
  Downloading torch-2.7.0-cp313-cp313-win_amd64.whl.metadata (29 kB)
Collecting torchvision
  Obtaining dependency information for torchvision from https://files.pythonhosted.org/packages/58/19/ca7a4f8907a56351dfe6ae0a708f4e6b3569b5c61d282e3e7f61cf42a4ce/torchvision-0.22.0-cp313-cp313-win_amd64.whl.metadata
  Downloading torchvision-0.22.0-cp313-cp313-win_amd64.whl.metadata (6.3 kB)
Collecting torchaudio
  Obtaining dependency information for torchaudio from https://files.pythonhosted.org/packages/88/49/923ebb2603156dd5c5ae6d845bf51a078e05f27432cd26f13ecdcc8713cd/torchaudio-2.7.0-cp313-cp313-win_amd64.whl.metadata
  Downloading torchaudio-2.7.0-cp313-cp313-win_amd64.whl.metadata (6.7 kB)
Collecting torch-geometric
  Obtaining dependency information for torch-geometric 


[notice] A new release of pip is available: 23.2.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [11]:
import pandas as pd
import networkx as nx
import numpy as np
import scipy as sp

df = pd.read_csv("../datasets/train_test_network.csv")
df.head()

Unnamed: 0,src_ip,src_port,dst_ip,dst_port,proto,service,duration,src_bytes,dst_bytes,conn_state,...,http_response_body_len,http_status_code,http_user_agent,http_orig_mime_types,http_resp_mime_types,weird_name,weird_addl,weird_notice,label,type
0,192.168.1.37,4444,192.168.1.193,49178,tcp,-,290.371539,101568,2592,OTH,...,0,0,-,-,-,-,-,-,1,backdoor
1,192.168.1.193,49180,192.168.1.37,8080,tcp,-,0.000102,0,0,REJ,...,0,0,-,-,-,-,-,-,1,backdoor
2,192.168.1.193,49180,192.168.1.37,8080,tcp,-,0.000148,0,0,REJ,...,0,0,-,-,-,-,-,-,1,backdoor
3,192.168.1.193,49180,192.168.1.37,8080,tcp,-,0.000113,0,0,REJ,...,0,0,-,-,-,-,-,-,1,backdoor
4,192.168.1.193,49180,192.168.1.37,8080,tcp,-,0.00013,0,0,REJ,...,0,0,-,-,-,-,-,-,1,backdoor


## Graph Structure - Heterogeneous Graph 


| Element            | What You Use From Dataset                              |
| ------------------ | ------------------------------------------------------ |
| **Nodes**          | IP addresses from `src_ip` and `dst_ip`                |
| **Edges**          | One edge per connection or flow from `src_ip → dst_ip` |
| **Directionality** | Yes – flows are directed from source to destination    |
| **Edge Weight**    | Optional – could be frequency, total bytes, etc.       |

Recommended Edge Attributes:
- **proto** (TCP/UDP/ICMP)
- **service** (HTTP, DNS, SSL, etc.)
- **duration** (length of the flow in seconds)
- **src_bytes** / dst_bytes (payload size per direction)
- **conn_state** (S0, S1, REJ — Zeek flow state)
- **missed_bytes** (for gap analysis)
- **label** (0 = normal, 1 = attack)
- **type** (attack category, e.g., DDoS, DoS, backdoor)
- **timestamp** (ts) – useful for graph evolution

In [13]:
from sklearn.preprocessing import LabelEncoder

df = df.dropna(subset=["src_ip", "dst_ip", "label"])

ip_encoder = LabelEncoder()
all_ips = pd.concat([df["src_ip"], df["dst_ip"]])
ip_encoder.fit(all_ips)
df["src_id"] = ip_encoder.transform(df["src_ip"])
df["dst_id"] = ip_encoder.transform(df["dst_ip"])

num_nodes = len(ip_encoder.classes_)

In [15]:
from sklearn.preprocessing import StandardScaler

features = np.zeros((num_nodes, 5))  
labels = np.zeros(num_nodes)

for ip_id in range(num_nodes):
    rows = df[(df["src_id"] == ip_id) | (df["dst_id"] == ip_id)]
    features[ip_id, 0] = rows["src_bytes"].sum()
    features[ip_id, 1] = rows["dst_bytes"].sum()
    features[ip_id, 2] = rows["duration"].mean()
    features[ip_id, 3] = rows["proto"].nunique()
    features[ip_id, 4] = rows["service"].nunique()
    
    # Label as attack if it ever participated in an attack
    if (rows["label"] == 1).any():
        labels[ip_id] = 1  # attacker
    else:
        labels[ip_id] = 0  # normal

# Normalize features
features = StandardScaler().fit_transform(features)


In [16]:
import torch
import torch_geometric

edge_index = torch.tensor(df[["src_id", "dst_id"]].values.T, dtype=torch.long)

x = torch.tensor(features, dtype=torch.float)
y = torch.tensor(labels, dtype=torch.long)

In [17]:
from torch_geometric.data import Data

data = Data(x=x, edge_index=edge_index, y=y)

In [18]:
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

class GCN(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, output_dim)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = F.relu(self.conv1(x, edge_index))
        x = self.conv2(x, edge_index)
        return x

In [24]:
from torch_geometric.utils import train_test_split_edges

train_mask = torch.rand(len(y)) < 0.8
test_mask = ~train_mask

model = GCN(input_dim=x.shape[1], hidden_dim=16, output_dim=2)
optimizer = torch.optim.Adam(model.parameters(), lr=0.02)
loss_fn = torch.nn.CrossEntropyLoss()

model.train()
for epoch in range(200):
    optimizer.zero_grad()
    out = model(data)
    loss = loss_fn(out[train_mask], y[train_mask])
    loss.backward()
    optimizer.step()
    if epoch % 10 == 0:
        print(f"Epoch {epoch}, Loss: {loss.item():.4f}")


Epoch 0, Loss: 1.0239
Epoch 10, Loss: 0.5269
Epoch 20, Loss: 0.4503
Epoch 30, Loss: 0.3589
Epoch 40, Loss: 0.2727
Epoch 50, Loss: 0.2091
Epoch 60, Loss: 0.1708
Epoch 70, Loss: 0.1454
Epoch 80, Loss: 0.1283
Epoch 90, Loss: 0.1140
Epoch 100, Loss: 0.1000
Epoch 110, Loss: 0.0901
Epoch 120, Loss: 0.0828
Epoch 130, Loss: 0.0757
Epoch 140, Loss: 0.0700
Epoch 150, Loss: 0.0649
Epoch 160, Loss: 0.0607
Epoch 170, Loss: 0.0570
Epoch 180, Loss: 0.0538
Epoch 190, Loss: 0.0510


In [25]:
model.eval()
pred = model(data).argmax(dim=1)

from sklearn.metrics import classification_report
print(classification_report(y[test_mask], pred[test_mask], target_names=["Normal", "Attack"]))

              precision    recall  f1-score   support

      Normal       0.98      0.99      0.99       123
      Attack       0.98      0.96      0.97        45

    accuracy                           0.98       168
   macro avg       0.98      0.97      0.98       168
weighted avg       0.98      0.98      0.98       168

