With a GNN we can learn patterns of the relationship between nodes and the connection's characteristics to identify anomalies in the behaviour.
We should include this metrics: Accuracy, False Positive Rate , Precision, Recall and F1-score. https://ieeexplore.ieee.org/document/9776097/figures#figures

In [1]:
import pandas as pd
import torch
from torch_geometric.data import Data, DataLoader
from torch_geometric.nn import GCNConv, global_mean_pool
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score

In [2]:
# 1. Load data
df = pd.read_csv("df_ben_ddos.csv")  
features = df.drop(columns=["Category"]).values
labels = df["Category"].values

In [3]:
# 2. Characteristic's Normalization 
scaler = MinMaxScaler()
features = scaler.fit_transform(features)

In [4]:
# 3. tensor converter
x = torch.tensor(features, dtype=torch.float)
y = torch.tensor(labels, dtype=torch.long)

In [5]:
# 4. Graphs
def create_graph(x, y, k_neighbors=5):
    num_nodes = x.shape[0]
    edge_index = []
    for i in range(num_nodes):
        for j in range(max(0, i-k_neighbors), min(num_nodes, i+k_neighbors+1)):
            if i != j:  
                edge_index.append([i, j])
    edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
    return Data(x=x, edge_index=edge_index, y=y)

graph_data = create_graph(x, y)

In [6]:
# 5. Train and Test
train_mask, test_mask = train_test_split(range(graph_data.num_nodes), test_size=0.2, random_state=42)
graph_data.train_mask = torch.tensor(train_mask, dtype=torch.long)
graph_data.test_mask = torch.tensor(test_mask, dtype=torch.long)

In [7]:
# 6. Creating the GNN model
class GNN(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GNN, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, hidden_dim)
        self.fc = torch.nn.Linear(hidden_dim, output_dim)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index).relu()
        x = global_mean_pool(x, data.batch)  # Global pooling
        x = self.fc(x)
        return x

In [8]:
# 7. Model configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = GNN(input_dim=x.size(1), hidden_dim=64, output_dim=2).to(device)
data = graph_data.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.CrossEntropyLoss()

In [9]:
# 8. Training
def train():
    model.train()
    optimizer.zero_grad()
    out = model(data)
    loss = criterion(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
    return loss.item()

# def test():
#     model.eval()
#     with torch.no_grad():
#         logits = model(data)
#         preds = logits.argmax(dim=1)
#         correct = preds[data.test_mask] == data.y[data.test_mask]
#         accuracy = int(correct.sum()) / int(data.test_mask.sum())
#     return accuracy

def test():
    model.eval()
    with torch.no_grad():
        logits = model(graph_data)
        preds = logits.argmax(dim=1).cpu().numpy()  # Predicted labels
        true_labels = graph_data.y.cpu().numpy()  # True labels

        # Filter only test data
        test_indices = graph_data.test_mask.cpu().numpy()
        preds = preds[test_indices]
        true_labels = true_labels[test_indices]

        # Calculate confusion matrix
        tn, fp, fn, tp = confusion_matrix(true_labels, preds, labels=[0, 1]).ravel()

        # Calculate metrics
        accuracy = (tp + tn) / (tn + fp + fn + tp)
        fpr = fp / (fp + tn) if (fp + tn) > 0 else 0  # Avoid division by zero
        precision = precision_score(true_labels, preds, pos_label=1)
        recall = recall_score(true_labels, preds, pos_label=1)
        f1 = f1_score(true_labels, preds, pos_label=1)

    return accuracy, fpr, precision, recall, f1

In [10]:
for epoch in range(1, 201):
    loss = train()
    print(epoch)
    if epoch % 8 == 0:
        accuracy, fpr, precision, recall, f1 = test()
        print(f"Epoch {epoch:03d}, Loss: {loss:.4f}, "
              f"Accuracy: {accuracy:.4f}, FPR: {fpr:.4f}, "
              f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1-score: {f1:.4f}")

hola
adios
deu


IndexError: index 1548751 is out of bounds for dimension 0 with size 1

In [17]:
# 8. Entrenamiento
def train():
    model.train()
    optimizer.zero_grad()
    out = model(data)
    print(f"out.shape: {out.shape}")
    print(f"data.y.shape: {data.y.shape}")
    print(f"data.train_mask.shape: {data.train_mask.shape}")
    print(f"data.train_mask.sum(): {data.train_mask.sum()}")  # Número de muestras en el train_mask

    loss = criterion(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
    return loss.item()

def test():
    model.eval()
    with torch.no_grad():
        logits = model(data)
        preds = logits.argmax(dim=1)
        correct = preds[data.test_mask] == data.y[data.test_mask]
        accuracy = int(correct.sum()) / int(data.test_mask.sum())
    return accuracy

for epoch in range(1, 201):
    loss = train()
    if epoch % 10 == 0:
        test_acc = test()
        print(f'Epoch {epoch:03d}, Loss: {loss:.4f}, Test Acc: {test_acc:.4f}')


out.shape: torch.Size([1, 2])
data.y.shape: torch.Size([2314095])
data.train_mask.shape: torch.Size([1851276])
data.train_mask.sum(): 2142024140205


IndexError: index 1548751 is out of bounds for dimension 0 with size 1