# First approach, simple neural networks

In [1]:
import pickle
import warnings

import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from dataset import LinkPredictionDataset
from evaluation import load_model, evaluate_all_metrics
from graph import *
from models import MyPredictionModel, EdgePredictionModel, GraphPredictionModel
from node import *

warnings.simplefilter("ignore", category=FutureWarning)


In [2]:
with open('./data/graphs.dat', 'rb') as file:
    train_graphs_list: List[Graph] = pickle.load(file)
    train_graphs_list, test_graphs = train_test_split(train_graphs_list, test_size=0.2, random_state=42)
all_part_ids = []
all_family_ids = []
for graph in train_graphs_list:
    for n in graph.get_nodes():
        all_part_ids.append(int(n.get_part().get_part_id()))
        all_family_ids.append(int(n.get_part().get_family_id()))

part_vocab_size = max(all_part_ids) + 1
family_vocab_size = max(all_family_ids) + 1
print(f"Part Vocab Size: {part_vocab_size}")
print(f"Family Vocab Size: {family_vocab_size}")

Part Vocab Size: 2271
Family Vocab Size: 96


In [3]:
def train_edge_predictor(model, optimizer, criterion, epochs=100):
    model.train()
    for epoch in range(epochs):
        total_loss = 0.0
        for batch in dataloader:
            part_i, fam_i, part_j, fam_j, label = batch
            # Convert to Long / Float for embeddings + BCE
            part_i = part_i.long()
            fam_i = fam_i.long()
            part_j = part_j.long()
            fam_j = fam_j.long()
            label = label.float()

            optimizer.zero_grad()
            logits = model(part_i, fam_i, part_j, fam_j)
            loss = criterion(logits, label)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        avg_loss = total_loss / len(dataloader)
        print(f"Epoch {epoch + 1}/{epochs} - Loss: {avg_loss:.4f}")

### Now train it and write it to disk

In [4]:
# Create the dataset and dataloader
dataset = LinkPredictionDataset(train_graphs_list)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

# Create the model, criterion, and optimizer
model_EdgePredictor = EdgePredictionModel(part_vocab_size, family_vocab_size, embed_dim=16, hidden_dim=32)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model_EdgePredictor.parameters(), lr=0.001)

# train the model
train_edge_predictor(model_EdgePredictor, optimizer, criterion, epochs=50)
torch.save(model_EdgePredictor.state_dict(), "model_EdgePredictor.pth")


  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


Epoch 1/50 - Loss: 0.1548


KeyboardInterrupt: 

Now evaluate it
We developed our own metrics:

Evaluation Metrics:

1. Precision, Recall, F1-score:
   - Precision: Measures the proportion of correctly predicted edges out of all predicted edges.
   - Recall: Measures the proportion of correctly predicted edges out of all actual edges.
   - F1-score: Harmonic mean of precision and recall, balancing both metrics.

2. Hamming Distance:
   - Counts the number of differing edges between the predicted and target adjacency matrices.

3. Jaccard Similarity:
   - Measures the overlap between predicted and actual edge sets, calculated as the intersection over the union.

4. Graph Edit Distance:
   - Computes the minimum number of edge insertions, deletions, or substitutions to transform the predicted graph into the target graph.



We took the mean over all test samples.



In [3]:
model_file_path = 'model_EdgePredictor.pth'
prediction_model: MyPredictionModel = load_model(model_file_path)

In [4]:
instances = [(graph.get_parts(), graph) for graph in test_graphs[:500]]
evaluate_all_metrics(prediction_model, instances)

Processing graphs: 100%|██████████| 500/500 [02:30<00:00,  3.31graph/s, failed=5, P=0.8793, R=1.0000, F1=0.9296, Hamming=1.4263, Jaccard=0.8793, Edit Dist=1.4263, Acc=96.44%]


Evaluation Results:
  Number of invalid graphs due to mismatch in number of nodes: 5
  Precision: 0.8793
  Recall: 1.0000
  F1-score: 0.9296
  Hamming Distance: 1.4263
  Jaccard Similarity: 0.8793
  Graph Edit Distance: 1.4263
  Edge Accuracy: 96.4358%





# Second Method: GNN

In [28]:
def train_graph_predictor(model, train_graphs_list, optimizer, criterion, epochs=100):
    model.train()
    for epoch in tqdm(range(epochs), desc="Training", unit="epoch", total=epochs):
        total_loss = 0.0

        # Wrap train_graphs_list with tqdm for progress tracking
        for graph in train_graphs_list:
            optimizer.zero_grad()

            # Sort nodes
            nodes = sorted(
                graph.get_nodes(),
                key=lambda node: (node.get_part().get_part_id(), node.get_part().get_family_id())
            )

            # Prepare part/family IDs
            part_ids = torch.tensor(
                [int(node.get_part().get_part_id()) for node in nodes],
                dtype=torch.long
            )
            family_ids = torch.tensor(
                [int(node.get_part().get_family_id()) for node in nodes],
                dtype=torch.long
            )

            # Build adjacency on the same device
            part_order = tuple(node.get_part() for node in nodes)
            adjacency_matrix = torch.tensor(
                graph.get_adjacency_matrix(part_order),
                dtype=torch.float32
            )

            # Forward pass
            logits = model(part_ids, family_ids)

            # Flatten for loss
            target = adjacency_matrix.flatten()
            loss = criterion(logits.flatten(), target)

            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        avg_loss = total_loss / len(train_graphs_list)
        print(f"Epoch {epoch + 1}/{epochs} - Loss: {avg_loss:.4f}")


### Now train it and write it to disk

In [19]:
model = GraphPredictionModel(part_vocab_size, family_vocab_size, embed_dim=1, gnn_hidden_dim=32)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.BCEWithLogitsLoss()

train_graph_predictor(model, train_graphs_list, optimizer, criterion, epochs=100)
torch.save(model.state_dict(), "graph_predictor_model.pth")



Training:   1%|          | 1/100 [00:12<20:49, 12.63s/epoch]

Epoch 1/100 - Loss: 0.2705


Training:   2%|▏         | 2/100 [00:25<20:33, 12.59s/epoch]

Epoch 2/100 - Loss: 0.1640


Training:   2%|▏         | 2/100 [00:36<29:57, 18.34s/epoch]


KeyboardInterrupt: 

### Now evaluate it

In [4]:
model_file_path = 'graph_predictor_model.pth'
prediction_model: MyPredictionModel = load_model(model_file_path)

Again we evaluate on our metrics

In [5]:
instances = [(graph.get_parts(), graph) for graph in test_graphs[:500]]
evaluate_all_metrics(prediction_model, instances)

Processing graphs: 100%|██████████| 500/500 [05:26<00:00,  1.53graph/s, failed=2, P=0.8575, R=0.9994, F1=0.9156, Hamming=1.7410, Jaccard=0.8573, Edit Dist=1.7289, Acc=95.58%]


Evaluation Results:
  Number of invalid graphs due to mismatch in number of nodes: 2
  Precision: 0.8575
  Recall: 0.9994
  F1-score: 0.9156
  Hamming Distance: 1.7410
  Jaccard Similarity: 0.8573
  Graph Edit Distance: 1.7289
  Edge Accuracy: 95.5845%



