# Graph Classification

## Graph Classes

we use the following graph classes:
- Chemical Graphs (Molecules)
- Random
- Small World
- Scale Free

## Setup

We use pyg (pytorch-geometric) to generate the model to train.
The model is a GCN with 2 layers and 32 hidden units.

In [1]:
import torch_geometric
import torch

print(torch.__version__)
print(torch.version.cuda)
print(torch.backends.mps.is_available())
print(torch_geometric.__version__)

1.13.1
11.7
False
2.2.0


In [2]:
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader
from torch_geometric.utils import from_networkx

# we need to read the dataset from the pickle file
import pickle

with open('dataset.pickle', 'rb') as f:
    dataset = pickle.load(f)

for key in dataset.keys():
    print(f"datasetkey: {key}, shape: {len(dataset[key])}")

# we need to convert the networkx graphs to pytorch geometric graphs

class_key = dict(zip(dataset.keys(), range(len(dataset.keys()))))
print(class_key)
pyg_dataset = []
for key in dataset.keys():
    for i in range(len(dataset[key])):
        graph = dataset[key][i]
        graph_tensor = from_networkx(graph, group_node_attrs=["label", "betweenness", "degree"])
        graph_tensor.y = torch.tensor([class_key[key]])
        pyg_dataset.append(graph_tensor)

datasetkey: random, shape: 225
datasetkey: smallworld, shape: 225
datasetkey: scalefree, shape: 98
datasetkey: complete, shape: 98
datasetkey: line, shape: 98
datasetkey: tree, shape: 298
datasetkey: star, shape: 98
{'random': 0, 'smallworld': 1, 'scalefree': 2, 'complete': 3, 'line': 4, 'tree': 5, 'star': 6}


In [3]:
import torch
from torch_geometric.data import InMemoryDataset
from torch_geometric.utils import from_networkx
import pickle


class MyOwnDataset(InMemoryDataset):
    def __init__(self, root, transform=None, pre_transform=None, pre_filter=None):
        super().__init__(root, transform, pre_transform, pre_filter)
        self.data, self.slices = torch.load(self.processed_paths[0])

    @property
    def raw_file_names(self):
        return ['some_file_1', 'some_file_2', ...]

    @property
    def processed_file_names(self):
        return ['data.pt']


    def process(self):
        # Read data into huge `Data` list.
        with open('dataset.pickle', 'rb') as f:
            dataset = pickle.load(f)

        class_key = dict(zip(dataset.keys(), range(len(dataset.keys()))))
        print(class_key)
        data_list = []
        for key in dataset.keys():
            for i in range(len(dataset[key])):
                graph = dataset[key][i]
                graph_tensor = from_networkx(graph, group_node_attrs=["label", "betweenness", "degree", "density"])
                graph_tensor.y = torch.tensor([class_key[key]])
                data_list.append(graph_tensor)

        if self.pre_filter is not None:
            data_list = [data for data in data_list if self.pre_filter(data)]

        if self.pre_transform is not None:
            data_list = [self.pre_transform(data) for data in data_list]

        data, slices = self.collate(data_list)
        torch.save((data, slices), self.processed_paths[0])

In [4]:
import torch
from torch_geometric.datasets import TUDataset

pyg_dataset = MyOwnDataset(root='data/CustomData')

print()
print(f'Dataset: {pyg_dataset}:')
print('====================')
print(f'Number of graphs: {len(pyg_dataset)}')
print(f'Number of features: {pyg_dataset.num_features}')
print(f'Number of classes: {pyg_dataset.num_classes}')

data = pyg_dataset[0]  # Get the first graph object.

print()
print(data)
print(data.x)
print('=============================================================')

# Gather some statistics about the first graph.
print(f'Number of nodes: {data.num_nodes}')
print(f'Number of edges: {data.num_edges}')
print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}')
print(f'Has isolated nodes: {data.has_isolated_nodes()}')
print(f'Has self-loops: {data.has_self_loops()}')
print(f'Is undirected: {data.is_undirected()}')

Processing...


{'random': 0, 'smallworld': 1, 'scalefree': 2, 'complete': 3, 'line': 4, 'tree': 5, 'star': 6}

Dataset: MyOwnDataset(1140):
Number of graphs: 1140
Number of features: 4
Number of classes: 7

Data(edge_index=[2, 16], x=[5, 4], y=[1])
tensor([[0.0000, 0.0556, 0.7500, 0.8000],
        [1.0000, 0.0556, 0.7500, 0.8000],
        [2.0000, 0.1111, 1.0000, 0.8000],
        [3.0000, 0.0556, 0.7500, 0.8000],
        [4.0000, 0.0556, 0.7500, 0.8000]])
Number of nodes: 5
Number of edges: 16
Average node degree: 3.20
Has isolated nodes: False
Has self-loops: False
Is undirected: True


Done!


In [5]:
torch.manual_seed(12345)
pyg_dataset = pyg_dataset.shuffle()

train_dataset = pyg_dataset[:600]
test_dataset = pyg_dataset[600:]

print(f'Number of training graphs: {len(train_dataset)}')
print(f'Number of test graphs: {len(test_dataset)}')

Number of training graphs: 600
Number of test graphs: 540


In [6]:
from torch_geometric.loader import DataLoader

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

for step, data in enumerate(train_loader):
    print(f'Step {step + 1}:')
    print('=======')
    print(f'Number of graphs in the current batch: {data.num_graphs}')
    print(data)
    print()

Step 1:
Number of graphs in the current batch: 64
DataBatch(edge_index=[2, 50170], x=[2201, 4], y=[64], batch=[2201], ptr=[65])

Step 2:
Number of graphs in the current batch: 64
DataBatch(edge_index=[2, 22396], x=[2365, 4], y=[64], batch=[2365], ptr=[65])

Step 3:
Number of graphs in the current batch: 64
DataBatch(edge_index=[2, 56512], x=[2406, 4], y=[64], batch=[2406], ptr=[65])

Step 4:
Number of graphs in the current batch: 64
DataBatch(edge_index=[2, 30874], x=[2007, 4], y=[64], batch=[2007], ptr=[65])

Step 5:
Number of graphs in the current batch: 64
DataBatch(edge_index=[2, 33752], x=[2037, 4], y=[64], batch=[2037], ptr=[65])

Step 6:
Number of graphs in the current batch: 64
DataBatch(edge_index=[2, 18918], x=[1755, 4], y=[64], batch=[1755], ptr=[65])

Step 7:
Number of graphs in the current batch: 64
DataBatch(edge_index=[2, 42320], x=[2363, 4], y=[64], batch=[2363], ptr=[65])

Step 8:
Number of graphs in the current batch: 64
DataBatch(edge_index=[2, 24564], x=[1880, 4], y

In [7]:
from IPython.display import Javascript
display(Javascript('''google.colab.output.setIframeHeight(0, true, {maxHeight: 300})'''))
from model.gcn import GCN

model = GCN(hidden_channels=64, num_classes=pyg_dataset.num_classes,
            num_node_features=pyg_dataset.num_node_features)

optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.CrossEntropyLoss()

def train():
    model.train()

    for data in train_loader:  # Iterate in batches over the training dataset.
        out = model(data.x, data.edge_index, data.batch)  # Perform a single forward pass.
        loss = criterion(out, data.y)  # Compute the loss.
        loss.backward()  # Derive gradients.
        optimizer.step()  # Update parameters based on gradients.
        optimizer.zero_grad()  # Clear gradients.

def test(loader):
    model.eval()

    correct = 0
    for data in loader:  # Iterate in batches over the training/test dataset.
        out = model(data.x, data.edge_index, data.batch)  
        pred = out.argmax(dim=1)  # Use the class with highest probability.
        correct += int((pred == data.y).sum())  # Check against ground-truth labels.
    return correct / len(loader.dataset)  # Derive ratio of correct predictions.


confusion_matrix = torch.zeros(3, 3)
for epoch in range(1, 171):
    train()
    train_acc = test(train_loader)
    test_acc = test(test_loader)
    print(f'Epoch: {epoch:03d}, Train Acc: {train_acc:.4f}, Test Acc: {test_acc:.4f}')

<IPython.core.display.Javascript object>

Epoch: 001, Train Acc: 0.2617, Test Acc: 0.2630
Epoch: 002, Train Acc: 0.2617, Test Acc: 0.2611
Epoch: 003, Train Acc: 0.3667, Test Acc: 0.3759
Epoch: 004, Train Acc: 0.4367, Test Acc: 0.4278
Epoch: 005, Train Acc: 0.5800, Test Acc: 0.5574
Epoch: 006, Train Acc: 0.5267, Test Acc: 0.5519
Epoch: 007, Train Acc: 0.7567, Test Acc: 0.7296
Epoch: 008, Train Acc: 0.5617, Test Acc: 0.5611
Epoch: 009, Train Acc: 0.7033, Test Acc: 0.6981
Epoch: 010, Train Acc: 0.7550, Test Acc: 0.7241
Epoch: 011, Train Acc: 0.8200, Test Acc: 0.7796
Epoch: 012, Train Acc: 0.7833, Test Acc: 0.7648
Epoch: 013, Train Acc: 0.7700, Test Acc: 0.7722
Epoch: 014, Train Acc: 0.7117, Test Acc: 0.6704
Epoch: 015, Train Acc: 0.7700, Test Acc: 0.7296
Epoch: 016, Train Acc: 0.8217, Test Acc: 0.8130
Epoch: 017, Train Acc: 0.7200, Test Acc: 0.7352
Epoch: 018, Train Acc: 0.7533, Test Acc: 0.7741
Epoch: 019, Train Acc: 0.8333, Test Acc: 0.8296
Epoch: 020, Train Acc: 0.8400, Test Acc: 0.8130
Epoch: 021, Train Acc: 0.7367, Test Acc:

In [8]:
print(model)

GCN(
  (conv1): GCNConv(4, 64)
  (conv2): GCNConv(64, 64)
  (conv3): GCNConv(64, 64)
  (lin): Linear(in_features=64, out_features=7, bias=True)
)


In [9]:
torch.save(model.state_dict(), 'gnn_model_weights.pth')
torch.save(model, 'gnn_model.pth')

In [10]:
import visualkeras

visualkeras.layered_view(model).show() # display using your system viewer

NameError: name 'Layer' is not defined

In [None]:
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import random
from torch_geometric.utils import to_networkx
from model.gcn import GCN


class_key = dict(zip(dataset.keys(), range(len(dataset.keys()))))
class_key = {v: k for k, v in class_key.items()}

def index_to_class(index):
    return class_key[index]

fig, ax = plt.subplots(3, 3, figsize=(11, 11))
fig.suptitle('GCN - Graph classification')

shuffled_dataset = pyg_dataset.shuffle()

for i, data in enumerate(shuffled_dataset[:9]):
    # Calculate color (green if correct, red otherwise)
    out = model(data.x, data.edge_index, batch=data.batch)
    pred = out.argmax(dim=1)
    color = "green" if out.argmax(dim=1) == data.y else "red"

    # Plot graph
    ix = np.unravel_index(i, ax.shape)
    ax[ix].axis('off')
    ax[ix].set_title('Predicted: ' + index_to_class(pred.item()) + '\nActual: ' + index_to_class(data.y.item()))
    G = to_networkx(data, to_undirected=True)
    nx.draw_networkx(G,
                     pos=nx.spring_layout(G, seed=0),
                     with_labels=True,
                     node_size=150,
                     node_color=color,
                     width=0.8,
                     ax=ax[ix]
                     )


In [17]:
from torch_geometric.explain import Explainer, GNNExplainer

explainer = Explainer(
    model=model,
    algorithm=GNNExplainer(epochs=200),
    explainer_config=dict(
        explanation_type='model',
        node_mask_type='attributes',
        edge_mask_type='object',
    ),
    model_config=dict(
        mode='classification',
        task_level='node',
        return_type='log_probs',
    ),
)

shuffled_dataset = pyg_dataset.shuffle()

for i, data in enumerate(shuffled_dataset[:9]):
    # Calculate color (green if correct, red otherwise)
    node_index = 1
    explanation = explainer(data.x, data.edge_index, index=node_index, batch=data.batch)
    print(f'Generated explanations in {explanation.available_explanations}')

    path = 'feature_importance.png'
    explanation.visualize_feature_importance(path, top_k=10)
    print(f"Feature importance plot has been saved to '{path}'")

    path = 'subgraph.pdf'
    explanation.visualize_graph(path)
    print(f"Subgraph visualization plot has been saved to '{path}'")


AssertionError: 