In [1]:
import os
import torch
import random
from typing import Callable, List, Optional
from torch_geometric.data import Data, InMemoryDataset, download_url
import os.path as osp
import json
import pandas as pd

## PyG Dataset

Create a dataset class.

In [2]:
class GPClassification(InMemoryDataset):
  def __init__(self, root: str, dataset_name: str = "GPClassificationTest_01",
              transform: Optional[Callable] = None,
              pre_transform: Optional[Callable] = None):

    self.dataset_name = dataset_name
    super().__init__(root, transform, pre_transform)

    self.data, self.slices = torch.load(self.processed_paths[0])
    node_dict = torch.load(self.processed_paths[1])
    rel_dict = torch.load(self.processed_paths[2])

    self.num_relations=len(rel_dict.keys())


  @property
  def raw_file_names(self) -> List[str]:
      return ['graphs.json', 'targets.csv']

  @property
  def processed_file_names(self) -> List[str]:
      return ['graphs.pt', 'node_dict.pt', 'rel_dict.pt']


  def download(self):
    print('Please place the required files in the raw directory')

  def process(self):
    data_list, node_dict, rel_dict = [], {}, {}

    graph_dict = json.load(open(osp.join(self.raw_dir, 'graphs.json'), 'r'))

    targets = pd.read_csv(osp.join(self.raw_dir, 'targets.csv'), header=None,  names=['id', 'label'])

    self.num_nodes = max(targets['id'])
    x = torch.range(0, self.num_nodes + 1)


    for graph_id, graph in graph_dict.items():
      edge_index = []
      edge_type = []
      for head, relation, tail in graph:
        if head not in node_dict:
          node_dict[head] = len(node_dict.keys())
        if tail not in node_dict:
          node_dict[tail] = len(node_dict.keys())
        if relation not in rel_dict:
          rel_dict[relation] = len(rel_dict.keys())

        edge_index.append([node_dict[head], node_dict[tail]])
        edge_type.append(rel_dict[relation])

      data_list.append(Data(x=x,
                            edge_index=torch.tensor(edge_index).T,
                            edge_type=torch.tensor(edge_type),
                            y=int(targets[targets['id']==int(graph_id)]['label'])))

    torch.save(self.collate(data_list), self.processed_paths[0])

    torch.save(node_dict, self.processed_paths[1])
    torch.save(rel_dict, self.processed_paths[2])


def add_edges(dataset, reverse_edges=False, self_loops=False):

  num_relations = dataset.num_relations

  if reverse_edges:
    dataset_new = []
    for g in dataset:
      row, col = g.edge_index
      row, col = torch.cat([row, col], dim=0), torch.cat([col, row], dim=0)

      g_new = Data(x=g.x,
                    edge_index=torch.stack([row, col], dim=0),
                    edge_type=torch.cat([g.edge_type , g.edge_type  + num_relations]),
                    y=g.y)
      dataset_new.append(g_new)
    num_relations *= 2
    dataset = dataset_new


  if self_loops:
    dataset_new = []
    for g in dataset:
      g_nodes = torch.unique(g.edge_index.flatten())

      g_new = Data(x=g.x,
                    edge_index=torch.cat((g.edge_index, torch.stack([g_nodes, g_nodes], dim=0)), dim=1),
                    edge_type=torch.cat([g.edge_type , torch.full_like(g_nodes, num_relations)]),
                    y=g.y)
      dataset_new.append(g_new)
    num_relations += 1

  return dataset_new, num_relations

Load the dataset.

In [3]:
dataset = GPClassification('./data/GPClassificationTest_01/', dataset_name='GPClassificationTest_01')
num_classes = dataset.num_classes
x = dataset[0].x
print(dataset.num_relations)
dataset, num_relations = add_edges(dataset, reverse_edges=True, self_loops=True)
torch.manual_seed(12345)
random.shuffle(dataset)

Please place the required files in the raw directory


Processing...


FileNotFoundError: [Errno 2] No such file or directory: 'data/GPClassificationTest_01/raw/graphs.json'

In [4]:
# todo implement balancing

train, val, test = torch.utils.data.random_split(dataset, [0.7, 0.15, 0.15])

In [5]:
from torch_geometric.loader import DataLoader, DataListLoader


train_loader = DataLoader(train, batch_size=64, shuffle=True)  # , sampler=sampler)
val_loader = DataLoader(val, batch_size=64, shuffle=False)
test_loader = DataLoader(test, batch_size=64, shuffle=False)

for step, data in enumerate(train_loader):
    print(f'Step {step + 1}:')
    print('=======')
    print(f'Number of graphs in the current batch: {data.num_graphs}')
    print(data)


Step 1:
Number of graphs in the current batch: 64
DataBatch(x=[7808], edge_index=[2, 244], y=[64], edge_type=[244], batch=[7808], ptr=[65])
Step 2:
Number of graphs in the current batch: 20
DataBatch(x=[2440], edge_index=[2, 77], y=[20], edge_type=[77], batch=[2440], ptr=[21])


In [6]:
x.size()

torch.Size([122])

In [7]:
from torch.nn import Linear
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, RGCNConv, GINEConv, MLP
from torch_geometric.nn import global_mean_pool



class GCN(torch.nn.Module):
    def __init__(self, layer_type, hidden_channels, num_nodes, trainable_embeddings=True):
        super(GCN, self).__init__()
        torch.manual_seed(12345)

        self.layer_type = layer_type
        print(num_nodes, 50)

        self.emb = torch.nn.Embedding(num_nodes, 50, _freeze=trainable_embeddings)
        self.lin = Linear(hidden_channels, num_classes)

        if layer_type == 'GCN':
            self.conv1 = GCNConv(50, hidden_channels)
            self.conv2 = GCNConv(hidden_channels, hidden_channels)
            self.conv3 = GCNConv(hidden_channels, hidden_channels)

        elif layer_type == 'RGCN':
            self.conv1 = RGCNConv(50, hidden_channels, num_relations=num_relations)
            self.conv2 = RGCNConv(hidden_channels, hidden_channels, num_relations=num_relations)
            self.conv3 = RGCNConv(hidden_channels, hidden_channels, num_relations=num_relations)

        elif layer_type == 'GINE':
            self.conv1 = GINEConv(MLP([50, hidden_channels]), edge_dim=num_relations)
            self.conv2 = GINEConv(MLP([hidden_channels, hidden_channels]), edge_dim=num_relations)
            self.conv3 = GINEConv(MLP([hidden_channels, hidden_channels]), edge_dim=num_relations)

    def forward(self, x, edge_index, edge_type, batch):

        edge_features = None
        if self.layer_type == 'RGCN':
            edge_features = edge_type
        elif self.layer_type == 'GINE':
            edge_features = torch.nn.functional.one_hot(edge_type, num_classes=num_relations).to(torch.float)

        # 1. Obtain node embeddings
        x = self.conv1(self.emb.weight[x.long()], edge_index, edge_features)
        x = x.relu()
        x = self.conv2(x, edge_index, edge_features)
        x = x.relu()
        x = self.conv3(x, edge_index, edge_features)

        # 2. Readout layer
        x = global_mean_pool(x, batch)  # [batch_size, hidden_channels]

        # 3. Apply a final classifier
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.lin(x)

        return x


In [8]:
model = GCN(layer_type='RGCN', hidden_channels=64, num_nodes=x.size(0), trainable_embeddings=False)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = torch.nn.CrossEntropyLoss()

def train():
    model.train()

    for data in train_loader:  # Iterate in batches over the training dataset.
        out = model(data.x.long(), data.edge_index, data.edge_type, data.batch)  # Perform a single forward pass.
        loss = criterion(out, data.y)  # Compute the loss.
        loss.backward()  # Derive gradients.
        optimizer.step()  # Update parameters based on gradients.
        optimizer.zero_grad()  # Clear gradients.

def test(loader):
     model.eval()

     correct = 0
     for data in loader:  # Iterate in batches over the training/test dataset.
         out = model(data.x, data.edge_index, data.edge_type, data.batch)
         pred = out.argmax(dim=1)  # Use the class with the highest probability.

         correct += int((pred == data.y).sum())  # Check against ground-truth labels.
     return correct / len(loader.dataset)  # Derive ratio of correct predictions.


for epoch in range(1, 171):
    train()
    train_acc = test(train_loader)
    test_acc = test(test_loader)
    print(f'Epoch: {epoch:03d}, Train Acc: {train_acc:.4f}, Test Acc: {test_acc:.4f}')

122 50
Epoch: 001, Train Acc: 0.8214, Test Acc: 0.8889
Epoch: 002, Train Acc: 0.8214, Test Acc: 0.8889
Epoch: 003, Train Acc: 0.8214, Test Acc: 0.8889
Epoch: 004, Train Acc: 0.8214, Test Acc: 0.8889
Epoch: 005, Train Acc: 0.8214, Test Acc: 0.8889
Epoch: 006, Train Acc: 0.8214, Test Acc: 0.8889
Epoch: 007, Train Acc: 0.8214, Test Acc: 0.8889
Epoch: 008, Train Acc: 0.8214, Test Acc: 0.8889
Epoch: 009, Train Acc: 0.8214, Test Acc: 0.8889
Epoch: 010, Train Acc: 0.8214, Test Acc: 0.8889
Epoch: 011, Train Acc: 0.8214, Test Acc: 0.8889
Epoch: 012, Train Acc: 0.8214, Test Acc: 0.8889
Epoch: 013, Train Acc: 0.8214, Test Acc: 0.8889
Epoch: 014, Train Acc: 0.8214, Test Acc: 0.8889
Epoch: 015, Train Acc: 0.8214, Test Acc: 0.8889
Epoch: 016, Train Acc: 0.8214, Test Acc: 0.8889
Epoch: 017, Train Acc: 0.8214, Test Acc: 0.8889
Epoch: 018, Train Acc: 0.8214, Test Acc: 0.8889
Epoch: 019, Train Acc: 0.8214, Test Acc: 0.8889
Epoch: 020, Train Acc: 0.8214, Test Acc: 0.8889
Epoch: 021, Train Acc: 0.8214, Te