In [1]:
%load_ext autoreload
%autoreload 2

# Imports

In [2]:
import dgl
from dgl.data import CoraGraphDataset
from dgl.nn.pytorch import GraphConv

import networkx as nx
import networkx.algorithms.community as nx_comm

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import networkx as nx

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
import numpy as np

import csv

In [3]:
import data.generate as gen
import data.visualise as vis

# Loading the dataset

In [4]:
graphs = gen.read_all_graphs()

In [5]:
# information abaout number of nodes
min_nodes = 10000
max_nodes = 0
for g in graphs:
    if g.number_of_nodes() < min_nodes:
        min_nodes = g.number_of_nodes()
    if g.number_of_nodes() > max_nodes:
        max_nodes = g.number_of_nodes()

print(min_nodes)
print(max_nodes)

50
9312


In [6]:
# create position matrix for all nodes
all_positions = []
for g in graphs:
    pos_list = []
    for node in g.nodes:
        pos_list.append(g.nodes[node]['pos'])
        g.nodes[node]['pos'] = torch.tensor(g.nodes[node]['pos']).clone().detach()
    all_positions.append(torch.tensor(np.array(pos_list)))

In [7]:
g = graphs[134]
g.graph

{'id': 174, 'type': 'random', 'algorithm': 'spring'}

In [9]:
def min_max_scale(positions):
    min_val = np.min(positions)
    max_val = np.max(positions)
    scaled_positions = (positions - min_val) / (max_val - min_val)
    return scaled_positions, min_val, max_val

def reverse_min_max_scale(scaled_positions, min_val, max_val):
    positions = scaled_positions * (max_val - min_val) + min_val
    return positions

In [27]:
# data loader
class GraphDataset(Dataset):
    def __init__(self, graphs):
        self.graphs = []
        self.positions = []
        self.min_max_for_rescaling = []
        self.original_positions = []
        for g in graphs:
            features = []
            for node in g.nodes:
                features.append(g.nodes[node]['pos'].detach().numpy())

            #scale the position data to [0, 1]
            scaled_features, min_value, max_value = min_max_scale(np.array(features))
            
            scaled_features = torch.tensor(scaled_features)
            dgl_g = dgl.from_networkx(g)
            min_max_values = (min_value,max_value)
            #dgl_g.ndata['pos'] = features

            self.graphs.append(dgl_g)
            self.positions.append(scaled_features.float())
            self.original_positions.append(torch.tensor(np.array(features)).float())
            self.min_max_for_rescaling.append(min_max_values)

            # sanity check
            reversed = reverse_min_max_scale(scaled_features.numpy(), min_value, max_value)
            try:
                assert np.array_equal(np.round(reversed, decimals=4), np.round(np.array(features), decimals=4))
            except AssertionError as e:
                errors = [(x,y) for x,y in zip(np.round(reversed,decimals=4), np.round(np.array(features), decimals=4)) if not np.array_equal(x,y)]
                for x, y in errors:
                    print(f"Inaccurate scaling on graph {g.graph['id']}: rescale: {x}, original: {y}")               

    def __len__(self):
        # Return the total number of samples in the dataset
        return len(self.graphs)

    def __getitem__(self, idx):
        # Return a sample and its corresponding label based on the given index
        return self.graphs[idx], self.positions[idx], self.original_positions[idx], self.min_max_for_rescaling[idx]

def custom_collate(batch):
    graphs, features, unscaled_features, min_max = zip(*batch)
    batched_graphs = dgl.batch(graphs)
    batched_features = torch.cat(features, dim=0)
    batched_unscaled_features = torch.cat(unscaled_features, dim=0)
    return batched_graphs, batched_features, batched_unscaled_features, min_max

In [28]:
def train_val_test_split(indices):
    train_indices, val_indices_= train_test_split(indices, test_size=0.3)
    val_indices, test_indices = train_test_split(val_indices_, test_size=0.5)

    return train_indices, val_indices, test_indices

In [29]:
# train test split
random_graphs = [g for g in graphs if g.graph['type'] == 'random']
star_graphs = [g for g in graphs if g.graph['type'] == 'star']
grid_graphs = [g for g in graphs if g.graph['type'] == 'grid']

train_graphs = []
valid_graphs = []
test_graphs = []

# random graphs
train_indices, val_indices, test_indices = train_val_test_split(np.arange(len(random_graphs)))

train_graphs_ = [random_graphs[i] for i in train_indices]
valid_graphs_ = [random_graphs[i] for i in val_indices]
test_graphs_ = [random_graphs[i] for i in test_indices]

train_graphs = train_graphs + train_graphs_
valid_graphs = valid_graphs + valid_graphs_
test_graphs = test_graphs + test_graphs_

# star graphs
train_indices, val_indices, test_indices = train_val_test_split(np.arange(len(star_graphs)))

train_graphs_ = [star_graphs[i] for i in train_indices]
valid_graphs_ = [star_graphs[i] for i in val_indices]
test_graphs_ = [star_graphs[i] for i in test_indices]

train_graphs = train_graphs + train_graphs_
valid_graphs = valid_graphs + valid_graphs_
test_graphs = test_graphs + test_graphs_

# grid graphs
train_indices, val_indices, test_indices = train_val_test_split(np.arange(len(grid_graphs)))

train_graphs_ = [grid_graphs[i] for i in train_indices]
valid_graphs_ = [grid_graphs[i] for i in val_indices]
test_graphs_ = [grid_graphs[i] for i in test_indices]

train_graphs = train_graphs + train_graphs_
valid_graphs = valid_graphs + valid_graphs_
test_graphs = test_graphs + test_graphs_

In [30]:
# create dataloaders
batch_size = 32
train_ds = GraphDataset(train_graphs)
train_dl = DataLoader(train_ds, batch_size = batch_size, shuffle = True, collate_fn=custom_collate, pin_memory=True)
val_ds = GraphDataset(valid_graphs)
val_dl = DataLoader(val_ds, batch_size = batch_size, collate_fn=custom_collate, pin_memory=True)
test_ds = GraphDataset(test_graphs)
test_dl = DataLoader(test_ds, batch_size = batch_size, collate_fn=custom_collate, pin_memory=True)

Inaccurate scaling on graph 283: rescale: [0.599  0.5676], original: [0.599  0.5675]
Inaccurate scaling on graph 283: rescale: [0.0995 0.4944], original: [0.0995 0.4945]
Inaccurate scaling on graph 283: rescale: [5.09224e+01 4.94000e-02], original: [5.09224e+01 4.93000e-02]
Inaccurate scaling on graph 283: rescale: [0.3632 0.95  ], original: [0.3632 0.9501]
Inaccurate scaling on graph 283: rescale: [0.2985 0.2148], original: [0.2984 0.2148]
Inaccurate scaling on graph 913: rescale: [0.3776 0.9434], original: [0.3776 0.9435]


# Model

In [31]:
class GCN(nn.Module):
  def __init__(self, in_feats, n_hidden, n_classes, n_layers, activation = F.relu, dropout = 0):
    super(GCN, self).__init__()
    self.layers = nn.ModuleList()
    # input layer
    self.layers.append(GraphConv(in_feats, n_hidden, activation=activation))
    # hidden layers
    for i in range(n_layers - 1):
      self.layers.append(GraphConv(n_hidden, n_hidden, activation=activation))
    # output layer
    self.layers.append(GraphConv(n_hidden, n_classes))
    self.dropout = nn.Dropout(p=dropout)

  def forward(self, g, features):
    # the H matrix is the feature matrix
    h = features
    # the matrix passes through each layer
    for i, layer in enumerate(self.layers):
      if i != 0:
        h = self.dropout(h)       #dropout
      h = layer(g, h)
    return h

In [78]:
#initialising the model
model = GCN(in_feats = 2, n_hidden = 48, n_classes = 2, n_layers = 5, dropout = 0)
model

GCN(
  (layers): ModuleList(
    (0): GraphConv(in=2, out=48, normalization=both, activation=<function relu at 0x00000171654C96C0>)
    (1-4): 4 x GraphConv(in=48, out=48, normalization=both, activation=<function relu at 0x00000171654C96C0>)
    (5): GraphConv(in=48, out=2, normalization=both, activation=None)
  )
  (dropout): Dropout(p=0, inplace=False)
)

In [79]:
for g in graphs[:10]:
    features = []
    for node in g.nodes:
        features.append(g.nodes[node]['pos'].detach().numpy())
    
    h = torch.tensor(np.array(features))
    dgl_g = dgl.from_networkx(g)

    h = model(dgl_g, h)
    print(h.shape)

torch.Size([448, 2])
torch.Size([4745, 2])
torch.Size([1392, 2])
torch.Size([4074, 2])
torch.Size([768, 2])
torch.Size([392, 2])
torch.Size([2304, 2])
torch.Size([3496, 2])
torch.Size([1296, 2])
torch.Size([2002, 2])


# Utility functions

In [34]:
#class for early stopping
class EarlyStopping:
    def __init__(self, patience=1):
        self.patience = patience
        self.counter = 0
        self.min_validation_loss = np.inf

    def early_stop(self, validation_loss):
        if validation_loss < self.min_validation_loss:
            self.min_validation_loss = validation_loss
            self.counter = 0
        elif validation_loss > self.min_validation_loss:
            self.counter += 1
            if self.counter >= self.patience:
                return True
        return False

In [67]:
# defining a train function
def train(model, num_epochs, optimizer, loss, train_dl, valid_dl, patience = -1, verbose = True):
    all_logits = []
    all_train_loss = []
    all_val_loss = []

    if(patience != -1):
        early_stopper = EarlyStopping(patience)

    # if available using cuda
    #model.to(device)

    for epoch in range(num_epochs):
        train_losses = []
        for batch in train_dl:
            batched_graph, features,_, _ = batch			
            logits = model(dgl.add_self_loop(batched_graph), torch.tensor(np.zeros_like(features)).float())
            assert features.shape == logits.shape
            #compute loss
            train_loss = loss(logits, features.float())
                
            #training the model
            optimizer.zero_grad()
            train_loss.backward()
            optimizer.step()
            all_logits.append(logits.detach())
            train_losses.append(train_loss.item())	
        train_losses = np.array(train_losses)
        all_train_loss.append(train_losses)
        
        # check validation
        val_losses = []
        with torch.no_grad():
            for batch in valid_dl:
                batched_graph, features, _, _ = batch
                logits = model(dgl.add_self_loop(batched_graph), torch.tensor(np.zeros_like(features)).float())
                val_loss = loss(logits, features.float())
                val_losses.append(val_loss.item())
        val_losses = np.array(val_losses)
        all_val_loss.append(val_losses)
        
        #early stopping
        if patience != -1 and early_stopper.early_stop(val_loss):
            if verbose:
                print("Early stopping has occured!")
            break
            
        if verbose:
            print('Epoch %d | Avg train Loss: %.4f | Min train Loss: %.4f | Max train Loss: %.4f | Avg valid Loss: %.4f | Min valid Loss: %.4f | Max valid Loss: %.4f' % 
                        (epoch, np.mean(train_losses), np.min(train_losses), np.max(train_losses), np.mean(val_losses), np.min(val_losses), np.max(val_losses)))
                        
    return all_logits, np.array(all_train_loss), np.array(all_val_loss)

In [81]:
optimizer = torch.optim.Adam(model.parameters(), lr = 0.1)
loss = nn.MSELoss()
all_logits, all_train_loss, all_val_loss = train(model, 10, optimizer, loss, train_dl, val_dl)


Epoch 0 | Avg train Loss: 0.0676 | Min train Loss: 0.0507 | Max train Loss: 0.0932 | Avg valid Loss: 0.0581 | Min valid Loss: 0.0511 | Max valid Loss: 0.0815
Epoch 1 | Avg train Loss: 0.0664 | Min train Loss: 0.0528 | Max train Loss: 0.0823 | Avg valid Loss: 0.0611 | Min valid Loss: 0.0554 | Max valid Loss: 0.0756
Epoch 2 | Avg train Loss: 0.0687 | Min train Loss: 0.0507 | Max train Loss: 0.0902 | Avg valid Loss: 0.0583 | Min valid Loss: 0.0501 | Max valid Loss: 0.0867
Epoch 3 | Avg train Loss: 0.0682 | Min train Loss: 0.0524 | Max train Loss: 0.0927 | Avg valid Loss: 0.0579 | Min valid Loss: 0.0504 | Max valid Loss: 0.0837
Epoch 4 | Avg train Loss: 0.0665 | Min train Loss: 0.0545 | Max train Loss: 0.0987 | Avg valid Loss: 0.0602 | Min valid Loss: 0.0550 | Max valid Loss: 0.0756
Epoch 5 | Avg train Loss: 0.0670 | Min train Loss: 0.0540 | Max train Loss: 0.0863 | Avg valid Loss: 0.0567 | Min valid Loss: 0.0483 | Max valid Loss: 0.0840
Epoch 6 | Avg train Loss: 0.0662 | Min train Loss: 0

In [82]:
all_train_loss

array([[0.06378424, 0.06338058, 0.05065175, 0.08257305, 0.06733917,
        0.0587659 , 0.05929109, 0.06851999, 0.0736275 , 0.07758753,
        0.06939198, 0.05800937, 0.08596684, 0.06039057, 0.07102326,
        0.06540596, 0.06403052, 0.05954872, 0.05647984, 0.06970081,
        0.09317273, 0.06949528],
       [0.05825505, 0.05430376, 0.07544778, 0.06165404, 0.07528561,
        0.07537641, 0.05833545, 0.05889522, 0.07133072, 0.05929767,
        0.05488056, 0.07834159, 0.07414039, 0.05281418, 0.05599844,
        0.0823352 , 0.06708096, 0.06925076, 0.06405424, 0.07715543,
        0.06232596, 0.07486474],
       [0.05899919, 0.06565566, 0.05820832, 0.05800349, 0.07515676,
        0.05500498, 0.0823854 , 0.09023158, 0.08900127, 0.05719982,
        0.06351675, 0.07476663, 0.07236217, 0.06467618, 0.07024312,
        0.06443466, 0.06174158, 0.075865  , 0.08478507, 0.0506903 ,
        0.08024418, 0.05845245],
       [0.08546553, 0.06421471, 0.05556084, 0.05637836, 0.05399564,
        0.0707759

In [83]:
all_logits[-2]

tensor([[0.4190, 0.6529],
        [0.4190, 0.6529],
        [0.4190, 0.6529],
        ...,
        [0.4190, 0.6529],
        [0.4190, 0.6529],
        [0.4190, 0.6529]])