In [1]:
import os
import torch
from torch.utils.data import Dataset
import numpy as np
import collections
import csv
import random
import pickle
from torch.utils.data import DataLoader
import dgl
import pandas as pd

from scipy.special import comb
from itertools import combinations 
import networkx.algorithms.isomorphism as iso
from tqdm import tqdm
import torch.nn as nn
import networkx as nx

import copy

class Subgraphs(Dataset):


    def __init__(self, root, mode, subgraph_list, subgraph2label, subgraph2center_node):   

        self.subgraph2label = subgraph2label
        self.subgraph_list = subgraph_list
        self.subgraph2center_node = subgraph2center_node
        
        self.data = pd.read_csv(os.path.join(root, mode + '.csv'))  # csv path
    
    def __getitem__(self, index):
        
        return self.subgraph_list[self.data.iloc[index]['name']], self.subgraph2label[self.data.iloc[index]['name']], self.subgraph2center_node[self.data.iloc[index]['name']] 

    def __len__(self):
        # as we have built up to batchsz of sets, you can sample some small batch size of sets.
        return len(self.data)
    
def collate(samples):
    # The input `samples` is a list of pairs
    #  (graph, label).
    graphs, labels, center_nodes = map(list, zip(*samples))
    batched_graph = dgl.batch(graphs)
    return batched_graph, torch.LongTensor(labels), torch.LongTensor(center_nodes)


In [2]:
def generate_graphlet(n):
    non_iso_graph = []
    non_iso_graph_adj = []
    dgl_graph = []
    for i in tqdm(range(n-1, int(comb(n, 2))+1)):
    # for each of these possible # of edges
        arr = np.array(range(int((n**2-n)/2)))
        all_comb = list(combinations(arr, i)) 
        # all possible combination of edge positions 
        indices = np.triu_indices(n, 1)
        for m in range(len(all_comb)):
            # iterate over all these graphs
            adj = np.zeros((n,n))
            adj[indices[0][np.array(all_comb[m])], indices[1][np.array(all_comb[m])]] = 1
            adj_temp = adj
            adj = adj + adj.T
            #print(adj)
            if sum(np.sum(adj_temp, axis = 0) == 0) == 1:
                #the graph has to be connected
                new_graph = nx.from_numpy_matrix(adj)
                if len(non_iso_graph) == 0:
                    non_iso_graph.append(new_graph)
                    non_iso_graph_adj.append(adj)
                    S = dgl.DGLGraph()
                    S.from_networkx(new_graph)
                    dgl_graph.append(S)
                else:
                    is_iso = False
                    for g in non_iso_graph:
                        if iso.is_isomorphic(g, new_graph):
                            #print('yes')
                            is_iso = True
                            break
                    if not is_iso:
                        # not isomorphic to any of the current graphs
                        non_iso_graph.append(new_graph)
                        non_iso_graph_adj.append(adj)
                        
                        S = dgl.DGLGraph()
                        S.from_networkx(new_graph)
                        dgl_graph.append(S)
                        
    
    print('There are {} non-isomorphic graphs'.format(len(non_iso_graph)))
    return dgl_graph

In [3]:
import dgl.function as fn
import torch
import torch.nn as nn


# Sends a message of node feature h.
msg = fn.copy_src(src='h', out='m')

def reduce(nodes):
    """Take an average over all neighbor node features hu and use it to
    overwrite the original node feature."""
    accum = torch.mean(nodes.mailbox['m'], 1)
    return {'h': accum}

class NodeApplyModule(nn.Module):
    """Update the node feature hv with ReLU(Whv+b)."""
    def __init__(self, in_feats, out_feats, activation):
        super(NodeApplyModule, self).__init__()
        self.linear = nn.Linear(in_feats, out_feats)
        self.activation = activation

    def forward(self, node):
        h = self.linear(node.data['h'])
        h = self.activation(h)
        return {'h' : h}

class GCN(nn.Module):
    def __init__(self, in_feats, out_feats, activation):
        super(GCN, self).__init__()
        self.apply_mod = NodeApplyModule(in_feats, out_feats, activation)

    def forward(self, g, feature):
        # Initialize the node features with h.
        g.ndata['h'] = feature
        g.update_all(msg, reduce)
        g.apply_nodes(func=self.apply_mod)
        return g.ndata.pop('h')

In [4]:
import torch.nn.functional as F


class Classifier(nn.Module):
    def __init__(self, in_dim, hidden_dim, n_classes, attention_dim):
        super(Classifier, self).__init__()

        self.layers = nn.ModuleList([
            GCN(in_dim, hidden_dim, F.relu),
            GCN(hidden_dim, hidden_dim, F.relu)])
        self.classify = nn.Linear(hidden_dim + attention_dim, n_classes)
        
        self.query = nn.Linear(hidden_dim, attention_dim)
        self.key = nn.Linear(hidden_dim, attention_dim)
        self.value = nn.Linear(30, attention_dim)
            
    def forward(self, g, to_fetch, features, graphlets):
        h = g.in_degrees().view(-1, 1).float().to(device)
        h_graphlets = graphlets.in_degrees().view(-1, 1).float().to(device)

        #h = torch.tensor([1.]*g.number_of_nodes()).view(-1, 1).float()

        for conv in self.layers:
            h = conv(g, h)
            h_graphlets = conv(graphlets, h_graphlets)

        g.ndata['h'] = h
        graphlets.ndata['h'] = h_graphlets

        num_nodes_ = g.batch_num_nodes
        num_nodes_.insert(0, 0)
        offset = torch.cumsum(torch.LongTensor(num_nodes_), dim = 0)[:-1].to(device)
        hg = h[to_fetch + offset]
        
        h_graphlets = dgl.mean_nodes(graphlets, 'h')
        
        Q = self.query(hg)
        K = self.key(h_graphlets)
        attention_scores = torch.matmul(Q, K.T)
        attention_probs = nn.Softmax(dim=-1)(attention_scores)
        context = self.value(attention_probs)
        
        h = torch.cat((context, hg), 1)
        
        return self.classify(h)

In [5]:
# create graphlets
graphs = []
for i in range(1, 5):
        graphs = graphs + generate_graphlet(i+1)

print('There are {} number of graphlets'.format(len(graphs)))
graphlets = dgl.batch(graphs)

100%|██████████| 1/1 [00:00<00:00, 923.45it/s]
100%|██████████| 2/2 [00:00<00:00, 934.04it/s]
100%|██████████| 4/4 [00:00<00:00, 454.33it/s]
100%|██████████| 7/7 [00:00<00:00, 37.90it/s]

There are 1 non-isomorphic graphs
There are 2 non-isomorphic graphs
There are 6 non-isomorphic graphs
There are 21 non-isomorphic graphs
There are 30 number of graphlets





In [6]:
from torch.utils.data import DataLoader
import torch.optim as optim
path = '../../../data/single_graph/cycle/'
with open(path + 'list_subgraph.pkl', 'rb') as f:
    total_subgraph = pickle.load(f)
    
with open(path + 'label.pkl', 'rb') as f:
    info = pickle.load(f)

with open(path + 'center.pkl', 'rb') as f:
    center_node = pickle.load(f)
    
path = path + 'fold1/'
trainset = Subgraphs(path, 'train', total_subgraph, info, center_node)
valset = Subgraphs(path, 'val', total_subgraph, info, center_node)
testset = Subgraphs(path, 'test', total_subgraph, info, center_node)
    
data_loader = DataLoader(trainset, batch_size=64, shuffle=True,
                         collate_fn=collate)
data_loader_val = DataLoader(valset, batch_size=64, shuffle=True,
                         collate_fn=collate)
data_loader_test = DataLoader(testset, batch_size=64, shuffle=True,
                         collate_fn=collate)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [7]:
def test(data_loader, model):
    model.eval()
    y_pred = []
    y_label = []
    epoch_loss = 0
    for iter, (bg, label, to_fetch) in enumerate(data_loader):
        bg = bg.to(device)
        label = label.to(device)
        features = bg.ndata['h']
        prediction = model(bg, to_fetch, features, graphlets)
        loss_func = nn.CrossEntropyLoss()
        loss = loss_func(prediction, label)
        
        probs_Y = torch.softmax(prediction, 1)
        argmax_Y = torch.max(probs_Y, 1)[1].view(-1, 1)
        
        y_pred = y_pred + argmax_Y.detach().numpy().flatten().tolist()
        y_label = y_label + label.numpy().flatten().tolist()
        epoch_loss += loss.detach().item()
    epoch_loss /= (iter + 1)    
    acc = (np.array(y_label) == np.array(y_pred)).sum().item() / len(test_Y) * 100
    return acc, epoch_loss

def test2(valset, model):
    
    model.eval()
    # Convert a list of tuples to two lists
    test_X, test_Y, center_nodes = map(list, zip(*valset))
    test_bg = dgl.batch(test_X)
    test_Y = torch.tensor(test_Y).float().view(-1, 1)
    test_bg.to(device)
    test_Y.to(device)
    center_nodes = torch.LongTensor(center_nodes).to(device)

    probs_Y = torch.softmax(model(test_bg, center_nodes, test_bg.ndata['h'], graphlets), 1)
    argmax_Y = torch.max(probs_Y, 1)[1].view(-1, 1)
    res = (test_Y == argmax_Y.detach().cpu().float()).sum().item() / len(test_Y) * 100
    #print('Accuracy of argmax predictions on the validation set: {:4f}%'.format(res)
    return res
    
    
# Create model
model = Classifier(1, 128, max(info.values()) + 1, 32)
model.to(device)
loss_func = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
model.train()

max_res = 0
model_max = copy.deepcopy(model)

epoch_losses = []
for epoch in range(500):
    epoch_loss = 0
    for iter, (bg, label, to_fetch) in enumerate(data_loader):
        bg = bg.to(device)
        label = label.to(device)
        features = bg.ndata['h']
        prediction = model(bg, to_fetch, features, graphlets)
        #print(prediction.shape)
        loss = loss_func(prediction, label)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        epoch_loss += loss.detach().item()
    epoch_loss /= (iter + 1)
    if epoch % 50 == 0:
        res = test2(valset, model)
        if res > max_res:
            model_max = copy.deepcopy(model)
        print('Epoch {}, training loss {:.4f} and validation accuracy {:.4f}%'.format(epoch, epoch_loss, res))
        #print('Accuracy of argmax predictions on the val set: {:4f}%'.format(res))
    epoch_losses.append(epoch_loss)

res = test2(testset, model_max)
print('Accuracy of argmax predictions on the test set: {:4f}%'.format(res))
    

Epoch 0, training loss 2.7893 and validation accuracy 3.9216%
Epoch 50, training loss 1.8460 and validation accuracy 33.3333%
Epoch 100, training loss 1.5739 and validation accuracy 35.2941%
Epoch 150, training loss 1.4244 and validation accuracy 39.2157%
Epoch 200, training loss 1.3013 and validation accuracy 43.1373%
Epoch 250, training loss 1.1998 and validation accuracy 49.0196%
Epoch 300, training loss 1.0994 and validation accuracy 47.0588%
Epoch 350, training loss 1.0060 and validation accuracy 49.0196%
Epoch 400, training loss 0.9191 and validation accuracy 50.9804%
Epoch 450, training loss 0.8389 and validation accuracy 45.0980%
Accuracy of argmax predictions on the test set: 54.411765%


In [8]:
model = model_max
valset = Subgraphs(path, 'test', total_subgraph, info, center_node)
#testset = Subgraphs(path, mode='test', path_s = 'list_subgraph.pkl', path_l = 'label.pkl')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.eval()
# Convert a list of tuples to two lists
test_X, test_Y, center_nodes = map(list, zip(*valset))
test_bg = dgl.batch(test_X)
test_Y = torch.tensor(test_Y).float().view(-1, 1)
test_bg.to(device)
test_Y.to(device)
center_nodes = torch.LongTensor(center_nodes).to(device)

probs_Y = torch.softmax(model(test_bg, center_nodes, test_bg.ndata['h'], graphlets), 1)
argmax_Y = torch.max(probs_Y, 1)[1].view(-1, 1)
print('Accuracy of argmax predictions on the test set: {:4f}%'.format(
    (test_Y == argmax_Y.detach().cpu().float()).sum().item() / len(test_Y) * 100))

Accuracy of argmax predictions on the test set: 54.411765%


In [9]:
#torch.save(model.state_dict(), 'model.pt')

In [10]:
#model = Classifier(1, 256, 10, 16)
#model.load_state_dict(torch.load('model.pt'))