In [2]:
import dgl
%matplotlib inline
import networkx as nx 
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import os
import dgl.function as fn
import torch as th
import torch.nn as nn
import torch.nn.functional as F
from dgl import DGLGraph

adj = np.load('../../../data/single_graph/BA/graph_adj.npy')
rows, cols = np.where(adj == 1)
edges = zip(rows.tolist(), cols.tolist())
G = nx.Graph()
G.add_edges_from(edges)
labels = pd.read_csv('../../../data/single_graph/BA/data.csv').label.values


In [3]:
G.number_of_nodes()

200

In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from dgl.nn.pytorch.conv import GINConv
from dgl.nn.pytorch.glob import SumPooling, AvgPooling, MaxPooling


class ApplyNodeFunc(nn.Module):
    """Update the node feature hv with MLP, BN and ReLU."""
    def __init__(self, mlp):
        super(ApplyNodeFunc, self).__init__()
        self.mlp = mlp
        self.bn = nn.BatchNorm1d(self.mlp.output_dim)

    def forward(self, h):
        h = self.mlp(h)
        h = self.bn(h)
        h = F.relu(h)
        return h


class MLP(nn.Module):
    """MLP with linear output"""
    def __init__(self, num_layers, input_dim, hidden_dim, output_dim):
        """MLP layers construction
        Paramters
        ---------
        num_layers: int
            The number of linear layers
        input_dim: int
            The dimensionality of input features
        hidden_dim: int
            The dimensionality of hidden units at ALL layers
        output_dim: int
            The number of classes for prediction
        """
        super(MLP, self).__init__()
        self.linear_or_not = True  # default is linear model
        self.num_layers = num_layers
        self.output_dim = output_dim

        if num_layers < 1:
            raise ValueError("number of layers should be positive!")
        elif num_layers == 1:
            # Linear model
            self.linear = nn.Linear(input_dim, output_dim)
        else:
            # Multi-layer model
            self.linear_or_not = False
            self.linears = torch.nn.ModuleList()
            self.batch_norms = torch.nn.ModuleList()

            self.linears.append(nn.Linear(input_dim, hidden_dim))
            for layer in range(num_layers - 2):
                self.linears.append(nn.Linear(hidden_dim, hidden_dim))
            self.linears.append(nn.Linear(hidden_dim, output_dim))

            for layer in range(num_layers - 1):
                self.batch_norms.append(nn.BatchNorm1d((hidden_dim)))

    def forward(self, x):
        if self.linear_or_not:
            # If linear model
            return self.linear(x)
        else:
            # If MLP
            h = x
            for i in range(self.num_layers - 1):
                h = F.relu(self.batch_norms[i](self.linears[i](h)))
            return self.linears[-1](h)


class GIN(nn.Module):
    """GIN model"""
    def __init__(self, num_layers, num_mlp_layers, input_dim, hidden_dim,
                 output_dim, final_dropout, learn_eps, graph_pooling_type,
                 neighbor_pooling_type):
        """model parameters setting
        Paramters
        ---------
        num_layers: int
            The number of linear layers in the neural network
        num_mlp_layers: int
            The number of linear layers in mlps
        input_dim: int
            The dimensionality of input features
        hidden_dim: int
            The dimensionality of hidden units at ALL layers
        output_dim: int
            The number of classes for prediction
        final_dropout: float
            dropout ratio on the final linear layer
        learn_eps: boolean
            If True, learn epsilon to distinguish center nodes from neighbors
            If False, aggregate neighbors and center nodes altogether.
        neighbor_pooling_type: str
            how to aggregate neighbors (sum, mean, or max)
        graph_pooling_type: str
            how to aggregate entire nodes in a graph (sum, mean or max)
        """
        super(GIN, self).__init__()
        self.num_layers = num_layers
        self.learn_eps = learn_eps

        # List of MLPs
        self.ginlayers = torch.nn.ModuleList()
        self.batch_norms = torch.nn.ModuleList()

        for layer in range(self.num_layers - 1):
            if layer == 0:
                mlp = MLP(num_mlp_layers, input_dim, hidden_dim, hidden_dim)
            else:
                mlp = MLP(num_mlp_layers, hidden_dim, hidden_dim, hidden_dim)

            self.ginlayers.append(
                GINConv(ApplyNodeFunc(mlp), neighbor_pooling_type, 0, self.learn_eps))
            self.batch_norms.append(nn.BatchNorm1d(hidden_dim))

        # Linear function for graph poolings of output of each layer
        # which maps the output of different layers into a prediction score
        self.linears_prediction = torch.nn.ModuleList()

        for layer in range(num_layers):
            if layer == 0:
                self.linears_prediction.append(
                    nn.Linear(input_dim, output_dim))
            else:
                self.linears_prediction.append(
                    nn.Linear(hidden_dim, output_dim))

        self.drop = nn.Dropout(final_dropout)

        if graph_pooling_type == 'sum':
            self.pool = SumPooling()
        elif graph_pooling_type == 'mean':
            self.pool = AvgPooling()
        elif graph_pooling_type == 'max':
            self.pool = MaxPooling()
        else:
            raise NotImplementedError

    def forward(self, g, h):
        # list of hidden representation at each layer (including input)
        #hidden_rep = [h]
        h = g.in_degrees().view(-1, 1).float()
        #h = torch.tensor([1.]*g.number_of_nodes()).view(-1, 1).float()
        
        for i in range(self.num_layers - 1):
            h = self.ginlayers[i](g, h)
            h = self.batch_norms[i](h)
            h = F.relu(h)
            #hidden_rep.append(h)

        #score_over_layer = 0

        # perform pooling over all nodes in each graph in every layer
        #for i, h in enumerate(hidden_rep):
        #    pooled_h = self.pool(g, h)
        #    score_over_layer += self.drop(self.linears_prediction[i](pooled_h))

        return h

In [5]:
import torch
labels = torch.tensor(labels)
inputs = torch.eye(G.number_of_nodes())

In [6]:
S = dgl.DGLGraph()
S.from_networkx(G)
S.ndata['h'] = inputs

In [7]:
inputs.shape

torch.Size([200, 200])

In [8]:
np.random.seed(1)
labeled_nodes = np.random.choice(list(range(G.number_of_nodes())), int(G.number_of_nodes() * 0.15), replace = False)
labels_train = labels[labeled_nodes]

unlabelled_nodes = [i for i in list(range(G.number_of_nodes())) if i not in labeled_nodes]
val_nodes = np.random.choice(unlabelled_nodes, int(len(unlabelled_nodes)*0.2), replace = False)
test_nodes = [i for i in unlabelled_nodes if i not in val_nodes]

val_label = labels[val_nodes]
test_label = labels[test_nodes]

In [9]:
labeled_nodes

array([ 58,  40,  34, 102, 184, 198,  95,   4,  29, 168, 171,  18,  11,
        89, 110, 118, 159,  35, 136,  59,  51,  16,  44,  94,  31, 162,
        38,  28, 193,  27])

In [10]:
net = GIN(3, 2,
        1, 32, 14,
        0.5, True,
        "max", "sum")

optimizer = torch.optim.Adam(net.parameters(), lr=0.01)
all_logits = []
for epoch in range(2000):
    logits = net(S, inputs)
    # we save the logits for visualization later
    logp = F.log_softmax(logits, 1)
    # we only compute loss for labeled nodes
    loss = F.nll_loss(logp[labeled_nodes], labels_train)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    if epoch%100 == 0:
        print('Epoch %d | Loss: %.4f' % (epoch, loss.item()))


Epoch 0 | Loss: 3.6014
Epoch 100 | Loss: 0.2684
Epoch 200 | Loss: 0.2099
Epoch 300 | Loss: 0.1861
Epoch 400 | Loss: 0.1755
Epoch 500 | Loss: 0.1648
Epoch 600 | Loss: 0.1624
Epoch 700 | Loss: 0.1615
Epoch 800 | Loss: 0.1612
Epoch 900 | Loss: 0.1611
Epoch 1000 | Loss: 0.1610
Epoch 1100 | Loss: 0.1609
Epoch 1200 | Loss: 0.1609
Epoch 1300 | Loss: 0.1609
Epoch 1400 | Loss: 0.1608
Epoch 1500 | Loss: 0.1608
Epoch 1600 | Loss: 0.1608
Epoch 1700 | Loss: 0.1608
Epoch 1800 | Loss: 0.1608
Epoch 1900 | Loss: 0.1608


In [11]:
net.eval()
logits = net(S, inputs)
# we save the logits for visualization later
logp = F.log_softmax(logits, 1)
# we only compute loss for labeled nodes
#loss = F.nll_loss(logp[labeled_nodes], labels_train)

In [12]:
argmax_Y = torch.max(logp[test_nodes], 1)[1]

In [13]:
print('Accuracy of argmax predictions on the test set: {:4f}%'.format(
    (test_label == argmax_Y.float()).sum().item() / len(test_label) * 100))

Accuracy of argmax predictions on the test set: 27.941176%
