# Problem - Solution Discription
## Problem: testing the discrimination power of GNN GIN-based model on SBM Dataset with different values of inter-classes similarity parameter (mult_factor=1+np.linespace (0.2,2,18)).
## Solution: the model is specified in the code within the NET class, with:
## batch size=128 , epochs number=300

In [None]:
#cu101 
!pip install torch-geometric \
  torch-sparse==latest+cu101 \
  torch-scatter==latest+cu101 \
  torch-cluster==latest+cu101 \
  -f https://pytorch-geometric.com/whl/torch-1.4.0.html

In [None]:
import networkx as nx
from sklearn.model_selection import train_test_split
import numpy as np 
import torch
from matplotlib import pyplot as plt 
#from torch_geometric.datasets import Entities 
from torch_geometric.datasets import TUDataset
from torch_geometric.data import DataLoader
from torch_geometric.nn import GINConv, global_add_pool, GCNConv, global_mean_pool
from torch_geometric.utils import convert

import torch.nn.functional as F
from torch.nn import Sequential, Linear, ReLU
from torch_geometric.data import DataLoader

In [None]:
class dataset_loading:
    def __init__(self):
        pass 

    #SBM generator
    def generate_SBM(self,Graphs_num=600,nodes_per_graph=60,block_size=10,fraction=0.3,mult_factor=1.5,avg_deg=10,test_size=0.2): #############
        blocks_num=int(nodes_per_graph/block_size)
        sizes=[block_size]*blocks_num
        G,y=[],[]
        for i in range (Graphs_num):                  
            p_in=fraction  if i <Graphs_num/2 else fraction*mult_factor
            p_out=(avg_deg-(block_size-1)*p_in)/(nodes_per_graph-block_size)
            p=p_out*np.ones([blocks_num]*2)+(p_in-p_out)*np.eye(blocks_num)
            #print(p_in,p_out)
            G.append(nx.stochastic_block_model(sizes, p))
            y.append(0 if i<Graphs_num/2 else 1)            
        G_train, G_test, y_train, y_test = train_test_split(G, y, test_size=test_size)
        return (G_train,y_train),(G_test,y_test)

# Here is the GNN GIN based model's structure
class Net(torch.nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        num_features=1    # dimensionality of node features ( in this case we considered node_degree)    
        num_classes=2
        dim = 32          # dimensionality of hidden layers' outputs 

        nn1 = Sequential(Linear(num_features, dim), ReLU(), Linear(dim, dim))
        self.conv1 = GINConv(nn1)
        self.bn1 = torch.nn.BatchNorm1d(dim)    # batch normalization layer

        nn2 = Sequential(Linear(dim, dim), ReLU(), Linear(dim, dim))
        self.conv2 = GINConv(nn2)
        self.bn2 = torch.nn.BatchNorm1d(dim)

        nn3 = Sequential(Linear(dim, dim), ReLU(), Linear(dim, dim))
        self.conv3 = GINConv(nn3)
        self.bn3 = torch.nn.BatchNorm1d(dim)

        nn4 = Sequential(Linear(dim, dim), ReLU(), Linear(dim, dim))
        self.conv4 = GINConv(nn4)
        self.bn4 = torch.nn.BatchNorm1d(dim)

        nn5 = Sequential(Linear(dim, dim), ReLU(), Linear(dim, dim))
        self.conv5 = GINConv(nn5)
        self.bn5 = torch.nn.BatchNorm1d(dim)

        self.fc1 = Linear(dim, dim)
        self.fc2 = Linear(dim, num_classes)

    def forward(self, x, edge_index, batch):
        x = F.relu(self.conv1(x, edge_index))
        x = self.bn1(x)
        x = F.relu(self.conv2(x, edge_index))
        x = self.bn2(x)
        x = F.relu(self.conv3(x, edge_index))
        x = self.bn3(x)
        x = F.relu(self.conv4(x, edge_index))
        x = self.bn4(x)
        x = F.relu(self.conv5(x, edge_index))
        x = self.bn5(x)
        x = global_add_pool(x, batch)
        x = F.relu(self.fc1(x))
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.fc2(x)
        return F.log_softmax(x, dim=-1)



def train(epoch):
    model.train()
    
    '''
    if epoch == 51:
        for param_group in optimizer.param_groups: pass
            #param_group['lr'] = 0.1 * param_group['lr']
    
    for param_group in optimizer.param_groups: pass
        #param_group['lr'] =  param_group['lr']/(1+0.1*epoch)
    '''
    # apply decay on learning rate 
    if epoch%100==0: 
        for param_group in optimizer.param_groups: 
            param_group['lr'] = 0.1 * param_group['lr']

    loss_all = 0
    for data in train_loader:
        data = data.to(device)
        optimizer.zero_grad()
        output = model(data.x, data.edge_index, data.batch)
        loss = F.nll_loss(output, data.y)
        loss.backward()
        loss_all += loss.item() * data.num_graphs
        optimizer.step()
    return loss_all / len(train_dataset)


def test(loader):
    model.eval()
    correct = 0
    for data in loader:
        data = data.to(device)
        output = model(data.x, data.edge_index, data.batch)
        pred = output.max(dim=1)[1]
        correct += pred.eq(data.y).sum().item()
    return correct / len(loader.dataset)


mult_factor=1+np.linspace(0.2,2,18)             # inter class similarity param 
train_loss = np.zeros([len(mult_factor),1])
train_acc = np.zeros([len(mult_factor),1])
test_acc = np.zeros([len(mult_factor),1])
for ind in range (len(mult_factor)):
    # generating the SBM dataset in Networkx framwork 
    (Gtr,ytr),(Gts,yts)=dataset_loading().generate_SBM(mult_factor=mult_factor[ind]) 
    #creating the nodes-degree features vectors 
    for i,g in enumerate (Gtr):
    dg=list(g.degree(g.nodes))
    dg=dict([(i,[j]) for (i,j) in dg])
    nx.set_node_attributes(g,dg,'x')
    #go from networkx to Torch-geormetry framwork 
    Gtr=[convert.from_networkx(g) for g in Gtr]
    for i,g in enumerate (Gtr): g.y=torch.tensor([ytr[i]],dtype=torch.long)

    for i,g in enumerate (Gts):
    dg=list(g.degree(g.nodes))
    dg=dict([(i,[j]) for (i,j) in dg])
    nx.set_node_attributes(g,dg,'x')
    Gts=[convert.from_networkx(g) for g in Gts]
    for i,g in enumerate (Gts): g.y=torch.tensor([yts[i]], dtype=torch.long)
    
    # setting the cuda setup 
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    # defining an instance of the model with the optimizer 
    model = Net().to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
    
    
    test_dataset = Gts           #redundant        
    train_dataset = Gtr          # redundant 
    test_loader = DataLoader(test_dataset, batch_size=128)
    train_loader = DataLoader(train_dataset, batch_size=128)
    # start the learning and evaluation  processes 
    print("\nmult factor value is {}\n".format(mult_factor[ind]))
    for epoch in range(1, 300):
        train_loss[ind][0] = train(epoch)
        train_acc [ind][0]= test(train_loader)
        test_acc [ind][0]= test(test_loader)
        #test_acc = 0
        print('Epoch: {:03d}, Train Loss: {:.7f}, '
            'Train Acc: {:.1f}%, Test Acc: {:.0f}%'.format(epoch, train_loss[ind][0],
                                                        100*train_acc[ind][0], 100*test_acc[ind][0]))

In [None]:
# the next three cells are for saving and visualizing the loss-accuracy of the model 
results =np.concatenate((train_loss,train_acc,test_acc),axis=1)
np.savetxt('GNN_model_SBM_variant_similarity.csv',results, delimiter=',')

In [None]:
plt.plot(mult_factor,results[:,0].flatten())
plt.xlabel("Inter-classes similarity parameter ")
plt.ylabel("Training loss (Cross entropy)")
plt.title("GIN-based model performance with different classes similarity")
plt.grid()
plt.savefig("Model's training loss.png")
plt.show()

In [None]:
plt.plot(mult_factor,results[:,1].flatten(),label="Training accuracy",Linewidth=2)
plt.plot(mult_factor,results[:,2].flatten(),label="Test accuracy",Linewidth=2)
plt.xlabel("Inter-classes similarity parameter ")
plt.ylabel("Accuracy")
plt.title("GIN-based model performance with different classes similarity")
plt.grid()
plt.legend()
plt.savefig("Model's Accuracy.png")
plt.show()

In [None]:
#This cell is unimportant, I was just trying things 

a=TUDataset(root=".", name="MUTAG").shuffle()
print(len(a))
print(a[0].keys)
print(a[0].y)
print(a[100].y)
a=convert.to_networkx(a[0],node_attrs='x')
print()
color = nx.get_node_attributes(a, 'x')
print(a.nodes[0]['x'])
#print(a.nodes(data=True))
a=convert.from_networkx(a)
print(type(a))
print(a.y)
a.y=1
print(a.y)
#nx.draw(a)
#plt. show()
dataset = TUDataset('.', name='MUTAG').shuffle()
test_dataset = dataset[:1]
train_dataset = dataset[len(dataset) // 10:]
test_loader = DataLoader(test_dataset, batch_size=1)
train_loader = DataLoader(train_dataset, batch_size=128)

print(type(test_dataset))
print(100%100==0)
print(101%100==0)
print(dataset[0].y)
print(type(dataset[0]))
for data in test_loader:
    data = data.to(device)
    print(data.x.dtype)
print(dataset.num_features)
