In [1]:
import torch
print(torch.__version__)

1.10.0


In [2]:
#First install dependences to pytorch geometric
"""
pip install torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric-f https://data.pyg.org/whl/torch-1.10.0+cu113.html

https://pytorch-geometric.readthedocs.io/en/latest/notes/installation.html
"""

'\npip install torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric-f https://data.pyg.org/whl/torch-1.10.0+cu113.html\n\nhttps://pytorch-geometric.readthedocs.io/en/latest/notes/installation.html\n'

In [3]:
# pip install torch-scatter -f https://data.pyg.org/whl/torch-1.10.0.html
# pip install torch-sparse -f https://data.pyg.org/whl/torch-1.10.0.html
# pip install torch-geometric

In [4]:
#Importing the librarys
import torch
from torch_geometric.data import Data

https://pytorch-geometric.readthedocs.io/en/latest/notes/introduction.html

![graph.svg](attachment:graph.svg)

In [5]:
"""
Represents the edges at the graph above
where first row represent one way to conecte(like 0 to 1)
and second row the 'back' with (1 to 0).
"""
edge_index = torch.tensor([[0, 1, 1, 2],
                           [1, 0, 2, 1]], dtype=torch.long)

edge_index

tensor([[0, 1, 1, 2],
        [1, 0, 2, 1]])

In [6]:
"""
We can write the edge index like a tensor off pairs
using edge_index.contiguous()
"""
edge_index.t().contiguous()

tensor([[0, 1],
        [1, 0],
        [1, 2],
        [2, 1]])

In [7]:
"""
X represetns the value of some metric to each node
following the numeric order of node (0,1,2 like in the example)
"""
x = torch.tensor([[-1], [0], [1]], dtype=torch.float)
x

tensor([[-1.],
        [ 0.],
        [ 1.]])

In [8]:
data = Data(x=x, edge_index=edge_index.t().contiguous())
data

Data(x=[3, 1], edge_index=[4, 2])

In [9]:
"""
Some tips of commands that help us to verify if the graph is correctly contructed
and how send this data to GPU for training (in future)
"""
print(f"Number of nodes at the graph: {data.num_nodes}")

print(f"Number of edges at the graph: {data.num_edges}")

print(f"Number of features (metrics) at the graph {data.num_node_features}")

# Transfer data object to GPU.
device = torch.device('cuda')
#Come back data to CPU
data = data.to('cpu')

Number of nodes at the graph: 3
Number of edges at the graph: 2
Number of features (metrics) at the graph 1


### Example with Protein dataset

In [10]:
from torch_geometric.datasets import TUDataset
# https://chrsmrrs.github.io/datasets/

In [11]:
#Load the enzine dataset that are compost with 600 graphs with 6 classes
dataset = TUDataset(root='/tmp/ENZYMES', name='ENZYMES')

In [12]:
print(f"Size of dataset: {len(dataset)}")
print(f"Number of classes: {dataset.num_classes}")
print(f"Features per node: {dataset.num_node_features}")

Size of dataset: 600
Number of classes: 6
Features per node: 3


In [13]:
"""
Acessing the first graph at the dataset we can see that have
84 undirect edges (168/2), with 33 nodes where each node gave
3 feature value and the classe of this graph is 1 (y value)
"""
dataset[0]

Data(edge_index=[2, 168], x=[37, 3], y=[1])

### Separing data in training and test

In [14]:
"""
Setting the training set with 90% of data and test with 10%
"""
#How much samples will have in traing
numb_traing_samples = int(len(dataset) * 0.9)

#Shuffling the dataset
dataset = dataset.shuffle()

#Defining how are the traing
train_dataset = dataset[:numb_traing_samples]

#And choosen the test
test_dataset = dataset[numb_traing_samples:]

## Creating DataLoader

In [15]:
from torch_geometric.loader import DataLoader

In [16]:
"""
Dataloader is the structure responsable to generate the batchs for network,
when called return a set of graphs (the batch size)
"""
loader = DataLoader(train_dataset, batch_size=12, shuffle=True)

In [17]:
#Way to see what return at each interaction in dataloader
#The variable batch shows the index of each node at the batch
next(iter(loader))

DataBatch(edge_index=[2, 1240], x=[312, 3], y=[12], batch=[312], ptr=[13])

In [18]:
print(f"Each batch have: {int(3912/2)} edges, 1002 nodes with 3 features each")

Each batch have: 1956 edges, 1002 nodes with 3 features each


## Network (GCN)

In [19]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.nn import global_mean_pool as gap, global_max_pool as gmp
import numpy as np

#https://towardsdatascience.com/hands-on-graph-neural-networks-with-pytorch-pytorch-geometric-359487e221a8

In [20]:
"""
Define the GCN convolutional network, with 2 convolutional layers
with activation ReLU and softmax activation for total of classes.
"""
class GCN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = GCNConv(train_dataset.num_node_features, 256)
        self.conv2 = GCNConv(256, 128)
        self.conv3 = GCNConv(128, train_dataset.num_classes)

    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch

        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        
        x = self.conv3(x, edge_index)
        x = torch.cat([gmp(x, batch), gap(x, batch)],dim=1)

        return F.log_softmax(x, dim=1)

In [21]:
#If you have a GPU and cuda installed this part set to use GPU in training
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GCN().to(device)
data = loader

#Select how optimizer will be used to adjust the network weights
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

In [22]:
plot_factor = 40
model.train()

for epoch in range(20):  # loop over the dataset multiple times

    running_loss = 0.0
    for i, data in enumerate(loader, 0):
        # get the inputs; data is a list of [inputs, labels]
        #Inforns that optimizer need a gradient values adjust
        optimizer.zero_grad()
        #Get the result of network (softmax output)
        out = model(data.to(device))
        #Calculate the loss
        loss = F.nll_loss(out, data.y)
        #Do the backpropragation (adjusting the weight of layers)
        loss.backward()
        #Set the new weights and got to next step
        optimizer.step()
        
        running_loss += loss.item()
        if i % plot_factor == 0:    # print every mini-batches
            print(f'[{epoch}, {i + 1:5d}] loss: {running_loss / plot_factor:.3f}')
            running_loss = 0.0

[0,     1] loss: 0.062
[0,    41] loss: 2.351
[1,     1] loss: 0.056
[1,    41] loss: 1.982
[2,     1] loss: 0.045
[2,    41] loss: 1.872
[3,     1] loss: 0.043
[3,    41] loss: 1.846
[4,     1] loss: 0.049
[4,    41] loss: 1.831
[5,     1] loss: 0.043
[5,    41] loss: 1.824
[6,     1] loss: 0.043
[6,    41] loss: 1.805
[7,     1] loss: 0.045
[7,    41] loss: 1.809
[8,     1] loss: 0.042
[8,    41] loss: 1.803
[9,     1] loss: 0.039
[9,    41] loss: 1.783
[10,     1] loss: 0.046
[10,    41] loss: 1.787
[11,     1] loss: 0.045
[11,    41] loss: 1.801
[12,     1] loss: 0.043
[12,    41] loss: 1.804
[13,     1] loss: 0.045
[13,    41] loss: 1.747
[14,     1] loss: 0.042
[14,    41] loss: 1.817
[15,     1] loss: 0.047
[15,    41] loss: 1.798
[16,     1] loss: 0.044
[16,    41] loss: 1.778
[17,     1] loss: 0.045
[17,    41] loss: 1.776
[18,     1] loss: 0.047
[18,    41] loss: 1.792
[19,     1] loss: 0.041
[19,    41] loss: 1.773


## Testing the model

In [23]:
test_loader = DataLoader(test_dataset, batch_size=12, shuffle=True)

In [24]:
#Set the model in test mode (dont change thw weights)
model.eval()

GCN(
  (conv1): GCNConv(3, 256)
  (conv2): GCNConv(256, 128)
  (conv3): GCNConv(128, 6)
)

In [25]:
#Code to compute de acc in the test data
def accuracy(data):
    
    prediction = model(data)
    y = data.y.squeeze().to(device)
    is_correct = (torch.max(prediction, 1)[1] == y)
    
    return(is_correct.cpu().numpy().tolist())

In [26]:
Acc = []
for count, data in enumerate(test_loader, 0):
    Acc.extend(accuracy(data.to(device)))

In [27]:
np.mean(Acc)

0.25