<a href="https://colab.research.google.com/github/lnsayer/personal_repo/blob/main/amazon_node_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Amazon Dataset (from PyTorch Geometric) Node Classification Project

The Amazon Computers and Amazon Photo networks from the “Pitfalls of Graph Neural Network Evaluation” paper. Nodes represent goods and edges represent that two goods are frequently bought together. Given product reviews as bag-of-words node features, the task is to map goods to their respective product category.

In [1]:
!pip install torch_geometric



In [2]:
from torch_geometric.datasets import Amazon
import torch

In [3]:
dataset = Amazon(root="data/Amazon", name="Computers")

data = dataset[0]

print(f"Number of nodes in graph is: {data.x.shape[0]}")
print(f"Number of features for each node is {dataset.num_features}")
print(f"Number of edges is {data.edge_index.shape[1]}")
print(f"We want to predict the classes of the 13752 Computers\n")
print(data, "\n")

print(f"The number of class members for each class: {dataset.num_classes}\n")
print(f"Class labels: \n{data.y.unique(return_counts=True)}")
#dir(dataset)

Number of nodes in graph is: 13752
Number of features for each node is 767
Number of edges is 491722
We want to predict the classes of the 13752 Computers

Data(x=[13752, 767], edge_index=[2, 491722], y=[13752]) 

The number of class members for each class: 10

Class labels: 
(tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), tensor([ 436, 2142, 1414,  542, 5158,  308,  487,  818, 2156,  291]))


In [62]:
# We will split the data using a transductive method in which the whole input graph is put through the forward method however
# we split the labels (classes) for the loss function. We will use a split of train:80%, val:10% and test:10%.
torch.manual_seed(42)
# indices of the nodes randomly shuffled
indices = torch.randperm(data.x.shape[0])

# train:80%, val:10% and test:10%
train_indices = indices[:int(0.8*len(indices))]
val_indices = indices[int(0.8*len(indices)):int(0.9*len(indices))]
test_indices = indices[int(0.9*len(indices)):]

# train labels to pass
train_labels = data.y[train_indices]
val_labels = data.y[val_indices]
test_labels = data.y[test_indices]

print(len(val_labels), len(test_labels))
print(torch.concat((val_indices, test_indices)))

print(len(indices))
print(len(train_indices))
print(len(val_indices))
print(len(test_indices))

print(test_indices)

1000 1000
tensor([  794,  4772,  8150,  ...,  1300, 12066,  3247])
13752
1000
1000
1000
tensor([ 8461,  5630,  8059,  1094,    45,  4962,  3510,  2548,  8251,  2735,
        12073, 12921,  5425,  7690,  9579,  8783,   468,  8391,  7467, 12950,
         9889,  7624,  6818,  4982, 13219,  8313,   159,  7823,  3233,  7636,
         1059,  6132,  7355, 12230,  3646, 12507,  4793,  9586,  3596,  3545,
          708, 10637, 10571,  2364, 12553,  3423,  6406,  6781,    50, 11683,
         8594, 12739,  1681,  1037,  3394,  5460, 12677,  5709,  1391,   609,
        12314,  6983,  4344,  2405,  6119,  3288,  9898,  1450,  4787,  9383,
         4511,  6043, 10462,  5456,  2263,  8778,  2783,  2382,  9099, 11221,
         8662, 11316,  7293,  9531,   653,  8405, 12335,  2588,   615,  8975,
         1421, 10101, 12325,   946,  3710,  8048,  7367,  4064,  3934, 11613,
          538,  6997,  6683,  7627,  7965,  4099,  5085,  5734,  9673,  4437,
         9029, 12068,   734,  6853, 13191, 10714, 1081

In [5]:
data.is_undirected()

True

In [56]:
# Create a model
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

nb_hidden_channels = 32

class GCN(torch.nn.Module):
  def __init__(self):
    super().__init__()
    self.conv1 = GCNConv(in_channels = dataset.num_features,
                         out_channels = nb_hidden_channels)
    self.conv2 = GCNConv(in_channels = nb_hidden_channels,
                         out_channels = dataset.num_classes)

  def forward(self, data):
    x, edge_index = data.x, data.edge_index

    x = self.conv1(x, edge_index)
    x = F.relu(x)
    x = self.conv2(x, edge_index)
    x = F.dropout(x, training=self.training)
    # could try log softmax: improved numerical performance and gradient optimisation
    x = F.log_softmax(x, dim = 1)

    return x

# defined it later too
gcn_model = GCN()
gcn_model


GCN(
  (conv1): GCNConv(767, 32)
  (conv2): GCNConv(32, 10)
)

In [57]:
optimiser = torch.optim.Adam(gcn_model.parameters(), lr=0.001)
loss_fn = torch.nn.CrossEntropyLoss()

In [58]:
# Run training

# set device as GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
# instantiate the gcn_model
gcn_model = GCN()
# move the model to the gpu (or cpu is not available)
gcn_model = gcn_model.to(device)

# define the data
data = dataset[0]
data = data.to(device)
torch.manual_seed(42)

nb_epochs = 500

for epoch in range(nb_epochs):
  # forward pass on the network
  out = gcn_model(data)

  # calculate the loss
  loss = loss_fn(out[train_indices], data.y[train_indices])

  # calculate the label probabilities
  label_probs = out[torch.concat((val_indices, test_indices))]
  # calculate the label predictions
  label_preds = torch.argmax(label_probs, dim=1)
  # calculate accuracy
  accuracy = (label_preds == data.y[torch.concat((val_indices, test_indices))]).sum()/len(label_preds)

  # zero the gradients
  optimiser.zero_grad()
  # backpropagate the loss
  loss.backward()
  # update the optimizer
  optimiser.step()

  if epoch % 10 == 0:
    print(f"Epoch: {epoch}, Loss: {loss:.4f}, Accuracy: {accuracy:.4f}")


Epoch: 0, Loss: 2.6151, Accuracy: 0.1178
Epoch: 10, Loss: 2.6223, Accuracy: 0.1105
Epoch: 20, Loss: 2.6234, Accuracy: 0.1203
Epoch: 30, Loss: 2.6243, Accuracy: 0.1203
Epoch: 40, Loss: 2.6141, Accuracy: 0.1087
Epoch: 50, Loss: 2.6304, Accuracy: 0.1116
Epoch: 60, Loss: 2.6267, Accuracy: 0.1170
Epoch: 70, Loss: 2.6329, Accuracy: 0.1123
Epoch: 80, Loss: 2.6239, Accuracy: 0.1160
Epoch: 90, Loss: 2.6299, Accuracy: 0.1134
Epoch: 100, Loss: 2.6117, Accuracy: 0.1163
Epoch: 110, Loss: 2.6253, Accuracy: 0.1250
Epoch: 120, Loss: 2.6206, Accuracy: 0.1160
Epoch: 130, Loss: 2.6196, Accuracy: 0.1221
Epoch: 140, Loss: 2.6209, Accuracy: 0.1203
Epoch: 150, Loss: 2.6202, Accuracy: 0.1174
Epoch: 160, Loss: 2.6239, Accuracy: 0.1160
Epoch: 170, Loss: 2.6189, Accuracy: 0.1098
Epoch: 180, Loss: 2.6200, Accuracy: 0.1174
Epoch: 190, Loss: 2.6163, Accuracy: 0.1170
Epoch: 200, Loss: 2.6292, Accuracy: 0.1185
Epoch: 210, Loss: 2.6217, Accuracy: 0.1221
Epoch: 220, Loss: 2.6225, Accuracy: 0.1156
Epoch: 230, Loss: 2.63