<a href="https://colab.research.google.com/github/lnsayer/personal_repo/blob/main/amazon_node_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Amazon Dataset (from PyTorch Geometric) Node Classification Project

The Amazon Computers and Amazon Photo networks from the “Pitfalls of Graph Neural Network Evaluation” paper. Nodes represent goods and edges represent that two goods are frequently bought together. Given product reviews as bag-of-words node features, the task is to map goods to their respective product category.

In [1]:
!pip install torch_geometric



In [2]:
from torch_geometric.datasets import Amazon
import torch

In [3]:
dataset = Amazon(root="data/Amazon", name="Computers")

data = dataset[0]

print(f"Number of nodes in graph is: {data.x.shape[0]}")
print(f"Number of features for each node is {dataset.num_features}")
print(f"Number of edges is {data.edge_index.shape[1]}")
print(f"We want to predict the classes of the 13752 Computers\n")
print(data, "\n")

print(f"The number of class members for each class: {dataset.num_classes}\n")
print(f"Class labels: \n{data.y.unique(return_counts=True)}")
#dir(dataset)

Number of nodes in graph is: 13752
Number of features for each node is 767
Number of edges is 491722
We want to predict the classes of the 13752 Computers

Data(x=[13752, 767], edge_index=[2, 491722], y=[13752]) 

The number of class members for each class: 10

Class labels: 
(tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), tensor([ 436, 2142, 1414,  542, 5158,  308,  487,  818, 2156,  291]))


In [4]:
# We will split the data using a transductive method in which the whole input graph is put through the forward method however
# we split the labels (classes) for the loss function. We will use a split of train:80%, val:10% and test:10%.

# indices of the nodes randomly shuffled
indices = torch.randperm(data.x.shape[0])

# train:80%, val:10% and test:10%
train_indices = indices[:int(0.8*len(indices))]
val_indices = indices[int(0.8*len(indices)):int(0.9*len(indices))]
test_indices = indices[int(0.9*len(indices)):]

# train labels to pass
train_labels = data.y[train_indices]
val_labels = data.y[val_indices]
test_labels = data.y[test_indices]

print(len(val_labels), len(test_labels))
len(torch.concat((data.y[val_indices], data.y[test_indices])))

1375 1376


2751

In [5]:
data.is_undirected()

True

In [6]:
# Create a model
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

nb_hidden_channels = 32

class GCN(torch.nn.Module):
  def __init__(self):
    super().__init__()
    self.conv1 = GCNConv(in_channels = dataset.num_features,
                         out_channels = nb_hidden_channels)
    self.conv2 = GCNConv(in_channels = nb_hidden_channels,
                         out_channels = dataset.num_classes)

  def forward(self, data):
    x, edge_index = data.x, data.edge_index

    x = self.conv1(x, edge_index)
    x = F.relu(x)
    x = self.conv2(x, edge_index)
    # could try log softmax: improved numerical performance and gradient optimisation
    x = F.softmax(x, dim = 1)

    return x

# defined it later too
gcn_model = GCN()
gcn_model


GCN(
  (conv1): GCNConv(767, 32)
  (conv2): GCNConv(32, 10)
)

In [7]:
optimiser = torch.optim.Adam(gcn_model.parameters(), lr = 0.001)
loss_fn = torch.nn.CrossEntropyLoss()

In [13]:
# Run training
device = "cuda" if torch.cuda.is_available() else "cpu"
gcn_model = GCN()
gcn_model = gcn_model.to(device)
data = data.to(device)
torch.manual_seed(42)

nb_epochs = 100

for epoch in range(nb_epochs):
  # forward pass on the network
  out = gcn_model(data)
  # calculate the loss
  loss = loss_fn(out[train_indices], data.y[train_indices])

  label_probs = out[torch.concat((data.y[val_indices], data.y[test_indices]))]
  label_preds = torch.argmax(label_probs, dim=1)
  accuracy = (label_preds == data.y[torch.concat((data.y[val_indices], data.y[test_indices]))]).sum()/len(label_preds)



  # zero the gradients
  optimiser.zero_grad()
  # backpropagate the loss
  loss.backward()
  # update the optimizer
  optimiser.step()

  if epoch % 10 == 0:
    print(label_preds.shape)
    print(f"Epoch: {epoch}, Loss: {loss}, Accuracy: {accuracy}")


torch.Size([2751])
Epoch: 0, Loss: 2.303694248199463, Accuracy: 0.12250091135501862
torch.Size([2751])
Epoch: 10, Loss: 2.303694248199463, Accuracy: 0.12250091135501862
torch.Size([2751])
Epoch: 20, Loss: 2.303694248199463, Accuracy: 0.12250091135501862
torch.Size([2751])
Epoch: 30, Loss: 2.303694248199463, Accuracy: 0.12250091135501862
torch.Size([2751])
Epoch: 40, Loss: 2.303694248199463, Accuracy: 0.12250091135501862
torch.Size([2751])
Epoch: 50, Loss: 2.303694248199463, Accuracy: 0.12250091135501862
torch.Size([2751])
Epoch: 60, Loss: 2.303694248199463, Accuracy: 0.12250091135501862
torch.Size([2751])
Epoch: 70, Loss: 2.303694248199463, Accuracy: 0.12250091135501862
torch.Size([2751])
Epoch: 80, Loss: 2.303694248199463, Accuracy: 0.12250091135501862
torch.Size([2751])
Epoch: 90, Loss: 2.303694248199463, Accuracy: 0.12250091135501862
