<a href="https://colab.research.google.com/github/lnsayer/personal_repo/blob/main/amazon_node_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Amazon Dataset (from PyTorch Geometric) Node Classification Project

The Amazon Computers and Amazon Photo networks from the “Pitfalls of Graph Neural Network Evaluation” paper. Nodes represent goods and edges represent that two goods are frequently bought together. Given product reviews as bag-of-words node features, the task is to map goods to their respective product category.

In [3]:
!pip install torch_geometric

Collecting torch_geometric
  Downloading torch_geometric-2.5.3-py3-none-any.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: torch_geometric
Successfully installed torch_geometric-2.5.3


In [4]:
from torch_geometric.datasets import Amazon
import torch

In [5]:
dataset = Amazon(root="data/Amazon", name="Computers")

data = dataset[0]

print(f"Number of nodes in graph is: {data.x.shape[0]}")
print(f"Number of features for each node is {dataset.num_features}")
print(f"Number of edges is {data.edge_index.shape[1]}")
print(f"We want to predict the classes of the 13752 Computers\n")
print(data, "\n")

print(f"The number of class members for each class: {dataset.num_classes}\n")
print(f"Class labels: \n{data.y.unique(return_counts=True)}")
#dir(dataset)

Downloading https://github.com/shchur/gnn-benchmark/raw/master/data/npz/amazon_electronics_computers.npz
Processing...


Number of nodes in graph is: 13752
Number of features for each node is 767
Number of edges is 491722
We want to predict the classes of the 13752 Computers

Data(x=[13752, 767], edge_index=[2, 491722], y=[13752]) 

The number of class members for each class: 10

Class labels: 
(tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), tensor([ 436, 2142, 1414,  542, 5158,  308,  487,  818, 2156,  291]))


Done!


In [6]:
# We will split the data using a transductive method in which the whole input graph is put through the forward method however
# we split the labels (classes) for the loss function. We will use a split of train: 80%, val: 10% and test: 10%.
torch.manual_seed(42)
# indices of the nodes randomly shuffled
indices = torch.randperm(data.x.shape[0])
print(f"Shuffled indices are {indices} of length {len(indices)}\n")

# train:80%, val:10% and test:10%
train_indices = indices[:int(0.8*len(indices))]
val_indices = indices[int(0.8*len(indices)):int(0.9*len(indices))]
test_indices = indices[int(0.9*len(indices)):]
print(f"Train shuffled indices are {train_indices} of length {len(train_indices)}\n")
print(f"Test shuffled indices are {test_indices} of length {len(test_indices)}\n")
print(f"Validation shuffled indices are {val_indices} of length {len(val_indices)}\n")

# train labels to pass
train_labels = data.y[train_indices]
val_labels = data.y[val_indices]
test_labels = data.y[test_indices]


Shuffled indices are tensor([11094,  1027,  4378,  ...,  8275,  9941,  4338]) of length 13752

Train shuffled indices are tensor([11094,  1027,  4378,  ...,  8049, 12835,    92]) of length 11001

Test shuffled indices are tensor([8461, 5630, 8059,  ..., 8275, 9941, 4338]) of length 1376

Validation shuffled indices are tensor([  794,  4772,  8150,  ..., 12435,   687,  5437]) of length 1375



In [7]:
data.is_undirected()

True

In [8]:
# Create a model
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

nb_hidden_channels = 32

class GCN(torch.nn.Module):
  def __init__(self):
    super().__init__()
    self.conv1 = GCNConv(in_channels = dataset.num_features,
                         out_channels = nb_hidden_channels)
    self.conv2 = GCNConv(in_channels = nb_hidden_channels,
                         out_channels = dataset.num_classes)

  def forward(self, data):
    x, edge_index = data.x, data.edge_index

    x = self.conv1(x, edge_index)
    x = F.relu(x)
    x = self.conv2(x, edge_index)
    #x = F.dropout(x, training=self.training)
    # could try log softmax: improved numerical performance and gradient optimisation
    x = F.log_softmax(x, dim=1)

    return x

# defined it later too
gcn_model = GCN()
gcn_model


GCN(
  (conv1): GCNConv(767, 32)
  (conv2): GCNConv(32, 10)
)

In [13]:
# Run training
torch.manual_seed(42)

# set device as GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
# instantiate the gcn_model
gcn_model = GCN()
# move the model to the gpu (or cpu is not available)
gcn_model = gcn_model.to(device)

optimiser = torch.optim.Adam(gcn_model.parameters(), lr=0.0001)
loss_fn = torch.nn.NLLLoss()

# define the data
data = dataset[0]
data = data.to(device)

nb_epochs = 401

for epoch in range(nb_epochs):
  # Train
  # forward pass on the network
  gcn_model.train()
  out = gcn_model(data)

  # calculate the loss
  loss = loss_fn(out[train_indices], data.y[train_indices])

  # calculate the label probabilities
  label_probs = out[train_indices]
  # calculate the label predictions
  label_preds = torch.argmax(label_probs, dim=1)
  # calculate accuracy
  accuracy = (label_preds == data.y[train_indices]).sum()/len(label_preds)

  # zero the gradients
  optimiser.zero_grad()
  # backpropagate the loss
  loss.backward()
  # update the optimizer
  optimiser.step()

  # Test
  # Put the model in eval mode
  gcn_model.eval()
  with torch.inference_mode():
    # Calculate the loss for the test set
    test_loss = loss_fn(out[test_indices], data.y[test_indices])

    # Calculate the predictions for the test set and then calculate the accuracy
    test_label_probs = out[test_indices]
    test_label_preds = torch.argmax(test_label_probs, dim=1)
    test_accuracy = (test_label_preds == data.y[test_indices]).sum()/len(test_label_preds)


  if epoch % 10 == 0:
    print(f"Epoch: {epoch}, Train Loss: {loss:.4f}, Train Accuracy: {accuracy:.4f}, Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}")


Epoch: 0, Train Loss: 2.4572, Train Accuracy: 0.1576, Test Loss: 2.4824, Test Accuracy: 0.1468
Epoch: 10, Train Loss: 2.2600, Train Accuracy: 0.1591, Test Loss: 2.2749, Test Accuracy: 0.1475
Epoch: 20, Train Loss: 2.1322, Train Accuracy: 0.3367, Test Loss: 2.1402, Test Accuracy: 0.3350
Epoch: 30, Train Loss: 2.0551, Train Accuracy: 0.3744, Test Loss: 2.0607, Test Accuracy: 0.3576
Epoch: 40, Train Loss: 1.9974, Train Accuracy: 0.3760, Test Loss: 2.0032, Test Accuracy: 0.3597
Epoch: 50, Train Loss: 1.9516, Train Accuracy: 0.3841, Test Loss: 1.9578, Test Accuracy: 0.3692
Epoch: 60, Train Loss: 1.9119, Train Accuracy: 0.3882, Test Loss: 1.9183, Test Accuracy: 0.3750
Epoch: 70, Train Loss: 1.8754, Train Accuracy: 0.3951, Test Loss: 1.8817, Test Accuracy: 0.3837
Epoch: 80, Train Loss: 1.8413, Train Accuracy: 0.4087, Test Loss: 1.8476, Test Accuracy: 0.3975
Epoch: 90, Train Loss: 1.8091, Train Accuracy: 0.4311, Test Loss: 1.8159, Test Accuracy: 0.4172
Epoch: 100, Train Loss: 1.7784, Train Acc

## Try out current method on Cora dataset

In [29]:
from torch_geometric.datasets import Planetoid

In [97]:
dataset = Planetoid(root="data/Planetoid", name="Cora")

data = dataset[0]

print(f"Number of nodes in graph is: {data.x.shape[0]}")
print(f"Number of features for each node is {dataset.num_features}")
print(f"Number of edges is {data.edge_index.shape[1]}")
print(f"We want to predict the classes of the 13752 papers\n")
print(data, "\n")

print(f"The number of class members for each class: {dataset.num_classes}\n")
print(f"Class labels: \n{data.y.unique(return_counts=True)}")
print(data)

Number of nodes in graph is: 2708
Number of features for each node is 1433
Number of edges is 10556
We want to predict the classes of the 13752 papers

Data(x=[2708, 1433], edge_index=[2, 10556], y=[2708], train_mask=[2708], val_mask=[2708], test_mask=[2708]) 

The number of class members for each class: 7

Class labels: 
(tensor([0, 1, 2, 3, 4, 5, 6]), tensor([351, 217, 418, 818, 426, 298, 180]))
Data(x=[2708, 1433], edge_index=[2, 10556], y=[2708], train_mask=[2708], val_mask=[2708], test_mask=[2708])


In [34]:
# We will split the data using a transductive method in which the whole input graph is put through the forward method however
# we split the labels (classes) for the loss function. We will use a split of train: 80%, val: 10% and test: 10%.
torch.manual_seed(42)
# indices of the nodes randomly shuffled
indices = torch.randperm(data.x.shape[0])
print(f"Shuffled indices are {indices} of length {len(indices)}\n")

# train:80%, val:10% and test:10%
train_indices = indices[:int(0.8*len(indices))]
val_indices = indices[int(0.8*len(indices)):int(0.9*len(indices))]
test_indices = indices[int(0.9*len(indices)):]
print(f"Train shuffled indices are {train_indices} of length {len(train_indices)}\n")
print(f"Test shuffled indices are {test_indices} of length {len(test_indices)}\n")
print(f"Validation shuffled indices are {val_indices} of length {len(val_indices)}\n")

# train labels to pass
train_labels = data.y[train_indices]
val_labels = data.y[val_indices]
test_labels = data.y[test_indices]


Shuffled indices are tensor([1594,  519,  528,  ...,  547,   72, 1362]) of length 2708

Train shuffled indices are tensor([1594,  519,  528,  ..., 2707, 1649, 1425]) of length 2166

Test shuffled indices are tensor([1666, 2032, 2015,  977, 2299,  957, 2238,   21, 2256, 1506, 2241,  845,
         763, 1188, 1961,  210,  593,  354, 2360,   67,  251, 2618,  993, 1090,
           6, 2424, 1713, 1039, 1511,  849, 2596, 1939, 1917,  738,   41,   54,
         290,  460,  243,  542,  240, 1113, 2270, 1282,  891,  351, 1815, 1189,
         698, 1237, 1261, 1433, 2037, 1451, 1081,  684, 2161,  869,  180, 1861,
         921,  249,  497, 2072, 1339,  604, 1185,   30, 2213,  166, 1577, 1022,
        2487,  259, 2221, 1177,  434,  176, 2458, 1737,  403, 1762,    2,  373,
        1557,  406, 1518,   37, 1636, 2373,  530, 1746,  967, 1120, 1629, 1694,
         348, 1760, 2512, 1920, 1790,  376, 2412, 1748,  153, 1286, 1234, 2132,
        1187,  561, 1822, 1463, 1517, 2008, 1608, 1681,  419, 2296, 1335

In [36]:
data.is_undirected()

True

In [112]:
# Create a model
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

nb_hidden_channels = 32

class GCN(torch.nn.Module):
  def __init__(self):
    super().__init__()
    self.conv1 = GCNConv(in_channels = dataset.num_features,
                         out_channels = nb_hidden_channels)
    self.conv2 = GCNConv(in_channels = nb_hidden_channels,
                         out_channels = dataset.num_classes)

  def forward(self, data):
    x, edge_index = data.x, data.edge_index

    x = self.conv1(x, edge_index)
    x = F.relu(x)
    x = self.conv2(x, edge_index)
    #x = F.dropout(x, training=self.training)
    # could try log softmax: improved numerical performance and gradient optimisation
    x = torch.softmax(x, dim=1)

    return x

# defined it later too
gcn_model = GCN()
gcn_model

GCN(
  (conv1): GCNConv(1433, 32)
  (conv2): GCNConv(32, 7)
)

In [113]:
# Run training
torch.manual_seed(42)

# set device as GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
# instantiate the gcn_model
gcn_model = GCN()
# move the model to the gpu (or cpu is not available)
gcn_model = gcn_model.to(device)

optimiser = torch.optim.Adam(gcn_model.parameters(), lr=0.01)
loss_fn = torch.nn.CrossEntropyLoss()

# define the data
data = dataset[0]
data = data.to(device)

nb_epochs = 200

for epoch in range(nb_epochs):
  gcn_model.train()
  # forward pass on the network
  out = gcn_model(data)

  # calculate the loss
  loss = loss_fn(out[train_indices], data.y[train_indices])

  # calculate the label probabilities
  label_probs = out[torch.concat((val_indices, test_indices))]
  # calculate the label predictions
  label_preds = torch.argmax(label_probs, dim=1)
  # calculate accuracy
  accuracy = (label_preds == data.y[torch.concat((val_indices, test_indices))]).sum()/len(label_preds)

  # zero the gradients
  optimiser.zero_grad()
  # backpropagate the loss
  loss.backward()
  # update the optimizer
  optimiser.step()

  if epoch % 10 == 0:
    print(f"Epoch: {epoch}, Loss: {loss:.4f}, Accuracy: {accuracy:.4f}")


Epoch: 0, Loss: 1.9462, Accuracy: 0.1052
Epoch: 10, Loss: 1.5051, Accuracy: 0.7472
Epoch: 20, Loss: 1.3360, Accuracy: 0.8155
Epoch: 30, Loss: 1.2846, Accuracy: 0.8395
Epoch: 40, Loss: 1.2463, Accuracy: 0.8801
Epoch: 50, Loss: 1.2295, Accuracy: 0.8801
Epoch: 60, Loss: 1.2216, Accuracy: 0.8782
Epoch: 70, Loss: 1.2161, Accuracy: 0.8801
Epoch: 80, Loss: 1.2127, Accuracy: 0.8727
Epoch: 90, Loss: 1.2101, Accuracy: 0.8745
Epoch: 100, Loss: 1.2072, Accuracy: 0.8764
Epoch: 110, Loss: 1.2045, Accuracy: 0.8727
Epoch: 120, Loss: 1.2024, Accuracy: 0.8745
Epoch: 130, Loss: 1.2005, Accuracy: 0.8819
Epoch: 140, Loss: 1.1992, Accuracy: 0.8801
Epoch: 150, Loss: 1.1982, Accuracy: 0.8782
Epoch: 160, Loss: 1.1975, Accuracy: 0.8782
Epoch: 170, Loss: 1.1969, Accuracy: 0.8782
Epoch: 180, Loss: 1.1965, Accuracy: 0.8801
Epoch: 190, Loss: 1.1960, Accuracy: 0.8764


In [89]:
t = torch.tensor(([1,2,3, 4, 5, 6], [7,8,9, 10, 11, 12]), dtype=torch.float)
print(t,t.shape)

print(torch.softmax(t, dim=0))
print(torch.softmax(t, dim=1))

tensor([[ 1.,  2.,  3.,  4.,  5.,  6.],
        [ 7.,  8.,  9., 10., 11., 12.]]) torch.Size([2, 6])
tensor([[0.0025, 0.0025, 0.0025, 0.0025, 0.0025, 0.0025],
        [0.9975, 0.9975, 0.9975, 0.9975, 0.9975, 0.9975]])
tensor([[0.0043, 0.0116, 0.0315, 0.0858, 0.2331, 0.6337],
        [0.0043, 0.0116, 0.0315, 0.0858, 0.2331, 0.6337]])
