# Data preprocessing
---

## imports

In [1]:
import torch
from torch_geometric.datasets import Reddit, Amazon
from torch_geometric.utils import to_networkx
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from torch_geometric.loader import DataLoader
import random
from torch.functional import F
from torch_geometric.nn import GCNConv, GATConv
from torch_geometric.loader import NeighborSampler

# DATA PREPARATION

In [2]:
# reading the Amazon dataset
amazon_computers_dataset = Amazon(root='data/Amazon', name='Computers')
data = amazon_computers_dataset[0]

# split 


loader = DataLoader(amazon_computers_dataset, batch_size=32, shuffle=True)

In [3]:
# print info about the dataset
print("data", data)
print("num nodes", data.num_nodes)
print("Num edges", data.num_edges)
print("num features", data.num_features)
print("is undirected", data.is_undirected())
print("is directed", data.is_directed())

data Data(x=[13752, 767], edge_index=[2, 491722], y=[13752])
num nodes 13752
Num edges 491722
num features 767
is undirected True
is directed False


In [4]:
# plots for data visualization and exploration
# G = to_networkx(data, to_undirected=True)
# pos = nx.spring_layout(G)
# plt.figure(figsize=(8, 8))
# nx.draw(G, pos, node_size=10, width=0.5)
# plt.show()

In [5]:
def create_masks(data, num_nodes, train_ratio, val_ratio):
    # Generate and shuffle indices
    indices = list(range(num_nodes))
    random.shuffle(indices)
    
    # Create masks
    train_mask = torch.zeros(num_nodes, dtype=torch.bool)
    val_mask = torch.zeros(num_nodes, dtype=torch.bool)
    test_mask = torch.zeros(num_nodes, dtype=torch.bool)

    # Set proportions for train, val, and test
    train_end = int(train_ratio * num_nodes)
    val_end = train_end + int(val_ratio * num_nodes)
    
    train_mask[indices[:train_end]] = True
    val_mask[indices[train_end:val_end]] = True
    test_mask[indices[val_end:]] = True

    # Assign the custom masks to the dataset
    data.train_mask = train_mask
    data.val_mask = val_mask
    data.test_mask = test_mask
    
def train_test_split_graph(data, train_ratio, val_ratio):
    num_nodes = data.num_nodes
    
    # Call the function to create masks
    create_masks(data, num_nodes, train_ratio, val_ratio)
    
    # Step 5: Create train, validation, and test node indices based on the masks
    train_idx = data.train_mask.nonzero(as_tuple=False).view(-1)
    val_idx = data.val_mask.nonzero(as_tuple=False).view(-1)
    test_idx = data.test_mask.nonzero(as_tuple=False).view(-1)
    
    # Create the NeighborSampler objects using node indices for each subset
    train_loader = NeighborSampler(data.edge_index, node_idx=train_idx, sizes=[15, 10], batch_size=64, shuffle=True, num_workers=4)
    val_loader = NeighborSampler(data.edge_index, node_idx=val_idx, sizes=[15, 10], batch_size=64, shuffle=False, num_workers=4)
    test_loader = NeighborSampler(data.edge_index, node_idx=test_idx, sizes=[15, 10], batch_size=64, shuffle=False, num_workers=4)

    return train_loader, val_loader, test_loader
  
  

# TRAINING

In [6]:
def train_epoch(model, optimizer, data_loader, device, data):
    """Train the model for one epoch using NeighborSampler mini-batches."""
    model.train()
    total_loss = 0

    for batch_size, n_id, adjs in data_loader:
        # `n_id` includes target nodes and all sampled neighbors in this mini-batch
        # `batch_size` is the number of target nodes for which loss should be calculated

        adjs = [adj.to(device) for adj in adjs]  # Move sampled adjacency matrices to GPU

        # Move input features of all nodes in `n_id` to the device
        x_input = data.x[n_id].to(device)
        
        # Compute the model's predictions for the mini-batch
        out = model(x_input, adjs[0].edge_index)
        
        # Calculate loss only for the first `batch_size` target nodes
        loss = F.nll_loss(out[:batch_size], data.y[n_id[:batch_size]].to(device))
        
        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(data_loader)


@torch.no_grad()
def evaluate(model, data_loader, device, data):
    """Evaluate the model on the validation or test set using NeighborSampler mini-batches."""
    model.eval()
    total_correct = 0

    for batch_size, n_id, adjs in data_loader:
        adjs = [adj.to(device) for adj in adjs]
        
        # Compute predictions for all nodes in the mini-batch
        x_input = data.x[n_id].to(device)
        out = model(x_input, adjs[0].edge_index)

        # Get predictions only for the first `batch_size` target nodes
        pred = out[:batch_size].max(dim=1)[1]
        
        # Compare with the actual labels of the target nodes
        total_correct += (pred == data.y[n_id[:batch_size]].to(device)).sum().item()

    # Calculate accuracy based on the number of target nodes in the entire set
    return total_correct / len(data_loader.dataset)


# MODEL

In [7]:
# Step 3: Define a simple GCN model
class GCN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        
        return F.log_softmax(x, dim=1)

In [17]:
class GAT(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(GAT, self).__init__()
        self.conv1 = GATConv(in_channels, hidden_channels)
        self.conv2 = GATConv(hidden_channels, out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        
        return F.log_softmax(x, dim=1)

# MAIN

In [15]:
def main(gnn: torch.nn.Module):

    amazon_computers_dataset = Amazon(root='data/Amazon', name='Computers')
    data = amazon_computers_dataset[0]

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    data = data.to(device)

    train_ratio = 0.8
    val_ratio = 0.1
    test_ratio = 0.1

    train_loader, val_loader, test_loader = train_test_split_graph(data, train_ratio, val_ratio)

    # Initialize the model and optimizer
    model = gnn(in_channels=data.num_node_features, hidden_channels=64, out_channels=amazon_computers_dataset.num_classes).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

    # Training loop
    for epoch in range(1, 10):
        loss = train_epoch(model, optimizer, train_loader, device, data)
        train_acc = evaluate(model, train_loader, device, data)
        val_acc = evaluate(model, val_loader, device, data)
        print(f'Epoch: {epoch:02d}, Loss: {loss:.4f}, Train Acc: {train_acc:.4f}, Val Acc: {val_acc:.4f}')

    # Final test accuracy
    test_acc = evaluate(model, test_loader, device, data)
    print(f'Final Test Accuracy: {test_acc:.4f}')


# Step 6: Run the main function
# if __name__ == "__main__":
#     main()

main(gnn=GCN)

Epoch: 01, Loss: 1.4793, Train Acc: 0.7605, Val Acc: 0.7571
Epoch: 02, Loss: 0.5610, Train Acc: 0.8431, Val Acc: 0.8284
Epoch: 03, Loss: 0.4964, Train Acc: 0.8526, Val Acc: 0.8465
Epoch: 04, Loss: 0.4836, Train Acc: 0.8413, Val Acc: 0.8371
Epoch: 05, Loss: 0.4643, Train Acc: 0.8554, Val Acc: 0.8378
Epoch: 06, Loss: 0.5066, Train Acc: 0.8646, Val Acc: 0.8436
Epoch: 07, Loss: 0.4457, Train Acc: 0.8626, Val Acc: 0.8400
Epoch: 08, Loss: 0.4503, Train Acc: 0.8702, Val Acc: 0.8516
Epoch: 09, Loss: 0.4359, Train Acc: 0.8704, Val Acc: 0.8618
Final Test Accuracy: 0.8561


In [19]:
main(gnn=GAT)

Epoch: 01, Loss: 0.8295, Train Acc: 0.8684, Val Acc: 0.8480
Epoch: 02, Loss: 0.4084, Train Acc: 0.8856, Val Acc: 0.8545
Epoch: 03, Loss: 0.3587, Train Acc: 0.8956, Val Acc: 0.8625
Epoch: 04, Loss: 0.3492, Train Acc: 0.8824, Val Acc: 0.8531
Epoch: 05, Loss: 0.3253, Train Acc: 0.9000, Val Acc: 0.8756
Epoch: 06, Loss: 0.3618, Train Acc: 0.8919, Val Acc: 0.8807
Epoch: 07, Loss: 0.3210, Train Acc: 0.9049, Val Acc: 0.8778
Epoch: 08, Loss: 0.3273, Train Acc: 0.9026, Val Acc: 0.8800
Epoch: 09, Loss: 0.3069, Train Acc: 0.9156, Val Acc: 0.8836
Final Test Accuracy: 0.8765
