# GNN
---

## Import

In [1]:
import torch
from torch_geometric.datasets import Reddit, Amazon
from torch_geometric.utils import to_networkx
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from torch_geometric.loader import DataLoader
from torch.functional import F
from torch_geometric.nn import GCNConv, GATConv
from torch_geometric.loader import NeighborSampler, NeighborLoader
from torch_geometric.data import Data
import torch.optim as optim
import seaborn as sns
from models import *
import tqdm
from sklearn.manifold import TSNE
# from umap import UMAP
import logging
import concurrent

np.random.seed(0)

## Read, understand and prepare the data

In [2]:
# node_data = pd.read_parquet('data/amazon_product_data_word2vec.parquet')
data = torch.load('data/amazon_product_data_concat.pt')
data.num_classes = data.y.unique().shape[0]

  data = torch.load('data/amazon_product_data_concat.pt')


### Main info

In [3]:
print("data", data)
print("num nodes", data.num_nodes)
print("Num edges", data.num_edges)
print("num node features", data.num_node_features)
print("is undirected", data.is_undirected())
print("is directed", data.is_directed())
print("num edge features", data.num_edge_features)
print('num classes', data.num_classes)

data Data(x=[729819, 1200], edge_index=[2, 680548], y=[729819], num_classes=10)
num nodes 729819
Num edges 680548
num node features 1200
is undirected False
is directed True
num edge features 0
num classes 10


In [4]:
# value_counts = node_data['main_category'].value_counts()

# # plot a bar chart of the main categories
# plt.figure(figsize=(15, 6))
# plt.bar(value_counts.index, value_counts.values)
# plt.xticks(rotation=90)
# plt.title('Main Category Distribution')
# plt.show()

### Visualization

In [5]:
def visualize(h, color):
    z = TSNE(n_components=2).fit_transform(h.detach().cpu().numpy())

    plt.figure(figsize=(10, 10))
    plt.xticks([])
    plt.yticks([])

    plt.scatter(z[:, 0], z[:, 1], s=70, c=color, cmap="Set2")
    plt.show()

### Preparation for model training

In [6]:
def create_masks(data, train_ratio, val_ratio):
  num_nodes = data.num_nodes
  indices = list(range(num_nodes))
  np.random.shuffle(indices)
  
  # Create masks
  train_mask = torch.zeros(num_nodes, dtype=torch.bool)
  val_mask = torch.zeros(num_nodes, dtype=torch.bool)
  test_mask = torch.zeros(num_nodes, dtype=torch.bool)
  
  # Set proportions for train, val, and test
  train_end = int(train_ratio * num_nodes)
  val_end = train_end + int(val_ratio * num_nodes)
  train_mask[indices[:train_end]] = True
  val_mask[indices[train_end:val_end]] = True
  test_mask[indices[val_end:]] = True
  
  # Assign the custom masks to the dataset
  data.train_mask = train_mask
  data.val_mask = val_mask
  data.test_mask = test_mask


def train_test_split_graph(data: Data, train_ratio: float, val_ratio: float, batch_size: int):
    """
    Split the graph data into train, validation, and test sets
    :param data: The graph data
    :param train_ratio: The ratio of the training set
    :param val_ratio: The ratio of the validation set
    :param batch_size: The batch size
    
    :return: The train, validation, and test data loaders
    """

    # Call the function to create masks
    create_masks(data, train_ratio, val_ratio)

    # train, validation, and test node indices based on the masks
    train_idx = data.train_mask.nonzero(as_tuple=False).view(-1)
    val_idx = data.val_mask.nonzero(as_tuple=False).view(-1)
    test_idx = data.test_mask.nonzero(as_tuple=False).view(-1)
    
    def create_data_loader(data, indices):
        return DataLoader(data[indices], batch_size=batch_size, shuffle=True)
      
    def create_neighbor_loader(data, indices, batch_size=batch_size):
        return NeighborLoader(data, num_neighbors=[30] * 2, batch_size=batch_size, input_nodes=indices)

    # create the data loaders
    train_loader = create_neighbor_loader(data, train_idx)
    val_loader = create_neighbor_loader(data, val_idx)
    test_loader = create_neighbor_loader(data, test_idx)

    return train_loader, val_loader, test_loader

## Model training

### Training functions

In [7]:
from sklearn.metrics import f1_score, balanced_accuracy_score


def accuracy(predictions, labels):
    correct = (predictions.argmax(dim=1) == labels).sum().item()
    total = labels.size(0)
    return correct / total


def f1(predictions, labels):
    preds = predictions.argmax(dim=1).cpu().numpy()
    labels = labels.cpu().numpy()
    return f1_score(labels, preds, average='macro')
  
def balanced_accuracy(predictions, labels):
    preds = predictions.argmax(dim=1).cpu().numpy()
    labels = labels.cpu().numpy()
    return balanced_accuracy_score(labels, preds)

In [8]:
def train_epoch(model, optimizer, loss_fn, train_loader, device, metrics):
    model.train()
    total_loss = 0
    total_samples = 0
    metric_sums = {metric_name: 0 for metric_name in metrics}

    for batch in train_loader:
        optimizer.zero_grad()
        batch = batch.to(device)

        out, _ = model(batch.x, batch.edge_index)
        loss = loss_fn(out, batch.y)
        loss.backward()
        optimizer.step()

        batch_size = batch.y.size(0)
        total_loss += loss.item() * batch_size
        total_samples += batch_size

        # Compute metrics for this batch
        for metric_name, metric_fn in metrics.items():
            metric_value = metric_fn(out, batch.y)
            metric_sums[metric_name] += metric_value * \
                batch_size  # Weighted sum

    # Calculate average loss and metrics over all batches
    avg_loss = total_loss / total_samples
    avg_metrics = {
        metric_name: metric_sums[metric_name] / total_samples for metric_name in metrics}

    return avg_loss, avg_metrics
  

@torch.no_grad()
def validate(model, loss_fn, val_loader, device, metrics):
    model.eval()
    total_loss = 0
    total_samples = 0
    metric_sums = {metric_name: 0 for metric_name in metrics}

    for batch in val_loader:
        batch = batch.to(device)
        out, h = model(batch.x, batch.edge_index)
        loss = loss_fn(out, batch.y)

        batch_size = batch.y.size(0)
        total_loss += loss.item() * batch_size
        total_samples += batch_size

        # Compute metrics for this batch
        for metric_name, metric_fn in metrics.items():
            metric_value = metric_fn(out, batch.y)
            metric_sums[metric_name] += metric_value * \
                batch_size  # Weighted sum
                
        # visualize the embeddings for the first batch
        # if total_samples == batch_size:
        #     visualize(h, color=batch.y)

    # Calculate average loss and metrics over all batches
    avg_loss = total_loss / total_samples
    avg_metrics = {
      metric_name: metric_sums[metric_name] / total_samples for metric_name in metrics
    }

    return avg_loss, avg_metrics
  
  
def training_loop(model, optimizer, loss_fn, train_loader, val_loader, num_epochs, device, metrics):
    print("Starting training")
    train_losses, val_losses = [], []
    train_metrics_history = {metric_name: [] for metric_name in metrics}
    val_metrics_history = {metric_name: [] for metric_name in metrics}

    for epoch in range(1, num_epochs + 1):
        # Training
        train_loss, train_metrics = train_epoch(
            model, optimizer, loss_fn, train_loader, device, metrics)
        # Validation
        val_loss, val_metrics = validate(
            model, loss_fn, val_loader, device, metrics)
        
        # Logging
        train_losses.append(train_loss)
        val_losses.append(val_loss)
        for metric_name in metrics:
            train_metrics_history[metric_name].append(train_metrics[metric_name])
            val_metrics_history[metric_name].append(val_metrics[metric_name])

        # Print metrics
        metrics_str = ', '.join(
            [f'{metric_name}: {train_metrics[metric_name]:.3f} (train), {val_metrics[metric_name]:.3f} (val)'
             for metric_name in metrics])
        print(
            f"Epoch {epoch}/{num_epochs}: "
            f"Loss: {train_loss:.3f} (train), {val_loss:.3f} (val), "
            f"{metrics_str}"
        )

    return model, train_losses, val_losses, train_metrics_history, val_metrics_history

### Actual training

## Experiment

In [9]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
data = data.to(device)
train_ratio = 0.8
val_ratio = 0.1
learning_rate = 0.01  
weight_decay = 5e-4
batch_size = 64
  
  
loss_fn = torch.nn.NLLLoss()
  
train_loader, val_loader, test_loader = train_test_split_graph(
      data, train_ratio, val_ratio, batch_size)



In [10]:
def gnn_experiment(model: GNN, train_loader, val_loader):
  logging.info(f"Training {model.__class__.__name__}")
  
  optimizer = torch.optim.Adam(
      model.parameters(), lr=learning_rate, weight_decay=weight_decay)
  
  # Initialize the model and optimizer
  model = GCN(in_channels=data.num_node_features, hidden_channels=64, out_channels=data.num_classes).to(device)
  optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
  loss_fn = torch.nn.NLLLoss()
  
  metrics = {
    'accuracy': accuracy,
    'f1': f1,
    'balanced_accuracy': balanced_accuracy,
  }
  
  # Train the model
  import warnings
  warnings.filterwarnings("ignore")
  model, train_losses, val_losses, train_metrics_history, val_metrics_history = training_loop(
    model, optimizer, loss_fn, train_loader, val_loader, num_epochs=10, device=device, metrics=metrics
  )
  
  return {
    'model': model,
    'train_losses': train_losses,
    'val_losses': val_losses,
    'train_metrics_history': train_metrics_history,
    'val_metrics_history': val_metrics_history
  }

In [11]:
gcn = GCN(in_channels=data.num_node_features, hidden_channels=64, out_channels=data.num_classes)
gat = GAT(in_channels=data.num_node_features, hidden_channels=64, out_channels=data.num_classes, num_heads=1)
gin = GIN(in_channels=data.num_node_features, hidden_channels=64, out_channels=data.num_classes)
gsage = GraphSAGE(in_channels=data.num_node_features, hidden_channels=64, out_channels=data.num_classes)

In [12]:
# run the expertiment in parallel, by saving the results in a dictionary
# with concurrent.futures.ThreadPoolExecutor() as executor:
#   futures = {
#     executor.submit(gnn_experiment, model, train_loader, val_loader): model
#     for model in [gcn, gat, gin, gsage]
#   }

for model in [gcn, gat, gin, gsage]:
  results = gnn_experiment(model, train_loader, val_loader)
  model_name = model.__class__.__name__
  torch.save(results, f'output/{model_name}_results.pt')
  print(f"Saved results for {model_name}")

Starting training


ImportError: 'NeighborSampler' requires either 'pyg-lib' or 'torch-sparse'

## Evaluation