In [None]:
pip install dgl torch torch-geometric

Collecting dgl
  Using cached dgl-2.1.0-cp311-cp311-manylinux1_x86_64.whl.metadata (553 bytes)
Using cached dgl-2.1.0-cp311-cp311-manylinux1_x86_64.whl (8.6 MB)
Installing collected packages: dgl
Successfully installed dgl-2.1.0


In [None]:
!pip uninstall -y torchdata
#!pip uninstall -y dgl
!pip uninstall -y torch-geometric

# Reinstall with correct versions
!pip install torch
!pip install torchdata==0.6.1
#!pip install dgl==2.1.0
!pip install torch-geometric

Found existing installation: torchdata 0.6.1
Uninstalling torchdata-0.6.1:
  Successfully uninstalled torchdata-0.6.1
Found existing installation: torch-geometric 2.6.1
Uninstalling torch-geometric-2.6.1:
  Successfully uninstalled torch-geometric-2.6.1
Collecting torchdata==0.6.1
  Using cached torchdata-0.6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Using cached torchdata-0.6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.6 MB)
Installing collected packages: torchdata
Successfully installed torchdata-0.6.1
Collecting torch-geometric
  Using cached torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
Using cached torch_geometric-2.6.1-py3-none-any.whl (1.1 MB)
Installing collected packages: torch-geometric
Successfully installed torch-geometric-2.6.1


In [None]:
!pip uninstall -y numpy
!pip install numpy==1.24.3  # Stable version for compatibility with PyTorch and DGL
!pip uninstall -y dgl
#!pip install dgl==2.1.0
!pip install dgl-cu117 -f https://data.dgl.ai/wheels/repo.html
!pip install torch-scatter torch-sparse torch-cluster torch-spline-conv pyg-lib  \
  -f https://data.pyg.org/whl/torch-2.0.1+cu117.html


Found existing installation: numpy 1.24.3
Uninstalling numpy-1.24.3:
  Successfully uninstalled numpy-1.24.3
Collecting numpy==1.24.3
  Using cached numpy-1.24.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB)
Using cached numpy-1.24.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.3 MB)
Installing collected packages: numpy
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
pymc 5.21.2 requires numpy>=1.25.0, but you have numpy 1.24.3 which is incompatible.
treescope 0.1.9 requires numpy>=1.25.2, but you have numpy 1.24.3 which is incompatible.
torchvision 0.21.0+cu124 requires torch==2.6.0, but you have torch 2.0.1 which is incompatible.
tensorflow 2.18.0 requires numpy<2.1.0,>=1.26.0, but you have numpy 1.24.3 which is incompatible.
albucore 0.0.23 requires numpy>=1.24.4, but you have numpy 1.24.3 which is incomp

Found existing installation: dgl 2.1.0
Uninstalling dgl-2.1.0:
  Successfully uninstalled dgl-2.1.0
Looking in links: https://data.dgl.ai/wheels/repo.html
Looking in links: https://data.pyg.org/whl/torch-2.0.1+cu117.html


In [None]:
import os
os.environ["DGLBACKEND"] = "pytorch"

import torch
import torch.nn as nn
import torch.optim as optim
from torch_geometric.datasets import Planetoid
from torch_geometric.nn import GCNConv
import dgl
import numpy as np
import time


In [None]:

print(torch.__version__)
print(torch.version.cuda)


2.0.1+cu117
11.7


In [None]:
import pyg_lib  # should import without error


In [None]:
# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


#**Cora**

In [None]:
# Load Cora dataset
cora_dataset = Planetoid(root='data/Cora', name='Cora')
cora_data = cora_dataset[0].to(device) # Assume only one graph in the dataset

Using device: cuda


In [None]:
device_list = [torch.device(f'cuda:{i}') for i in range(torch.cuda.device_count())]
print(f"Using {len(device_list)} GPUs:", device_list)


Using 1 GPUs: [device(type='cuda', index=0)]


In [None]:
from torch_geometric.loader import NeighborLoader


##**GCN implementation**

In [None]:
# Define GCN model
class GCN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, dropout_rate=0.5):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.bn1 = nn.BatchNorm1d(hidden_dim)  # Added batch normalization
        self.dropout_rate = dropout_rate
        self.conv2 = GCNConv(hidden_dim, output_dim)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = torch.relu(x)
        x = torch.dropout(x, p=self.dropout_rate, train=self.training)

        x = self.conv2(x, edge_index)
        return x

##**GAT implementation**

In [None]:
from torch_geometric.nn import GATConv, SAGEConv
class GAT(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, heads=8, dropout=0.6):
        super(GAT, self).__init__()
        self.gat1 = GATConv(input_dim, hidden_dim, heads=heads, dropout=dropout)
        self.gat2 = GATConv(hidden_dim * heads, output_dim, heads=1, concat=False, dropout=dropout)

    def forward(self, x, edge_index):
        x = self.gat1(x, edge_index)
        x = torch.relu(x)
        x = torch.dropout(x, p=0.6, train=self.training)
        x = self.gat2(x, edge_index)
        return x

##**SAGE implementation**

In [None]:

class GraphSAGE(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, dropout=0.6):
        super(GraphSAGE, self).__init__()
        self.conv1 = SAGEConv(input_dim, hidden_dim)
        self.conv2 = SAGEConv(hidden_dim, output_dim)
        self.dropout = dropout

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = torch.relu(x)
        x = torch.dropout(x, self.dropout, train=self.training)
        x = self.conv2(x, edge_index)
        return x


In [None]:

# Convert PyG graph to DGL graph
def pyg_to_dgl(data):
    src, dst = data.edge_index.cpu()
    dgl_graph = dgl.graph((src, dst))  # Create DGL graph
    dgl_graph.ndata['feat'] = data.x.cpu()   # Add node features
    return dgl_graph


In [None]:
# Adaptive graph partitioning based on partition size and node degree
def adaptive_partitioning(dgl_graph, num_partitions=4, imbalance_threshold=1.5):
    # Ensure graph is on CPU for METIS partitioning
    dgl_graph_cpu = dgl_graph.to('cpu')

    # Partition the graph using METIS
    partition = dgl.metis_partition_assignment(dgl_graph_cpu, num_partitions)

    # Check for imbalance between partition sizes
    partition_sizes = torch.bincount(partition)
    imbalance_ratio = partition_sizes.max().item() / partition_sizes.min().item()

    # Trigger re-partitioning if imbalance exceeds the threshold
    if imbalance_ratio > imbalance_threshold:
        print(f"Re-partitioning triggered due to imbalance: {partition_sizes.tolist()} with ratio {imbalance_ratio:.2f}")
        partition = dgl.metis_partition_assignment(dgl_graph_cpu, num_partitions)

    # Attach partition info and move back to GPU
    dgl_graph_cpu.ndata['part'] = partition
    dgl_graph_gpu = dgl_graph_cpu.to(device)
    return dgl_graph_gpu

In [None]:
# Training function with fully adaptive dynamic partitioning
def train_gnn(model, data, optimizer, criterion, epochs, num_partitions=4, partition_interval=5, imbalance_threshold=1.5):
    model.train()
    dgl_graph = pyg_to_dgl(data)
    start_time = time.time()

    # Initial partitioning
    partition = adaptive_partitioning(dgl_graph, num_partitions, imbalance_threshold)

    for epoch in range(epochs):
        # Perform dynamic partitioning at intervals or when imbalance is detected
        if epoch % partition_interval == 0:
            partition = adaptive_partitioning(dgl_graph, num_partitions, imbalance_threshold)

        # Train on each partition
        for part_id in range(num_partitions):
            # Get nodes in the current partition
            node_mask = (partition.ndata['part'] == part_id)
            if node_mask.sum() == 0:
                continue
            subgraph = dgl.node_subgraph(partition, node_mask)

            # Get features and edge index for the subgraph
            x = subgraph.ndata['feat']
            edge_index = torch.stack(subgraph.edges()).to(device)

            # Get the train mask for the current partition
            subgraph_train_mask = data.train_mask[node_mask]

            # Forward pass
            optimizer.zero_grad()
            output = model(x, edge_index)
            loss = criterion(output[subgraph_train_mask], data.y[node_mask.cpu()][subgraph_train_mask])
            loss.backward()
            optimizer.step()

    end_time = time.time()
    return end_time - start_time



In [None]:
# Evaluate function
def evaluate_model(model, data, mask):
    model.eval()
    with torch.no_grad():
        data_x = data.x
        data_edge_index = data.edge_index
        data_y = data.y

        start_time = time.time()
        output = model(data_x, data_edge_index)
        predictions = output[mask].argmax(dim=1)
        correct = (predictions == data_y[mask]).sum().item()
        accuracy = correct / mask.sum().item()
        end_time = time.time()

    inference_time = end_time - start_time
    return accuracy, inference_time


In [None]:
def create_partition_loaders(data, num_partitions, batch_size):
    loaders = []
    for p in range(num_partitions):
        mask = (data.part == p) & data.train_mask
        if mask.sum() == 0:
            continue
        loader = NeighborLoader(
            data,
            num_neighbors=[10, 10],
            input_nodes=mask.nonzero(as_tuple=True)[0],
            batch_size=batch_size,
            shuffle=True
        )
        loaders.append(loader)
    return loaders




In [None]:
def train_minibatch(model, data, train_loaders, optimizer, criterion, epochs, partition_interval):
    model.train()

    for epoch in range(epochs):
        total_loss = 0
        start_time = time.time()

        if epoch % partition_interval == 0 and epoch != 0:
            # Re-partition the graph
            print(f"\n[Epoch {epoch}] Re-partitioning graph...")
            dgl_graph = pyg_to_dgl(data)
            dgl_graph = adaptive_partitioning(dgl_graph, num_partitions=num_partitions, imbalance_threshold=imbalance_threshold)
            data.part = dgl_graph.ndata['part'].to(data.x.device)
            train_loaders = create_partition_loaders(data, num_partitions, batch_size=64)

        for loader in train_loaders:
            for batch in loader:
                batch = batch.to(device)

                optimizer.zero_grad()
                out = model(batch.x, batch.edge_index)
                loss = criterion(out[batch.train_mask], batch.y[batch.train_mask])
                loss.backward()
                optimizer.step()

                total_loss += loss.item()

        print(f"Epoch {epoch}, Loss: {total_loss:.4f}, Time: {time.time() - start_time:.2f}s")


In [None]:
def evaluate(model, data, mask):
    model.eval()
    with torch.no_grad():
        out = model(data.x, data.edge_index)
        pred = out[mask].argmax(dim=1)
        acc = (pred == data.y[mask]).sum().item() / mask.sum().item()
        return acc


In [None]:
import torch_sparse

In [None]:
print("pyg-lib version:", pyg_lib.__version__)
print("torch-sparse version:", torch_sparse.__version__)

pyg-lib version: 0.4.0+pt20cu117
torch-sparse version: 0.6.18+pt20cu117


#**GCN model - CORA**

In [None]:
input_dim = cora_data.num_features
hidden_dim = 64
output_dim = int(cora_data.y.max() - cora_data.y.min() + 1)
# learning_rate = 0.1
epochs = 50
num_partitions = 128  # Number of partitions for adaptive partitioning
partition_interval = 5  # Re-partition every 5 epochs
imbalance_threshold = 1.5  # Max allowed partition imbalance ratio

# Convert PyG to DGL and apply adaptive partitioning
dgl_graph = pyg_to_dgl(cora_data)
dgl_graph = adaptive_partitioning(dgl_graph, num_partitions=num_partitions, imbalance_threshold=imbalance_threshold)

# Copy the partition info back to PyG object for compatibility
partition_tensor = dgl_graph.ndata['part'].to(cora_data.x.device)
cora_data.part = partition_tensor

train_loaders = create_partition_loaders(cora_data, num_partitions, batch_size=64)

gcn_model = GCN(input_dim=input_dim, hidden_dim=hidden_dim, output_dim=output_dim).to(device)
optimizer = torch.optim.Adam(gcn_model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()


train_minibatch(gcn_model, cora_data, train_loaders, optimizer, criterion, epochs=epochs, partition_interval=partition_interval)
test_acc = evaluate(gcn_model, cora_data, cora_data.test_mask)

print("Test Accuracy:", test_acc)


Convert a graph into a bidirected graph: 0.000 seconds, peak memory: 13.563 GB
Construct multi-constraint weights: 0.000 seconds, peak memory: 13.563 GB
Metis partitioning: 0.017 seconds, peak memory: 13.563 GB
Epoch 0, Loss: 149.2296, Time: 0.30s
Epoch 1, Loss: 104.3701, Time: 0.30s
Epoch 2, Loss: 60.7357, Time: 0.31s
Epoch 3, Loss: 31.8678, Time: 0.30s
Epoch 4, Loss: 18.1165, Time: 0.30s

[Epoch 5] Re-partitioning graph...
Convert a graph into a bidirected graph: 0.000 seconds, peak memory: 13.563 GB
Construct multi-constraint weights: 0.000 seconds, peak memory: 13.563 GB
Metis partitioning: 0.018 seconds, peak memory: 13.563 GB
Epoch 5, Loss: 15.6307, Time: 0.45s
Epoch 6, Loss: 8.6242, Time: 0.32s
Epoch 7, Loss: 7.0666, Time: 0.32s
Epoch 8, Loss: 4.1852, Time: 0.33s
Epoch 9, Loss: 3.0751, Time: 0.33s

[Epoch 10] Re-partitioning graph...
Convert a graph into a bidirected graph: 0.000 seconds, peak memory: 13.563 GB
Construct multi-constraint weights: 0.000 seconds, peak memory: 13.5

#**GAT model - CORA**

In [None]:
input_dim = cora_data.num_features
hidden_dim = 64
output_dim = int(cora_data.y.max() - cora_data.y.min() + 1)
# learning_rate = 0.1
epochs = 50
num_partitions = 128  # Number of partitions for adaptive partitioning
partition_interval = 5  # Re-partition every 5 epochs
imbalance_threshold = 1.5  # Max allowed partition imbalance ratio

# Convert PyG to DGL and apply adaptive partitioning
dgl_graph = pyg_to_dgl(cora_data)
dgl_graph = adaptive_partitioning(dgl_graph, num_partitions=num_partitions, imbalance_threshold=imbalance_threshold)

# Copy the partition info back to PyG object for compatibility
partition_tensor = dgl_graph.ndata['part'].to(cora_data.x.device)
cora_data.part = partition_tensor

train_loaders = create_partition_loaders(cora_data, num_partitions, batch_size=64)

gat_model = GAT(input_dim=input_dim, hidden_dim=hidden_dim, output_dim=output_dim).to(device)
optimizer = torch.optim.Adam(gat_model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()


train_minibatch(gat_model, cora_data, train_loaders, optimizer, criterion, epochs=epochs, partition_interval=partition_interval)
test_acc = evaluate(gat_model, cora_data, cora_data.test_mask)

print("Test Accuracy:", test_acc)


Convert a graph into a bidirected graph: 0.000 seconds, peak memory: 13.460 GB
Construct multi-constraint weights: 0.000 seconds, peak memory: 13.460 GB
Metis partitioning: 0.018 seconds, peak memory: 13.460 GB
Epoch 0, Loss: 171.7317, Time: 0.50s
Epoch 1, Loss: 128.9949, Time: 0.49s
Epoch 2, Loss: 107.8815, Time: 0.50s
Epoch 3, Loss: 87.4326, Time: 0.48s
Epoch 4, Loss: 71.7495, Time: 0.48s

[Epoch 5] Re-partitioning graph...
Convert a graph into a bidirected graph: 0.000 seconds, peak memory: 13.461 GB
Construct multi-constraint weights: 0.000 seconds, peak memory: 13.461 GB
Metis partitioning: 0.019 seconds, peak memory: 13.461 GB
Epoch 5, Loss: 59.3898, Time: 0.56s
Epoch 6, Loss: 49.5070, Time: 0.46s
Epoch 7, Loss: 52.5654, Time: 0.47s
Epoch 8, Loss: 44.7127, Time: 0.46s
Epoch 9, Loss: 52.6056, Time: 0.58s

[Epoch 10] Re-partitioning graph...
Convert a graph into a bidirected graph: 0.001 seconds, peak memory: 13.461 GB
Construct multi-constraint weights: 0.000 seconds, peak memory:

#**SAGE model - CORA**


Convert a graph into a bidirected graph: 0.000 seconds, peak memory: 13.326 GB
Construct multi-constraint weights: 0.000 seconds, peak memory: 13.326 GB
Metis partitioning: 0.018 seconds, peak memory: 13.326 GB
Epoch 0, Loss: 169.3610, Time: 0.31s
Epoch 1, Loss: 161.6532, Time: 0.32s
Epoch 2, Loss: 154.3731, Time: 0.33s
Epoch 3, Loss: 145.6822, Time: 0.31s
Epoch 4, Loss: 134.3488, Time: 0.31s

[Epoch 5] Re-partitioning graph...
Convert a graph into a bidirected graph: 0.000 seconds, peak memory: 13.326 GB
Construct multi-constraint weights: 0.000 seconds, peak memory: 13.326 GB
Metis partitioning: 0.020 seconds, peak memory: 13.326 GB
Epoch 5, Loss: 123.0420, Time: 0.41s
Epoch 6, Loss: 110.6501, Time: 0.33s
Epoch 7, Loss: 94.5461, Time: 0.30s
Epoch 8, Loss: 82.7916, Time: 0.32s
Epoch 9, Loss: 69.5157, Time: 0.32s

[Epoch 10] Re-partitioning graph...
Convert a graph into a bidirected graph: 0.000 seconds, peak memory: 13.326 GB
Construct multi-constraint weights: 0.000 seconds, peak mem

#**CiteSeer**

In [None]:
# Load citeseer dataset
citeseer_dataset = Planetoid(root='data/CiteSeer', name='CiteSeer')
citeseer_data = citeseer_dataset[0].to(device)  # Assume only one graph in the dataset

Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.x
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.tx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.allx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.y
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.ty
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.ally
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.graph
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.test.index
Processing...
Done!


#**GCN - CiteSeer**


In [None]:
input_dim = citeseer_data.num_features
hidden_dim = 256
output_dim = int(citeseer_data.y.max() - citeseer_data.y.min() + 1)
# learning_rate = 0.1
epochs = 50
num_partitions = 128  # Number of partitions for adaptive partitioning
partition_interval = 5  # Re-partition every 5 epochs
imbalance_threshold = 1.5  # Max allowed partition imbalance ratio

# Convert PyG to DGL and apply adaptive partitioning
dgl_graph = pyg_to_dgl(citeseer_data)
dgl_graph = adaptive_partitioning(dgl_graph, num_partitions=num_partitions, imbalance_threshold=imbalance_threshold)

# Copy the partition info back to PyG object for compatibility
partition_tensor = dgl_graph.ndata['part'].to(citeseer_data.x.device)
citeseer_data.part = partition_tensor

train_loaders = create_partition_loaders(citeseer_data, num_partitions, batch_size=128)

gcn_model = GCN(input_dim=input_dim, hidden_dim=hidden_dim, output_dim=output_dim).to(device)
optimizer = torch.optim.Adam(gcn_model.parameters(), lr=0.0001)
criterion = nn.CrossEntropyLoss()


train_minibatch(gcn_model, citeseer_data, train_loaders, optimizer, criterion, epochs=epochs, partition_interval=partition_interval)
test_acc = evaluate(gcn_model, citeseer_data, citeseer_data.test_mask)

print("Test Accuracy:", test_acc)


Convert a graph into a bidirected graph: 0.000 seconds, peak memory: 13.650 GB
Construct multi-constraint weights: 0.000 seconds, peak memory: 13.650 GB
Metis partitioning: 0.017 seconds, peak memory: 13.650 GB
Epoch 0, Loss: 133.8869, Time: 0.32s
Epoch 1, Loss: 120.7798, Time: 0.87s
Epoch 2, Loss: 108.6856, Time: 0.70s
Epoch 3, Loss: 96.1708, Time: 0.41s
Epoch 4, Loss: 83.6712, Time: 0.31s

[Epoch 5] Re-partitioning graph...
Convert a graph into a bidirected graph: 0.000 seconds, peak memory: 13.650 GB
Construct multi-constraint weights: 0.000 seconds, peak memory: 13.650 GB
Metis partitioning: 0.018 seconds, peak memory: 13.650 GB
Epoch 5, Loss: 71.1657, Time: 0.44s
Epoch 6, Loss: 59.2644, Time: 0.32s
Epoch 7, Loss: 49.9215, Time: 0.32s
Epoch 8, Loss: 40.6654, Time: 0.34s
Epoch 9, Loss: 34.5397, Time: 0.43s

[Epoch 10] Re-partitioning graph...
Convert a graph into a bidirected graph: 0.001 seconds, peak memory: 13.650 GB
Construct multi-constraint weights: 0.000 seconds, peak memory:

#**GAT-CiteSeer**

In [None]:
input_dim = citeseer_data.num_features
hidden_dim = 256
output_dim = int(citeseer_data.y.max() - citeseer_data.y.min() + 1)
# learning_rate = 0.1
epochs = 50
num_partitions = 128  # Number of partitions for adaptive partitioning
partition_interval = 5  # Re-partition every 5 epochs
imbalance_threshold = 1.5  # Max allowed partition imbalance ratio

# Convert PyG to DGL and apply adaptive partitioning
dgl_graph = pyg_to_dgl(citeseer_data)
dgl_graph = adaptive_partitioning(dgl_graph, num_partitions=num_partitions, imbalance_threshold=imbalance_threshold)

# Copy the partition info back to PyG object for compatibility
partition_tensor = dgl_graph.ndata['part'].to(citeseer_data.x.device)
citeseer_data.part = partition_tensor

train_loaders = create_partition_loaders(citeseer_data, num_partitions, batch_size=128)

gat_model = GAT(input_dim=input_dim, hidden_dim=hidden_dim, output_dim=output_dim).to(device)
optimizer = torch.optim.Adam(gat_model.parameters(), lr=0.0001)
criterion = nn.CrossEntropyLoss()


train_minibatch(gat_model, citeseer_data, train_loaders, optimizer, criterion, epochs=epochs, partition_interval=partition_interval)
test_acc = evaluate(gat_model, citeseer_data, citeseer_data.test_mask)

print("Test Accuracy:", test_acc)


Convert a graph into a bidirected graph: 0.000 seconds, peak memory: 13.933 GB
Construct multi-constraint weights: 0.000 seconds, peak memory: 13.933 GB
Metis partitioning: 0.017 seconds, peak memory: 13.933 GB
Epoch 0, Loss: 135.6477, Time: 0.62s
Epoch 1, Loss: 127.6758, Time: 0.62s
Epoch 2, Loss: 118.5697, Time: 0.60s
Epoch 3, Loss: 112.2784, Time: 0.61s
Epoch 4, Loss: 104.5411, Time: 0.60s

[Epoch 5] Re-partitioning graph...
Convert a graph into a bidirected graph: 0.000 seconds, peak memory: 13.933 GB
Construct multi-constraint weights: 0.000 seconds, peak memory: 13.933 GB
Metis partitioning: 0.021 seconds, peak memory: 13.933 GB
Epoch 5, Loss: 99.8715, Time: 0.71s
Epoch 6, Loss: 89.6498, Time: 0.59s
Epoch 7, Loss: 80.8371, Time: 0.59s
Epoch 8, Loss: 70.5256, Time: 0.60s
Epoch 9, Loss: 73.1796, Time: 0.60s

[Epoch 10] Re-partitioning graph...
Convert a graph into a bidirected graph: 0.000 seconds, peak memory: 13.933 GB
Construct multi-constraint weights: 0.000 seconds, peak memor

#**SAGE-CiteSeer**

In [None]:
input_dim = citeseer_data.num_features
hidden_dim = 256
output_dim = int(citeseer_data.y.max() - citeseer_data.y.min() + 1)
# learning_rate = 0.1
epochs = 50
num_partitions = 128  # Number of partitions for adaptive partitioning
partition_interval = 5  # Re-partition every 5 epochs
imbalance_threshold = 1.5  # Max allowed partition imbalance ratio

# Convert PyG to DGL and apply adaptive partitioning
dgl_graph = pyg_to_dgl(citeseer_data)
dgl_graph = adaptive_partitioning(dgl_graph, num_partitions=num_partitions, imbalance_threshold=imbalance_threshold)

# Copy the partition info back to PyG object for compatibility
partition_tensor = dgl_graph.ndata['part'].to(citeseer_data.x.device)
citeseer_data.part = partition_tensor

train_loaders = create_partition_loaders(citeseer_data, num_partitions, batch_size=128)

sage_model = GraphSAGE(input_dim=input_dim, hidden_dim=hidden_dim, output_dim=output_dim).to(device)
optimizer = torch.optim.Adam(sage_model.parameters(), lr=0.0001)
criterion = nn.CrossEntropyLoss()


train_minibatch(sage_model, citeseer_data, train_loaders, optimizer, criterion, epochs=epochs, partition_interval=partition_interval)
test_acc = evaluate(sage_model, citeseer_data, citeseer_data.test_mask)

print("Test Accuracy:", test_acc)


Convert a graph into a bidirected graph: 0.001 seconds, peak memory: 13.935 GB
Construct multi-constraint weights: 0.001 seconds, peak memory: 13.935 GB
Metis partitioning: 0.026 seconds, peak memory: 13.935 GB
Epoch 0, Loss: 134.4818, Time: 0.40s
Epoch 1, Loss: 124.9051, Time: 0.32s
Epoch 2, Loss: 114.7484, Time: 0.30s
Epoch 3, Loss: 102.1520, Time: 0.29s
Epoch 4, Loss: 87.2902, Time: 0.30s

[Epoch 5] Re-partitioning graph...
Convert a graph into a bidirected graph: 0.000 seconds, peak memory: 13.935 GB
Construct multi-constraint weights: 0.000 seconds, peak memory: 13.935 GB
Metis partitioning: 0.019 seconds, peak memory: 13.935 GB
Epoch 5, Loss: 70.7696, Time: 0.42s
Epoch 6, Loss: 57.4887, Time: 0.29s
Epoch 7, Loss: 43.9011, Time: 0.29s
Epoch 8, Loss: 33.8285, Time: 0.29s
Epoch 9, Loss: 27.4696, Time: 0.30s

[Epoch 10] Re-partitioning graph...
Convert a graph into a bidirected graph: 0.000 seconds, peak memory: 13.935 GB
Construct multi-constraint weights: 0.000 seconds, peak memory

#**Pubmed**

In [None]:
# Load PubMed dataset
pubmed_dataset = Planetoid(root='data/PubMed', name='PubMed')
pubmed_data = pubmed_dataset[0].to(device)  # Assume only one graph in the dataset

Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.x
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.tx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.allx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.y
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.ty
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.ally
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.graph
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.test.index
Processing...
Done!


#**GCN-Pubmed**

In [None]:
input_dim = pubmed_data.num_features
hidden_dim = 64
output_dim = int(pubmed_data.y.max() - pubmed_data.y.min() + 1)
# learning_rate = 0.1
epochs = 50
num_partitions = 128  # Number of partitions for adaptive partitioning
partition_interval = 5  # Re-partition every 5 epochs
imbalance_threshold = 1.5  # Max allowed partition imbalance ratio

# Convert PyG to DGL and apply adaptive partitioning
dgl_graph = pyg_to_dgl(pubmed_data)
dgl_graph = adaptive_partitioning(dgl_graph, num_partitions=num_partitions, imbalance_threshold=imbalance_threshold)

# Copy the partition info back to PyG object for compatibility
partition_tensor = dgl_graph.ndata['part'].to(pubmed_data.x.device)
pubmed_data.part = partition_tensor

train_loaders = create_partition_loaders(pubmed_data, num_partitions, batch_size=64)

gcn_model = GCN(input_dim=input_dim, hidden_dim=hidden_dim, output_dim=output_dim).to(device)
optimizer = torch.optim.Adam(gcn_model.parameters(), lr=0.001, weight_decay=5e-4)
criterion = nn.CrossEntropyLoss()


train_minibatch(gcn_model, pubmed_data, train_loaders, optimizer, criterion, epochs=epochs, partition_interval=partition_interval)
test_acc = evaluate(gcn_model, pubmed_data, pubmed_data.test_mask)

print("Test Accuracy:", test_acc)

Convert a graph into a bidirected graph: 0.003 seconds, peak memory: 18.680 GB
Construct multi-constraint weights: 0.000 seconds, peak memory: 18.680 GB
Metis partitioning: 0.057 seconds, peak memory: 18.680 GB
Epoch 0, Loss: 57.0572, Time: 0.19s
Epoch 1, Loss: 54.6426, Time: 0.19s
Epoch 2, Loss: 51.2024, Time: 0.19s
Epoch 3, Loss: 46.7737, Time: 0.24s
Epoch 4, Loss: 41.7061, Time: 0.19s

[Epoch 5] Re-partitioning graph...
Convert a graph into a bidirected graph: 0.004 seconds, peak memory: 18.680 GB
Construct multi-constraint weights: 0.000 seconds, peak memory: 18.680 GB
Metis partitioning: 0.055 seconds, peak memory: 18.680 GB
Epoch 5, Loss: 36.9964, Time: 0.34s
Epoch 6, Loss: 32.0261, Time: 0.19s
Epoch 7, Loss: 28.1203, Time: 0.20s
Epoch 8, Loss: 24.8917, Time: 0.19s
Epoch 9, Loss: 21.1926, Time: 0.19s

[Epoch 10] Re-partitioning graph...
Convert a graph into a bidirected graph: 0.003 seconds, peak memory: 18.680 GB
Construct multi-constraint weights: 0.000 seconds, peak memory: 18

#**GAT-Pubmed**

In [None]:
input_dim = pubmed_data.num_features
hidden_dim = 64
output_dim = int(pubmed_data.y.max() - pubmed_data.y.min() + 1)
# learning_rate = 0.1
epochs = 50
num_partitions = 128  # Number of partitions for adaptive partitioning
partition_interval = 5  # Re-partition every 5 epochs
imbalance_threshold = 1.5  # Max allowed partition imbalance ratio

# Convert PyG to DGL and apply adaptive partitioning
dgl_graph = pyg_to_dgl(pubmed_data)
dgl_graph = adaptive_partitioning(dgl_graph, num_partitions=num_partitions, imbalance_threshold=imbalance_threshold)

# Copy the partition info back to PyG object for compatibility
partition_tensor = dgl_graph.ndata['part'].to(pubmed_data.x.device)
pubmed_data.part = partition_tensor

train_loaders = create_partition_loaders(pubmed_data, num_partitions, batch_size=64)

gat_model = GAT(input_dim=input_dim, hidden_dim=hidden_dim, output_dim=output_dim).to(device)
optimizer = torch.optim.Adam(gat_model.parameters(), lr=0.001, weight_decay=5e-4)
criterion = nn.CrossEntropyLoss()


train_minibatch(gat_model, pubmed_data, train_loaders, optimizer, criterion, epochs=epochs, partition_interval=partition_interval)
test_acc = evaluate(gat_model, pubmed_data, pubmed_data.test_mask)

print("Test Accuracy:", test_acc)

Convert a graph into a bidirected graph: 0.003 seconds, peak memory: 19.988 GB
Construct multi-constraint weights: 0.000 seconds, peak memory: 19.988 GB
Metis partitioning: 0.063 seconds, peak memory: 19.988 GB
Epoch 0, Loss: 57.2277, Time: 0.30s
Epoch 1, Loss: 55.8392, Time: 0.31s
Epoch 2, Loss: 53.3572, Time: 0.31s
Epoch 3, Loss: 51.0175, Time: 0.29s
Epoch 4, Loss: 46.3050, Time: 0.29s

[Epoch 5] Re-partitioning graph...
Convert a graph into a bidirected graph: 0.003 seconds, peak memory: 19.988 GB
Construct multi-constraint weights: 0.000 seconds, peak memory: 19.988 GB
Metis partitioning: 0.058 seconds, peak memory: 19.988 GB
Epoch 5, Loss: 43.6870, Time: 0.46s
Epoch 6, Loss: 37.6944, Time: 0.30s
Epoch 7, Loss: 36.8989, Time: 0.29s
Epoch 8, Loss: 39.3339, Time: 0.31s
Epoch 9, Loss: 30.8787, Time: 0.29s

[Epoch 10] Re-partitioning graph...
Convert a graph into a bidirected graph: 0.003 seconds, peak memory: 19.988 GB
Construct multi-constraint weights: 0.000 seconds, peak memory: 19

#**SAGE-Pubmed**

In [None]:
input_dim = pubmed_data.num_features
hidden_dim = 128
output_dim = int(pubmed_data.y.max() - pubmed_data.y.min() + 1)
# learning_rate = 0.1
epochs = 50
num_partitions = 128  # Number of partitions for adaptive partitioning
partition_interval = 5  # Re-partition every 5 epochs
imbalance_threshold = 1.5  # Max allowed partition imbalance ratio

# Convert PyG to DGL and apply adaptive partitioning
dgl_graph = pyg_to_dgl(pubmed_data)
dgl_graph = adaptive_partitioning(dgl_graph, num_partitions=num_partitions, imbalance_threshold=imbalance_threshold)

# Copy the partition info back to PyG object for compatibility
partition_tensor = dgl_graph.ndata['part'].to(pubmed_data.x.device)
pubmed_data.part = partition_tensor

train_loaders = create_partition_loaders(pubmed_data, num_partitions, batch_size=64)

sage_model = GraphSAGE(input_dim=input_dim, hidden_dim=hidden_dim, output_dim=output_dim).to(device)
optimizer = torch.optim.Adam(sage_model.parameters(), lr=0.001, weight_decay = 5e-4)
criterion = nn.CrossEntropyLoss()


train_minibatch(sage_model, pubmed_data, train_loaders, optimizer, criterion, epochs=epochs, partition_interval=partition_interval)
test_acc = evaluate(sage_model, pubmed_data, pubmed_data.test_mask)

print("Test Accuracy:", test_acc)

Convert a graph into a bidirected graph: 0.003 seconds, peak memory: 23.006 GB
Construct multi-constraint weights: 0.000 seconds, peak memory: 23.006 GB
Metis partitioning: 0.055 seconds, peak memory: 23.006 GB
Epoch 0, Loss: 57.1300, Time: 0.19s
Epoch 1, Loss: 55.4701, Time: 0.18s
Epoch 2, Loss: 52.4399, Time: 0.18s
Epoch 3, Loss: 47.8319, Time: 0.19s
Epoch 4, Loss: 41.2092, Time: 0.23s

[Epoch 5] Re-partitioning graph...
Convert a graph into a bidirected graph: 0.003 seconds, peak memory: 23.006 GB
Construct multi-constraint weights: 0.000 seconds, peak memory: 23.006 GB
Metis partitioning: 0.076 seconds, peak memory: 23.006 GB
Epoch 5, Loss: 33.7639, Time: 0.42s
Epoch 6, Loss: 26.3176, Time: 0.21s
Epoch 7, Loss: 20.5659, Time: 0.19s
Epoch 8, Loss: 15.9906, Time: 0.20s
Epoch 9, Loss: 12.7823, Time: 0.24s

[Epoch 10] Re-partitioning graph...
Convert a graph into a bidirected graph: 0.004 seconds, peak memory: 23.006 GB
Construct multi-constraint weights: 0.000 seconds, peak memory: 23