In [101]:
import os
import networkx as nx
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.datasets import Planetoid
from torch_geometric.utils import from_networkx
from torch_geometric.utils import subgraph
from torch_geometric.data import Data
from torch_geometric.data import DataLoader
from sklearn.model_selection import train_test_split

In [102]:

dataset = Planetoid(root='/tmp/Cora', name='Cora')
data = dataset[0]


folder_path = 'client_subgraphs'
sub_data_list = []

for filename in os.listdir(folder_path):
    if filename.endswith('.gml'):
      
        file_path = os.path.join(folder_path, filename)
        g = nx.read_gml(file_path)

        subgraph_nodes = list(g.nodes)
        subgraph_nodes = [int(node) for node in subgraph_nodes]  # Convert to integer if they are not

        sub_edge_index, _ = subgraph(subgraph_nodes, data.edge_index, relabel_nodes=True)

        sub_data = Data(x=data.x[subgraph_nodes], edge_index=sub_edge_index, y=data.y[subgraph_nodes])
        sub_data_list.append(sub_data)


In [5]:
# #plot subgraph 0 to make sure it looks right
# import matplotlib.pyplot as plt
# import networkx as nx
# from torch_geometric.utils import to_networkx

# G = to_networkx(sub_data_list[82], to_undirected=False)
# plt.figure(figsize=(20,20))
# nx.draw(G, with_labels=True, node_size=15, node_color='g', edge_color='b')
# plt.show()


In [31]:

class GCN(torch.nn.Module):
    def __init__(self, num_features, num_classes):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(num_features, 16)
        self.conv2 = GCNConv(16, num_classes)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        # First convolutional layer
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)

        # Second convolutional layer
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)


In [33]:
#keep a  list of training and validation loss per epoch for each subgraph
train_losses = []
val_losses = []

In [34]:
import numpy as np
def train_val_split(data):
        #handle labels with only one sample
    # `data.x` contains the node features and `data.y` contains the labels

    # Count the occurrences of each label in the dataset
    label_counts = torch.bincount(data.y)

    # Find the labels that appear only once (single occurrence)
    single_occurrence_labels = torch.nonzero(label_counts == 1).flatten()

    # Initialize empty lists to store samples
    single_sample_label = []
    other_label = []

    # Separate samples based on labels
    for i, label in enumerate(data.y):
        if label in single_occurrence_labels:
            single_sample_label.append(i)
        else:
            other_label.append(i)

    # Convert the lists of sample indices into tensors
    single_sample_label = torch.tensor(single_sample_label)
    other_label = torch.tensor(other_label)

    # Extract the corresponding node features and labels
    single_sample_x = data.x[single_sample_label]
    single_sample_y = data.y[single_sample_label]
    other_x = data.x[other_label]
    other_y = data.y[other_label]
    
    other_x_train, other_x_test, other_y_train, other_y_test = train_test_split(other_x, other_y, test_size=0.2, random_state=42)

    # Add single_sample_x and single_sample_y to the training set
    combined_x_train = torch.cat((other_x_train, single_sample_x), dim=0)
    combined_y_train = torch.cat((other_y_train, single_sample_y), dim=0)
    
    return combined_x_train, combined_y_train, other_x_test, other_y_test
                    

 


In [103]:
import torch

def transductive_split(data, train_percent=0.8):
    """
    Split graph data into training and testing sets for transductive learning.
    :param data: PyG Data object
    :param train_percent: Percentage of nodes to be used for training
    :return: data object with train_mask and test_mask attributes added
    """
    num_nodes = data.num_nodes
    train_size = int(train_percent * num_nodes)

    # Create a random permutation of node indices
    perm = torch.randperm(num_nodes)

    # Create masks for training and testing nodes
    train_mask = torch.zeros(num_nodes, dtype=torch.bool)
    test_mask = torch.zeros(num_nodes, dtype=torch.bool)

    train_mask[perm[:train_size]] = True
    test_mask[perm[train_size:]] = True

    # Add masks to data object
    data.train_mask = train_mask
    data.test_mask = test_mask

    return data


In [None]:
for i in range(0, 100):
    
    sub_data = sub_data_list[i]
    #split sub_data.x and sub_data.y into train and validation, but keep the same edge_index
    train_x, train_y, val_x, val_y = transductive_split(sub_data)
    
    continue
    sub_data_train = Data(x=train_x, edge_index=sub_data.edge_index, y=train_y)
    sub_data_val = Data(x=val_x, edge_index=sub_data.edge_index, y=val_y)
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = GCN(sub_data.num_node_features, dataset.num_classes).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

    
    client_train_losses = []
    client_val_losses = []
    
    for epoch in range(200):
        model = model.train()
        optimizer.zero_grad()
        out = model(sub_data_train)
        loss = F.nll_loss(out, sub_data_train.y)
        loss.backward()
        optimizer.step()
        
        client_train_losses.append(loss.item())
        
        #calculate validation loss
        model = model.eval()
        out_val = model(sub_data_val)
        loss = F.nll_loss(out_val, sub_data_val.y)
        client_val_losses.append(loss.item())
        
        
    train_losses.append(client_train_losses)
    val_losses.append(client_val_losses)
    # print out metrics
    #calculate final training accuracy
    true_labels = sub_data_train.y
    _, pred = out.max(1)
    train_correct = pred.eq(true_labels).sum().item()
    print("final training loss: ", loss.item(), " for subgraph ", i)
    print("final training accuracy: ", train_correct / len(true_labels), " for subgraph ", i)
    print()
    #calculate validation accuracy and loss
    true_labels = sub_data_val.y
    _, pred = out_val.max(1)
    val_correct = pred.eq(true_labels).sum().item()
    
    
    print("final validation loss: ", loss.item(), " for subgraph ", i)
    print("final validation accuracy: ", val_correct / len(true_labels), " for subgraph ", i)
    print("------------------------------------------------------------")

In [104]:
# splitting the subgraphs into train test and val 
for i in range(0, 100):
    sub_data = sub_data_list[i]
    sub_data = transductive_split(sub_data)

print(sub_data_list[4])

Data(x=[13, 1433], edge_index=[2, 34], y=[13], train_mask=[13], test_mask=[13])


In [105]:
from torch_geometric.nn import GCNConv
    
    
class GCN(torch.nn.Module):
    def __init__(self, num_features, num_classes):
        super().__init__()
        self.conv1 = GCNConv(num_features, 16)
        self.conv2 = GCNConv(16, num_classes)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = x.relu()
        x = F.dropout(x, p=0.5, training=self.training) # p = 0.25
        x = self.conv2(x, edge_index)
        return x
    


model = GCN(sub_data.num_node_features, dataset.num_classes).to(device)

In [107]:


def train(sub_data, model, optimizer, criterion):
      model.train()
      optimizer.zero_grad() 
      out = model(sub_data.x, sub_data.edge_index)  # Perform a single forward pass.
      loss = criterion(out[sub_data.train_mask], sub_data.y[sub_data.train_mask])  # Compute the loss solely based on the training nodes.
      loss.backward() 
      optimizer.step() 
      return loss

def test(sub_data):
      model.eval()
      out = model(sub_data.x, sub_data.edge_index)
      pred = out.argmax(dim=1)  # Use the class with highest probability.
      test_correct = pred[sub_data.test_mask] == data.y[sub_data.test_mask]  # Check against ground-truth labels.
      test_acc = int(test_correct.sum()) / int(sub_data.test_mask.sum())  # Derive ratio of correct predictions.
      return test_acc

for i in range(0, 100):
    sub_data = sub_data_list[i]
    for epoch in range(1, 101):
      model = GCN(sub_data.num_node_features, dataset.num_classes).to(device)
      optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
      criterion = torch.nn.CrossEntropyLoss()
      loss = train(sub_data, model, optimizer, criterion)
      print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}')



Epoch: 001, Loss: 1.9410
Epoch: 002, Loss: 1.9889
Epoch: 003, Loss: 1.9393
Epoch: 004, Loss: 1.9405
Epoch: 005, Loss: 1.9264
Epoch: 006, Loss: 1.8114
Epoch: 007, Loss: 2.0334
Epoch: 008, Loss: 1.9695
Epoch: 009, Loss: 1.9313
Epoch: 010, Loss: 1.9431
Epoch: 011, Loss: 1.9321
Epoch: 012, Loss: 1.9513
Epoch: 013, Loss: 1.9474
Epoch: 014, Loss: 1.8770
Epoch: 015, Loss: 2.0152
Epoch: 016, Loss: 1.9069
Epoch: 017, Loss: 1.9436
Epoch: 018, Loss: 1.8856
Epoch: 019, Loss: 1.9770
Epoch: 020, Loss: 1.9958
Epoch: 021, Loss: 2.0753
Epoch: 022, Loss: 2.1159
Epoch: 023, Loss: 2.0009
Epoch: 024, Loss: 1.9643
Epoch: 025, Loss: 1.9048
Epoch: 026, Loss: 1.9817
Epoch: 027, Loss: 1.9589
Epoch: 028, Loss: 2.0235
Epoch: 029, Loss: 1.8368
Epoch: 030, Loss: 1.9778
Epoch: 031, Loss: 1.9666
Epoch: 032, Loss: 1.9080
Epoch: 033, Loss: 1.9079
Epoch: 034, Loss: 1.9902
Epoch: 035, Loss: 1.9942
Epoch: 036, Loss: 2.0589
Epoch: 037, Loss: 1.9325
Epoch: 038, Loss: 2.0178
Epoch: 039, Loss: 1.9143
Epoch: 040, Loss: 1.9139
