In [None]:
# peptides struct is used for graph regression
# we aim to predict aggregated 3D properties of the peptides at the graph level
# we have 11 tasks to predict
# the graphs have avg. num nodes 150.94 and avg. num edges 307.30
# should contain 15,535 graphs

In [None]:
! pip install torch-geometric

In [None]:
# Visualize dataset
import torch
from torch_geometric.datasets import LRGBDataset

dataset = LRGBDataset(root='data/LRGBDataset', name='Peptides-struct')
print(dataset)
print(f"Number of graphs {len(dataset)}")
#10873 graphs

In [None]:
# load train dataset
train_dataset = LRGBDataset(root='data/LRGBDataset', name='Peptides-struct', split='train')

In [None]:
# Analyzing the graphs
for data in train_dataset:
    pass
    #print(data)
    #Data(x=[338, 9], edge_index=[2, 682], edge_attr=[682, 3], y=[1, 11])
    #x: 338 nodes with a 9 dim. feature vector
    #edge_index: 682 edges represented as pairs of node indices
    #edge_attr: 682 edges have a 3 dim. feature vector
    #y: 11 output tasks

In [None]:
# Node-Level Clustering
## apply Kmeans clustering on the train_dataset
import warnings
from sklearn.cluster import KMeans
from torch_geometric.nn.pool import max_pool

warnings.filterwarnings("ignore", category=FutureWarning)

cluster_graphs = []
labels_list = []
for data in train_dataset:
    n_clusters = 3 
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)

    node_features = data.x.detach().cpu().numpy() # convert to numpy array
    labels = kmeans.fit_predict(node_features) # index of the cluster each sample belongs to
    tensor_labels = torch.from_numpy(labels).long() # convert to tensor

    # ensure tensor_labels and edge_index are contiguous
    tensor_labels = tensor_labels.contiguous()
    data.edge_index = data.edge_index.contiguous()

    # shrink graph based on clustering
    cluster_graph = max_pool(tensor_labels, data, transform=None)
    print(cluster_graph)

    cluster_graphs.append(cluster_graph)
    labels_list.append(tensor_labels)