In [1]:
import torch
import numpy as np
from scipy.io import savemat
from torch_geometric.utils import to_dense_adj



In [2]:
dataset = torch.load('data/dataset_python_node.pt', weights_only=False) 

In [3]:
# # Load dataset
# dataset = torch.load('data/dataset_python_node.pt', weights_only=False) 

# # Parameter: keep only the first n_keep features from each node's feature vector.
# # Adjust this value to drop the remaining features.
# n_keep = 1  # for example, keep only the first 50 features

# # Prepare plain Python lists for each attribute
# edge_indices = []
# features = []
# labels = []

# # Iterate through each Data object in the dataset
# for data in dataset:
#     # Convert to NumPy arrays if needed
#     edge_index = data.edge_index.numpy() if isinstance(data.edge_index, torch.Tensor) else data.edge_index
#     feature = data.x.numpy() if isinstance(data.x, torch.Tensor) else data.x
#     # Optionally drop some features, here keeping only the first n_keep columns
#     if feature.shape[1] > n_keep:
#         feature = feature[:, :n_keep]
        
#     edge_index_tensor = torch.tensor(edge_index, dtype=torch.long)
#     adj_matrix = to_dense_adj(edge_index_tensor)[0].numpy()
#     edge_index = adj_matrix
    
#     label = data.y.numpy() if isinstance(data.y, torch.Tensor) else data.y
#     edge_indices.append(edge_index)
#     features.append(feature)
#     labels.append(label)

# # Manually create object arrays to avoid broadcasting issues
# edge_indices_obj = np.empty(len(edge_indices), dtype=object)
# features_obj = np.empty(len(features), dtype=object)
# labels_obj = np.empty(len(labels), dtype=object)

# for i in range(len(edge_indices)):
#     edge_indices_obj[i] = edge_indices[i]
#     features_obj[i] = features[i]
#     labels_obj[i] = labels[i]

# # Build the dictionary for saving to .mat
# data_dict = {
#     'edge_indices': edge_indices_obj,  # MATLAB cell array equivalent
#     'features': features_obj,          # Object array of (possibly reduced) feature matrices
#     'labels': labels_obj               # Object array of labels
# }

# # Save to .mat file
# savemat('data/node.mat', data_dict)
# print("Conversion to .mat complete!")


In [9]:
import torch
import numpy as np
from torch_geometric.utils import to_dense_adj
from scipy.io import savemat

# Prepare lists for each attribute
edge_indices = []
features = []
labels = []

# For statistics
graph_node_counts = []
graph_feature_counts = []
graph_edge_counts = []  # list that will store number of edges per graph
graph_degrees = []     # list that will store the degree vector for each graph

# Iterate through each Data object in the dataset
for idx, data in enumerate(dataset):
    # Convert edge index and features to NumPy arrays if needed
    edge_index = data.edge_index.numpy() if isinstance(data.edge_index, torch.Tensor) else data.edge_index
    feature = data.x.numpy() if isinstance(data.x, torch.Tensor) else data.x
    
    # Print info about the feature matrix
    num_nodes, num_features = feature.shape  # number of nodes and features per node
    # print(f"Graph {idx}: feature shape = {feature.shape} (nodes x features)")
    
    # Filter: if we want graphs with exactly 18 nodes, then skip otherwise.
    if num_nodes != 18:
        print(f"Skipping graph {idx} due to node count != 18 (found {num_nodes})")
        continue

    # Convert edge indices to a dense adjacency matrix.
    # Force the matrix to be of size 18 x 18 (if needed) by setting max_num_nodes=18.
    edge_index_tensor = torch.tensor(edge_index, dtype=torch.long)
    # Using max_num_nodes forces output size to be 18x18 even if edge indices don't span all nodes.
    adj_matrix = to_dense_adj(edge_index_tensor, max_num_nodes=18)[0].numpy()
    
    # Print the shape of the adjacency matrix.
    # print(f"Graph {idx}: adjacency matrix shape = {adj_matrix.shape}")

    # Compute the degree of each node in the adjacency matrix.
    # (Assuming that the adjacency matrix is binary.)
    degrees = np.sum(adj_matrix, axis=1)
    total_edges = int(np.sum(degrees) // 2)  # for undirected graphs, each edge appears twice.
    # print(f"Graph {idx}: node degrees = {degrees}")
    # print(f"Graph {idx}: total number of edges = {total_edges}")
    
    # Append the processed data to the lists
    edge_indices.append(adj_matrix)
    features.append(feature)
    label = data.y.numpy() if isinstance(data.y, torch.Tensor) else data.y
    labels.append(label)
    
    # Gather statistics for overall checking later
    graph_node_counts.append(num_nodes)
    graph_feature_counts.append(num_features)
    graph_edge_counts.append(total_edges)
    graph_degrees.append(degrees)

# After looping through, print summary statistics
# print("\nSummary:")
# print(f"Total number of graphs processed: {len(edge_indices)}")
# print(f"Node counts (should all be 18): {graph_node_counts}")
# print(f"Feature dimension per graph (should be consistent): {graph_feature_counts}")
# print(f"Edge counts per graph: {graph_edge_counts}")

# Manually create object arrays to avoid broadcasting issues in savemat
edge_indices_obj = np.empty(len(edge_indices), dtype=object)
features_obj = np.empty(len(features), dtype=object)
labels_obj = np.empty(len(labels), dtype=object)

for i in range(len(edge_indices)):
    edge_indices_obj[i] = edge_indices[i]
    features_obj[i] = features[i]
    labels_obj[i] = labels[i]

# Build the dictionary for saving to .mat
data_dict = {
    'edge_indices': edge_indices_obj,  # MATLAB will see this as a cell array of adjacency matrices
    'features': features_obj,          # MATLAB cell array of feature matrices
    'labels': labels_obj               # MATLAB cell array of labels
}

# Save to .mat file
savemat('data/node.mat', data_dict)
print("Conversion to .mat complete!")


Conversion to .mat complete!
