In [3]:
import networkx as nx
import torch
from torch_geometric.data import HeteroData
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [14]:
G = nx.read_graphml("graph.graphml")


In [15]:
# 🔥 Step 2: Identify Node and Edge Types
node_labels = set(nx.get_node_attributes(G, "label").values())  # Extract unique node labels
edge_labels = set((G[u][v]["label"] for u, v in G.edges if "label" in G[u][v]))  # Unique edge labels

print(f"Node labels: {node_labels}")
print(f"Edge labels: {edge_labels}")

Node labels: {':Movie', ':Person', ':Cinema'}
Edge labels: {'PRODUCED', 'RELEASED', 'ACTED_IN', 'REVIEWED', 'DIRECTED', 'WROTE', 'FOLLOWS'}


In [16]:
# 🔥 Step 3: Process Nodes and Convert to PyTorch Format
data = HeteroData()

# Store node index mappings
node_mappings = {}  # {node_label: {original_id: new_index}}

for node_label in node_labels:
    # Extract nodes of this label
    nodes = [n for n in G.nodes if G.nodes[n]["label"] == node_label]
    
    # Identify all possible features for this node type
    all_features = set()
    for n in nodes:
        all_features.update(G.nodes[n].keys())
    all_features -= {"type"}  # Remove "type" field

    # Prepare data storage
    num_features, cat_features = [], []
    cat_feature_encoders = {}  # Store encoders for categorical features
    feature_matrix = []
    
    for n in nodes:
        feature_vector = []
        for key in all_features:
            if key not in G.nodes[n]:  # Handle missing values
                feature_vector.append(0)  # Default numeric value
            else:
                value = G.nodes[n][key]
                if isinstance(value, (int, float)):  # Numeric feature
                    feature_vector.append(value)
                else:  # Categorical feature
                    if key not in cat_feature_encoders:
                        cat_feature_encoders[key] = OneHotEncoder( handle_unknown="ignore")
                    encoded = cat_feature_encoders[key].fit_transform([[value]])[0]
                    feature_vector.extend(encoded)

        feature_matrix.append(feature_vector)
  # Convert numerical features to NumPy array
    if num_features:
        num_features = np.array(num_features).T
        num_features = StandardScaler().fit_transform(num_features)
    
    # Convert categorical features to NumPy array and concatenate with numerical
    if cat_features:
        cat_features_array = np.hstack(list(cat_features.values()))
    else:
        cat_features_array = None

    if num_features is not None and cat_features_array is not None:
        feature_matrix = np.hstack([num_features, cat_features_array])
    elif num_features is not None:
        feature_matrix = num_features
    elif cat_features_array is not None:
        feature_matrix = cat_features_array

    # Convert to PyTorch tensor
    if len(feature_matrix) > 0:
        data[node_label].x = torch.tensor(feature_matrix, dtype=torch.float)

    # Store mapping from original node ID to new index
    node_mappings[node_label] = {n: i for i, n in enumerate(nodes)}

In [20]:
for edge_label in edge_labels:
    edge_list = [(str(u), str(v)) for u, v in G.edges if G[u][v]["label"] == edge_label]

    if not edge_list:
        continue

    # Validate that the first edge's nodes exist in the graph
    src_node, dst_node = edge_list[0]
    if src_node not in G.nodes or dst_node not in G.nodes:
        print(f"⚠️ Warning: Edge references missing nodes ({src_node}, {dst_node})! Skipping this edge label.")
        continue

    src_label, dst_label = G.nodes[src_node]["label"], G.nodes[dst_node]["label"]

    # Skip edges where nodes are not mapped
    valid_edges = [
        (u, v) for u, v in edge_list if u in node_mappings[src_label] and v in node_mappings[dst_label]
    ]

    if not valid_edges:
        print(f"⚠️ Warning: No valid edges found for edge label {edge_label}. Skipping.")
        continue

    # Convert to edge index format
    edge_index = torch.tensor([
        [node_mappings[src_label][u], node_mappings[dst_label][v]]
        for u, v in valid_edges
    ], dtype=torch.long).T

    # Store in data object
    data[(src_label, edge_label, dst_label)].edge_index = edge_index


In [21]:
# ✅ Dataset is ready!
print(data)

HeteroData(
  (:Movie, PRODUCED, :Person)={ edge_index=[2, 2] },
  (:Movie, RELEASED, :Cinema)={ edge_index=[2, 644] },
  (:Movie, ACTED_IN, :Person)={ edge_index=[2, 99] },
  (:Movie, REVIEWED, :Person)={ edge_index=[2, 9] },
  (:Movie, DIRECTED, :Person)={ edge_index=[2, 23] },
  (:Person, WROTE, :Movie)={ edge_index=[2, 5] },
  (:Person, FOLLOWS, :Person)={ edge_index=[2, 3] }
)


In [22]:
torch.save(data, "data.pth")

In [23]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import HeteroConv, GATConv, Linear
from torch_geometric.data import HeteroData
from torch_geometric.loader import NeighborLoader
from torch_geometric.transforms import ToUndirected

In [25]:
# Convert graph to undirected (optional but useful)
data = ToUndirected()(data)

In [29]:
# ✅ Define HeteroGAT for Node Classification
class HeteroGATNodeClassifier(torch.nn.Module):
    def __init__(self, metadata, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = HeteroConv({
            edge_type: GATConv((-1, -1), hidden_channels, heads=2, concat=True,add_self_loops=False)
            for edge_type in metadata[1]
        })
        self.conv2 = HeteroConv({
            edge_type: GATConv((-1, -1), out_channels, heads=1, concat=False,add_self_loops=False)
            for edge_type in metadata[1]
        })
        self.classifier = Linear(out_channels, out_channels)

    def forward(self, x_dict, edge_index_dict):
        x_dict = self.conv1(x_dict, edge_index_dict)
        x_dict = {key: F.elu(x) for key, x in x_dict.items()}
        x_dict = self.conv2(x_dict, edge_index_dict)
        return self.classifier(x_dict)

In [30]:
# ✅ Define HeteroGAT for Link Prediction
class HeteroGATLinkPredictor(torch.nn.Module):
    def __init__(self, metadata, hidden_channels):
        super().__init__()
        self.conv1 = HeteroConv({
            edge_type: GATConv((-1, -1), hidden_channels, heads=2, concat=True,add_self_loops=False)
            for edge_type in metadata[1]
        })
        self.conv2 = HeteroConv({
            edge_type: GATConv((-1, -1), hidden_channels, heads=1, concat=False,add_self_loops=False)
            for edge_type in metadata[1]
        })

    def forward(self, x_dict, edge_index_dict):
        x_dict = self.conv1(x_dict, edge_index_dict)
        x_dict = {key: F.elu(x) for key, x in x_dict.items()}
        x_dict = self.conv2(x_dict, edge_index_dict)
        return x_dict

    def predict_link(self, x_dict, edge_label_index):
        src, dst = edge_label_index
        return (x_dict[src] * x_dict[dst]).sum(dim=-1).sigmoid()

In [31]:
# 🔥 Initialize Models
hidden_channels = 64
num_classes = 3  # Adjust based on your node classification task

node_model = HeteroGATNodeClassifier(data.metadata(), hidden_channels, num_classes)
link_model = HeteroGATLinkPredictor(data.metadata(), hidden_channels)

# Optimizers
node_optimizer = torch.optim.Adam(node_model.parameters(), lr=0.005)
link_optimizer = torch.optim.Adam(link_model.parameters(), lr=0.005)

In [32]:
# 🔥 Node Classification Training Loop
def train_node_classifier():
    node_model.train()
    node_optimizer.zero_grad()

    out = node_model(data.x_dict, data.edge_index_dict)
    mask = data["Movie"].train_mask  # Example: Classify Movies
    loss = F.cross_entropy(out["Movie"][mask], data["Movie"].y[mask])

    loss.backward()
    node_optimizer.step()
    return loss.item()


In [33]:
# 🔥 Link Prediction Training Loop
def train_link_predictor():
    link_model.train()
    link_optimizer.zero_grad()

    x_dict = link_model(data.x_dict, data.edge_index_dict)
    edge_label_index = data["Movie", "ACTED_IN", "Person"].edge_index

    pred = link_model.predict_link(x_dict, edge_label_index)
    target = torch.ones(pred.shape)  # Positive links

    loss = F.binary_cross_entropy(pred, target)
    loss.backward()
    link_optimizer.step()
    return loss.item()

In [35]:
# Ensure every node type has an `x` feature
for node_label in data.node_labels:
    if "x" not in data[node_label]:
        num_nodes = data[node_label].num_nodes
        feature_dim = 64  # Adjust based on your model input size
        print(f"⚠️ Warning: No features found for {node_label}. Assigning random features.")
        data[node_label].x = torch.randn(num_nodes, feature_dim)  # Initialize with random embeddings


AttributeError: 'HeteroData' has no attribute 'node_labels'

In [34]:
# 🔥 Training Execution
for epoch in range(50):
    node_loss = train_node_classifier()
    link_loss = train_link_predictor()
    
    if epoch % 10 == 0:
        print(f"Epoch {epoch}: Node Loss = {node_loss:.4f}, Link Loss = {link_loss:.4f}")


KeyError: "Tried to collect 'x' but did not find any occurrences of it in any node and/or edge type"

In [None]:
# 🔥 Evaluation for Node Classification
node_model.eval()
out = node_model(data.x_dict, data.edge_index_dict)
pred = out["Movie"].argmax(dim=-1)
accuracy = (pred[data["Movie"].test_mask] == data["Movie"].y[data["Movie"].test_mask]).sum() / data["Movie"].test_mask.sum()
print(f"Node Classification Accuracy: {accuracy:.4f}")

In [None]:
# 🔥 Evaluation for Link Prediction
link_model.eval()
x_dict = link_model(data.x_dict, data.edge_index_dict)
edge_label_index = data["Movie", "ACTED_IN", "Person"].edge_index
pred = link_model.predict_link(x_dict, edge_label_index)

print(f"Example Link Predictions: {pred[:5]}")