In [1]:
# Install required packages.
import os
import torch
import numpy as np
os.environ['TORCH'] = torch.__version__
os.environ['DGLBACKEND'] = "pytorch"

# Install the CPU version.
device = torch.device("cpu")
!pip install --pre dgl -f https://data.dgl.ai/wheels-test/repo.html

try:
    import dgl
    import dgl.graphbolt as gb
    installed = True
except ImportError as error:
    installed = False
    print(error)
print("DGL installed!" if installed else "DGL not found!")

[33mDEPRECATION: Loading egg at /Users/kushaldsouza/miniconda3/envs/dgl/lib/python3.12/site-packages/dgl-2.3-py3.12-macosx-11.0-arm64.egg is deprecated. pip 24.3 will enforce this behaviour change. A possible replacement is to use pip for package installation.. Discussion can be found at https://github.com/pypa/pip/issues/12330[0m[33m
[0mLooking in links: https://data.dgl.ai/wheels-test/repo.html
DGL installed!


In [2]:
base_dir = './ondisk_dataset_heterograph'
os.makedirs(base_dir, exist_ok=True)
print(f"Created base directory: {base_dir}")

Created base directory: ./ondisk_dataset_heterograph


In [3]:
import numpy as np
import pandas as pd

# For simplicity, we create a heterogeneous graph with
# 2 node types: `user`, `item`
# 2 edge types: `user:like:item`, `user:follow:user`
# And each node/edge type has the same number of nodes/edges.
num_nodes = 1000
num_edges = 10 * num_nodes

# Edge type: "user:like:item"
like_edges_path = os.path.join(base_dir, "like-edges.csv")
like_edges = np.random.randint(0, num_nodes, size=(num_edges, 2), dtype=np.int32)
print(f"Part of [user:like:item] edges: {like_edges[:5, :]}\n")

df = pd.DataFrame(like_edges)
df.to_csv(like_edges_path, index=False, header=False)
print(f"[user:like:item] edges are saved into {like_edges_path}\n")

# Edge type: "user:follow:user"
follow_edges_path = os.path.join(base_dir, "follow-edges.csv")
follow_edges = np.random.randint(0, num_nodes, size=(num_edges, 2), dtype=np.int32)
print(f"Part of [user:follow:user] edges: {follow_edges[:5, :]}\n")

df = pd.DataFrame(follow_edges)
df.to_csv(follow_edges_path, index=False, header=False)
print(f"[user:follow:user] edges are saved into {follow_edges_path}\n")

Part of [user:like:item] edges: [[707 170]
 [614 564]
 [514 914]
 [123  60]
 [ 67  26]]

[user:like:item] edges are saved into ./ondisk_dataset_heterograph/like-edges.csv

Part of [user:follow:user] edges: [[108 920]
 [868 960]
 [ 47 320]
 [588 646]
 [893 254]]

[user:follow:user] edges are saved into ./ondisk_dataset_heterograph/follow-edges.csv



In [4]:
# Generate node[user] feature in numpy array.
node_user_feat_0_path = os.path.join(base_dir, "node-user-feat-0.npy")
node_user_feat_0 = np.random.rand(num_nodes, 5)
print(f"Part of node[user] feature [feat_0]: {node_user_feat_0[:3, :]}")
np.save(node_user_feat_0_path, node_user_feat_0)
print(f"Node[user] feature [feat_0] is saved to {node_user_feat_0_path}\n")

# Generate another node[user] feature in torch tensor
node_user_feat_1_path = os.path.join(base_dir, "node-user-feat-1.pt")
node_user_feat_1 = torch.rand(num_nodes, 5, dtype=torch.float32)
print(f"Part of node[user] feature [feat_1]: {node_user_feat_1[:3, :]}")
torch.save(node_user_feat_1, node_user_feat_1_path)
print(f"Node[user] feature [feat_1] is saved to {node_user_feat_1_path}\n")

# Generate node[item] feature in numpy array.
node_item_feat_0_path = os.path.join(base_dir, "node-item-feat-0.npy")
node_item_feat_0 = np.random.rand(num_nodes, 5)
print(f"Part of node[item] feature [feat_0]: {node_item_feat_0[:3, :]}")
np.save(node_item_feat_0_path, node_item_feat_0)
print(f"Node[item] feature [feat_0] is saved to {node_item_feat_0_path}\n")

# Generate another node[item] feature in torch tensor
node_item_feat_1_path = os.path.join(base_dir, "node-item-feat-1.pt")
node_item_feat_1 = torch.rand(num_nodes, 5, dtype=torch.float32)
print(f"Part of node[item] feature [feat_1]: {node_item_feat_1[:3, :]}")
torch.save(node_item_feat_1, node_item_feat_1_path)
print(f"Node[item] feature [feat_1] is saved to {node_item_feat_1_path}\n")

# Generate edge[user:like:item] feature in numpy array.
edge_like_feat_0_path = os.path.join(base_dir, "edge-like-feat-0.npy")
edge_like_feat_0 = np.random.rand(num_edges, 5)
print(f"Part of edge[user:like:item] feature [feat_0]: {edge_like_feat_0[:3, :]}")
np.save(edge_like_feat_0_path, edge_like_feat_0)
print(f"Edge[user:like:item] feature [feat_0] is saved to {edge_like_feat_0_path}\n")

# Generate another edge[user:like:item] feature in torch tensor
edge_like_feat_1_path = os.path.join(base_dir, "edge-like-feat-1.pt")
edge_like_feat_1 = torch.rand(num_edges, 5, dtype=torch.float32)
print(f"Part of edge[user:like:item] feature [feat_1]: {edge_like_feat_1[:3, :]}")
torch.save(edge_like_feat_1, edge_like_feat_1_path)
print(f"Edge[user:like:item] feature [feat_1] is saved to {edge_like_feat_1_path}\n")

# Generate edge[user:follow:user] feature in numpy array.
edge_follow_feat_0_path = os.path.join(base_dir, "edge-follow-feat-0.npy")
edge_follow_feat_0 = np.random.rand(num_edges, 5)
print(f"Part of edge[user:follow:user] feature [feat_0]: {edge_follow_feat_0[:3, :]}")
np.save(edge_follow_feat_0_path, edge_follow_feat_0)
print(f"Edge[user:follow:user] feature [feat_0] is saved to {edge_follow_feat_0_path}\n")

# Generate another edge[user:follow:user] feature in torch tensor
edge_follow_feat_1_path = os.path.join(base_dir, "edge-follow-feat-1.pt")
edge_follow_feat_1 = torch.rand(num_edges, 5, dtype=torch.float32)
print(f"Part of edge[user:follow:user] feature [feat_1]: {edge_follow_feat_1[:3, :]}")
torch.save(edge_follow_feat_1, edge_follow_feat_1_path)
print(f"Edge[user:follow:user] feature [feat_1] is saved to {edge_follow_feat_1_path}\n")

Part of node[user] feature [feat_0]: [[0.57092506 0.10825544 0.20711292 0.10143901 0.21266508]
 [0.41212746 0.07993924 0.56151127 0.39376572 0.74892435]
 [0.24510754 0.11330935 0.99739147 0.84641067 0.64607123]]
Node[user] feature [feat_0] is saved to ./ondisk_dataset_heterograph/node-user-feat-0.npy

Part of node[user] feature [feat_1]: tensor([[0.0602, 0.5338, 0.3941, 0.7655, 0.6784],
        [0.5228, 0.7884, 0.1449, 0.5727, 0.9401],
        [0.2180, 0.7978, 0.5308, 0.9276, 0.8153]])
Node[user] feature [feat_1] is saved to ./ondisk_dataset_heterograph/node-user-feat-1.pt

Part of node[item] feature [feat_0]: [[0.72391364 0.5313694  0.02139435 0.71943807 0.32351023]
 [0.60507041 0.0982607  0.98794344 0.82759357 0.47551857]
 [0.0628069  0.80526314 0.87943115 0.97145407 0.95838395]]
Node[item] feature [feat_0] is saved to ./ondisk_dataset_heterograph/node-item-feat-0.npy

Part of node[item] feature [feat_1]: tensor([[0.5728, 0.7867, 0.9580, 0.5888, 0.0401],
        [0.7069, 0.6441, 0.30

In [5]:
# For illustration, let's generate item sets for each node type.
num_trains = int(num_nodes * 0.6)
num_vals = int(num_nodes * 0.2)
num_tests = num_nodes - num_trains - num_vals

user_ids = np.arange(num_nodes)
np.random.shuffle(user_ids)

item_ids = np.arange(num_nodes)
np.random.shuffle(item_ids)

# Train IDs for user.
nc_train_user_ids_path = os.path.join(base_dir, "nc-train-user-ids.npy")
nc_train_user_ids = user_ids[:num_trains]
print(f"Part of train ids[user] for node classification: {nc_train_user_ids[:3]}")
np.save(nc_train_user_ids_path, nc_train_user_ids)
print(f"NC train ids[user] are saved to {nc_train_user_ids_path}\n")

# Train labels for user.
nc_train_user_labels_path = os.path.join(base_dir, "nc-train-user-labels.pt")
nc_train_user_labels = torch.randint(0, 10, (num_trains,), dtype=torch.int32)
print(f"Part of train labels[user] for node classification: {nc_train_user_labels[:3]}")
torch.save(nc_train_user_labels, nc_train_user_labels_path)
print(f"NC train labels[user] are saved to {nc_train_user_labels_path}\n")

# Train IDs for item.
nc_train_item_ids_path = os.path.join(base_dir, "nc-train-item-ids.npy")
nc_train_item_ids = item_ids[:num_trains]
print(f"Part of train ids[item] for node classification: {nc_train_item_ids[:3]}")
np.save(nc_train_item_ids_path, nc_train_item_ids)
print(f"NC train ids[item] are saved to {nc_train_item_ids_path}\n")

# Train labels for item.
nc_train_item_labels_path = os.path.join(base_dir, "nc-train-item-labels.pt")
nc_train_item_labels = torch.randint(0, 10, (num_trains,), dtype=torch.int32)
print(f"Part of train labels[item] for node classification: {nc_train_item_labels[:3]}")
torch.save(nc_train_item_labels, nc_train_item_labels_path)
print(f"NC train labels[item] are saved to {nc_train_item_labels_path}\n")

# Val IDs for user.
nc_val_user_ids_path = os.path.join(base_dir, "nc-val-user-ids.npy")
nc_val_user_ids = user_ids[num_trains:num_trains+num_vals]
print(f"Part of val ids[user] for node classification: {nc_val_user_ids[:3]}")
np.save(nc_val_user_ids_path, nc_val_user_ids)
print(f"NC val ids[user] are saved to {nc_val_user_ids_path}\n")

# Val labels for user.
nc_val_user_labels_path = os.path.join(base_dir, "nc-val-user-labels.pt")
nc_val_user_labels = torch.randint(0, 10, (num_vals,), dtype=torch.int32)
print(f"Part of val labels[user] for node classification: {nc_val_user_labels[:3]}")
torch.save(nc_val_user_labels, nc_val_user_labels_path)
print(f"NC val labels[user] are saved to {nc_val_user_labels_path}\n")

# Val IDs for item.
nc_val_item_ids_path = os.path.join(base_dir, "nc-val-item-ids.npy")
nc_val_item_ids = item_ids[num_trains:num_trains+num_vals]
print(f"Part of val ids[item] for node classification: {nc_val_item_ids[:3]}")
np.save(nc_val_item_ids_path, nc_val_item_ids)
print(f"NC val ids[item] are saved to {nc_val_item_ids_path}\n")

# Val labels for item.
nc_val_item_labels_path = os.path.join(base_dir, "nc-val-item-labels.pt")
nc_val_item_labels = torch.randint(0, 10, (num_vals,), dtype=torch.int32)
print(f"Part of val labels[item] for node classification: {nc_val_item_labels[:3]}")
torch.save(nc_val_item_labels, nc_val_item_labels_path)
print(f"NC val labels[item] are saved to {nc_val_item_labels_path}\n")

# Test IDs for user.
nc_test_user_ids_path = os.path.join(base_dir, "nc-test-user-ids.npy")
nc_test_user_ids = user_ids[-num_tests:]
print(f"Part of test ids[user] for node classification: {nc_test_user_ids[:3]}")
np.save(nc_test_user_ids_path, nc_test_user_ids)
print(f"NC test ids[user] are saved to {nc_test_user_ids_path}\n")

# Test labels for user.
nc_test_user_labels_path = os.path.join(base_dir, "nc-test-user-labels.pt")
nc_test_user_labels = torch.randint(0, 10, (num_tests,), dtype=torch.int32)
print(f"Part of test labels[user] for node classification: {nc_test_user_labels[:3]}")
torch.save(nc_test_user_labels, nc_test_user_labels_path)
print(f"NC test labels[user] are saved to {nc_test_user_labels_path}\n")

# Test IDs for item.
nc_test_item_ids_path = os.path.join(base_dir, "nc-test-item-ids.npy")
nc_test_item_ids = item_ids[-num_tests:]
print(f"Part of test ids[item] for node classification: {nc_test_item_ids[:3]}")
np.save(nc_test_item_ids_path, nc_test_item_ids)
print(f"NC test ids[item] are saved to {nc_test_item_ids_path}\n")

# Test labels for item.
nc_test_item_labels_path = os.path.join(base_dir, "nc-test-item-labels.pt")
nc_test_item_labels = torch.randint(0, 10, (num_tests,), dtype=torch.int32)
print(f"Part of test labels[item] for node classification: {nc_test_item_labels[:3]}")
torch.save(nc_test_item_labels, nc_test_item_labels_path)
print(f"NC test labels[item] are saved to {nc_test_item_labels_path}\n")

Part of train ids[user] for node classification: [ 98 664 263]
NC train ids[user] are saved to ./ondisk_dataset_heterograph/nc-train-user-ids.npy

Part of train labels[user] for node classification: tensor([6, 3, 0], dtype=torch.int32)
NC train labels[user] are saved to ./ondisk_dataset_heterograph/nc-train-user-labels.pt

Part of train ids[item] for node classification: [940 268 474]
NC train ids[item] are saved to ./ondisk_dataset_heterograph/nc-train-item-ids.npy

Part of train labels[item] for node classification: tensor([3, 5, 9], dtype=torch.int32)
NC train labels[item] are saved to ./ondisk_dataset_heterograph/nc-train-item-labels.pt

Part of val ids[user] for node classification: [116 974  34]
NC val ids[user] are saved to ./ondisk_dataset_heterograph/nc-val-user-ids.npy

Part of val labels[user] for node classification: tensor([1, 0, 4], dtype=torch.int32)
NC val labels[user] are saved to ./ondisk_dataset_heterograph/nc-val-user-labels.pt

Part of val ids[item] for node classi

In [6]:
# For illustration, let's generate item sets for each edge type.
num_trains = int(num_edges * 0.6)
num_vals = int(num_edges * 0.2)
num_tests = num_edges - num_trains - num_vals

# Train node pairs for user:like:item.
lp_train_like_node_pairs_path = os.path.join(base_dir, "lp-train-like-node-pairs.npy")
lp_train_like_node_pairs = like_edges[:num_trains, :]
print(f"Part of train node pairs[user:like:item] for link prediction: {lp_train_like_node_pairs[:3]}")
np.save(lp_train_like_node_pairs_path, lp_train_like_node_pairs)
print(f"LP train node pairs[user:like:item] are saved to {lp_train_like_node_pairs_path}\n")

# Train node pairs for user:follow:user.
lp_train_follow_node_pairs_path = os.path.join(base_dir, "lp-train-follow-node-pairs.npy")
lp_train_follow_node_pairs = follow_edges[:num_trains, :]
print(f"Part of train node pairs[user:follow:user] for link prediction: {lp_train_follow_node_pairs[:3]}")
np.save(lp_train_follow_node_pairs_path, lp_train_follow_node_pairs)
print(f"LP train node pairs[user:follow:user] are saved to {lp_train_follow_node_pairs_path}\n")

# Val node pairs for user:like:item.
lp_val_like_node_pairs_path = os.path.join(base_dir, "lp-val-like-node-pairs.npy")
lp_val_like_node_pairs = like_edges[num_trains:num_trains+num_vals, :]
print(f"Part of val node pairs[user:like:item] for link prediction: {lp_val_like_node_pairs[:3]}")
np.save(lp_val_like_node_pairs_path, lp_val_like_node_pairs)
print(f"LP val node pairs[user:like:item] are saved to {lp_val_like_node_pairs_path}\n")

# Val negative dsts for user:like:item.
lp_val_like_neg_dsts_path = os.path.join(base_dir, "lp-val-like-neg-dsts.pt")
lp_val_like_neg_dsts = torch.randint(0, num_nodes, (num_vals, 10), dtype=torch.int32)
print(f"Part of val negative dsts[user:like:item] for link prediction: {lp_val_like_neg_dsts[:3]}")
torch.save(lp_val_like_neg_dsts, lp_val_like_neg_dsts_path)
print(f"LP val negative dsts[user:like:item] are saved to {lp_val_like_neg_dsts_path}\n")

# Val node pairs for user:follow:user.
lp_val_follow_node_pairs_path = os.path.join(base_dir, "lp-val-follow-node-pairs.npy")
lp_val_follow_node_pairs = follow_edges[num_trains:num_trains+num_vals, :]
print(f"Part of val node pairs[user:follow:user] for link prediction: {lp_val_follow_node_pairs[:3]}")
np.save(lp_val_follow_node_pairs_path, lp_val_follow_node_pairs)
print(f"LP val node pairs[user:follow:user] are saved to {lp_val_follow_node_pairs_path}\n")

# Val negative dsts for user:follow:user.
lp_val_follow_neg_dsts_path = os.path.join(base_dir, "lp-val-follow-neg-dsts.pt")
lp_val_follow_neg_dsts = torch.randint(0, num_nodes, (num_vals, 10), dtype=torch.int32)
print(f"Part of val negative dsts[user:follow:user] for link prediction: {lp_val_follow_neg_dsts[:3]}")
torch.save(lp_val_follow_neg_dsts, lp_val_follow_neg_dsts_path)
print(f"LP val negative dsts[user:follow:user] are saved to {lp_val_follow_neg_dsts_path}\n")

# Test node paris for user:like:item.
lp_test_like_node_pairs_path = os.path.join(base_dir, "lp-test-like-node-pairs.npy")
lp_test_like_node_pairs = like_edges[-num_tests, :]
print(f"Part of test node pairs[user:like:item] for link prediction: {lp_test_like_node_pairs[:3]}")
np.save(lp_test_like_node_pairs_path, lp_test_like_node_pairs)
print(f"LP test node pairs[user:like:item] are saved to {lp_test_like_node_pairs_path}\n")

# Test negative dsts for user:like:item.
lp_test_like_neg_dsts_path = os.path.join(base_dir, "lp-test-like-neg-dsts.pt")
lp_test_like_neg_dsts = torch.randint(0, num_nodes, (num_tests, 10), dtype=torch.int32)
print(f"Part of test negative dsts[user:like:item] for link prediction: {lp_test_like_neg_dsts[:3]}")
torch.save(lp_test_like_neg_dsts, lp_test_like_neg_dsts_path)
print(f"LP test negative dsts[user:like:item] are saved to {lp_test_like_neg_dsts_path}\n")

# Test node paris for user:follow:user.
lp_test_follow_node_pairs_path = os.path.join(base_dir, "lp-test-follow-node-pairs.npy")
lp_test_follow_node_pairs = follow_edges[-num_tests, :]
print(f"Part of test node pairs[user:follow:user] for link prediction: {lp_test_follow_node_pairs[:3]}")
np.save(lp_test_follow_node_pairs_path, lp_test_follow_node_pairs)
print(f"LP test node pairs[user:follow:user] are saved to {lp_test_follow_node_pairs_path}\n")

# Test negative dsts for user:follow:user.
lp_test_follow_neg_dsts_path = os.path.join(base_dir, "lp-test-follow-neg-dsts.pt")
lp_test_follow_neg_dsts = torch.randint(0, num_nodes, (num_tests, 10), dtype=torch.int32)
print(f"Part of test negative dsts[user:follow:user] for link prediction: {lp_test_follow_neg_dsts[:3]}")
torch.save(lp_test_follow_neg_dsts, lp_test_follow_neg_dsts_path)
print(f"LP test negative dsts[user:follow:user] are saved to {lp_test_follow_neg_dsts_path}\n")

Part of train node pairs[user:like:item] for link prediction: [[707 170]
 [614 564]
 [514 914]]
LP train node pairs[user:like:item] are saved to ./ondisk_dataset_heterograph/lp-train-like-node-pairs.npy

Part of train node pairs[user:follow:user] for link prediction: [[108 920]
 [868 960]
 [ 47 320]]
LP train node pairs[user:follow:user] are saved to ./ondisk_dataset_heterograph/lp-train-follow-node-pairs.npy

Part of val node pairs[user:like:item] for link prediction: [[881 188]
 [552 518]
 [206 929]]
LP val node pairs[user:like:item] are saved to ./ondisk_dataset_heterograph/lp-val-like-node-pairs.npy

Part of val negative dsts[user:like:item] for link prediction: tensor([[758, 911, 597, 192, 221, 547, 495, 159, 719, 710],
        [ 65, 279, 870, 578, 602, 597, 358, 972, 535, 256],
        [785, 866, 650, 562, 389, 596, 232, 672, 224, 798]], dtype=torch.int32)
LP val negative dsts[user:like:item] are saved to ./ondisk_dataset_heterograph/lp-val-like-neg-dsts.pt

Part of val node pair

In [7]:
yaml_content = f"""
    dataset_name: heterogeneous_graph_nc_lp
    graph:
      nodes:
        - type: user
          num: {num_nodes}
        - type: item
          num: {num_nodes}
      edges:
        - type: "user:like:item"
          format: csv
          path: {os.path.basename(like_edges_path)}
        - type: "user:follow:user"
          format: csv
          path: {os.path.basename(follow_edges_path)}
    feature_data:
      - domain: node
        type: user
        name: feat_0
        format: numpy
        path: {os.path.basename(node_user_feat_0_path)}
      - domain: node
        type: user
        name: feat_1
        format: torch
        path: {os.path.basename(node_user_feat_1_path)}
      - domain: node
        type: item
        name: feat_0
        format: numpy
        path: {os.path.basename(node_item_feat_0_path)}
      - domain: node
        type: item
        name: feat_1
        format: torch
        path: {os.path.basename(node_item_feat_1_path)}
      - domain: edge
        type: "user:like:item"
        name: feat_0
        format: numpy
        path: {os.path.basename(edge_like_feat_0_path)}
      - domain: edge
        type: "user:like:item"
        name: feat_1
        format: torch
        path: {os.path.basename(edge_like_feat_1_path)}
      - domain: edge
        type: "user:follow:user"
        name: feat_0
        format: numpy
        path: {os.path.basename(edge_follow_feat_0_path)}
      - domain: edge
        type: "user:follow:user"
        name: feat_1
        format: torch
        path: {os.path.basename(edge_follow_feat_1_path)}
    tasks:
      - name: node_classification
        num_classes: 10
        train_set:
          - type: user
            data:
              - name: seed_nodes
                format: numpy
                path: {os.path.basename(nc_train_user_ids_path)}
              - name: labels
                format: torch
                path: {os.path.basename(nc_train_user_labels_path)}
          - type: item
            data:
              - name: seed_nodes
                format: numpy
                path: {os.path.basename(nc_train_item_ids_path)}
              - name: labels
                format: torch
                path: {os.path.basename(nc_train_item_labels_path)}
        validation_set:
          - type: user
            data:
              - name: seed_nodes
                format: numpy
                path: {os.path.basename(nc_val_user_ids_path)}
              - name: labels
                format: torch
                path: {os.path.basename(nc_val_user_labels_path)}
          - type: item
            data:
              - name: seed_nodes
                format: numpy
                path: {os.path.basename(nc_val_item_ids_path)}
              - name: labels
                format: torch
                path: {os.path.basename(nc_val_item_labels_path)}
        test_set:
          - type: user
            data:
              - name: seed_nodes
                format: numpy
                path: {os.path.basename(nc_test_user_ids_path)}
              - name: labels
                format: torch
                path: {os.path.basename(nc_test_user_labels_path)}
          - type: item
            data:
              - name: seed_nodes
                format: numpy
                path: {os.path.basename(nc_test_item_ids_path)}
              - name: labels
                format: torch
                path: {os.path.basename(nc_test_item_labels_path)}
      - name: link_prediction
        num_classes: 10
        train_set:
          - type: "user:like:item"
            data:
              - name: node_pairs
                format: numpy
                path: {os.path.basename(lp_train_like_node_pairs_path)}
          - type: "user:follow:user"
            data:
              - name: node_pairs
                format: numpy
                path: {os.path.basename(lp_train_follow_node_pairs_path)}
        validation_set:
          - type: "user:like:item"
            data:
              - name: node_pairs
                format: numpy
                path: {os.path.basename(lp_val_like_node_pairs_path)}
              - name: negative_dsts
                format: torch
                path: {os.path.basename(lp_val_like_neg_dsts_path)}
          - type: "user:follow:user"
            data:
              - name: node_pairs
                format: numpy
                path: {os.path.basename(lp_val_follow_node_pairs_path)}
              - name: negative_dsts
                format: torch
                path: {os.path.basename(lp_val_follow_neg_dsts_path)}
"""
metadata_path = os.path.join(base_dir, "metadata.yaml")
with open(metadata_path, "w") as f:
  f.write(yaml_content)

In [8]:
dataset = gb.OnDiskDataset(base_dir).load()
# graph = dataset.graph
# print(f"Loaded graph: {graph}\n")

# feature = dataset.feature
# print(f"Loaded feature store: {feature}\n")

tasks = dataset.tasks
lp_task = tasks[1].train_set
print(f"Loaded link prediction task: {lp_task}\n")

The on-disk dataset is re-preprocessing, so the existing preprocessed dataset has been removed.
Start to preprocess the on-disk dataset.
Finish preprocessing the on-disk dataset.
Loaded link prediction task: HeteroItemSet(
    itemsets={'user:like:item': ItemSet(
                 items=(tensor([[707, 170],
                     [614, 564],
                     [514, 914],
                     ...,
                     [  5, 609],
                     [563,  50],
                     [830, 383]], dtype=torch.int32),),
                 names=('node_pairs',),
             ), 'user:follow:user': ItemSet(
                 items=(tensor([[108, 920],
                     [868, 960],
                     [ 47, 320],
                     ...,
                     [596, 313],
                     [ 26, 753],
                     [539, 335]], dtype=torch.int32),),
                 names=('node_pairs',),
             )},
    names=('node_pairs',),
)





In [9]:
from dgl import graphbolt as gb
item_sampler = gb.ItemSampler(
    lp_task, batch_size=5, shuffle=False, drop_last=False
)
item_sampler = item_sampler.sample_neighbor(dataset.graph, [2])
item_sampler = item_sampler.fetch_feature(dataset.feature, node_feature_keys={'user': ['feat_0']})

In [10]:
test = next(iter(item_sampler))



In [11]:
print(test)

MiniBatch(seeds={'user:like:item': tensor([[707, 170],
                        [614, 564],
                        [514, 914],
                        [123,  60],
                        [ 67,  26]], dtype=torch.int32)},
          sampled_subgraphs=[SampledSubgraphImpl(sampled_csc={'user:follow:user': CSCFormatBase(indptr=tensor([ 0,  2,  4,  6,  8, 10], dtype=torch.int32),
                                                                         indices=tensor([ 5,  6,  7,  8,  9, 10, 11, 12, 13, 14], dtype=torch.int32),
                                                           ), 'user:like:item': CSCFormatBase(indptr=tensor([ 0,  2,  4,  6,  8, 10], dtype=torch.int32),
                                                                         indices=tensor([15, 16, 17, 18, 19,  2, 20, 21,  4, 22], dtype=torch.int32),
                                                           )},
                                               original_row_node_ids={'user': tensor([707, 614, 514, 123, 

In [14]:
test.blocks[0].ntypes

['item', 'user', 'item', 'user']

In [None]:
compacted_seeds = test.compacted_seeds['user:like:item'].T

In [None]:
compacted_seeds[0]

In [None]:
compacted_seeds[1]

In [None]:
compacted_seeds[0] * compacted_seeds[1]

In [None]:
import dgl
import torch as th

graph_data = {
   ('drug', 'interacts', 'drug'): (th.tensor([0, 1]), th.tensor([1, 2])),
   ('drug', 'interacts', 'gene'): (th.tensor([0, 1]), th.tensor([2, 3])),
   ('drug', 'treats', 'disease'): (th.tensor([1]), th.tensor([2]))
}
g = dgl.heterograph(graph_data)

In [None]:
g.edges(etype=('drug', 'interacts', 'drug'))

In [None]:
g.edges['drug', 'interacts', 'drug']

Out of the 98 million rows in the data, only 0.54% have bindings, I want to use a margin loss, and construct two graphs, one with positive bindings and one with negative bindings. Given that my dataset has 3 proteins for each molecule to bind against and within the dataset, each molecule is repeated 3 times in three rows, where each row represents binding with one protein block, how can I modify the following code so that I go through the file and first restrict the data to only those molecules that bind with at least one protein. After this I want to group by molecules and split into train, test and validation sets. Finally I want to construct a positive and negative graph for the train, test and validation sets, where for each molecule, the positive graph has a binding and the corresponding negative graph has the same node id with a binding for the protein to which it does not bind against. 



In [16]:
a = torch.rand(3,2)
b = torch.rand(1, 3)

In [18]:
b.shape > a.shape

False

In [19]:
list (a.shape)


[3, 2]

In [20]:
list(b.shape)

[1, 3]

In [23]:
a = torch.rand(4, 2)
print(a.shape)

torch.Size([4, 2])


In [24]:
from torch.nn.functional import pad

In [25]:
print(a)

tensor([[0.8824, 0.4688],
        [0.7484, 0.8062],
        [0.7821, 0.9220],
        [0.6811, 0.0371]])


In [33]:
b = pad(a, [0,0,2,1], "constant", 3)

In [34]:
print(b)

tensor([[3.0000, 3.0000],
        [3.0000, 3.0000],
        [0.8824, 0.4688],
        [0.7484, 0.8062],
        [0.7821, 0.9220],
        [0.6811, 0.0371],
        [3.0000, 3.0000]])


In [35]:
b.shape

torch.Size([7, 2])

In [39]:
3/2

1.5