In [24]:
import dgl
import torch as th
import os
import numpy as np
import pickle
import random
from pprint import pprint
from scipy import sparse
from scipy import io as sio
from dgl import save_graphs, load_graphs
from dgl.data.utils import makedirs, save_info, load_info
import errno
from ogb.nodeproppred import DglNodePropPredDataset
from sklearn.metrics import f1_score

In [2]:
def get_binary_mask(total_size, indices):
    mask = th.zeros(total_size)
    mask[indices] = 1
    return mask.byte()

In [3]:
dataset = DglNodePropPredDataset(name = 'ogbn-mag')

split_idx = dataset.get_idx_split()
train_idx, valid_idx, test_idx = split_idx["train"], split_idx["valid"], split_idx["test"]
graph, label = dataset[0] # graph: dgl graph object, label: torch tensor of shape (num_nodes, num_tasks)

In [4]:
print(label)

{'paper': tensor([[246],
        [131],
        [189],
        ...,
        [266],
        [289],
        [  1]])}


In [5]:
graph

Graph(num_nodes={'author': 1134649, 'field_of_study': 59965, 'institution': 8740, 'paper': 736389},
      num_edges={('author', 'affiliated_with', 'institution'): 1043998, ('author', 'writes', 'paper'): 7145660, ('paper', 'cites', 'paper'): 5416271, ('paper', 'has_topic', 'field_of_study'): 7505078},
      metagraph=[('author', 'institution', 'affiliated_with'), ('author', 'paper', 'writes'), ('paper', 'paper', 'cites'), ('paper', 'field_of_study', 'has_topic')])

In [6]:
graph.etypes

['affiliated_with', 'writes', 'cites', 'has_topic']

In [7]:
u1,v1 = graph.edges(form='uv', etype='affiliated_with')
u2,v2 = graph.edges(form='uv', etype='writes')
u3,v3 = graph.edges(form='uv', etype='cites')
u4,v4 = graph.edges(form='uv', etype='has_topic')

In [8]:
data_dict = {
    ('author', 'ai', 'institution'): (u1, v1),
    ('institution', 'ia', 'author'): (v1, u1),
    ('author', 'ap', 'paper'): (u2, v2),
    ('paper', 'pa', 'author'): (v2, u2),
    ('paper', 'pP', 'paper'): (u3, v3),
    ('paper', 'Pp', 'paper'): (v3, u3),
    ('paper', 'pf', 'field_of_study'): (u4, v4),
    ('field_of_study', 'fp', 'paper'): (v4, u4)
}

In [9]:
graph_new = dgl.heterograph(data_dict)

In [10]:
graph_new

Graph(num_nodes={'author': 1134649, 'field_of_study': 59965, 'institution': 8740, 'paper': 736389},
      num_edges={('author', 'ai', 'institution'): 1043998, ('author', 'ap', 'paper'): 7145660, ('field_of_study', 'fp', 'paper'): 7505078, ('institution', 'ia', 'author'): 1043998, ('paper', 'Pp', 'paper'): 5416271, ('paper', 'pP', 'paper'): 5416271, ('paper', 'pa', 'author'): 7145660, ('paper', 'pf', 'field_of_study'): 7505078},
      metagraph=[('author', 'institution', 'ai'), ('author', 'paper', 'ap'), ('institution', 'author', 'ia'), ('paper', 'paper', 'Pp'), ('paper', 'paper', 'pP'), ('paper', 'author', 'pa'), ('paper', 'field_of_study', 'pf'), ('field_of_study', 'paper', 'fp')])

In [11]:
graph.nodes['paper'].data['feat']

tensor([[-0.0954,  0.0408, -0.2109,  ...,  0.0616, -0.0277, -0.1338],
        [-0.1510, -0.1073, -0.2220,  ...,  0.3458, -0.0277, -0.2185],
        [-0.1148, -0.1760, -0.2606,  ...,  0.1731, -0.1564, -0.2780],
        ...,
        [ 0.0228, -0.0865,  0.0981,  ..., -0.0547, -0.2077, -0.2305],
        [-0.2891, -0.2029, -0.1525,  ...,  0.1042,  0.2041, -0.3528],
        [-0.0890, -0.0348, -0.2642,  ...,  0.2601, -0.0875, -0.5171]])

In [12]:
graph_new.ndata['feat']={'paper':graph.nodes['paper'].data['feat']}
graph_new.ndata['feat']

{'paper': tensor([[-0.0954,  0.0408, -0.2109,  ...,  0.0616, -0.0277, -0.1338],
         [-0.1510, -0.1073, -0.2220,  ...,  0.3458, -0.0277, -0.2185],
         [-0.1148, -0.1760, -0.2606,  ...,  0.1731, -0.1564, -0.2780],
         ...,
         [ 0.0228, -0.0865,  0.0981,  ..., -0.0547, -0.2077, -0.2305],
         [-0.2891, -0.2029, -0.1525,  ...,  0.1042,  0.2041, -0.3528],
         [-0.0890, -0.0348, -0.2642,  ...,  0.2601, -0.0875, -0.5171]])}

In [13]:
#save_graphs('./mag_mp.bin', [graph_new], label)

In [14]:
"""sampler = dgl.dataloading.MultiLayerFullNeighborSampler(2)
dataloader = dgl.dataloading.NodeDataLoader(
    graph_new, train_idx, sampler,
    batch_size=1024,
    shuffle=True,
    drop_last=False,
    num_workers=4)"""

'sampler = dgl.dataloading.MultiLayerFullNeighborSampler(2)\ndataloader = dgl.dataloading.NodeDataLoader(\n    graph_new, train_idx, sampler,\n    batch_size=1024,\n    shuffle=True,\n    drop_last=False,\n    num_workers=4)'

In [15]:
print(train_idx)

{'paper': tensor([     0,      1,      2,  ..., 736386, 736387, 736388])}


In [16]:
print(valid_idx)

{'paper': tensor([   332,    756,    784,  ..., 736364, 736367, 736370])}


In [17]:
print(test_idx)

{'paper': tensor([   359,    411,    608,  ..., 736358, 736384, 736385])}


In [18]:
train_mask = get_binary_mask(graph_new.num_nodes('paper'), train_idx['paper'])
train_mask

tensor([1, 1, 1,  ..., 1, 1, 1], dtype=torch.uint8)

In [19]:
val_mask = get_binary_mask(graph_new.num_nodes('paper'), valid_idx['paper'])
val_mask

tensor([0, 0, 0,  ..., 0, 0, 0], dtype=torch.uint8)

In [20]:
test_mask = get_binary_mask(graph_new.num_nodes('paper'), test_idx['paper'])
test_mask

tensor([0, 0, 0,  ..., 0, 0, 0], dtype=torch.uint8)

In [21]:
print(dataset)

DglNodePropPredDataset(1)


In [22]:
hg, labels  = load_graphs("./mag_mp.bin", [0])

In [23]:
hg[0]

Graph(num_nodes={'author': 1134649, 'field_of_study': 59965, 'institution': 8740, 'paper': 736389},
      num_edges={('author', 'ai', 'institution'): 1043998, ('author', 'ap', 'paper'): 7145660, ('field_of_study', 'fp', 'paper'): 7505078, ('institution', 'ia', 'author'): 1043998, ('paper', 'Pp', 'paper'): 5416271, ('paper', 'pP', 'paper'): 5416271, ('paper', 'pa', 'author'): 7145660, ('paper', 'pf', 'field_of_study'): 7505078},
      metagraph=[('author', 'institution', 'ai'), ('author', 'paper', 'ap'), ('institution', 'author', 'ia'), ('paper', 'paper', 'Pp'), ('paper', 'paper', 'pP'), ('paper', 'author', 'pa'), ('paper', 'field_of_study', 'pf'), ('field_of_study', 'paper', 'fp')])