In [4]:
import dgl
import dgl.function as fn
from dgl.nn import GATConv
import torch
import torch.nn as nn
import torch.nn.functional as F
from dgl.data.utils import load_graphs
import numpy as np

In [5]:
from ogb.lsc import MAG240MDataset
dataset = MAG240MDataset(root = 'dataset/')

number_of_papers = 100000

In [6]:
#https://ogb.stanford.edu/kddcup2021/mag240m/

# print(dataset.num_papers) # number of paper nodes
# print(dataset.num_authors) # number of author nodes
# print(dataset.num_institutions) # number of institution nodes
# print(dataset.num_paper_features) # dimensionality of paper features
# print(dataset.num_classes) # number of subject area classes

def node_level_subsampling(g, list_of_nodes, node_numbers):
    subsample_data = {}
    if len(list_of_nodes) == 0:
        raise Error('list of nodes are empty')
    
    for node_type in list_of_nodes:
        subsample_data[node_type]=g.nodes(node_type)[:node_numbers]
    
    return dgl.node_subgraph(g,subsample_data)

In [7]:

# edge_index_writes = dataset.edge_index('author', 'writes', 'paper') 
# edge_index_writes = dataset.edge_index('author', 'paper') # edge type can be omitted and inferred by our package.
# edge_index_cites = dataset.edge_index('paper', 'paper')
# edge_index_affiliated_with = dataset.edge_index('author', 'institution')


In [8]:
# split_dict = dataset.get_idx_split()
# train_idx = split_dict['train'] # numpy array storing indices of training paper nodes
# valid_idx = split_dict['valid'] # numpy array storing indices of validation paper nodes
# test_idx = split_dict['test'] # numpy array storing indices of testing paper nodes

In [9]:
# from pprint import pprint
# pprint(vars(dataset))

In [10]:
ei_writes = dataset.edge_index('author', 'writes', 'paper')
ei_cites = dataset.edge_index('paper', 'paper')
ei_affiliated = dataset.edge_index('author', 'institution')

# We sort the nodes starting with the papers, then the authors, then the institutions.




In [11]:
author_offset = 0
inst_offset = author_offset + dataset.num_authors
paper_offset = inst_offset + dataset.num_institutions
# print(inst_offset)
# print(paper_offset)

In [12]:
ei_writes_src = ei_writes[0][:number_of_papers]
ei_writes_dst = ei_writes[1][:number_of_papers]
ei_affiliated_src = ei_writes[0][:number_of_papers]
ei_affiliated_dst = ei_writes[1][:number_of_papers]
ei_cites_src = ei_writes[0][:number_of_papers]
ei_cites_dst = ei_writes[1][:number_of_papers]


In [13]:
g = dgl.heterograph({
    ('author', 'write', 'paper'): (ei_writes_src, ei_writes_dst),
    ('paper', 'write-by', 'author'): (ei_writes_dst, ei_writes_src),
    ('author', 'affiliate-with', 'institution'): (ei_affiliated_src, ei_affiliated_dst),
    ('institution', 'affiliate', 'author'): (ei_affiliated_dst, ei_affiliated_src),
    ('paper', 'cite', 'paper'): (np.concatenate([ei_cites_src, ei_cites_dst]), np.concatenate([ei_cites_dst, ei_cites_src]))
    })

# Graph(num_nodes={'author': 1527, 'institution': 121750817, 'paper': 121750817},
#      num_edges={('author', 'affiliate-with', 'institution'): 100000, 
#                 ('author', 'write', 'paper'): 100000, 
#                 ('institution', 'affiliate', 'author'): 100000,
#('paper', 'cite', 'paper'): 200000, ('paper', 'write-by', 'author'): 100000},
#      metagraph=[('author', 'institution', 'affiliate-with'), ('author', 'paper', 'write'), ('institution', 'author', 'affiliate'), ('paper', 'paper', 'cite'), ('paper', 'author', 'write-by')])


In [14]:
# print(g)

In [15]:
# new_graph = node_level_subsampling(g, ['author','paper','institution'], 10)

In [16]:
# print(new_graph)

In [17]:
# paper_feat = dataset.paper_feat
# author_feat = np.memmap(args.author_output_path, mode='w+', dtype='float16', shape=(dataset.num_authors, dataset.num_paper_features))
# inst_feat = np.memmap(args.inst_output_path, mode='w+', dtype='float16', shape=(dataset.num_institutions, dataset.num_paper_features))


In [18]:
sub_array = np.arange(number_of_papers)
paper_feat = dataset.paper_feat[sub_array]


In [None]:
author_feat = np.memmap(args.author_output_path, mode='w+', dtype='float16', shape=(dataset.num_authors, dataset.num_paper_features))
inst_feat = np.memmap(args.inst_output_path, mode='w+', dtype='float16', shape=(dataset.num_institutions, dataset.num_paper_features))
