In [1]:
import dgl
import numpy as np
import torch
import pandas as pd
import datetime
import os
from dgl.data.utils import load_graphs
from ogb.nodeproppred import DglNodePropPredDataset
from ogb.nodeproppred import Evaluator

dataset = DglNodePropPredDataset(name = 'ogbn-mag')
evaluator = Evaluator(name = 'ogbn-mag')

split_idx = dataset.get_idx_split()
train_idx, valid_idx, test_idx = split_idx["train"], split_idx["valid"], split_idx["test"]

def node_level_subsampling(g, list_of_nodes, node_numbers):
    subsample_data = {}
    if len(list_of_nodes) == 0:
        raise ValueError('list of nodes are empty')
    
    for node_type in list_of_nodes:
        subsample_data[node_type]=g.nodes(node_type)[:node_numbers]
    
    return dgl.node_subgraph(g,subsample_data)
    

def create_mask_from_idx(idx,total_nodes):
    mask_array = np.zeros(total_nodes,dtype=bool)
    mask_array[idx]=True
    return mask_array

def load_mag_data():
    graph = load_graphs('./mag_mp.bin')
    _, label = dataset[0]
    g = graph[0][0]
    print(g)
    features = g.ndata['feat']['paper']
    labels = label['paper']
    mask = torch.BoolTensor(create_mask_from_idx(train_idx['paper'], g.num_nodes('paper')))
   
    return g, features, labels, mask

def prepare_accuracy_csv(model_name, train_acc, validation_acc, test_acc, total_epoch, total_duration, timestamp):
    accuracy_csv_data = []
    accuracy_csv_data.append([str(model_name),str(train_acc),str(validation_acc), str(test_acc), str(total_epoch), str(total_duration), str(timestamp)])
    return accuracy_csv_data

def write_csv(file_name, columns_name, data):
    df_csv = pd.DataFrame(data, columns = columns_name)
    if not os.path.isfile(file_name):
       df_csv.to_csv(file_name, header='column_names')
    else: # else it exists so append without writing the header
       df_csv.to_csv(file_name, header=False, mode='a')
    
def prepare_epoch_csv(epoch_number, current_loss, total_duration, timestamp):
    epoch_csv_data = []
    epoch_csv_data.append([str(epoch_number),str(current_loss), str(total_duration), str(timestamp)])
    return epoch_csv_data    

def test_accuracy_HeteroRGCN(model, x_dict, y_true, split_idx, evaluator):
    model.eval()

    out = model(x_dict)
    y_pred = out.argmax(dim=-1, keepdim=True)

    train_acc = evaluator.eval({
        'y_true': y_true[split_idx['train']['paper']],
        'y_pred': y_pred[split_idx['train']['paper']],
    })['acc']
    valid_acc = evaluator.eval({
        'y_true': y_true[split_idx['valid']['paper']],
        'y_pred': y_pred[split_idx['valid']['paper']],
    })['acc']
    test_acc = evaluator.eval({
        'y_true': y_true[split_idx['test']['paper']],
        'y_pred': y_pred[split_idx['test']['paper']],
    })['acc']

    return train_acc, valid_acc, test_acc

def test_accuracy_RGCN(model, x_dict, adj_t_dict, y_true, split_idx, evaluator):
    model.eval()

    out = model(x_dict, adj_t_dict)['paper']
    y_pred = out.argmax(dim=-1, keepdim=True)

    train_acc = evaluator.eval({
        'y_true': y_true[split_idx['train']['paper']],
        'y_pred': y_pred[split_idx['train']['paper']],
    })['acc']
    valid_acc = evaluator.eval({
        'y_true': y_true[split_idx['valid']['paper']],
        'y_pred': y_pred[split_idx['valid']['paper']],
    })['acc']
    test_acc = evaluator.eval({
        'y_true': y_true[split_idx['test']['paper']],
        'y_pred': y_pred[split_idx['test']['paper']],
    })['acc']

    return train_acc, valid_acc, test_acc

accuracy_columns = ['Model_Name','Train_Acc', 'Validation_Acc', 'Test_Acc', 'Total_Epoch', 'Total_Duration','TimeStamp']
epoch_columns = ['Epoch_No', 'NLL_loss', 'Total_Duration','TimeStamp']
modelaccuracy_file_name = 'modelaccuracy.csv'
epoch_file_name = 'epochdetails.csv' 

Using backend: pytorch


In [2]:
# Define a Heterograph Conv model
import dgl.nn as dglnn
import torch.nn as nn
import torch.nn.functional as F
import dgl.function as fn

class RGCN(nn.Module):
    def __init__(self, in_feats, hid_feats, out_feats, rel_names):
        super().__init__()

        self.conv1 = dglnn.HeteroGraphConv({
            rel: dglnn.GraphConv(in_feats, hid_feats)
            for rel in rel_names}, aggregate='sum')
        self.conv2 = dglnn.HeteroGraphConv({
            rel: dglnn.GraphConv(hid_feats, out_feats)
            for rel in rel_names}, aggregate='sum')

    def forward(self, graph, inputs):
        # inputs are features of nodes
        h = self.conv1(graph, inputs)
        h = {k: F.relu(v) for k, v in h.items()}
        h = self.conv2(graph, h)
        return h

In [3]:
# Define a Heterograph Conv model
import dgl.nn as dglnn
import torch.nn as nn
import torch.nn.functional as F
import dgl.function as fn

class HeteroRGCNLayer(nn.Module):
    def __init__(self, in_size, out_size, etypes):
        super(HeteroRGCNLayer, self).__init__()
        # W_r for each relation
        self.weight = nn.ModuleDict({
                name : nn.Linear(in_size, out_size) for name in etypes
            })

    def forward(self, G, feat_dict):
        # The input is a dictionary of node features for each type
        funcs = {}
        for srctype, etype, dsttype in G.canonical_etypes:
            # Compute W_r * h
            Wh = self.weight[etype](feat_dict[srctype])
            # Save it in graph for message passing
            G.nodes[srctype].data['Wh_%s' % etype] = Wh
            # Specify per-relation message passing functions: (message_func, reduce_func).
            # Note that the results are saved to the same destination feature 'h', which
            # hints the type wise reducer for aggregation.
            funcs[etype] = (fn.copy_u('Wh_%s' % etype, 'm'), fn.mean('m', 'h'))
        # Trigger message passing of multiple types.
        # The first argument is the message passing functions for each relation.
        # The second one is the type wise reducer, could be "sum", "max",
        # "min", "mean", "stack"
        G.multi_update_all(funcs, 'sum')
        # return the updated node feature dictionary
        return {ntype : G.nodes[ntype].data['h'] for ntype in G.ntypes}

class HeteroRGCN(nn.Module):
    def __init__(self, G, in_size, hidden_size, out_size):
        super(HeteroRGCN, self).__init__()
        # Use trainable node embeddings as featureless inputs.
        embed_dict = {ntype : nn.Parameter(torch.Tensor(G.number_of_nodes(ntype), in_size))
                      for ntype in G.ntypes}
        for key, embed in embed_dict.items():
            nn.init.xavier_uniform_(embed)
        self.embed = nn.ParameterDict(embed_dict)
        # create layers
        self.layer1 = HeteroRGCNLayer(in_size, hidden_size, G.etypes)
        self.layer2 = HeteroRGCNLayer(hidden_size, out_size, G.etypes)

    def forward(self, G):
        h_dict = self.layer1(G, self.embed)
        h_dict = {k : F.leaky_relu(h) for k, h in h_dict.items()}
        h_dict = self.layer2(G, h_dict)
        # get paper logits
        return h_dict['paper']

In [4]:
def prepare_model(model_name):
    g, features, labels, mask = load_mag_data()
    if model_name.upper() == 'HETERORGCN':
        model = HeteroRGCN(g, 128, 20, 349)
    elif model_name.upper() == 'RGCN':
        model = RGCN(128, 20, 349,g.etypes)

    
    paper_feats = features
    author_feats = torch.zeros([g.num_nodes('author'), 128])
    fos_feats = torch.zeros([g.num_nodes('field_of_study'), 128])
    institute_feats = torch.zeros([g.num_nodes('institution'), 128])
    train_mask = mask
    node_features = {'paper': paper_feats, 'author':author_feats, 'field_of_study':fos_feats, 'institution': institute_feats}
    return g, labels, node_features, model

In [5]:
#prepare_model('HeteroRGCN')

In [6]:
g, features, labels, mask = load_mag_data()
h_model = HeteroRGCN(g, 128, 20, 349)
r_model = RGCN(128, 20, 349,g.etypes)
paper_feats = features
author_feats = torch.zeros([g.num_nodes('author'), 128])
fos_feats = torch.zeros([g.num_nodes('field_of_study'), 128])
institute_feats = torch.zeros([g.num_nodes('institution'), 128])
train_mask = mask
print(paper_feats.shape)
print(author_feats.shape)
node_features = {'paper': paper_feats, 'author':author_feats, 'field_of_study':fos_feats, 'institution': institute_feats}
g.etypes


Graph(num_nodes={'author': 1134649, 'field_of_study': 59965, 'institution': 8740, 'paper': 736389},
      num_edges={('author', 'ai', 'institution'): 1043998, ('author', 'ap', 'paper'): 7145660, ('field_of_study', 'fp', 'paper'): 7505078, ('institution', 'ia', 'author'): 1043998, ('paper', 'pa', 'author'): 7145660, ('paper', 'pf', 'field_of_study'): 7505078, ('paper', 'pp', 'paper'): 5416271},
      metagraph=[('author', 'institution', 'ai'), ('author', 'paper', 'ap'), ('institution', 'author', 'ia'), ('paper', 'author', 'pa'), ('paper', 'field_of_study', 'pf'), ('paper', 'paper', 'pp'), ('field_of_study', 'paper', 'fp')])
torch.Size([736389, 128])
torch.Size([1134649, 128])


['ai', 'ap', 'fp', 'ia', 'pa', 'pf', 'pp']

In [7]:
def run_experiment(total_epoch, model_name):  
    if model_name.upper() == 'HETERORGCN':
        model = h_model
    elif model_name.upper() == 'RGCN':
        model = r_model
    else: 
        raise ValueError('Model Name is not recognized')
    start_time = datetime.datetime.now().replace(microsecond=0)
    now = datetime.datetime.now() # current date and time
    opt = torch.optim.Adam(model.parameters(), lr=0.05, weight_decay=5e-4)
    current_date_time = now.strftime("%d_%m_%Y_%H_%M_%S")
    epoch_file_name = "Training_Epoch_(" + model_name + ")_" + str(total_epoch) + "__" + current_date_time + ".csv"
    best_val_acc = 0
    best_test_acc = 0
    for epoch in range(total_epoch):
        epoch_start_time = datetime.datetime.now().replace(microsecond=0)
        model.train()
        opt.zero_grad()
        if model_name.upper() == 'HETERORGCN':
            logits = model(g)
            pred = logits.argmax(1)
        elif model_name.upper() == 'RGCN':
            logits = model(g, node_features)['paper'].log_softmax(dim=-1)
            y_pred = logits.argmax(dim=-1, keepdim=True)    
        loss = F.cross_entropy(logits[train_idx['paper']], labels[train_idx['paper']].squeeze())
        print("Loss :", loss.item())
        loss.backward()
        opt.step()
        current_time = datetime.datetime.now().replace(microsecond=0)
        print("Epoch Completed : ", epoch, "         Current Time: ", current_time)
        epoch_end_time = datetime.datetime.now().replace(microsecond=0)
        total_epoch_time = epoch_end_time - epoch_start_time
        data_epoch = prepare_epoch_csv(epoch, loss.item(), total_epoch_time, epoch_end_time)
        write_csv(epoch_file_name, epoch_columns, data_epoch)
    
    if model_name.upper() == 'HETERORGCN':
        train_acc, valid_acc, test_acc = test_accuracy_HeteroRGCN(model, g, labels, split_idx, evaluator)
    elif model_name.upper() == 'RGCN':
        train_acc, valid_acc, test_acc =  test_accuracy_RGCN(model, g, node_features, labels, split_idx, evaluator)
    end_time = datetime.datetime.now().replace(microsecond=0)
    total_time = end_time - start_time
    print ("Train_Acc :", train_acc)
    print ("Validation_Acc :",valid_acc)
    print ("Test_Acc :",test_acc)
    data_accuracy = prepare_accuracy_csv(model_name,train_acc, valid_acc, test_acc, total_epoch, total_time, end_time)
    write_csv(modelaccuracy_file_name, accuracy_columns, data_accuracy)
    print ("Finished")

In [None]:
#run_experiment(25, 'HeteroRGCN')
run_experiment(100, 'RGCN')

Loss : 3.4757847785949707
Epoch Completed :  0          Current Time:  2021-06-20 17:54:41
Loss : 4.850241661071777
Epoch Completed :  1          Current Time:  2021-06-20 17:54:55
Loss : 4.101527690887451
Epoch Completed :  2          Current Time:  2021-06-20 17:55:10
Loss : 3.8440663814544678
Epoch Completed :  3          Current Time:  2021-06-20 17:55:24
Loss : 3.8339908123016357
Epoch Completed :  4          Current Time:  2021-06-20 17:55:38
Loss : 3.7999181747436523
Epoch Completed :  5          Current Time:  2021-06-20 17:55:52


In [None]:
#torch.cuda.is_available()
#device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#device